fix: server error to disconnect

2026-06-17 15:19:31 +08:00
parent 60df0fe196
commit 66d318774a
1 changed files with 297 additions and 57 deletions
--- a/main/bridge_server.py
+++ b/main/bridge_server.py
@ -1,6 +1,5 @@
 import asyncio
 import base64
-import contextlib
 import json
 import os
 import re
@ -12,7 +11,7 @@ import traceback
 import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Coroutine, Optional

 import httpx
 import opuslib
@ -123,6 +122,9 @@ class DeviceSession:
    last_interrupt_time: float = 0.0
    last_uplink_audible_time: float = 0.0
    agent_dispatch_task: Optional[asyncio.Task] = None
+    room_connect_task: Optional[asyncio.Task] = None
+    background_tasks: set[asyncio.Task[Any]] = field(default_factory=set)
+    forwarding_track_participants: dict[str, str] = field(default_factory=dict)
    closed: bool = False
    captured_frame_count: int = 0
    first_capture_log_time: float = 0.0
@ -209,11 +211,15 @@ class ESP32LiveKitBridge:
        session: DeviceSession,
        task: asyncio.Task[Any],
    ) -> None:
+        if session.room_connect_task is task:
+            session.room_connect_task = None
        if task.cancelled():
            return
        try:
            task.result()
        except Exception as exc:
+            if session.closed:
+                return
            self._log_exception(
                f"LiveKit 房间连接后台任务失败: room={session.room_name}",
                exc,
@ -222,6 +228,48 @@ class ESP32LiveKitBridge:
            if websocket is not None:
                asyncio.create_task(websocket.close(code=1011, reason="livekit connect failed"))

+    def _create_session_task(
+        self,
+        session: DeviceSession,
+        coroutine: Coroutine[Any, Any, Any],
+        description: str,
+    ) -> Optional[asyncio.Task[Any]]:
+        if session.closed:
+            coroutine.close()
+            return None
+
+        task = asyncio.create_task(coroutine)
+        session.background_tasks.add(task)
+        task.add_done_callback(
+            lambda done_task: self._handle_session_task_done(
+                session,
+                done_task,
+                description,
+            )
+        )
+        return task
+
+    def _handle_session_task_done(
+        self,
+        session: DeviceSession,
+        task: asyncio.Task[Any],
+        description: str,
+    ) -> None:
+        session.background_tasks.discard(task)
+        if task.cancelled():
+            return
+        try:
+            task.result()
+        except Exception as exc:
+            if not session.closed:
+                self._log_exception(f"{description} 失败: room={session.room_name}", exc)
+
+    async def _disconnect_room_quietly(self, session: DeviceSession, reason: str) -> None:
+        try:
+            await session.room.disconnect()
+        except Exception as exc:
+            print(f"断开 LiveKit 房间失败: room={session.room_name} reason={reason} error={exc}")
+
    async def _capture_mic_frame(
        self,
        session: DeviceSession,
@ -523,6 +571,8 @@ class ESP32LiveKitBridge:
        return False

    async def _send_agent_interrupt(self, session: DeviceSession, reason: str) -> None:
+        if session.closed:
+            return
        payload = {
            "type": "interrupt",
            "topic": INTERRUPT_TOPIC,
@ -553,6 +603,8 @@ class ESP32LiveKitBridge:
        return path

    async def _publish_vision_frame(self, session: DeviceSession, message: dict[str, Any]) -> None:
+        if session.closed:
+            return
        image = message.get("image")
        if not isinstance(image, str) or not image:
            print("收到 vision frame，但 image 字段为空")
@ -603,6 +655,8 @@ class ESP32LiveKitBridge:
            print(f"发送 vision frame 失败，publish_data 签名不兼容: {last_error}")

    async def _publish_mcp_message(self, session: DeviceSession, message: dict[str, Any]) -> None:
+        if session.closed:
+            return
        payload = message.get("payload")
        if not isinstance(payload, dict):
            print(f"收到 ESP32 MCP 消息但缺少 payload: {message}")
@ -652,6 +706,8 @@ class ESP32LiveKitBridge:
        *,
        source_identity: str,
    ) -> None:
+        if session.closed:
+            return
        if session.websocket is None:
            print("跳过 MCP 请求，ESP32 尚未连接")
            return
@ -671,6 +727,8 @@ class ESP32LiveKitBridge:
        )

    async def _send_tts_state(self, session: DeviceSession, state: str) -> None:
+        if session.closed:
+            return
        if session.websocket is None:
            print(f"跳过 tts {state}，ESP32 尚未连接")
            return
@ -678,6 +736,8 @@ class ESP32LiveKitBridge:
        print(f"已发送 tts {state}: device={session.device_id}")

    async def _send_emotion(self, session: DeviceSession, emotion: str) -> None:
+        if session.closed:
+            return
        if session.websocket is None:
            print(f"跳过 emotion {emotion}，ESP32 尚未连接")
            return
@ -703,6 +763,8 @@ class ESP32LiveKitBridge:
            await self._send_emotion(session, emotion)

    async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
+        if session.closed:
+            return
        if session.websocket is None:
            return
        raw_text = text
@ -733,12 +795,18 @@ class ESP32LiveKitBridge:

        if len(text) <= TTS_DISPLAY_SCROLL_WIDTH:
            self._cancel_tts_display_task(session)
-            asyncio.create_task(self._send_tts_text(session, text, final))
+            self._create_session_task(
+                session,
+                self._send_tts_text(session, text, final),
+                "发送 TTS 字幕",
+            )
            return

        if session.tts_display_task is None or session.tts_display_task.done():
-            session.tts_display_task = asyncio.create_task(
-                self._scroll_tts_display_text(session, session.tts_stream_id)
+            session.tts_display_task = self._create_session_task(
+                session,
+                self._scroll_tts_display_text(session, session.tts_stream_id),
+                "滚动 TTS 字幕",
            )

    async def _scroll_tts_display_text(self, session: DeviceSession, stream_id: int) -> None:
@ -857,7 +925,11 @@ class ESP32LiveKitBridge:
            f"[agent-state] room={session.room_name} identity={participant.identity} state={state}"
        )
        if state == "thinking":
-            asyncio.create_task(self._start_thinking(session))
+            self._create_session_task(
+                session,
+                self._start_thinking(session),
+                "处理 agent thinking 状态",
+            )

    async def _stop_tts(self, session: DeviceSession) -> None:
        if not session.tts_active and not session.tts_thinking:
@ -898,14 +970,20 @@ class ESP32LiveKitBridge:
        session.tts_suppressed_until = now + TTS_INTERRUPT_SUPPRESS_SECONDS
        session.tts_waiting_for_user_audio_after_interrupt = True
        await self._force_stop_tts(session, reason)
-        asyncio.create_task(self._send_agent_interrupt(session, reason))
+        self._create_session_task(
+            session,
+            self._send_agent_interrupt(session, reason),
+            "发送 agent interrupt",
+        )

    def _reset_tts_idle_timer(self, session: DeviceSession) -> None:
        session.tts_last_audible_at = time.monotonic()
        if session.tts_idle_task is not None:
            session.tts_idle_task.cancel()
-        session.tts_idle_task = asyncio.create_task(
-            self._tts_idle_watchdog(session, session.tts_stream_id)
+        session.tts_idle_task = self._create_session_task(
+            session,
+            self._tts_idle_watchdog(session, session.tts_stream_id),
+            "TTS idle watchdog",
        )

    async def _tts_idle_watchdog(self, session: DeviceSession, stream_id: int) -> None:
@ -959,6 +1037,8 @@ class ESP32LiveKitBridge:
        participant: rtc.RemoteParticipant,
        source: str,
    ) -> None:
+        if session.closed or session.websocket is None:
+            return
        if track.kind != rtc.TrackKind.KIND_AUDIO:
            return

@ -974,6 +1054,7 @@ class ESP32LiveKitBridge:
        if existing_task is not None and existing_task.done():
            print(f"检测到已结束的音频转发任务，重新创建: sid={track_sid}")
            session.forwarding_tracks.pop(track_sid, None)
+            session.forwarding_track_participants.pop(track_sid, None)

        task = asyncio.create_task(
            self.forward_audio_to_esp32(
@ -983,17 +1064,37 @@ class ESP32LiveKitBridge:
            )
        )
        session.forwarding_tracks[track_sid] = task
+        session.forwarding_track_participants[track_sid] = participant.identity
        print(
            f"收到音频流: {participant.identity} sid={track_sid} "
            f"source={source} room={session.room_name}"
        )

+    def _cancel_forwarding_tracks(
+        self,
+        session: DeviceSession,
+        participant_identity: Optional[str] = None,
+    ) -> list[asyncio.Task[Any]]:
+        cancelled: list[asyncio.Task[Any]] = []
+        for track_sid, task in list(session.forwarding_tracks.items()):
+            track_participant = session.forwarding_track_participants.get(track_sid)
+            if participant_identity is not None and track_participant != participant_identity:
+                continue
+            session.forwarding_tracks.pop(track_sid, None)
+            session.forwarding_track_participants.pop(track_sid, None)
+            if not task.done():
+                task.cancel()
+                cancelled.append(task)
+        return cancelled
+
    def _scan_participant_audio_tracks(
        self,
        session: DeviceSession,
        participant: rtc.RemoteParticipant,
        source: str,
    ) -> None:
+        if session.closed:
+            return
        publications = getattr(participant, "track_publications", None) or {}
        for publication in publications.values():
            track = getattr(publication, "track", None)
@ -1061,6 +1162,76 @@ class ESP32LiveKitBridge:

        return normalized[start:].strip() or normalized

+    async def _handle_room_connected(self, session: DeviceSession) -> None:
+        if session.closed:
+            return
+        print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
+        self._log_agent_participants(session, "connected")
+        for participant in list(session.room.remote_participants.values()):
+            if session.closed:
+                return
+            if self._is_agent_participant(participant, session.agent_name):
+                session.agent_ready.set()
+                self._scan_participant_audio_tracks(session, participant, "connected_scan")
+                self._handle_agent_state(session, participant)
+
+    async def _handle_participant_connected(
+        self,
+        session: DeviceSession,
+        participant: rtc.RemoteParticipant,
+    ) -> None:
+        if session.closed:
+            return
+        role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
+        print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
+        self._log_agent_participants(session, "participant_connected")
+        if self._is_agent_participant(participant, session.agent_name):
+            session.agent_ready.set()
+            self._scan_participant_audio_tracks(
+                session, participant, "participant_connected_scan"
+            )
+            self._handle_agent_state(session, participant)
+
+    async def _handle_participant_attributes_changed(
+        self,
+        session: DeviceSession,
+        changed: list[str],
+        participant: rtc.Participant,
+    ) -> None:
+        if session.closed:
+            return
+        if AGENT_STATE_ATTRIBUTE not in changed:
+            return
+        if not isinstance(participant, rtc.RemoteParticipant):
+            return
+        if not self._is_agent_participant(participant, session.agent_name):
+            return
+        self._handle_agent_state(session, participant)
+
+    async def _handle_track_subscribed(
+        self,
+        session: DeviceSession,
+        track: rtc.Track,
+        publication: rtc.TrackPublication,
+        participant: rtc.RemoteParticipant,
+        source: str,
+    ) -> None:
+        if session.closed:
+            return
+        self._maybe_forward_remote_audio(session, track, publication, participant, source)
+
+    async def _handle_track_published(
+        self,
+        session: DeviceSession,
+        publication: rtc.RemoteTrackPublication,
+        participant: rtc.RemoteParticipant,
+    ) -> None:
+        if session.closed:
+            return
+        track = getattr(publication, "track", None)
+        if track is not None:
+            self._maybe_forward_remote_audio(session, track, publication, participant, "published")
+
    def _register_room_handlers(self, session: DeviceSession) -> None:
        @session.room.on("connection_state_changed")
        def on_connection_state_changed(state: int) -> None:
@ -1069,44 +1240,32 @@ class ESP32LiveKitBridge:

        @session.room.on("connected")
        def on_connected() -> None:
-            print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
-            self._log_agent_participants(session, "connected")
-            for participant in session.room.remote_participants.values():
-                if self._is_agent_participant(participant, session.agent_name):
-                    session.agent_ready.set()
-                    self._scan_participant_audio_tracks(session, participant, "connected_scan")
-                    self._handle_agent_state(session, participant)
+            self._create_session_task(
+                session,
+                self._handle_room_connected(session),
+                "处理 LiveKit connected 事件",
+            )

        @session.room.on("participant_connected")
        def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
-            role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
-            print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
-            self._log_agent_participants(session, "participant_connected")
-            if self._is_agent_participant(participant, session.agent_name):
-                session.agent_ready.set()
-                self._scan_participant_audio_tracks(
-                    session, participant, "participant_connected_scan"
-                )
-                self._handle_agent_state(session, participant)
+            self._create_session_task(
+                session,
+                self._handle_participant_connected(session, participant),
+                "处理 LiveKit participant_connected 事件",
+            )

        @session.room.on("participant_disconnected")
        def on_participant_disconnected(participant: rtc.RemoteParticipant) -> None:
            print(f"👋 远端参与者离开房间: room={session.room_name} identity={participant.identity}")
-            session.forwarding_tracks = {
-                track_sid: task
-                for track_sid, task in session.forwarding_tracks.items()
-                if not track_sid.endswith(f":{participant.identity}")
-            }
+            self._cancel_forwarding_tracks(session, participant.identity)

        @session.room.on("participant_attributes_changed")
        def on_participant_attributes_changed(changed: list[str], participant: rtc.Participant) -> None:
-            if AGENT_STATE_ATTRIBUTE not in changed:
-                return
-            if not isinstance(participant, rtc.RemoteParticipant):
-                return
-            if not self._is_agent_participant(participant, session.agent_name):
-                return
-            self._handle_agent_state(session, participant)
+            self._create_session_task(
+                session,
+                self._handle_participant_attributes_changed(session, changed, participant),
+                "处理 LiveKit participant_attributes_changed 事件",
+            )

        @session.room.on("data_received")
        def on_data_received(data_packet: rtc.DataPacket) -> None:
@ -1133,12 +1292,14 @@ class ESP32LiveKitBridge:
            ):
                mcp_payload = payload.get("payload")
                if isinstance(mcp_payload, dict):
-                    asyncio.create_task(
+                    self._create_session_task(
+                        session,
                        self._forward_mcp_to_device(
                            session,
                            mcp_payload,
                            source_identity=identity,
-                        )
+                        ),
+                        "转发 MCP 到 ESP32",
                    )
                else:
                    print(f"收到 MCP 数据但缺少 payload: {payload}")
@ -1167,7 +1328,11 @@ class ESP32LiveKitBridge:
                    )
                    if emotion and emotion != session.tts_emotion:
                        session.tts_emotion = emotion
-                        asyncio.create_task(self._send_emotion(session, emotion))
+                        self._create_session_task(
+                            session,
+                            self._send_emotion(session, emotion),
+                            "发送 emotion",
+                        )
                    display_text = self._current_tts_display_text(tts_text)
                    print(f"[livekit-llm] display_text={display_text!r} final={segment.final}")
                    if not display_text or display_text == session.tts_transcript_text:
@ -1177,14 +1342,19 @@ class ESP32LiveKitBridge:
                    if not segment.final:
                        continue
                    display_text = segment.text
-                    asyncio.create_task(self._start_thinking(session))
+                    self._create_session_task(
+                        session,
+                        self._start_thinking(session),
+                        "发送 TTS thinking",
+                    )

                if session.websocket is not None:
                    ws = session.websocket
                    if is_agent:
                        self._update_tts_display_text(session, display_text, segment.final)
                    else:
-                        asyncio.create_task(
+                        self._create_session_task(
+                            session,
                            ws.send(
                                json.dumps(
                                    {
@ -1193,7 +1363,8 @@ class ESP32LiveKitBridge:
                                        "final": segment.final,
                                    }
                                )
-                            )
+                            ),
+                            "发送 STT 到 ESP32",
                        )

        @session.room.on("track_subscribed")
@ -1202,7 +1373,11 @@ class ESP32LiveKitBridge:
            publication: rtc.TrackPublication,
            participant: rtc.RemoteParticipant,
        ) -> None:
-            self._maybe_forward_remote_audio(session, track, publication, participant, "event")
+            self._create_session_task(
+                session,
+                self._handle_track_subscribed(session, track, publication, participant, "event"),
+                "处理 LiveKit track_subscribed 事件",
+            )

        @session.room.on("track_published")
        def on_track_published(
@ -1214,18 +1389,25 @@ class ESP32LiveKitBridge:
            #     f"📡 远端音轨已发布: room={session.room_name} identity={participant.identity} "
            #     f"track_sid={track_sid}"
            # )
-            track = getattr(publication, "track", None)
-            if track is not None:
-                self._maybe_forward_remote_audio(session, track, publication, participant, "published")
+            self._create_session_task(
+                session,
+                self._handle_track_published(session, publication, participant),
+                "处理 LiveKit track_published 事件",
+            )

    async def _connect_session_room(self, session: DeviceSession) -> None:
+        if session.closed:
+            return
        self._register_room_handlers(session)
+        connected = False

        # print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
        # print(f"[config] token_url={TOKEN_URL}")
        # print(f"[config] room={session.room_name} identity={session.identity}")
        # print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
        token = await fetch_token(session.room_name, session.identity, session.agent_name)
+        if session.closed:
+            return

        try:
            await session.room.connect(
@ -1233,7 +1415,10 @@ class ESP32LiveKitBridge:
                token,
                options=rtc.RoomOptions(connect_timeout=CONNECT_TIMEOUT_SECONDS),
            )
+            connected = True
        except Exception as exc:
+            if session.closed:
+                return
            self._log_exception(
                f"连接 LiveKit 房间失败: room={session.room_name}",
                exc,
@ -1245,6 +1430,9 @@ class ESP32LiveKitBridge:
            )
            raise

+        if session.closed:
+            await self._disconnect_room_quietly(session, "session_closed_after_connect")
+            return
        print(f"已连接到 LiveKit 房间: {session.room.name}")
        # print(f"[livekit] local_identity={session.room.local_participant.identity}")
        # print(f"[livekit] local_sid={session.room.local_participant.sid}")
@ -1252,6 +1440,9 @@ class ESP32LiveKitBridge:
        self._log_agent_participants(session, "after_connect")

        await self.ensure_agent_dispatched(session)
+        if session.closed:
+            await self._disconnect_room_quietly(session, "session_closed_after_dispatch")
+            return

        track = rtc.LocalAudioTrack.create_audio_track(
            f"esp32-mic-{session.device_id}",
@ -1259,6 +1450,9 @@ class ESP32LiveKitBridge:
        )
        options = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_MICROPHONE)
        publication = await session.room.local_participant.publish_track(track, options)
+        if session.closed:
+            await self._disconnect_room_quietly(session, "session_closed_after_publish")
+            return
        publication_sid = getattr(publication, "sid", None)
        track_sid = getattr(track, "sid", None)
        # print(
@ -1278,6 +1472,9 @@ class ESP32LiveKitBridge:
                return
            print(f"⚠️ agent 等待超时: room={session.room_name}")

+        if connected and session.closed:
+            await self._disconnect_room_quietly(session, "session_closed_after_agent_wait")
+
    async def start(self) -> None:
        print(f"[config] websocket_port={WS_PORT}")
        print(f"[config] websocket_max_queue={WS_MAX_QUEUE} websocket_max_size={WS_MAX_SIZE}")
@ -1323,14 +1520,47 @@ class ESP32LiveKitBridge:
        session.tts_active = False
        session.tts_thinking = False
        session.tts_stream_id += 1
+        cleanup_tasks: list[asyncio.Task[Any]] = []
+        current_task = asyncio.current_task()
+        room_connect_pending = (
+            session.room_connect_task is not None
+            and not session.room_connect_task.done()
+        )
+
+        if (
+            not room_connect_pending
+            and session.agent_dispatch_task is not None
+            and not session.agent_dispatch_task.done()
+        ):
+            session.agent_dispatch_task.cancel()
+            if session.agent_dispatch_task is not current_task:
+                cleanup_tasks.append(session.agent_dispatch_task)
+            session.agent_dispatch_task = None
+
+        cleanup_tasks.extend(self._cancel_forwarding_tracks(session))
+
        if session.tts_idle_task is not None:
            session.tts_idle_task.cancel()
+            if session.tts_idle_task is not current_task:
+                cleanup_tasks.append(session.tts_idle_task)
            session.tts_idle_task = None
-        self._cancel_tts_display_task(session)
-        try:
-            await session.room.disconnect()
-        except Exception as exc:
-            print(f"断开 LiveKit 房间失败: room={session.room_name} error={exc}")
+        if session.tts_display_task is not None:
+            session.tts_display_task.cancel()
+            if session.tts_display_task is not current_task:
+                cleanup_tasks.append(session.tts_display_task)
+            session.tts_display_task = None
+
+        for task in list(session.background_tasks):
+            if task is current_task or task.done():
+                continue
+            task.cancel()
+            cleanup_tasks.append(task)
+
+        if cleanup_tasks:
+            await asyncio.gather(*cleanup_tasks, return_exceptions=True)
+
+        if not room_connect_pending:
+            await self._disconnect_room_quietly(session, "session_close")

    async def forward_audio_to_esp32(
        self,
@ -1500,11 +1730,20 @@ class ESP32LiveKitBridge:
        except Exception as exc:
            print(f"音频流处理错误: {exc}")
        finally:
+            close_stream = getattr(audio_stream, "aclose", None) or getattr(audio_stream, "close", None)
+            if close_stream is not None:
+                try:
+                    result = close_stream()
+                    if result is not None and hasattr(result, "__await__"):
+                        await result
+                except Exception as exc:
+                    print(f"关闭 LiveKit 音频流失败: {exc}")
            print("🎧 TTS 音频结束")
            task = session.forwarding_tracks.get(track_sid)
            current_task = asyncio.current_task()
            if task is current_task:
                session.forwarding_tracks.pop(track_sid, None)
+                session.forwarding_track_participants.pop(track_sid, None)
            if stream_id == session.tts_stream_id and session.tts_idle_task is not None:
                session.tts_idle_task.cancel()
                session.tts_idle_task = None
@ -1566,9 +1805,14 @@ class ESP32LiveKitBridge:
                f"已发送 server hello: device={device_id} room={session.room_name} "
                f"audio={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
            )
-            asyncio.create_task(self._run_emotion_test_sequence(session))
+            self._create_session_task(
+                session,
+                self._run_emotion_test_sequence(session),
+                "emotion 测试序列",
+            )

            room_connect_task = asyncio.create_task(self._connect_session_room(session))
+            session.room_connect_task = room_connect_task
            room_connect_task.add_done_callback(
                lambda task: self._track_room_connect_task(session, task)
            )
@ -1656,10 +1900,6 @@ class ESP32LiveKitBridge:
            self._log_exception("WebSocket 其他错误", exc)
        finally:
            print(f"ESP32 断开连接: device={device_id} room={session.room_name}")
-            if room_connect_task is not None and not room_connect_task.done():
-                room_connect_task.cancel()
-                with contextlib.suppress(asyncio.CancelledError):
-                    await room_connect_task
            await self._close_session(session)
            self.device_sessions.pop(device_id, None)