fix: voice interrupt

2026-05-22 10:10:16 +08:00
parent 61ad9dafd9
commit 5223333418
1 changed files with 30 additions and 7 deletions
--- a/main/bridge_server.py
+++ b/main/bridge_server.py
@ -49,6 +49,7 @@ TTS_DISPLAY_SENTENCE_BREAKS = "。！？!?；;"
 TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
 TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
 TTS_DISPLAY_SCROLL_GAP = "   "
+TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))


@dataclass
@ -69,6 +70,7 @@ class DeviceSession:
    tts_transcript_text: str = ""
    tts_display_text: str = ""
    tts_display_final: bool = False
+    tts_suppressed_until: float = 0.0
    agent_dispatch_task: Optional[asyncio.Task] = None
    closed: bool = False
    captured_frame_count: int = 0
@ -431,15 +433,24 @@ class ESP32LiveKitBridge:
        session.tts_display_text = ""
        session.tts_display_final = False

-    async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None:
-        print(f"收到打断请求，停止当前 TTS: device={session.device_id} reason={reason}")
-        session.tts_stream_id += 1
+    async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
+        self._cancel_tts_display_task(session)
        if session.tts_idle_task is not None:
            session.tts_idle_task.cancel()
            session.tts_idle_task = None
-        self._cancel_tts_display_task(session)
-        await self._send_agent_interrupt(session, reason)
-        await self._stop_tts(session)
+        session.tts_active = False
+        session.tts_transcript_text = ""
+        session.tts_display_text = ""
+        session.tts_display_final = False
+        await self._send_tts_state(session, "stop")
+        print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")
+
+    async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None:
+        print(f"收到打断请求，停止当前 TTS: device={session.device_id} reason={reason}")
+        session.tts_stream_id += 1
+        session.tts_suppressed_until = time.monotonic() + TTS_INTERRUPT_SUPPRESS_SECONDS
+        await self._force_stop_tts(session, reason)
+        asyncio.create_task(self._send_agent_interrupt(session, reason))

    def _reset_tts_idle_timer(self, session: DeviceSession) -> None:
        if session.tts_idle_task is not None:
@ -642,6 +653,8 @@ class ESP32LiveKitBridge:
                status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
                print(f"🗣️  [{status} | room={session.room_name} | {identity}]: {segment.text}")
                if is_agent:
+                    if time.monotonic() < session.tts_suppressed_until:
+                        continue
                    display_text = self._current_tts_display_text(segment.text)
                    if not display_text or display_text == session.tts_transcript_text:
                        continue
@ -799,7 +812,9 @@ class ESP32LiveKitBridge:
                    pre_roll_pcm.clear()
                    audible_frame_streak = 0
                    silence_frame_streak = 0
-                    waiting_for_post_interrupt_silence = True
+                    waiting_for_post_interrupt_silence = (
+                        time.monotonic() >= session.tts_suppressed_until
+                    )
                    stream_id = session.tts_stream_id
                    if session.tts_active:
                        await self._stop_tts(session)
@ -809,6 +824,14 @@ class ESP32LiveKitBridge:
                pcm_data = frame.data.tobytes()
                has_audible_audio = self._has_audible_audio(pcm_data)

+                if time.monotonic() < session.tts_suppressed_until:
+                    pending_pcm.clear()
+                    pre_roll_pcm.clear()
+                    audible_frame_streak = 0
+                    silence_frame_streak = 0
+                    waiting_for_post_interrupt_silence = False
+                    continue
+
                if waiting_for_post_interrupt_silence:
                    if has_audible_audio:
                        silence_frame_streak = 0