diff --git a/main/bridge_server.py b/main/bridge_server.py index d198992..80c6230 100644 --- a/main/bridge_server.py +++ b/main/bridge_server.py @@ -49,6 +49,7 @@ TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;;" TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18")) TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18")) TTS_DISPLAY_SCROLL_GAP = " " +TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8")) @dataclass @@ -69,6 +70,7 @@ class DeviceSession: tts_transcript_text: str = "" tts_display_text: str = "" tts_display_final: bool = False + tts_suppressed_until: float = 0.0 agent_dispatch_task: Optional[asyncio.Task] = None closed: bool = False captured_frame_count: int = 0 @@ -431,15 +433,24 @@ class ESP32LiveKitBridge: session.tts_display_text = "" session.tts_display_final = False - async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None: - print(f"收到打断请求,停止当前 TTS: device={session.device_id} reason={reason}") - session.tts_stream_id += 1 + async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None: + self._cancel_tts_display_task(session) if session.tts_idle_task is not None: session.tts_idle_task.cancel() session.tts_idle_task = None - self._cancel_tts_display_task(session) - await self._send_agent_interrupt(session, reason) - await self._stop_tts(session) + session.tts_active = False + session.tts_transcript_text = "" + session.tts_display_text = "" + session.tts_display_final = False + await self._send_tts_state(session, "stop") + print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}") + + async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None: + print(f"收到打断请求,停止当前 TTS: device={session.device_id} reason={reason}") + session.tts_stream_id += 1 + session.tts_suppressed_until = time.monotonic() + TTS_INTERRUPT_SUPPRESS_SECONDS + await self._force_stop_tts(session, reason) + asyncio.create_task(self._send_agent_interrupt(session, reason)) def _reset_tts_idle_timer(self, session: DeviceSession) -> None: if session.tts_idle_task is not None: @@ -642,6 +653,8 @@ class ESP32LiveKitBridge: status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果" print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}") if is_agent: + if time.monotonic() < session.tts_suppressed_until: + continue display_text = self._current_tts_display_text(segment.text) if not display_text or display_text == session.tts_transcript_text: continue @@ -799,7 +812,9 @@ class ESP32LiveKitBridge: pre_roll_pcm.clear() audible_frame_streak = 0 silence_frame_streak = 0 - waiting_for_post_interrupt_silence = True + waiting_for_post_interrupt_silence = ( + time.monotonic() >= session.tts_suppressed_until + ) stream_id = session.tts_stream_id if session.tts_active: await self._stop_tts(session) @@ -809,6 +824,14 @@ class ESP32LiveKitBridge: pcm_data = frame.data.tobytes() has_audible_audio = self._has_audible_audio(pcm_data) + if time.monotonic() < session.tts_suppressed_until: + pending_pcm.clear() + pre_roll_pcm.clear() + audible_frame_streak = 0 + silence_frame_streak = 0 + waiting_for_post_interrupt_silence = False + continue + if waiting_for_post_interrupt_silence: if has_audible_audio: silence_frame_streak = 0