fix: voice interupt
This commit is contained in:
@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@ -44,29 +45,40 @@ CHAT_MODE_AGENT_NAMES = {
|
||||
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
|
||||
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
|
||||
WS_PORT = 8080
|
||||
WS_MAX_QUEUE = int(os.getenv("BRIDGE_WS_MAX_QUEUE", "128"))
|
||||
WS_MAX_SIZE = int(os.getenv("BRIDGE_WS_MAX_SIZE", str(8 * 1024 * 1024)))
|
||||
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
|
||||
|
||||
INPUT_SAMPLE_RATE = 16000
|
||||
OUTPUT_SAMPLE_RATE = 24000
|
||||
INPUT_FRAME_DURATION_MS = 20
|
||||
INPUT_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * INPUT_FRAME_DURATION_MS // 1000
|
||||
INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 60 // 1000
|
||||
OUTPUT_FRAME_DURATION_MS = 20
|
||||
OUTPUT_SAMPLES_PER_OPUS_FRAME = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000
|
||||
TTS_IDLE_TIMEOUT_SECONDS = 0.25
|
||||
INPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_INPUT_SAMPLE_RATE", "16000"))
|
||||
OUTPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_OUTPUT_SAMPLE_RATE", "24000"))
|
||||
INPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_INPUT_FRAME_DURATION_MS", "60"))
|
||||
INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 120 // 1000
|
||||
OUTPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_OUTPUT_FRAME_DURATION_MS", "60"))
|
||||
AUDIO_STATS_INTERVAL_SECONDS = float(os.getenv("BRIDGE_AUDIO_STATS_INTERVAL_SECONDS", "5.0"))
|
||||
DOWNLINK_SEND_GAP_WARN_MS = float(os.getenv("BRIDGE_DOWNLINK_SEND_GAP_WARN_MS", "180.0"))
|
||||
UPLINK_CAPTURE_TIMEOUT_SECONDS = float(os.getenv("BRIDGE_UPLINK_CAPTURE_TIMEOUT_SECONDS", "0.25"))
|
||||
TTS_IDLE_TIMEOUT_SECONDS = float(os.getenv("TTS_IDLE_TIMEOUT_SECONDS", "1.2"))
|
||||
TTS_MIN_ACTIVE_SECONDS = float(os.getenv("TTS_MIN_ACTIVE_SECONDS", "1.0"))
|
||||
TTS_SILENCE_PEAK_THRESHOLD = 96
|
||||
TTS_PRE_ROLL_MS = 80
|
||||
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
|
||||
TTS_PRE_ROLL_MS = int(os.getenv("TTS_PRE_ROLL_MS", "480"))
|
||||
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = int(os.getenv("TTS_START_CONSECUTIVE_AUDIBLE_FRAMES", "1"))
|
||||
TTS_INTERRUPT_SILENCE_FRAMES = 3
|
||||
INTERRUPT_TOPIC = "lk.interrupt"
|
||||
VISION_FRAME_TOPIC = "vision.frame"
|
||||
AGENT_STATE_ATTRIBUTE = "lk.agent.state"
|
||||
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;;"
|
||||
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
|
||||
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
|
||||
TTS_DISPLAY_SCROLL_GAP = " "
|
||||
TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
|
||||
TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS = float(
|
||||
os.getenv("TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS", "0.25")
|
||||
)
|
||||
TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS = float(
|
||||
os.getenv("TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS", "8.0")
|
||||
)
|
||||
EMOTION_TEXT_PATTERN = re.compile(
|
||||
r"^\s*<?\s*emotion\s*=\s*([^\s>,,;;]+)\s*>?[\s,,;;]*(.*)$",
|
||||
re.DOTALL,
|
||||
@ -95,6 +107,7 @@ class DeviceSession:
|
||||
agent_ready: asyncio.Event
|
||||
forwarding_tracks: dict[str, asyncio.Task[Any]] = field(default_factory=dict)
|
||||
tts_active: bool = False
|
||||
tts_thinking: bool = False
|
||||
tts_idle_task: Optional[asyncio.Task] = None
|
||||
tts_display_task: Optional[asyncio.Task] = None
|
||||
tts_stream_id: int = 0
|
||||
@ -103,6 +116,11 @@ class DeviceSession:
|
||||
tts_display_final: bool = False
|
||||
tts_emotion: str = ""
|
||||
tts_suppressed_until: float = 0.0
|
||||
tts_started_at: float = 0.0
|
||||
tts_last_audible_at: float = 0.0
|
||||
tts_waiting_for_user_audio_after_interrupt: bool = False
|
||||
last_interrupt_time: float = 0.0
|
||||
last_uplink_audible_time: float = 0.0
|
||||
agent_dispatch_task: Optional[asyncio.Task] = None
|
||||
closed: bool = False
|
||||
captured_frame_count: int = 0
|
||||
@ -138,6 +156,100 @@ class ESP32LiveKitBridge:
|
||||
if formatted_tb.strip():
|
||||
print(formatted_tb.rstrip())
|
||||
|
||||
def _audio_duration_ms(self, sample_count: int, sample_rate: int) -> float:
|
||||
if sample_rate <= 0:
|
||||
return 0.0
|
||||
return sample_count * 1000.0 / sample_rate
|
||||
|
||||
def _build_server_hello(self, session: DeviceSession) -> dict[str, Any]:
|
||||
return {
|
||||
"type": "hello",
|
||||
"transport": "websocket",
|
||||
"session": {
|
||||
"room": session.room_name,
|
||||
"identity": session.identity,
|
||||
},
|
||||
"audio_params": {
|
||||
"format": "opus",
|
||||
"sample_rate": OUTPUT_SAMPLE_RATE,
|
||||
"channels": 1,
|
||||
"frame_duration": OUTPUT_FRAME_DURATION_MS,
|
||||
},
|
||||
}
|
||||
|
||||
def _log_client_hello(self, session: DeviceSession, message: dict[str, Any]) -> None:
|
||||
audio_params = message.get("audio_params")
|
||||
if not isinstance(audio_params, dict):
|
||||
return
|
||||
|
||||
sample_rate = audio_params.get("sample_rate")
|
||||
frame_duration = audio_params.get("frame_duration")
|
||||
channels = audio_params.get("channels")
|
||||
fmt = audio_params.get("format")
|
||||
print(
|
||||
"[client-audio] "
|
||||
f"device={session.device_id} format={fmt} sample_rate={sample_rate} "
|
||||
f"channels={channels} frame_duration={frame_duration}"
|
||||
)
|
||||
|
||||
if sample_rate != INPUT_SAMPLE_RATE or channels != 1:
|
||||
print(
|
||||
"[client-audio] warning: bridge uplink decode expects "
|
||||
f"{INPUT_SAMPLE_RATE}Hz mono, got {sample_rate}Hz channels={channels}"
|
||||
)
|
||||
if frame_duration != INPUT_FRAME_DURATION_MS:
|
||||
print(
|
||||
"[client-audio] warning: bridge expects "
|
||||
f"{INPUT_FRAME_DURATION_MS}ms uplink frames, got {frame_duration}ms"
|
||||
)
|
||||
|
||||
def _track_room_connect_task(
|
||||
self,
|
||||
session: DeviceSession,
|
||||
task: asyncio.Task[Any],
|
||||
) -> None:
|
||||
if task.cancelled():
|
||||
return
|
||||
try:
|
||||
task.result()
|
||||
except Exception as exc:
|
||||
self._log_exception(
|
||||
f"LiveKit 房间连接后台任务失败: room={session.room_name}",
|
||||
exc,
|
||||
)
|
||||
websocket = session.websocket
|
||||
if websocket is not None:
|
||||
asyncio.create_task(websocket.close(code=1011, reason="livekit connect failed"))
|
||||
|
||||
async def _capture_mic_frame(
|
||||
self,
|
||||
session: DeviceSession,
|
||||
pcm_bytes: bytes,
|
||||
num_samples: int,
|
||||
) -> bool:
|
||||
try:
|
||||
frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples)
|
||||
except TypeError:
|
||||
frame = AudioFrame.create(
|
||||
sample_rate=INPUT_SAMPLE_RATE,
|
||||
num_channels=1,
|
||||
samples_per_channel=num_samples,
|
||||
)
|
||||
memoryview(frame.data).cast("B")[:] = pcm_bytes
|
||||
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
session.mic_source.capture_frame(frame),
|
||||
timeout=UPLINK_CAPTURE_TIMEOUT_SECONDS,
|
||||
)
|
||||
return True
|
||||
except asyncio.TimeoutError:
|
||||
print(
|
||||
"[uplink] warning: capture_frame timeout, dropping frame "
|
||||
f"device={session.device_id} samples={num_samples}"
|
||||
)
|
||||
return False
|
||||
|
||||
def _build_device_id(self, websocket: Any) -> str:
|
||||
headers = websocket.request.headers
|
||||
requested_id = headers.get("X-Device-Id") or headers.get("Device-Id")
|
||||
@ -445,7 +557,7 @@ class ESP32LiveKitBridge:
|
||||
print("收到 vision frame,但 image 字段为空")
|
||||
return
|
||||
|
||||
saved_path = self._save_vision_frame(session, image)
|
||||
saved_path = await asyncio.to_thread(self._save_vision_frame, session, image)
|
||||
if saved_path is None:
|
||||
return
|
||||
print(f"已保存 vision frame: {saved_path}")
|
||||
@ -599,6 +711,10 @@ class ESP32LiveKitBridge:
|
||||
if session.tts_active:
|
||||
print("跳过 tts start,当前已处于激活状态")
|
||||
return
|
||||
block_reason = self._tts_resume_block_reason(session, include_user_quiet=False)
|
||||
if block_reason is not None:
|
||||
print(f"跳过 tts start,打断后仍在等待稳定聆听: {block_reason}")
|
||||
return
|
||||
if time.monotonic() < session.tts_suppressed_until:
|
||||
print("跳过 tts start,中断后的残留音频仍在抑制窗口内")
|
||||
return
|
||||
@ -607,16 +723,83 @@ class ESP32LiveKitBridge:
|
||||
session.tts_display_final = False
|
||||
session.tts_emotion = ""
|
||||
self._cancel_tts_display_task(session)
|
||||
now = time.monotonic()
|
||||
session.tts_started_at = now
|
||||
session.tts_last_audible_at = now
|
||||
await self._send_tts_state(session, "start")
|
||||
session.tts_active = True
|
||||
session.tts_thinking = False
|
||||
|
||||
async def _start_thinking(self, session: DeviceSession) -> None:
|
||||
if session.tts_active:
|
||||
print("跳过 tts thinking,当前已处于 TTS 播放状态")
|
||||
return
|
||||
if session.tts_thinking:
|
||||
print("跳过 tts thinking,当前已处于思考状态")
|
||||
return
|
||||
block_reason = self._tts_resume_block_reason(session, include_user_quiet=False)
|
||||
if block_reason is not None:
|
||||
print(f"跳过 tts thinking,打断后仍在等待稳定聆听: {block_reason}")
|
||||
return
|
||||
if time.monotonic() < session.tts_suppressed_until:
|
||||
print("跳过 tts thinking,中断后的残留音频仍在抑制窗口内")
|
||||
return
|
||||
await self._send_tts_state(session, "thinking")
|
||||
session.tts_thinking = True
|
||||
|
||||
def _tts_resume_block_reason(
|
||||
self,
|
||||
session: DeviceSession,
|
||||
now: Optional[float] = None,
|
||||
*,
|
||||
include_user_quiet: bool = True,
|
||||
) -> Optional[str]:
|
||||
if now is None:
|
||||
now = time.monotonic()
|
||||
|
||||
if session.tts_waiting_for_user_audio_after_interrupt:
|
||||
return "waiting_for_user_audio_after_interrupt"
|
||||
|
||||
if session.last_interrupt_time <= 0.0:
|
||||
return None
|
||||
|
||||
since_interrupt = now - session.last_interrupt_time
|
||||
if since_interrupt > TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS:
|
||||
return None
|
||||
|
||||
if session.last_uplink_audible_time < session.last_interrupt_time:
|
||||
return None
|
||||
|
||||
if not include_user_quiet:
|
||||
return None
|
||||
|
||||
quiet_for = now - session.last_uplink_audible_time
|
||||
if quiet_for < TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS:
|
||||
return f"user_audio_quiet_for={quiet_for:.2f}s"
|
||||
|
||||
return None
|
||||
|
||||
def _handle_agent_state(self, session: DeviceSession, participant: rtc.Participant) -> None:
|
||||
state = participant.attributes.get(AGENT_STATE_ATTRIBUTE)
|
||||
if not isinstance(state, str) or not state:
|
||||
return
|
||||
|
||||
print(
|
||||
f"[agent-state] room={session.room_name} identity={participant.identity} state={state}"
|
||||
)
|
||||
if state == "thinking":
|
||||
asyncio.create_task(self._start_thinking(session))
|
||||
|
||||
async def _stop_tts(self, session: DeviceSession) -> None:
|
||||
if not session.tts_active:
|
||||
if not session.tts_active and not session.tts_thinking:
|
||||
print("跳过 tts stop,当前未激活")
|
||||
return
|
||||
self._cancel_tts_display_task(session)
|
||||
await self._send_tts_state(session, "stop")
|
||||
session.tts_active = False
|
||||
session.tts_thinking = False
|
||||
session.tts_started_at = 0.0
|
||||
session.tts_last_audible_at = 0.0
|
||||
session.tts_transcript_text = ""
|
||||
session.tts_display_text = ""
|
||||
session.tts_display_final = False
|
||||
@ -628,6 +811,9 @@ class ESP32LiveKitBridge:
|
||||
session.tts_idle_task.cancel()
|
||||
session.tts_idle_task = None
|
||||
session.tts_active = False
|
||||
session.tts_thinking = False
|
||||
session.tts_started_at = 0.0
|
||||
session.tts_last_audible_at = 0.0
|
||||
session.tts_transcript_text = ""
|
||||
session.tts_display_text = ""
|
||||
session.tts_display_final = False
|
||||
@ -637,12 +823,16 @@ class ESP32LiveKitBridge:
|
||||
|
||||
async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None:
|
||||
print(f"收到打断请求,停止当前 TTS: device={session.device_id} reason={reason}")
|
||||
now = time.monotonic()
|
||||
session.tts_stream_id += 1
|
||||
session.tts_suppressed_until = time.monotonic() + TTS_INTERRUPT_SUPPRESS_SECONDS
|
||||
session.last_interrupt_time = now
|
||||
session.tts_suppressed_until = now + TTS_INTERRUPT_SUPPRESS_SECONDS
|
||||
session.tts_waiting_for_user_audio_after_interrupt = True
|
||||
await self._force_stop_tts(session, reason)
|
||||
asyncio.create_task(self._send_agent_interrupt(session, reason))
|
||||
|
||||
def _reset_tts_idle_timer(self, session: DeviceSession) -> None:
|
||||
session.tts_last_audible_at = time.monotonic()
|
||||
if session.tts_idle_task is not None:
|
||||
session.tts_idle_task.cancel()
|
||||
session.tts_idle_task = asyncio.create_task(
|
||||
@ -651,11 +841,28 @@ class ESP32LiveKitBridge:
|
||||
|
||||
async def _tts_idle_watchdog(self, session: DeviceSession, stream_id: int) -> None:
|
||||
try:
|
||||
await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS)
|
||||
if stream_id != session.tts_stream_id:
|
||||
while True:
|
||||
await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS)
|
||||
if stream_id != session.tts_stream_id or not session.tts_active:
|
||||
return
|
||||
|
||||
now = time.monotonic()
|
||||
idle_for = now - session.tts_last_audible_at
|
||||
active_for = now - session.tts_started_at
|
||||
remaining = max(
|
||||
TTS_IDLE_TIMEOUT_SECONDS - idle_for,
|
||||
TTS_MIN_ACTIVE_SECONDS - active_for,
|
||||
)
|
||||
if remaining > 0:
|
||||
await asyncio.sleep(remaining)
|
||||
continue
|
||||
|
||||
print(
|
||||
"TTS 静音达到阈值,切回聆听状态: "
|
||||
f"idle={idle_for:.2f}s active={active_for:.2f}s"
|
||||
)
|
||||
await self._stop_tts(session)
|
||||
return
|
||||
print(f"TTS 空闲超过 {TTS_IDLE_TIMEOUT_SECONDS}s,切回聆听状态")
|
||||
await self._stop_tts(session)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
@ -799,6 +1006,7 @@ class ESP32LiveKitBridge:
|
||||
if self._is_agent_participant(participant, session.agent_name):
|
||||
session.agent_ready.set()
|
||||
self._scan_participant_audio_tracks(session, participant, "connected_scan")
|
||||
self._handle_agent_state(session, participant)
|
||||
|
||||
@session.room.on("participant_connected")
|
||||
def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
|
||||
@ -810,6 +1018,7 @@ class ESP32LiveKitBridge:
|
||||
self._scan_participant_audio_tracks(
|
||||
session, participant, "participant_connected_scan"
|
||||
)
|
||||
self._handle_agent_state(session, participant)
|
||||
|
||||
@session.room.on("participant_disconnected")
|
||||
def on_participant_disconnected(participant: rtc.RemoteParticipant) -> None:
|
||||
@ -820,6 +1029,16 @@ class ESP32LiveKitBridge:
|
||||
if not track_sid.endswith(f":{participant.identity}")
|
||||
}
|
||||
|
||||
@session.room.on("participant_attributes_changed")
|
||||
def on_participant_attributes_changed(changed: list[str], participant: rtc.Participant) -> None:
|
||||
if AGENT_STATE_ATTRIBUTE not in changed:
|
||||
return
|
||||
if not isinstance(participant, rtc.RemoteParticipant):
|
||||
return
|
||||
if not self._is_agent_participant(participant, session.agent_name):
|
||||
return
|
||||
self._handle_agent_state(session, participant)
|
||||
|
||||
@session.room.on("data_received")
|
||||
def on_data_received(data_packet: rtc.DataPacket) -> None:
|
||||
identity = data_packet.participant.identity if data_packet.participant else "未知"
|
||||
@ -865,6 +1084,7 @@ class ESP32LiveKitBridge:
|
||||
if not segment.final:
|
||||
continue
|
||||
display_text = segment.text
|
||||
asyncio.create_task(self._start_thinking(session))
|
||||
|
||||
if session.websocket is not None:
|
||||
ws = session.websocket
|
||||
@ -967,9 +1187,21 @@ class ESP32LiveKitBridge:
|
||||
|
||||
async def start(self) -> None:
|
||||
print(f"[config] websocket_port={WS_PORT}")
|
||||
print(f"[config] websocket_max_queue={WS_MAX_QUEUE} websocket_max_size={WS_MAX_SIZE}")
|
||||
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
|
||||
print(f"[config] token_url={TOKEN_URL}")
|
||||
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
|
||||
print(
|
||||
"[config] audio="
|
||||
f"uplink_decode:{INPUT_SAMPLE_RATE}Hz/{INPUT_FRAME_DURATION_MS}ms "
|
||||
f"downlink_encode:{OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms "
|
||||
f"stats_interval:{AUDIO_STATS_INTERVAL_SECONDS}s "
|
||||
f"capture_timeout:{UPLINK_CAPTURE_TIMEOUT_SECONDS}s "
|
||||
f"tts_idle:{TTS_IDLE_TIMEOUT_SECONDS}s "
|
||||
f"tts_min_active:{TTS_MIN_ACTIVE_SECONDS}s "
|
||||
f"tts_start_frames:{TTS_START_CONSECUTIVE_AUDIBLE_FRAMES} "
|
||||
f"tts_pre_roll:{TTS_PRE_ROLL_MS}ms"
|
||||
)
|
||||
print(
|
||||
"[config] agents="
|
||||
f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
|
||||
@ -996,6 +1228,7 @@ class ESP32LiveKitBridge:
|
||||
session.websocket = None
|
||||
session.agent_ready.set()
|
||||
session.tts_active = False
|
||||
session.tts_thinking = False
|
||||
session.tts_stream_id += 1
|
||||
if session.tts_idle_task is not None:
|
||||
session.tts_idle_task.cancel()
|
||||
@ -1016,13 +1249,20 @@ class ESP32LiveKitBridge:
|
||||
pending_pcm = bytearray()
|
||||
pre_roll_pcm = bytearray()
|
||||
pre_roll_max_bytes = OUTPUT_SAMPLE_RATE * TTS_PRE_ROLL_MS // 1000 * 2
|
||||
output_samples_per_opus_frame = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000
|
||||
output_frame_bytes = output_samples_per_opus_frame * 2
|
||||
audible_frame_streak = 0
|
||||
silence_frame_streak = 0
|
||||
waiting_for_post_interrupt_silence = False
|
||||
downlink_packets = 0
|
||||
downlink_audio_ms = 0.0
|
||||
last_downlink_stats_time = time.monotonic()
|
||||
last_send_time: Optional[float] = None
|
||||
stream_id = session.tts_stream_id
|
||||
print(
|
||||
f"启动 TTS 转发: device={session.device_id} room={session.room_name} "
|
||||
f"track_sid={track_sid} stream_id={stream_id}"
|
||||
f"track_sid={track_sid} stream_id={stream_id} "
|
||||
f"opus={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
|
||||
)
|
||||
|
||||
try:
|
||||
@ -1043,7 +1283,8 @@ class ESP32LiveKitBridge:
|
||||
pcm_data = frame.data.tobytes()
|
||||
has_audible_audio = self._has_audible_audio(pcm_data)
|
||||
|
||||
if time.monotonic() < session.tts_suppressed_until:
|
||||
now = time.monotonic()
|
||||
if now < session.tts_suppressed_until:
|
||||
pending_pcm.clear()
|
||||
pre_roll_pcm.clear()
|
||||
audible_frame_streak = 0
|
||||
@ -1051,6 +1292,31 @@ class ESP32LiveKitBridge:
|
||||
waiting_for_post_interrupt_silence = True
|
||||
continue
|
||||
|
||||
block_reason = self._tts_resume_block_reason(
|
||||
session,
|
||||
now,
|
||||
include_user_quiet=False,
|
||||
)
|
||||
if block_reason is not None:
|
||||
pending_pcm.clear()
|
||||
pre_roll_pcm.clear()
|
||||
audible_frame_streak = 0
|
||||
silence_frame_streak = 0
|
||||
if block_reason == "waiting_for_user_audio_after_interrupt":
|
||||
waiting_for_post_interrupt_silence = True
|
||||
continue
|
||||
|
||||
if (
|
||||
waiting_for_post_interrupt_silence
|
||||
and session.last_interrupt_time > 0.0
|
||||
and session.last_uplink_audible_time >= session.last_interrupt_time
|
||||
and now - session.last_uplink_audible_time
|
||||
>= TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS
|
||||
):
|
||||
print("检测到用户打断后语音已结束,允许新 TTS 直接起播")
|
||||
waiting_for_post_interrupt_silence = False
|
||||
silence_frame_streak = 0
|
||||
|
||||
if waiting_for_post_interrupt_silence:
|
||||
if has_audible_audio:
|
||||
silence_frame_streak = 0
|
||||
@ -1098,20 +1364,42 @@ class ESP32LiveKitBridge:
|
||||
|
||||
if not current_frame_buffered:
|
||||
pending_pcm.extend(pcm_data)
|
||||
frame_bytes = OUTPUT_SAMPLES_PER_OPUS_FRAME * 2
|
||||
|
||||
while (
|
||||
len(pending_pcm) >= frame_bytes
|
||||
len(pending_pcm) >= output_frame_bytes
|
||||
and stream_id == session.tts_stream_id
|
||||
and session.websocket is not None
|
||||
):
|
||||
try:
|
||||
now = time.monotonic()
|
||||
if last_send_time is not None:
|
||||
send_gap_ms = (now - last_send_time) * 1000.0
|
||||
if send_gap_ms > DOWNLINK_SEND_GAP_WARN_MS:
|
||||
print(
|
||||
"[downlink] warning: send gap "
|
||||
f"{send_gap_ms:.1f}ms device={session.device_id} "
|
||||
f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}"
|
||||
)
|
||||
last_send_time = now
|
||||
|
||||
opus_packet = encoder.encode(
|
||||
bytes(pending_pcm[:frame_bytes]),
|
||||
OUTPUT_SAMPLES_PER_OPUS_FRAME,
|
||||
bytes(pending_pcm[:output_frame_bytes]),
|
||||
output_samples_per_opus_frame,
|
||||
)
|
||||
del pending_pcm[:frame_bytes]
|
||||
del pending_pcm[:output_frame_bytes]
|
||||
await session.websocket.send(self._wrap_opus_payload(session, opus_packet))
|
||||
downlink_packets += 1
|
||||
downlink_audio_ms += OUTPUT_FRAME_DURATION_MS
|
||||
if now - last_downlink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS:
|
||||
print(
|
||||
"[downlink] "
|
||||
f"device={session.device_id} packets={downlink_packets} "
|
||||
f"audio_ms={downlink_audio_ms:.0f} "
|
||||
f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}"
|
||||
)
|
||||
downlink_packets = 0
|
||||
downlink_audio_ms = 0.0
|
||||
last_downlink_stats_time = now
|
||||
except Exception as exc:
|
||||
print(f"发送回 ESP32 失败: {exc}")
|
||||
break
|
||||
@ -1171,27 +1459,26 @@ class ESP32LiveKitBridge:
|
||||
)
|
||||
session.tts_stream_id += 1
|
||||
opus_decoder = None
|
||||
uplink_packets = 0
|
||||
uplink_audio_ms = 0.0
|
||||
uplink_decode_errors = 0
|
||||
uplink_dropped_frames = 0
|
||||
last_uplink_stats_time = time.monotonic()
|
||||
room_connect_task: Optional[asyncio.Task[Any]] = None
|
||||
|
||||
try:
|
||||
hello_msg = {
|
||||
"type": "hello",
|
||||
"transport": "websocket",
|
||||
"session": {
|
||||
"room": session.room_name,
|
||||
"identity": session.identity,
|
||||
},
|
||||
"audio_params": {
|
||||
"format": "opus",
|
||||
"sample_rate": OUTPUT_SAMPLE_RATE,
|
||||
"channels": 1,
|
||||
"frame_duration": OUTPUT_FRAME_DURATION_MS,
|
||||
},
|
||||
}
|
||||
hello_msg = self._build_server_hello(session)
|
||||
await websocket.send(json.dumps(hello_msg))
|
||||
print(f"已发送 server hello: device={device_id} room={session.room_name}")
|
||||
print(
|
||||
f"已发送 server hello: device={device_id} room={session.room_name} "
|
||||
f"audio={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
|
||||
)
|
||||
asyncio.create_task(self._run_emotion_test_sequence(session))
|
||||
|
||||
await self._connect_session_room(session)
|
||||
room_connect_task = asyncio.create_task(self._connect_session_room(session))
|
||||
room_connect_task.add_done_callback(
|
||||
lambda task: self._track_room_connect_task(session, task)
|
||||
)
|
||||
|
||||
async for message in websocket:
|
||||
if isinstance(message, bytes):
|
||||
@ -1214,6 +1501,16 @@ class ESP32LiveKitBridge:
|
||||
if num_samples > 0:
|
||||
session.captured_frame_count += 1
|
||||
now = time.monotonic()
|
||||
uplink_packets += 1
|
||||
uplink_audio_ms += self._audio_duration_ms(num_samples, INPUT_SAMPLE_RATE)
|
||||
if self._has_audible_audio(pcm_bytes):
|
||||
session.last_uplink_audible_time = now
|
||||
if session.tts_waiting_for_user_audio_after_interrupt:
|
||||
session.tts_waiting_for_user_audio_after_interrupt = False
|
||||
print(
|
||||
f"[uplink] detected user audio after interrupt: "
|
||||
f"device={session.device_id}"
|
||||
)
|
||||
if (
|
||||
session.captured_frame_count <= 5
|
||||
or now - session.first_capture_log_time >= 5.0
|
||||
@ -1224,25 +1521,32 @@ class ESP32LiveKitBridge:
|
||||
# f"bytes={len(pcm_bytes)} samples={num_samples} "
|
||||
# f"room={session.room_name}"
|
||||
# )
|
||||
try:
|
||||
frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples)
|
||||
await session.mic_source.capture_frame(frame)
|
||||
except TypeError:
|
||||
frame = AudioFrame.create(
|
||||
sample_rate=INPUT_SAMPLE_RATE,
|
||||
num_channels=1,
|
||||
samples_per_channel=num_samples,
|
||||
if now - last_uplink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS:
|
||||
print(
|
||||
"[uplink] "
|
||||
f"device={session.device_id} packets={uplink_packets} "
|
||||
f"audio_ms={uplink_audio_ms:.0f} "
|
||||
f"decode_errors={uplink_decode_errors} "
|
||||
f"dropped_frames={uplink_dropped_frames}"
|
||||
)
|
||||
memoryview(frame.data).cast("B")[:] = pcm_bytes
|
||||
await session.mic_source.capture_frame(frame)
|
||||
uplink_packets = 0
|
||||
uplink_audio_ms = 0.0
|
||||
uplink_decode_errors = 0
|
||||
uplink_dropped_frames = 0
|
||||
last_uplink_stats_time = now
|
||||
if not await self._capture_mic_frame(session, pcm_bytes, num_samples):
|
||||
uplink_dropped_frames += 1
|
||||
except Exception as exc:
|
||||
uplink_decode_errors += 1
|
||||
print(f"Opus audio decode error ({len(message)} bytes): {exc}")
|
||||
elif isinstance(message, str):
|
||||
try:
|
||||
data = json.loads(message)
|
||||
# print(f"收到 ESP32 JSON 消息: {data}")
|
||||
msg_type = data.get("type")
|
||||
if msg_type == "abort":
|
||||
if msg_type == "hello":
|
||||
self._log_client_hello(session, data)
|
||||
elif msg_type == "abort":
|
||||
reason = data.get("reason")
|
||||
abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
|
||||
print(f"处理 ESP32 打断请求: reason={abort_reason}")
|
||||
@ -1257,6 +1561,10 @@ class ESP32LiveKitBridge:
|
||||
self._log_exception("WebSocket 其他错误", exc)
|
||||
finally:
|
||||
print(f"ESP32 断开连接: device={device_id} room={session.room_name}")
|
||||
if room_connect_task is not None and not room_connect_task.done():
|
||||
room_connect_task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError):
|
||||
await room_connect_task
|
||||
await self._close_session(session)
|
||||
self.device_sessions.pop(device_id, None)
|
||||
|
||||
@ -1265,7 +1573,13 @@ async def main() -> None:
|
||||
bridge = ESP32LiveKitBridge()
|
||||
try:
|
||||
await bridge.start()
|
||||
async with websockets.serve(bridge.handle_websocket, "0.0.0.0", WS_PORT):
|
||||
async with websockets.serve(
|
||||
bridge.handle_websocket,
|
||||
"0.0.0.0",
|
||||
WS_PORT,
|
||||
max_queue=WS_MAX_QUEUE,
|
||||
max_size=WS_MAX_SIZE,
|
||||
):
|
||||
print(f"WebSocket 服务器运行在端口 {WS_PORT},等待 ESP32 连接...")
|
||||
await asyncio.Future()
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user