feat: support camera capture to livekit

This commit is contained in:
0Xiao0
2026-05-25 17:21:11 +08:00
parent 4953244c7c
commit fc6302661d
12 changed files with 314 additions and 78 deletions

View File

@ -1,4 +1,5 @@
import asyncio
import base64
import json
import os
import shutil
@ -8,6 +9,7 @@ import time
import traceback
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional
import httpx
@ -31,6 +33,8 @@ CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
WS_PORT = 8080
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
PROJECT_ROOT = Path(__file__).resolve().parent.parent
VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
INPUT_SAMPLE_RATE = 16000
OUTPUT_SAMPLE_RATE = 24000
@ -45,6 +49,7 @@ TTS_PRE_ROLL_MS = 80
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
TTS_INTERRUPT_SILENCE_FRAMES = 3
INTERRUPT_TOPIC = "lk.interrupt"
VISION_FRAME_TOPIC = "vision.frame"
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;"
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
@ -336,6 +341,73 @@ class ESP32LiveKitBridge:
if not ok:
print("警告: bridge 已停止 TTS但 agent 侧 interrupt 未确认送出")
def _save_vision_frame(self, session: DeviceSession, image: str) -> Optional[Path]:
try:
image_bytes = base64.b64decode(image, validate=True)
except Exception as exc:
print(f"vision frame base64 解码失败: {exc}")
return None
safe_device_id = "".join(
char if char.isalnum() or char in ("-", "_") else "_"
for char in session.device_id
)
timestamp_ms = int(time.time() * 1000)
VISION_FRAME_SAVE_DIR.mkdir(parents=True, exist_ok=True)
path = VISION_FRAME_SAVE_DIR / f"{timestamp_ms}_{safe_device_id}.jpg"
path.write_bytes(image_bytes)
return path
async def _publish_vision_frame(self, session: DeviceSession, message: dict[str, Any]) -> None:
image = message.get("image")
if not isinstance(image, str) or not image:
print("收到 vision frame但 image 字段为空")
return
saved_path = self._save_vision_frame(session, image)
if saved_path is None:
return
print(f"已保存 vision frame: {saved_path}")
participant = getattr(session.room, "local_participant", None)
if participant is None:
print("跳过发送 vision framelocal participant 尚未就绪")
return
payload = {
"type": "vision_frame",
"topic": VISION_FRAME_TOPIC,
"room": session.room_name,
"identity": session.identity,
"device_id": session.device_id,
"mime_type": message.get("mime_type", "image/jpeg"),
"image": image,
"saved_path": str(saved_path),
}
data = json.dumps(payload).encode("utf-8")
agent_identities = self._get_agent_identities(session)
kwargs: dict[str, Any] = {}
if agent_identities:
kwargs["destination_identities"] = agent_identities
last_error: Optional[Exception] = None
for attempt in ({"topic": VISION_FRAME_TOPIC, **kwargs}, kwargs):
try:
await participant.publish_data(data, **attempt)
print(
f"已发送 vision frame: bytes={len(data)} "
f"targets={agent_identities or 'broadcast'}"
)
return
except TypeError as exc:
last_error = exc
except Exception as exc:
print(f"发送 vision frame 失败: {exc}")
return
if last_error is not None:
print(f"发送 vision frame 失败publish_data 签名不兼容: {last_error}")
async def _send_tts_state(self, session: DeviceSession, state: str) -> None:
if session.websocket is None:
print(f"跳过 tts {state}ESP32 尚未连接")
@ -939,14 +1011,12 @@ class ESP32LiveKitBridge:
)
self.device_sessions[device_id] = session
# print(f"ESP32 已连接: device={device_id}")
# print(f"ESP32 协议版本: {session.protocol_version}")
print(f"ESP32 已连接: device={device_id}")
print(f"ESP32 协议版本: {session.protocol_version}")
session.tts_stream_id += 1
opus_decoder = None
try:
await self._connect_session_room(session)
hello_msg = {
"type": "hello",
"transport": "websocket",
@ -962,6 +1032,9 @@ class ESP32LiveKitBridge:
},
}
await websocket.send(json.dumps(hello_msg))
print(f"已发送 server hello: device={device_id} room={session.room_name}")
await self._connect_session_room(session)
async for message in websocket:
if isinstance(message, bytes):
@ -1017,6 +1090,8 @@ class ESP32LiveKitBridge:
abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
print(f"处理 ESP32 打断请求: reason={abort_reason}")
await self._abort_tts(session, abort_reason)
elif msg_type == "vision" and data.get("state") == "frame":
await self._publish_vision_frame(session, data)
except json.JSONDecodeError:
print(f"收到未知的字符消息: {message}")
except ConnectionClosedError as exc: