feat: beaver

This commit is contained in:
0Xiao0
2026-06-04 15:48:10 +08:00
parent b92e6e1b07
commit 9637e09aef
5 changed files with 163 additions and 26 deletions

View File

@ -29,7 +29,18 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken"
LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
ROOM_PREFIX = "test-livekit"
IDENTITY_PREFIX = "uv-livekit"
AGENT_NAME = "my-agent"
LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent")
DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower()
AGENT_NAMES = {
"normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME),
"beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"),
}
CHAT_MODE_AGENT_NAMES = {
"normal": AGENT_NAMES["normal"],
"beaver": AGENT_NAMES["beaver"],
"vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"),
"vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"),
}
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
WS_PORT = 8080
@ -75,6 +86,10 @@ class DeviceSession:
protocol_version: int
room_name: str
identity: str
chat_mode: str
agent_mode: str
agent_name: str
vision_enabled: bool
room: rtc.Room
mic_source: AudioSource
agent_ready: asyncio.Event
@ -94,10 +109,10 @@ class DeviceSession:
first_capture_log_time: float = 0.0
async def fetch_token(room_name: str, identity: str) -> str:
async def fetch_token(room_name: str, identity: str, agent_name: str) -> str:
params = {"room": room_name, "identity": identity}
if AGENT_DISPATCH_MODE == "token":
params["agent_name"] = AGENT_NAME
params["agent_name"] = agent_name
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
response = await client.get(TOKEN_URL, params=params)
@ -137,15 +152,61 @@ class ESP32LiveKitBridge:
print(f"[session] device={device_id} room={room_name} identity={identity}")
return room_name, identity
def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool:
def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]:
requested_chat_mode = (
headers.get("Chat-Mode")
or headers.get("X-Chat-Mode")
or ""
).strip().lower()
chat_mode_to_agent = {
"normal": ("normal", False),
"beaver": ("beaver", False),
"vision-normal": ("normal", True),
"vision-beaver": ("beaver", True),
}
if requested_chat_mode in chat_mode_to_agent:
requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode]
else:
if requested_chat_mode:
print(f"未知 Chat-Mode={requested_chat_mode!r},回退到 Agent-Mode")
requested_mode = (
headers.get("Agent-Mode")
or headers.get("X-Agent-Mode")
or DEFAULT_AGENT_MODE
or "normal"
).strip().lower()
vision_enabled = False
requested_chat_mode = requested_mode
requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name")
if requested_name:
return requested_chat_mode, "custom", requested_name, vision_enabled
if requested_mode not in AGENT_NAMES:
print(f"未知 Agent-Mode={requested_mode!r},回退到 normal")
requested_mode = "normal"
requested_chat_mode = "vision-normal" if vision_enabled else "normal"
if requested_chat_mode in CHAT_MODE_AGENT_NAMES:
return (
requested_chat_mode,
requested_mode,
CHAT_MODE_AGENT_NAMES[requested_chat_mode],
vision_enabled,
)
return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled
def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool:
identity = getattr(participant, "identity", "") or ""
return identity.startswith("agent-") or AGENT_NAME in identity
return identity.startswith("agent-") or agent_name in identity
def _get_agent_identities(self, session: DeviceSession) -> list[str]:
return [
participant.identity
for participant in session.room.remote_participants.values()
if self._is_agent_participant(participant)
if self._is_agent_participant(participant, session.agent_name)
]
def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
@ -187,7 +248,7 @@ class ESP32LiveKitBridge:
dispatch_agent_name = getattr(dispatch, "agent_name", None)
dispatch_room = getattr(dispatch, "room", None)
if dispatch_room == session.room_name and (
dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None
dispatch_agent_name == session.agent_name or dispatch_agent_name is None
):
print(
f"检测到已有 dispatch: room={session.room_name} "
@ -210,7 +271,7 @@ class ESP32LiveKitBridge:
return
for participant in session.room.remote_participants.values():
if self._is_agent_participant(participant):
if self._is_agent_participant(participant, session.agent_name):
# print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}")
return
@ -222,7 +283,7 @@ class ESP32LiveKitBridge:
await session.agent_dispatch_task
async def _dispatch_agent(self, session: DeviceSession) -> None:
print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}")
print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}")
try:
if await self._dispatch_agent_with_sdk(session):
@ -252,13 +313,17 @@ class ESP32LiveKitBridge:
try:
dispatch = await lkapi.agent_dispatch.create_dispatch(
livekit_api.CreateAgentDispatchRequest(
agent_name=AGENT_NAME,
agent_name=session.agent_name,
room=session.room_name,
metadata=json.dumps(
{
"source": "bridge_server",
"identity": session.identity,
"device_id": session.device_id,
"chat_mode": session.chat_mode,
"agent_mode": session.agent_mode,
"agent_name": session.agent_name,
"vision_enabled": session.vision_enabled,
}
),
)
@ -281,13 +346,17 @@ class ESP32LiveKitBridge:
"--room",
session.room_name,
"--agent-name",
AGENT_NAME,
session.agent_name,
"--metadata",
json.dumps(
{
"source": "bridge_server",
"identity": session.identity,
"device_id": session.device_id,
"chat_mode": session.chat_mode,
"agent_mode": session.agent_mode,
"agent_name": session.agent_name,
"vision_enabled": session.vision_enabled,
}
),
stdout=asyncio.subprocess.PIPE,
@ -306,7 +375,7 @@ class ESP32LiveKitBridge:
print(f"lk dispatch create 失败,退出码: {process.returncode}")
return False
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}")
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}")
return True
async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
@ -727,16 +796,16 @@ class ESP32LiveKitBridge:
print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
self._log_agent_participants(session, "connected")
for participant in session.room.remote_participants.values():
if self._is_agent_participant(participant):
if self._is_agent_participant(participant, session.agent_name):
session.agent_ready.set()
self._scan_participant_audio_tracks(session, participant, "connected_scan")
@session.room.on("participant_connected")
def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
role = "Agent" if self._is_agent_participant(participant) else "Remote participant"
role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
self._log_agent_participants(session, "participant_connected")
if self._is_agent_participant(participant):
if self._is_agent_participant(participant, session.agent_name):
session.agent_ready.set()
self._scan_participant_audio_tracks(
session, participant, "participant_connected_scan"
@ -769,7 +838,9 @@ class ESP32LiveKitBridge:
track_pub: rtc.TrackPublication,
) -> None:
identity = participant.identity if participant else "未知"
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant)
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(
participant, session.agent_name
)
for segment in segments:
status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}")
@ -841,7 +912,7 @@ class ESP32LiveKitBridge:
# print(f"[config] token_url={TOKEN_URL}")
# print(f"[config] room={session.room_name} identity={session.identity}")
# print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
token = await fetch_token(session.room_name, session.identity)
token = await fetch_token(session.room_name, session.identity, session.agent_name)
try:
await session.room.connect(
@ -886,8 +957,12 @@ class ESP32LiveKitBridge:
# print(f"等待 agent 加入: room={session.room_name}")
try:
await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
if session.closed:
return
# print(f"✅ agent 已就绪: room={session.room_name}")
except asyncio.TimeoutError:
if session.closed:
return
print(f"⚠️ agent 等待超时: room={session.room_name}")
async def start(self) -> None:
@ -895,6 +970,14 @@ class ESP32LiveKitBridge:
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
print(f"[config] token_url={TOKEN_URL}")
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
print(
"[config] agents="
f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} "
f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} "
f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} "
f"default_mode:{DEFAULT_AGENT_MODE}"
)
if EMOTION_TEST_SEQUENCE:
print(
"[config] emotion_test_sequence="
@ -911,6 +994,7 @@ class ESP32LiveKitBridge:
return
session.closed = True
session.websocket = None
session.agent_ready.set()
session.tts_active = False
session.tts_stream_id += 1
if session.tts_idle_task is not None:
@ -1060,6 +1144,7 @@ class ESP32LiveKitBridge:
await self._close_session(existing_session)
self.device_sessions.pop(device_id, None)
chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers)
room_name, identity = self._build_session_names(device_id)
session = DeviceSession(
device_id=device_id,
@ -1067,6 +1152,10 @@ class ESP32LiveKitBridge:
protocol_version=protocol_version,
room_name=room_name,
identity=identity,
chat_mode=chat_mode,
agent_mode=agent_mode,
agent_name=agent_name,
vision_enabled=vision_enabled,
room=rtc.Room(),
mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
agent_ready=asyncio.Event(),
@ -1075,6 +1164,11 @@ class ESP32LiveKitBridge:
print(f"ESP32 已连接: device={device_id}")
print(f"ESP32 协议版本: {session.protocol_version}")
print(
f"ESP32 mode: chat={session.chat_mode} "
f"agent={session.agent_mode}/{session.agent_name} "
f"vision={session.vision_enabled}"
)
session.tts_stream_id += 1
opus_decoder = None