From 9637e09aef9d213a7687aac62b7646761e775027 Mon Sep 17 00:00:00 2001 From: 0Xiao0 <511201264@qq.com> Date: Thu, 4 Jun 2026 15:48:10 +0800 Subject: [PATCH] feat: beaver --- main/application.cc | 35 ++++- main/application.h | 12 ++ .../boards/m5stack-core-s3/m5stack_core_s3.cc | 12 +- main/bridge_server.py | 128 +++++++++++++++--- main/protocols/websocket_protocol.cc | 2 + 5 files changed, 163 insertions(+), 26 deletions(-) diff --git a/main/application.cc b/main/application.cc index 448a794..b679152 100644 --- a/main/application.cc +++ b/main/application.cc @@ -533,6 +533,9 @@ void Application::InitializeProtocol() { protocol_->OnAudioChannelClosed([this, &board]() { board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); Schedule([this]() { + if (GetDeviceState() == kDeviceStateConnecting) { + return; + } auto display = Board::GetInstance().GetDisplay(); display->SetChatMessage("system", ""); SetDeviceState(kDeviceStateIdle); @@ -681,13 +684,16 @@ void Application::DismissAlert() { } void Application::ToggleChatState() { - vision_text_mode_enabled_.store(false); - vision_frame_sent_for_current_listen_.store(false); - xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); + ToggleChatStateForMode(kChatAgentModeNormal, false); } void Application::ToggleChatStateWithVision() { - vision_text_mode_enabled_.store(true); + ToggleChatStateForMode(kChatAgentModeNormal, true); +} + +void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) { + chat_agent_mode_.store(agent_mode); + vision_text_mode_enabled_.store(vision_enabled); vision_frame_sent_for_current_listen_.store(false); xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); } @@ -696,6 +702,18 @@ bool Application::IsVisionTextModeEnabled() const { return vision_text_mode_enabled_.load(); } +const char* Application::GetChatAgentModeName() const { + return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal"; +} + +const char* Application::GetChatModeName() const { + bool vision_enabled = vision_text_mode_enabled_.load(); + if (chat_agent_mode_.load() == kChatAgentModeBeaver) { + return vision_enabled ? "vision-beaver" : "beaver"; + } + return vision_enabled ? "vision-normal" : "normal"; +} + void Application::StartListening() { vision_text_mode_enabled_.store(false); vision_frame_sent_for_current_listen_.store(false); @@ -729,7 +747,12 @@ void Application::HandleToggleChatEvent() { if (state == kDeviceStateIdle) { ListeningMode mode = GetDefaultListeningMode(); - if (!protocol_->IsAudioChannelOpened()) { + bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load(); + bool vision_mode_changed = vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load(); + if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) { + if (protocol_->IsAudioChannelOpened()) { + protocol_->CloseAudioChannel(); + } SetDeviceState(kDeviceStateConnecting); // Schedule to let the state change be processed first (UI update) Schedule([this, mode]() { @@ -761,6 +784,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) { } } + active_chat_agent_mode_.store(chat_agent_mode_.load()); + active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load()); SetListeningMode(mode); } diff --git a/main/application.h b/main/application.h index 8eab256..03f7edf 100644 --- a/main/application.h +++ b/main/application.h @@ -41,6 +41,11 @@ enum AecMode { kAecOnServerSide, }; +enum ChatAgentMode { + kChatAgentModeNormal, + kChatAgentModeBeaver, +}; + class Application { public: static Application& GetInstance() { @@ -93,7 +98,11 @@ public: */ void ToggleChatState(); void ToggleChatStateWithVision(); + void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled); bool IsVisionTextModeEnabled() const; + ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); } + const char* GetChatAgentModeName() const; + const char* GetChatModeName() const; /** * Start listening (event-based, thread-safe) @@ -147,7 +156,10 @@ private: bool aborted_ = false; bool assets_version_checked_ = false; bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening + std::atomic chat_agent_mode_ = kChatAgentModeNormal; + std::atomic active_chat_agent_mode_ = kChatAgentModeNormal; std::atomic vision_text_mode_enabled_ = false; + std::atomic active_vision_text_mode_enabled_ = false; std::atomic vad_speaking_ = false; std::atomic vision_frame_sent_for_current_listen_ = false; int clock_ticks_ = 0; diff --git a/main/boards/m5stack-core-s3/m5stack_core_s3.cc b/main/boards/m5stack-core-s3/m5stack_core_s3.cc index 6da3198..90978d9 100644 --- a/main/boards/m5stack-core-s3/m5stack_core_s3.cc +++ b/main/boards/m5stack-core-s3/m5stack_core_s3.cc @@ -192,6 +192,7 @@ private: void PollTouchpad() { static bool was_touched = false; static int64_t touch_start_time = 0; + static int touch_start_x = -1; const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值,超过500ms视为长按 ft6336_->UpdateTouchPoint(); @@ -201,11 +202,14 @@ private: if (touch_point.num > 0 && !was_touched) { was_touched = true; touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒 + touch_start_x = touch_point.x; } // 检测触摸释放 else if (touch_point.num == 0 && was_touched) { was_touched = false; int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time; + bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2; + auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal; if (touch_duration < TOUCH_THRESHOLD_MS) { auto& app = Application::GetInstance(); @@ -213,12 +217,12 @@ private: EnterWifiConfigMode(); return; } - ESP_LOGI(TAG, "Touch short: text-only mode"); - app.ToggleChatState(); + ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal"); + app.ToggleChatStateForMode(agent_mode, false); } else { auto& app = Application::GetInstance(); - ESP_LOGI(TAG, "Touch long: vision+text mode"); - app.ToggleChatStateWithVision(); + ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal"); + app.ToggleChatStateForMode(agent_mode, true); } } } diff --git a/main/bridge_server.py b/main/bridge_server.py index 352f186..a0881cc 100644 --- a/main/bridge_server.py +++ b/main/bridge_server.py @@ -29,7 +29,18 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken" LIVEKIT_WS_URL = "ws://172.19.0.240:7880" ROOM_PREFIX = "test-livekit" IDENTITY_PREFIX = "uv-livekit" -AGENT_NAME = "my-agent" +LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent") +DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower() +AGENT_NAMES = { + "normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME), + "beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"), +} +CHAT_MODE_AGENT_NAMES = { + "normal": AGENT_NAMES["normal"], + "beaver": AGENT_NAMES["beaver"], + "vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"), + "vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"), +} CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0")) AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0")) WS_PORT = 8080 @@ -75,6 +86,10 @@ class DeviceSession: protocol_version: int room_name: str identity: str + chat_mode: str + agent_mode: str + agent_name: str + vision_enabled: bool room: rtc.Room mic_source: AudioSource agent_ready: asyncio.Event @@ -94,10 +109,10 @@ class DeviceSession: first_capture_log_time: float = 0.0 -async def fetch_token(room_name: str, identity: str) -> str: +async def fetch_token(room_name: str, identity: str, agent_name: str) -> str: params = {"room": room_name, "identity": identity} if AGENT_DISPATCH_MODE == "token": - params["agent_name"] = AGENT_NAME + params["agent_name"] = agent_name async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: response = await client.get(TOKEN_URL, params=params) @@ -137,15 +152,61 @@ class ESP32LiveKitBridge: print(f"[session] device={device_id} room={room_name} identity={identity}") return room_name, identity - def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool: + def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]: + requested_chat_mode = ( + headers.get("Chat-Mode") + or headers.get("X-Chat-Mode") + or "" + ).strip().lower() + chat_mode_to_agent = { + "normal": ("normal", False), + "beaver": ("beaver", False), + "vision-normal": ("normal", True), + "vision-beaver": ("beaver", True), + } + + if requested_chat_mode in chat_mode_to_agent: + requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode] + else: + if requested_chat_mode: + print(f"未知 Chat-Mode={requested_chat_mode!r},回退到 Agent-Mode") + requested_mode = ( + headers.get("Agent-Mode") + or headers.get("X-Agent-Mode") + or DEFAULT_AGENT_MODE + or "normal" + ).strip().lower() + vision_enabled = False + requested_chat_mode = requested_mode + + requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name") + if requested_name: + return requested_chat_mode, "custom", requested_name, vision_enabled + + if requested_mode not in AGENT_NAMES: + print(f"未知 Agent-Mode={requested_mode!r},回退到 normal") + requested_mode = "normal" + requested_chat_mode = "vision-normal" if vision_enabled else "normal" + + if requested_chat_mode in CHAT_MODE_AGENT_NAMES: + return ( + requested_chat_mode, + requested_mode, + CHAT_MODE_AGENT_NAMES[requested_chat_mode], + vision_enabled, + ) + + return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled + + def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool: identity = getattr(participant, "identity", "") or "" - return identity.startswith("agent-") or AGENT_NAME in identity + return identity.startswith("agent-") or agent_name in identity def _get_agent_identities(self, session: DeviceSession) -> list[str]: return [ participant.identity for participant in session.room.remote_participants.values() - if self._is_agent_participant(participant) + if self._is_agent_participant(participant, session.agent_name) ] def _log_agent_participants(self, session: DeviceSession, source: str) -> None: @@ -187,7 +248,7 @@ class ESP32LiveKitBridge: dispatch_agent_name = getattr(dispatch, "agent_name", None) dispatch_room = getattr(dispatch, "room", None) if dispatch_room == session.room_name and ( - dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None + dispatch_agent_name == session.agent_name or dispatch_agent_name is None ): print( f"检测到已有 dispatch: room={session.room_name} " @@ -210,7 +271,7 @@ class ESP32LiveKitBridge: return for participant in session.room.remote_participants.values(): - if self._is_agent_participant(participant): + if self._is_agent_participant(participant, session.agent_name): # print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}") return @@ -222,7 +283,7 @@ class ESP32LiveKitBridge: await session.agent_dispatch_task async def _dispatch_agent(self, session: DeviceSession) -> None: - print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}") + print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}") try: if await self._dispatch_agent_with_sdk(session): @@ -252,13 +313,17 @@ class ESP32LiveKitBridge: try: dispatch = await lkapi.agent_dispatch.create_dispatch( livekit_api.CreateAgentDispatchRequest( - agent_name=AGENT_NAME, + agent_name=session.agent_name, room=session.room_name, metadata=json.dumps( { "source": "bridge_server", "identity": session.identity, "device_id": session.device_id, + "chat_mode": session.chat_mode, + "agent_mode": session.agent_mode, + "agent_name": session.agent_name, + "vision_enabled": session.vision_enabled, } ), ) @@ -281,13 +346,17 @@ class ESP32LiveKitBridge: "--room", session.room_name, "--agent-name", - AGENT_NAME, + session.agent_name, "--metadata", json.dumps( { "source": "bridge_server", "identity": session.identity, "device_id": session.device_id, + "chat_mode": session.chat_mode, + "agent_mode": session.agent_mode, + "agent_name": session.agent_name, + "vision_enabled": session.vision_enabled, } ), stdout=asyncio.subprocess.PIPE, @@ -306,7 +375,7 @@ class ESP32LiveKitBridge: print(f"lk dispatch create 失败,退出码: {process.returncode}") return False - print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}") + print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}") return True async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool: @@ -727,16 +796,16 @@ class ESP32LiveKitBridge: print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}") self._log_agent_participants(session, "connected") for participant in session.room.remote_participants.values(): - if self._is_agent_participant(participant): + if self._is_agent_participant(participant, session.agent_name): session.agent_ready.set() self._scan_participant_audio_tracks(session, participant, "connected_scan") @session.room.on("participant_connected") def on_participant_connected(participant: rtc.RemoteParticipant) -> None: - role = "Agent" if self._is_agent_participant(participant) else "Remote participant" + role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant" print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}") self._log_agent_participants(session, "participant_connected") - if self._is_agent_participant(participant): + if self._is_agent_participant(participant, session.agent_name): session.agent_ready.set() self._scan_participant_audio_tracks( session, participant, "participant_connected_scan" @@ -769,7 +838,9 @@ class ESP32LiveKitBridge: track_pub: rtc.TrackPublication, ) -> None: identity = participant.identity if participant else "未知" - is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant) + is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant( + participant, session.agent_name + ) for segment in segments: status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果" print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}") @@ -841,7 +912,7 @@ class ESP32LiveKitBridge: # print(f"[config] token_url={TOKEN_URL}") # print(f"[config] room={session.room_name} identity={session.identity}") # print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}") - token = await fetch_token(session.room_name, session.identity) + token = await fetch_token(session.room_name, session.identity, session.agent_name) try: await session.room.connect( @@ -886,8 +957,12 @@ class ESP32LiveKitBridge: # print(f"等待 agent 加入: room={session.room_name}") try: await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS) + if session.closed: + return # print(f"✅ agent 已就绪: room={session.room_name}") except asyncio.TimeoutError: + if session.closed: + return print(f"⚠️ agent 等待超时: room={session.room_name}") async def start(self) -> None: @@ -895,6 +970,14 @@ class ESP32LiveKitBridge: print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}") print(f"[config] token_url={TOKEN_URL}") print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}") + print( + "[config] agents=" + f"normal:{CHAT_MODE_AGENT_NAMES['normal']} " + f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} " + f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} " + f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} " + f"default_mode:{DEFAULT_AGENT_MODE}" + ) if EMOTION_TEST_SEQUENCE: print( "[config] emotion_test_sequence=" @@ -911,6 +994,7 @@ class ESP32LiveKitBridge: return session.closed = True session.websocket = None + session.agent_ready.set() session.tts_active = False session.tts_stream_id += 1 if session.tts_idle_task is not None: @@ -1060,6 +1144,7 @@ class ESP32LiveKitBridge: await self._close_session(existing_session) self.device_sessions.pop(device_id, None) + chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers) room_name, identity = self._build_session_names(device_id) session = DeviceSession( device_id=device_id, @@ -1067,6 +1152,10 @@ class ESP32LiveKitBridge: protocol_version=protocol_version, room_name=room_name, identity=identity, + chat_mode=chat_mode, + agent_mode=agent_mode, + agent_name=agent_name, + vision_enabled=vision_enabled, room=rtc.Room(), mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1), agent_ready=asyncio.Event(), @@ -1075,6 +1164,11 @@ class ESP32LiveKitBridge: print(f"ESP32 已连接: device={device_id}") print(f"ESP32 协议版本: {session.protocol_version}") + print( + f"ESP32 mode: chat={session.chat_mode} " + f"agent={session.agent_mode}/{session.agent_name} " + f"vision={session.vision_enabled}" + ) session.tts_stream_id += 1 opus_decoder = None diff --git a/main/protocols/websocket_protocol.cc b/main/protocols/websocket_protocol.cc index cd775d1..b1eaf5a 100644 --- a/main/protocols/websocket_protocol.cc +++ b/main/protocols/websocket_protocol.cc @@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() { websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str()); websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str()); websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str()); + websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName()); + websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName()); websocket_->OnData([this](const char* data, size_t len, bool binary) { if (binary) {