From 9637e09aef9d213a7687aac62b7646761e775027 Mon Sep 17 00:00:00 2001
From: 0Xiao0 <511201264@qq.com>
Date: Thu, 4 Jun 2026 15:48:10 +0800
Subject: [PATCH] feat: beaver

---
 main/application.cc                           |  35 ++++-
 main/application.h                            |  12 ++
 .../boards/m5stack-core-s3/m5stack_core_s3.cc |  12 +-
 main/bridge_server.py                         | 128 +++++++++++++++---
 main/protocols/websocket_protocol.cc          |   2 +
 5 files changed, 163 insertions(+), 26 deletions(-)

diff --git a/main/application.cc b/main/application.cc
index 448a794..b679152 100644
--- a/main/application.cc
+++ b/main/application.cc
@@ -533,6 +533,9 @@ void Application::InitializeProtocol() {
     protocol_->OnAudioChannelClosed([this, &board]() {
         board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
         Schedule([this]() {
+            if (GetDeviceState() == kDeviceStateConnecting) {
+                return;
+            }
             auto display = Board::GetInstance().GetDisplay();
             display->SetChatMessage("system", "");
             SetDeviceState(kDeviceStateIdle);
@@ -681,13 +684,16 @@ void Application::DismissAlert() {
 }
 
 void Application::ToggleChatState() {
-    vision_text_mode_enabled_.store(false);
-    vision_frame_sent_for_current_listen_.store(false);
-    xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
+    ToggleChatStateForMode(kChatAgentModeNormal, false);
 }
 
 void Application::ToggleChatStateWithVision() {
-    vision_text_mode_enabled_.store(true);
+    ToggleChatStateForMode(kChatAgentModeNormal, true);
+}
+
+void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) {
+    chat_agent_mode_.store(agent_mode);
+    vision_text_mode_enabled_.store(vision_enabled);
     vision_frame_sent_for_current_listen_.store(false);
     xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
 }
@@ -696,6 +702,18 @@ bool Application::IsVisionTextModeEnabled() const {
     return vision_text_mode_enabled_.load();
 }
 
+const char* Application::GetChatAgentModeName() const {
+    return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
+}
+
+const char* Application::GetChatModeName() const {
+    bool vision_enabled = vision_text_mode_enabled_.load();
+    if (chat_agent_mode_.load() == kChatAgentModeBeaver) {
+        return vision_enabled ? "vision-beaver" : "beaver";
+    }
+    return vision_enabled ? "vision-normal" : "normal";
+}
+
 void Application::StartListening() {
     vision_text_mode_enabled_.store(false);
     vision_frame_sent_for_current_listen_.store(false);
@@ -729,7 +747,12 @@ void Application::HandleToggleChatEvent() {
 
     if (state == kDeviceStateIdle) {
         ListeningMode mode = GetDefaultListeningMode();
-        if (!protocol_->IsAudioChannelOpened()) {
+        bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
+        bool vision_mode_changed = vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
+        if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
+            if (protocol_->IsAudioChannelOpened()) {
+                protocol_->CloseAudioChannel();
+            }
             SetDeviceState(kDeviceStateConnecting);
             // Schedule to let the state change be processed first (UI update)
             Schedule([this, mode]() {
@@ -761,6 +784,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
         }
     }
 
+    active_chat_agent_mode_.store(chat_agent_mode_.load());
+    active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load());
     SetListeningMode(mode);
 }
 
diff --git a/main/application.h b/main/application.h
index 8eab256..03f7edf 100644
--- a/main/application.h
+++ b/main/application.h
@@ -41,6 +41,11 @@ enum AecMode {
     kAecOnServerSide,
 };
 
+enum ChatAgentMode {
+    kChatAgentModeNormal,
+    kChatAgentModeBeaver,
+};
+
 class Application {
 public:
     static Application& GetInstance() {
@@ -93,7 +98,11 @@ public:
      */
     void ToggleChatState();
     void ToggleChatStateWithVision();
+    void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled);
     bool IsVisionTextModeEnabled() const;
+    ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); }
+    const char* GetChatAgentModeName() const;
+    const char* GetChatModeName() const;
 
     /**
      * Start listening (event-based, thread-safe)
@@ -147,7 +156,10 @@ private:
     bool aborted_ = false;
     bool assets_version_checked_ = false;
     bool play_popup_on_listening_ = false;  // Flag to play popup sound after state changes to listening
+    std::atomic<ChatAgentMode> chat_agent_mode_ = kChatAgentModeNormal;
+    std::atomic<ChatAgentMode> active_chat_agent_mode_ = kChatAgentModeNormal;
     std::atomic<bool> vision_text_mode_enabled_ = false;
+    std::atomic<bool> active_vision_text_mode_enabled_ = false;
     std::atomic<bool> vad_speaking_ = false;
     std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
     int clock_ticks_ = 0;
diff --git a/main/boards/m5stack-core-s3/m5stack_core_s3.cc b/main/boards/m5stack-core-s3/m5stack_core_s3.cc
index 6da3198..90978d9 100644
--- a/main/boards/m5stack-core-s3/m5stack_core_s3.cc
+++ b/main/boards/m5stack-core-s3/m5stack_core_s3.cc
@@ -192,6 +192,7 @@ private:
     void PollTouchpad() {
         static bool was_touched = false;
         static int64_t touch_start_time = 0;
+        static int touch_start_x = -1;
         const int64_t TOUCH_THRESHOLD_MS = 500;  // 触摸时长阈值，超过500ms视为长按
 
         ft6336_->UpdateTouchPoint();
@@ -201,11 +202,14 @@ private:
         if (touch_point.num > 0 && !was_touched) {
             was_touched = true;
             touch_start_time = esp_timer_get_time() / 1000;  // 转换为毫秒
+            touch_start_x = touch_point.x;
         }
         // 检测触摸释放
         else if (touch_point.num == 0 && was_touched) {
             was_touched = false;
             int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
+            bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2;
+            auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal;
 
             if (touch_duration < TOUCH_THRESHOLD_MS) {
                 auto& app = Application::GetInstance();
@@ -213,12 +217,12 @@ private:
                     EnterWifiConfigMode();
                     return;
                 }
-                ESP_LOGI(TAG, "Touch short: text-only mode");
-                app.ToggleChatState();
+                ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal");
+                app.ToggleChatStateForMode(agent_mode, false);
             } else {
                 auto& app = Application::GetInstance();
-                ESP_LOGI(TAG, "Touch long: vision+text mode");
-                app.ToggleChatStateWithVision();
+                ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal");
+                app.ToggleChatStateForMode(agent_mode, true);
             }
         }
     }
diff --git a/main/bridge_server.py b/main/bridge_server.py
index 352f186..a0881cc 100644
--- a/main/bridge_server.py
+++ b/main/bridge_server.py
@@ -29,7 +29,18 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken"
 LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
 ROOM_PREFIX = "test-livekit"
 IDENTITY_PREFIX = "uv-livekit"
-AGENT_NAME = "my-agent"
+LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent")
+DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower()
+AGENT_NAMES = {
+    "normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME),
+    "beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"),
+}
+CHAT_MODE_AGENT_NAMES = {
+    "normal": AGENT_NAMES["normal"],
+    "beaver": AGENT_NAMES["beaver"],
+    "vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"),
+    "vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"),
+}
 CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
 AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
 WS_PORT = 8080
@@ -75,6 +86,10 @@ class DeviceSession:
     protocol_version: int
     room_name: str
     identity: str
+    chat_mode: str
+    agent_mode: str
+    agent_name: str
+    vision_enabled: bool
     room: rtc.Room
     mic_source: AudioSource
     agent_ready: asyncio.Event
@@ -94,10 +109,10 @@ class DeviceSession:
     first_capture_log_time: float = 0.0
 
 
-async def fetch_token(room_name: str, identity: str) -> str:
+async def fetch_token(room_name: str, identity: str, agent_name: str) -> str:
     params = {"room": room_name, "identity": identity}
     if AGENT_DISPATCH_MODE == "token":
-        params["agent_name"] = AGENT_NAME
+        params["agent_name"] = agent_name
 
     async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
         response = await client.get(TOKEN_URL, params=params)
@@ -137,15 +152,61 @@ class ESP32LiveKitBridge:
         print(f"[session] device={device_id} room={room_name} identity={identity}")
         return room_name, identity
 
-    def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool:
+    def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]:
+        requested_chat_mode = (
+            headers.get("Chat-Mode")
+            or headers.get("X-Chat-Mode")
+            or ""
+        ).strip().lower()
+        chat_mode_to_agent = {
+            "normal": ("normal", False),
+            "beaver": ("beaver", False),
+            "vision-normal": ("normal", True),
+            "vision-beaver": ("beaver", True),
+        }
+
+        if requested_chat_mode in chat_mode_to_agent:
+            requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode]
+        else:
+            if requested_chat_mode:
+                print(f"未知 Chat-Mode={requested_chat_mode!r}，回退到 Agent-Mode")
+            requested_mode = (
+                headers.get("Agent-Mode")
+                or headers.get("X-Agent-Mode")
+                or DEFAULT_AGENT_MODE
+                or "normal"
+            ).strip().lower()
+            vision_enabled = False
+            requested_chat_mode = requested_mode
+
+        requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name")
+        if requested_name:
+            return requested_chat_mode, "custom", requested_name, vision_enabled
+
+        if requested_mode not in AGENT_NAMES:
+            print(f"未知 Agent-Mode={requested_mode!r}，回退到 normal")
+            requested_mode = "normal"
+            requested_chat_mode = "vision-normal" if vision_enabled else "normal"
+
+        if requested_chat_mode in CHAT_MODE_AGENT_NAMES:
+            return (
+                requested_chat_mode,
+                requested_mode,
+                CHAT_MODE_AGENT_NAMES[requested_chat_mode],
+                vision_enabled,
+            )
+
+        return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled
+
+    def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool:
         identity = getattr(participant, "identity", "") or ""
-        return identity.startswith("agent-") or AGENT_NAME in identity
+        return identity.startswith("agent-") or agent_name in identity
 
     def _get_agent_identities(self, session: DeviceSession) -> list[str]:
         return [
             participant.identity
             for participant in session.room.remote_participants.values()
-            if self._is_agent_participant(participant)
+            if self._is_agent_participant(participant, session.agent_name)
         ]
 
     def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
@@ -187,7 +248,7 @@ class ESP32LiveKitBridge:
                 dispatch_agent_name = getattr(dispatch, "agent_name", None)
                 dispatch_room = getattr(dispatch, "room", None)
                 if dispatch_room == session.room_name and (
-                    dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None
+                    dispatch_agent_name == session.agent_name or dispatch_agent_name is None
                 ):
                     print(
                         f"检测到已有 dispatch: room={session.room_name} "
@@ -210,7 +271,7 @@ class ESP32LiveKitBridge:
             return
 
         for participant in session.room.remote_participants.values():
-            if self._is_agent_participant(participant):
+            if self._is_agent_participant(participant, session.agent_name):
                 # print(f"Agent 已在房间中，跳过 dispatch: {participant.identity}")
                 return
 
@@ -222,7 +283,7 @@ class ESP32LiveKitBridge:
         await session.agent_dispatch_task
 
     async def _dispatch_agent(self, session: DeviceSession) -> None:
-        print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}")
+        print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}")
 
         try:
             if await self._dispatch_agent_with_sdk(session):
@@ -252,13 +313,17 @@ class ESP32LiveKitBridge:
         try:
             dispatch = await lkapi.agent_dispatch.create_dispatch(
                 livekit_api.CreateAgentDispatchRequest(
-                    agent_name=AGENT_NAME,
+                    agent_name=session.agent_name,
                     room=session.room_name,
                     metadata=json.dumps(
                         {
                             "source": "bridge_server",
                             "identity": session.identity,
                             "device_id": session.device_id,
+                            "chat_mode": session.chat_mode,
+                            "agent_mode": session.agent_mode,
+                            "agent_name": session.agent_name,
+                            "vision_enabled": session.vision_enabled,
                         }
                     ),
                 )
@@ -281,13 +346,17 @@ class ESP32LiveKitBridge:
             "--room",
             session.room_name,
             "--agent-name",
-            AGENT_NAME,
+            session.agent_name,
             "--metadata",
             json.dumps(
                 {
                     "source": "bridge_server",
                     "identity": session.identity,
                     "device_id": session.device_id,
+                    "chat_mode": session.chat_mode,
+                    "agent_mode": session.agent_mode,
+                    "agent_name": session.agent_name,
+                    "vision_enabled": session.vision_enabled,
                 }
             ),
             stdout=asyncio.subprocess.PIPE,
@@ -306,7 +375,7 @@ class ESP32LiveKitBridge:
             print(f"lk dispatch create 失败，退出码: {process.returncode}")
             return False
 
-        print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}")
+        print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}")
         return True
 
     async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
@@ -727,16 +796,16 @@ class ESP32LiveKitBridge:
             print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
             self._log_agent_participants(session, "connected")
             for participant in session.room.remote_participants.values():
-                if self._is_agent_participant(participant):
+                if self._is_agent_participant(participant, session.agent_name):
                     session.agent_ready.set()
                     self._scan_participant_audio_tracks(session, participant, "connected_scan")
 
         @session.room.on("participant_connected")
         def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
-            role = "Agent" if self._is_agent_participant(participant) else "Remote participant"
+            role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
             print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
             self._log_agent_participants(session, "participant_connected")
-            if self._is_agent_participant(participant):
+            if self._is_agent_participant(participant, session.agent_name):
                 session.agent_ready.set()
                 self._scan_participant_audio_tracks(
                     session, participant, "participant_connected_scan"
@@ -769,7 +838,9 @@ class ESP32LiveKitBridge:
             track_pub: rtc.TrackPublication,
         ) -> None:
             identity = participant.identity if participant else "未知"
-            is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant)
+            is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(
+                participant, session.agent_name
+            )
             for segment in segments:
                 status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
                 print(f"🗣️  [{status} | room={session.room_name} | {identity}]: {segment.text}")
@@ -841,7 +912,7 @@ class ESP32LiveKitBridge:
         # print(f"[config] token_url={TOKEN_URL}")
         # print(f"[config] room={session.room_name} identity={session.identity}")
         # print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
-        token = await fetch_token(session.room_name, session.identity)
+        token = await fetch_token(session.room_name, session.identity, session.agent_name)
 
         try:
             await session.room.connect(
@@ -886,8 +957,12 @@ class ESP32LiveKitBridge:
         # print(f"等待 agent 加入: room={session.room_name}")
         try:
             await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
+            if session.closed:
+                return
             # print(f"✅ agent 已就绪: room={session.room_name}")
         except asyncio.TimeoutError:
+            if session.closed:
+                return
             print(f"⚠️ agent 等待超时: room={session.room_name}")
 
     async def start(self) -> None:
@@ -895,6 +970,14 @@ class ESP32LiveKitBridge:
         print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
         print(f"[config] token_url={TOKEN_URL}")
         print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
+        print(
+            "[config] agents="
+            f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
+            f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} "
+            f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} "
+            f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} "
+            f"default_mode:{DEFAULT_AGENT_MODE}"
+        )
         if EMOTION_TEST_SEQUENCE:
             print(
                 "[config] emotion_test_sequence="
@@ -911,6 +994,7 @@ class ESP32LiveKitBridge:
             return
         session.closed = True
         session.websocket = None
+        session.agent_ready.set()
         session.tts_active = False
         session.tts_stream_id += 1
         if session.tts_idle_task is not None:
@@ -1060,6 +1144,7 @@ class ESP32LiveKitBridge:
             await self._close_session(existing_session)
             self.device_sessions.pop(device_id, None)
 
+        chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers)
         room_name, identity = self._build_session_names(device_id)
         session = DeviceSession(
             device_id=device_id,
@@ -1067,6 +1152,10 @@ class ESP32LiveKitBridge:
             protocol_version=protocol_version,
             room_name=room_name,
             identity=identity,
+            chat_mode=chat_mode,
+            agent_mode=agent_mode,
+            agent_name=agent_name,
+            vision_enabled=vision_enabled,
             room=rtc.Room(),
             mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
             agent_ready=asyncio.Event(),
@@ -1075,6 +1164,11 @@ class ESP32LiveKitBridge:
 
         print(f"ESP32 已连接: device={device_id}")
         print(f"ESP32 协议版本: {session.protocol_version}")
+        print(
+            f"ESP32 mode: chat={session.chat_mode} "
+            f"agent={session.agent_mode}/{session.agent_name} "
+            f"vision={session.vision_enabled}"
+        )
         session.tts_stream_id += 1
         opus_decoder = None
 
diff --git a/main/protocols/websocket_protocol.cc b/main/protocols/websocket_protocol.cc
index cd775d1..b1eaf5a 100644
--- a/main/protocols/websocket_protocol.cc
+++ b/main/protocols/websocket_protocol.cc
@@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
     websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
     websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
     websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
+    websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName());
+    websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName());
 
     websocket_->OnData([this](const char* data, size_t len, bool binary) {
         if (binary) {