feat: beaver

feat: remove background cam every time
feat: add icon beaver
2026-06-04 15:48:10 +08:00 · 2026-05-29 14:53:58 +08:00 · 2026-05-29 11:22:31 +08:00 · 2026-05-27 17:16:11 +08:00 · 2026-05-25 17:21:11 +08:00
13 changed files with 557 additions and 98 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@ sdkconfig
 dependencies.lock
 .env
 releases/
+vision_frames/
 main/assets/lang_config.h
 main/mmap_generate_emoji.h
 .DS_Store
--- a/main/Kconfig.projbuild
+++ b/main/Kconfig.projbuild
@ -15,7 +15,7 @@ config USE_DIRECT_WEBSOCKET
 config WEBSOCKET_URL
    string "Default WebSocket URL"
    depends on USE_DIRECT_WEBSOCKET
-    default "ws://10.6.80.130:8080"
+    default "ws://172.19.0.240:8080"
    help
        The WebSocket server URL used when direct WebSocket mode is enabled.

--- a/main/application.cc
+++ b/main/application.cc
@ -81,6 +81,7 @@ void Application::Initialize() {
        xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
    };
    callbacks.on_vad_change = [this](bool speaking) {
+        vad_speaking_.store(speaking);
        xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
    };
    audio_service_.SetCallbacks(callbacks);
@ -233,6 +234,13 @@ void Application::Run() {
            if (GetDeviceState() == kDeviceStateListening) {
                auto led = Board::GetInstance().GetLed();
                led->OnStateChanged();
+
+                if (vad_speaking_.load() && vision_text_mode_enabled_.load() &&
+                    !vision_frame_sent_for_current_listen_.exchange(true)) {
+                    if (!SendCurrentVisionFrame()) {
+                        vision_frame_sent_for_current_listen_.store(false);
+                    }
+                }
            }
        }

@ -525,6 +533,9 @@ void Application::InitializeProtocol() {
    protocol_->OnAudioChannelClosed([this, &board]() {
        board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
        Schedule([this]() {
+            if (GetDeviceState() == kDeviceStateConnecting) {
+                return;
+            }
            auto display = Board::GetInstance().GetDisplay();
            display->SetChatMessage("system", "");
            SetDeviceState(kDeviceStateIdle);
@ -673,10 +684,39 @@ void Application::DismissAlert() {
 }

 void Application::ToggleChatState() {
+    ToggleChatStateForMode(kChatAgentModeNormal, false);
+}
+
+void Application::ToggleChatStateWithVision() {
+    ToggleChatStateForMode(kChatAgentModeNormal, true);
+}
+
+void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) {
+    chat_agent_mode_.store(agent_mode);
+    vision_text_mode_enabled_.store(vision_enabled);
+    vision_frame_sent_for_current_listen_.store(false);
    xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
 }

+bool Application::IsVisionTextModeEnabled() const {
+    return vision_text_mode_enabled_.load();
+}
+
+const char* Application::GetChatAgentModeName() const {
+    return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
+}
+
+const char* Application::GetChatModeName() const {
+    bool vision_enabled = vision_text_mode_enabled_.load();
+    if (chat_agent_mode_.load() == kChatAgentModeBeaver) {
+        return vision_enabled ? "vision-beaver" : "beaver";
+    }
+    return vision_enabled ? "vision-normal" : "normal";
+}
+
 void Application::StartListening() {
+    vision_text_mode_enabled_.store(false);
+    vision_frame_sent_for_current_listen_.store(false);
    xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
 }

@ -686,7 +726,7 @@ void Application::StopListening() {

 void Application::HandleToggleChatEvent() {
    auto state = GetDeviceState();
-    
+
    if (state == kDeviceStateActivating) {
        SetDeviceState(kDeviceStateIdle);
        return;
@ -707,7 +747,12 @@ void Application::HandleToggleChatEvent() {

    if (state == kDeviceStateIdle) {
        ListeningMode mode = GetDefaultListeningMode();
-        if (!protocol_->IsAudioChannelOpened()) {
+        bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
+        bool vision_mode_changed = vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
+        if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
+            if (protocol_->IsAudioChannelOpened()) {
+                protocol_->CloseAudioChannel();
+            }
            SetDeviceState(kDeviceStateConnecting);
            // Schedule to let the state change be processed first (UI update)
            Schedule([this, mode]() {
@ -739,6 +784,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
        }
    }

+    active_chat_agent_mode_.store(chat_agent_mode_.load());
+    active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load());
    SetListeningMode(mode);
 }

@ -882,6 +929,7 @@ void Application::HandleStateChangedEvent() {
    switch (new_state) {
        case kDeviceStateUnknown:
        case kDeviceStateIdle:
+            vision_frame_sent_for_current_listen_.store(false);
            display->SetStatus(Lang::Strings::STANDBY);
            display->ClearChatMessages();  // Clear messages first
            display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
@ -894,6 +942,8 @@ void Application::HandleStateChangedEvent() {
            display->SetChatMessage("system", "");
            break;
        case kDeviceStateListening:
+            vad_speaking_.store(false);
+            vision_frame_sent_for_current_listen_.store(false);
            display->SetStatus(Lang::Strings::LISTENING);
            display->SetEmotion("neutral");

@ -944,6 +994,27 @@ void Application::HandleStateChangedEvent() {
    }
 }

+bool Application::SendCurrentVisionFrame() {
+    if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
+        return false;
+    }
+
+    auto camera = Board::GetInstance().GetCamera();
+    if (camera == nullptr) {
+        return false;
+    }
+
+    std::string jpeg_data;
+    if (!camera->CaptureToJpeg(jpeg_data, true)) {
+        ESP_LOGW(TAG, "Failed to capture vision frame");
+        return false;
+    }
+
+    protocol_->SendVisionFrame(jpeg_data);
+    ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
+    return true;
+}
+
 void Application::Schedule(std::function<void()>&& callback) {
    {
        std::lock_guard<std::mutex> lock(mutex_);
@ -962,6 +1033,8 @@ void Application::AbortSpeaking(AbortReason reason) {

 void Application::SetListeningMode(ListeningMode mode) {
    listening_mode_ = mode;
+    vad_speaking_.store(false);
+    vision_frame_sent_for_current_listen_.store(false);
    SetDeviceState(kDeviceStateListening);
 }

--- a/main/application.h
+++ b/main/application.h
@ -11,6 +11,7 @@
 #include <deque>
 #include <memory>
 #include <functional>
+#include <atomic>

 #include "protocol.h"
 #include "ota.h"
@ -40,6 +41,11 @@ enum AecMode {
    kAecOnServerSide,
 };

+enum ChatAgentMode {
+    kChatAgentModeNormal,
+    kChatAgentModeBeaver,
+};
+
 class Application {
 public:
    static Application& GetInstance() {
@ -91,6 +97,12 @@ public:
     * Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run()
     */
    void ToggleChatState();
+    void ToggleChatStateWithVision();
+    void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled);
+    bool IsVisionTextModeEnabled() const;
+    ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); }
+    const char* GetChatAgentModeName() const;
+    const char* GetChatModeName() const;

    /**
     * Start listening (event-based, thread-safe)
@ -144,6 +156,12 @@ private:
    bool aborted_ = false;
    bool assets_version_checked_ = false;
    bool play_popup_on_listening_ = false;  // Flag to play popup sound after state changes to listening
+    std::atomic<ChatAgentMode> chat_agent_mode_ = kChatAgentModeNormal;
+    std::atomic<ChatAgentMode> active_chat_agent_mode_ = kChatAgentModeNormal;
+    std::atomic<bool> vision_text_mode_enabled_ = false;
+    std::atomic<bool> active_vision_text_mode_enabled_ = false;
+    std::atomic<bool> vad_speaking_ = false;
+    std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
    int clock_ticks_ = 0;
    TaskHandle_t activation_task_handle_ = nullptr;

@ -159,6 +177,7 @@ private:
    void HandleWakeWordDetectedEvent();
    void ContinueOpenAudioChannel(ListeningMode mode);
    void ContinueWakeWordInvoke(const std::string& wake_word);
+    bool SendCurrentVisionFrame();

    // Activation task (runs in background)
    void ActivationTask();
--- a/main/boards/common/camera.h
+++ b/main/boards/common/camera.h
@ -7,6 +7,8 @@ class Camera {
 public:
    virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0;
    virtual bool Capture() = 0;
+    virtual bool CaptureBackground() { return Capture(); }
+    virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) { return false; }
    virtual bool SetHMirror(bool enabled) = 0;
    virtual bool SetVFlip(bool enabled) = 0;
    virtual bool SetSwapBytes(bool enabled) { return false; }  // Optional, default no-op
--- a/main/boards/common/esp_video.cc
+++ b/main/boards/common/esp_video.cc
@ -24,6 +24,7 @@
 #include "lvgl_display.h"
 #include "mcp_server.h"
 #include "system_info.h"
+#include "esp_timer.h"

 #ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
 #undef LOG_LOCAL_LEVEL
@ -55,6 +56,7 @@


 #define TAG "EspVideo"
+#define FOREGROUND_CAPTURE_PROTECTION_US (10 * 1000 * 1000)

 #if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
 #warning \
@ -381,11 +383,47 @@ EspVideo::~EspVideo() {
 }

 void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) {
+    std::lock_guard<std::mutex> lock(frame_mutex_);
    explain_url_ = url;
    explain_token_ = token;
 }

 bool EspVideo::Capture() {
+    return CaptureFrame(true);
+}
+
+bool EspVideo::CaptureBackground() {
+    return CaptureFrame(false);
+}
+
+bool EspVideo::CaptureToJpeg(std::string& jpeg_data, bool show_preview) {
+    jpeg_data.clear();
+    if (!CaptureFrame(show_preview)) {
+        return false;
+    }
+
+    std::lock_guard<std::mutex> lock(frame_mutex_);
+    if (frame_.data == nullptr || frame_.len == 0) {
+        return false;
+    }
+
+    uint16_t w = frame_.width ? frame_.width : 320;
+    uint16_t h = frame_.height ? frame_.height : 240;
+    return image_to_jpeg_cb(
+        frame_.data, frame_.len, w, h, frame_.format, 60,
+        [](void* arg, size_t index, const void* data, size_t len) -> size_t {
+            auto jpeg_data = static_cast<std::string*>(arg);
+            if (data != nullptr && len > 0) {
+                jpeg_data->append(static_cast<const char*>(data), len);
+            }
+            return len;
+        },
+        &jpeg_data);
+}
+
+bool EspVideo::CaptureFrame(bool show_preview) {
+    std::lock_guard<std::mutex> lock(frame_mutex_);
+
    if (encoder_thread_.joinable()) {
        encoder_thread_.join();
    }
@ -394,6 +432,10 @@ bool EspVideo::Capture() {
        return false;
    }

+    if (!show_preview && esp_timer_get_time() < foreground_capture_protected_until_us_) {
+        return true;
+    }
+
    for (int i = 0; i < 3; i++) {
        struct v4l2_buffer buf = {};
        buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
@ -729,9 +771,14 @@ bool EspVideo::Capture() {
        }
    }

-    // 显示预览图片
-    auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
-    if (display != nullptr) {
+    if (show_preview) {
+        foreground_capture_protected_until_us_ = esp_timer_get_time() + FOREGROUND_CAPTURE_PROTECTION_US;
+    }
+
+    if (show_preview) {
+        // 显示预览图片
+        auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
+        if (display != nullptr) {
        if (!frame_.data) {
            ESP_LOGE(TAG, "frame.data is null");
            return false;
@ -836,6 +883,7 @@ bool EspVideo::Capture() {

        auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
        display->SetPreviewImage(std::move(image));
+        }
    }
    return true;
 }
@ -898,10 +946,16 @@ bool EspVideo::SetVFlip(bool enabled) {
 * @warning 如果摄像头缓冲区为空或网络连接失败，将返回错误信息
 */
 std::string EspVideo::Explain(const std::string& question) {
+    std::lock_guard<std::mutex> lock(frame_mutex_);
+
    if (explain_url_.empty()) {
        throw std::runtime_error("Image explain URL or token is not set");
    }

+    if (frame_.data == nullptr || frame_.len == 0) {
+        throw std::runtime_error("No camera frame captured");
+    }
+
    // 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
    QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
    if (jpeg_queue == nullptr) {
--- a/main/boards/common/esp_video.h
+++ b/main/boards/common/esp_video.h
@ -5,6 +5,8 @@
 #include <thread>
 #include <memory>
 #include <vector>
+#include <mutex>
+#include <cstdint>

 #include <freertos/FreeRTOS.h>
 #include <freertos/queue.h>
@ -39,6 +41,10 @@ private:
    std::string explain_url_;
    std::string explain_token_;
    std::thread encoder_thread_;
+    std::mutex frame_mutex_;
+    int64_t foreground_capture_protected_until_us_ = 0;
+
+    bool CaptureFrame(bool show_preview);

 public:
    EspVideo(const esp_video_init_config_t& config);
@ -46,6 +52,8 @@ public:

    virtual void SetExplainUrl(const std::string& url, const std::string& token);
    virtual bool Capture();
+    virtual bool CaptureBackground() override;
+    virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) override;
    // 翻转控制函数
    virtual bool SetHMirror(bool enabled) override;
    virtual bool SetVFlip(bool enabled) override;
--- a/main/boards/m5stack-core-s3/m5stack_core_s3.cc
+++ b/main/boards/m5stack-core-s3/m5stack_core_s3.cc
@ -1,21 +1,23 @@
-#include "wifi_board.h"
+#include "application.h"
+#include "axp2101.h"
+#include "config.h"
 #include "cores3_audio_codec.h"
 #include "display/lcd_display.h"
-#include "application.h"
-#include "config.h"
-#include "power_save_timer.h"
 #include "i2c_device.h"
-#include "axp2101.h"
+#include "power_save_timer.h"
+#include "wifi_board.h"

-#include <esp_log.h>
 #include <driver/i2c_master.h>
+#include <esp_lcd_ili9341.h>
 #include <esp_lcd_panel_io.h>
 #include <esp_lcd_panel_ops.h>
-#include <esp_lcd_ili9341.h>
+#include <esp_log.h>
 #include <esp_timer.h>
 #include "esp_video.h"

 #define TAG "M5StackCoreS3Board"
+#define BACKGROUND_VISION_INITIAL_DELAY_MS 8000
+#define BACKGROUND_VISION_SAMPLE_INTERVAL_MS 100

 class Pmic : public Axp2101 {
 public:
@ -41,7 +43,7 @@ public:

 class CustomBacklight : public Backlight {
 public:
-    CustomBacklight(Pmic *pmic) : pmic_(pmic) {}
+    CustomBacklight(Pmic* pmic) : pmic_(pmic) {}

    void SetBrightnessImpl(uint8_t brightness) override {
        pmic_->SetBrightness(target_brightness_);
@ -49,7 +51,7 @@ public:
    }

 private:
-    Pmic *pmic_;
+    Pmic* pmic_;
 };

 class Aw9523 : public I2cDevice {
@ -89,16 +91,14 @@ public:
        int x = -1;
        int y = -1;
    };
-    
+
    Ft6336(i2c_master_bus_handle_t i2c_bus, uint8_t addr) : I2cDevice(i2c_bus, addr) {
        uint8_t chip_id = ReadReg(0xA3);
        ESP_LOGI(TAG, "Get chip ID: 0x%02X", chip_id);
        read_buffer_ = new uint8_t[6];
    }

-    ~Ft6336() {
-        delete[] read_buffer_;
-    }
+    ~Ft6336() { delete[] read_buffer_; }

    void UpdateTouchPoint() {
        ReadRegs(0x02, read_buffer_, 6);
@ -107,9 +107,7 @@ public:
        tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4];
    }

-    inline const TouchPoint_t& GetTouchPoint() {
-        return tp_;
-    }
+    inline const TouchPoint_t& GetTouchPoint() { return tp_; }

 private:
    uint8_t* read_buffer_ = nullptr;
@ -137,9 +135,7 @@ private:
            GetDisplay()->SetPowerSaveMode(false);
            GetBacklight()->RestoreBrightness();
        });
-        power_save_timer_->OnShutdownRequest([this]() {
-            pmic_->PowerOff();
-        });
+        power_save_timer_->OnShutdownRequest([this]() { pmic_->PowerOff(); });
        power_save_timer_->SetEnabled(true);
    }

@ -153,9 +149,10 @@ private:
            .glitch_ignore_cnt = 7,
            .intr_priority = 0,
            .trans_queue_depth = 0,
-            .flags = {
-                .enable_internal_pullup = 1,
-            },
+            .flags =
+                {
+                    .enable_internal_pullup = 1,
+                },
        };
        ESP_ERROR_CHECK(i2c_new_master_bus(&i2c_bus_cfg, &i2c_bus_));
    }
@ -195,29 +192,37 @@ private:
    void PollTouchpad() {
        static bool was_touched = false;
        static int64_t touch_start_time = 0;
+        static int touch_start_x = -1;
        const int64_t TOUCH_THRESHOLD_MS = 500;  // 触摸时长阈值，超过500ms视为长按
-        
+
        ft6336_->UpdateTouchPoint();
        auto& touch_point = ft6336_->GetTouchPoint();
-        
+
        // 检测触摸开始
        if (touch_point.num > 0 && !was_touched) {
            was_touched = true;
-            touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
-        } 
+            touch_start_time = esp_timer_get_time() / 1000;  // 转换为毫秒
+            touch_start_x = touch_point.x;
+        }
        // 检测触摸释放
        else if (touch_point.num == 0 && was_touched) {
            was_touched = false;
            int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
-            
-            // 只有短触才触发
+            bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2;
+            auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal;
+
            if (touch_duration < TOUCH_THRESHOLD_MS) {
                auto& app = Application::GetInstance();
                if (app.GetDeviceState() == kDeviceStateStarting) {
                    EnterWifiConfigMode();
                    return;
                }
-                app.ToggleChatState();
+                ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal");
+                app.ToggleChatStateForMode(agent_mode, false);
+            } else {
+                auto& app = Application::GetInstance();
+                ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal");
+                app.ToggleChatStateForMode(agent_mode, true);
            }
        }
    }
@ -225,19 +230,20 @@ private:
    void InitializeFt6336TouchPad() {
        ESP_LOGI(TAG, "Init FT6336");
        ft6336_ = new Ft6336(i2c_bus_, 0x38);
-        
+
        // 创建定时器，20ms 间隔
        esp_timer_create_args_t timer_args = {
-            .callback = [](void* arg) {
-                M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg;
-                board->PollTouchpad();
-            },
+            .callback =
+                [](void* arg) {
+                    M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg;
+                    board->PollTouchpad();
+                },
            .arg = this,
            .dispatch_method = ESP_TIMER_TASK,
            .name = "touchpad_timer",
            .skip_unhandled_events = true,
        };
-        
+
        ESP_ERROR_CHECK(esp_timer_create(&timer_args, &touchpad_timer_));
        ESP_ERROR_CHECK(esp_timer_start_periodic(touchpad_timer_, 20 * 1000));
    }
@ -276,7 +282,7 @@ private:
        panel_config.rgb_ele_order = LCD_RGB_ELEMENT_ORDER_BGR;
        panel_config.bits_per_pixel = 16;
        ESP_ERROR_CHECK(esp_lcd_new_panel_ili9341(panel_io, &panel_config, &panel));
-        
+
        esp_lcd_panel_reset(panel);
        aw9523_->ResetIli9342();

@ -285,23 +291,25 @@ private:
        esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY);
        esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y);

-        display_ = new SpiLcdDisplay(panel_io, panel,
-                                    DISPLAY_WIDTH, DISPLAY_HEIGHT, DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY);
+        display_ = new SpiLcdDisplay(panel_io, panel, DISPLAY_WIDTH, DISPLAY_HEIGHT,
+                                     DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X,
+                                     DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY);
    }

-     void InitializeCamera() {
+    void InitializeCamera() {
        static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = {
            .data_width = CAM_CTLR_DATA_WIDTH_8,
-            .data_io = {
-                [0] = CAMERA_PIN_D0,
-                [1] = CAMERA_PIN_D1,
-                [2] = CAMERA_PIN_D2,
-                [3] = CAMERA_PIN_D3,
-                [4] = CAMERA_PIN_D4,
-                [5] = CAMERA_PIN_D5,
-                [6] = CAMERA_PIN_D6,
-                [7] = CAMERA_PIN_D7,
-            },
+            .data_io =
+                {
+                    [0] = CAMERA_PIN_D0,
+                    [1] = CAMERA_PIN_D1,
+                    [2] = CAMERA_PIN_D2,
+                    [3] = CAMERA_PIN_D3,
+                    [4] = CAMERA_PIN_D4,
+                    [5] = CAMERA_PIN_D5,
+                    [6] = CAMERA_PIN_D6,
+                    [7] = CAMERA_PIN_D7,
+                },
            .vsync_io = CAMERA_PIN_VSYNC,
            .de_io = CAMERA_PIN_HREF,
            .pclk_io = CAMERA_PIN_PCLK,
@ -330,6 +338,42 @@ private:
        camera_->SetHMirror(false);
    }

+    void InitializeBackgroundVisionSampler() {
+        xTaskCreate(
+            [](void* arg) {
+                auto board = static_cast<M5StackCoreS3Board*>(arg);
+                bool has_logged_success = false;
+                bool has_logged_failure = false;
+
+                vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
+
+                while (true) {
+                    if (!Application::GetInstance().IsVisionTextModeEnabled()) {
+                        vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
+                        continue;
+                    }
+
+                    if (board->camera_ == nullptr) {
+                        vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
+                        continue;
+                    }
+
+                    if (board->camera_->Capture()) {
+                        if (!has_logged_success) {
+                            ESP_LOGI(TAG, "Vision preview sampler started");
+                            has_logged_success = true;
+                        }
+                    } else if (!has_logged_failure) {
+                        ESP_LOGW(TAG, "Vision preview sampler is waiting for camera");
+                        has_logged_failure = true;
+                    }
+
+                    vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
+                }
+            },
+            "BgVisionSampler", 4096, this, 1, nullptr);
+    }
+
 public:
    M5StackCoreS3Board() {
        InitializePowerSaveTimer();
@ -340,34 +384,24 @@ public:
        InitializeSpi();
        InitializeIli9342Display();
        InitializeCamera();
+        InitializeBackgroundVisionSampler();
        InitializeFt6336TouchPad();
        GetBacklight()->RestoreBrightness();
    }

    virtual AudioCodec* GetAudioCodec() override {
-        static CoreS3AudioCodec audio_codec(i2c_bus_,
-            AUDIO_INPUT_SAMPLE_RATE,
-            AUDIO_OUTPUT_SAMPLE_RATE,
-            AUDIO_I2S_GPIO_MCLK,
-            AUDIO_I2S_GPIO_BCLK,
-            AUDIO_I2S_GPIO_WS,
-            AUDIO_I2S_GPIO_DOUT,
-            AUDIO_I2S_GPIO_DIN,
-            AUDIO_CODEC_AW88298_ADDR,
-            AUDIO_CODEC_ES7210_ADDR,
-            AUDIO_INPUT_REFERENCE);
+        static CoreS3AudioCodec audio_codec(
+            i2c_bus_, AUDIO_INPUT_SAMPLE_RATE, AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_MCLK,
+            AUDIO_I2S_GPIO_BCLK, AUDIO_I2S_GPIO_WS, AUDIO_I2S_GPIO_DOUT, AUDIO_I2S_GPIO_DIN,
+            AUDIO_CODEC_AW88298_ADDR, AUDIO_CODEC_ES7210_ADDR, AUDIO_INPUT_REFERENCE);
        return &audio_codec;
    }

-    virtual Display* GetDisplay() override {
-        return display_;
-    }
+    virtual Display* GetDisplay() override { return display_; }

-    virtual Camera* GetCamera() override {
-        return camera_;
-    }
+    virtual Camera* GetCamera() override { return camera_; }

-    virtual bool GetBatteryLevel(int &level, bool& charging, bool& discharging) override {
+    virtual bool GetBatteryLevel(int& level, bool& charging, bool& discharging) override {
        static bool last_discharging = false;
        charging = pmic_->IsCharging();
        discharging = pmic_->IsDischarging();
@ -387,7 +421,7 @@ public:
        WifiBoard::SetPowerSaveLevel(level);
    }

-    virtual Backlight *GetBacklight() override {
+    virtual Backlight* GetBacklight() override {
        static CustomBacklight backlight(pmic_);
        return &backlight;
    }
--- a/main/boards/waveshare/esp32-s3-touch-lcd-4.3c/sdkconfig.4_3c
+++ b/main/boards/waveshare/esp32-s3-touch-lcd-4.3c/sdkconfig.4_3c
@ -599,7 +599,7 @@ CONFIG_PARTITION_TABLE_MD5=y
 #
 CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/"
 CONFIG_USE_DIRECT_WEBSOCKET=y
-CONFIG_WEBSOCKET_URL="ws://10.6.80.130:8080"
+CONFIG_WEBSOCKET_URL="ws://172.19.0.240:8080"
 CONFIG_WEBSOCKET_TOKEN=""
 CONFIG_WEBSOCKET_PROTOCOL_VERSION=1
 # CONFIG_FLASH_NONE_ASSETS is not set
--- a/main/bridge_server.py
+++ b/main/bridge_server.py
@ -1,6 +1,8 @@
 import asyncio
+import base64
 import json
 import os
+import re
 import shutil
 import struct
 import sys
@ -8,6 +10,7 @@ import time
 import traceback
 import uuid
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Any, Optional

 import httpx
@ -26,11 +29,24 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken"
 LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
 ROOM_PREFIX = "test-livekit"
 IDENTITY_PREFIX = "uv-livekit"
-AGENT_NAME = "my-agent"
+LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent")
+DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower()
+AGENT_NAMES = {
+    "normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME),
+    "beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"),
+}
+CHAT_MODE_AGENT_NAMES = {
+    "normal": AGENT_NAMES["normal"],
+    "beaver": AGENT_NAMES["beaver"],
+    "vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"),
+    "vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"),
+}
 CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
 AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
 WS_PORT = 8080
 AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))

 INPUT_SAMPLE_RATE = 16000
 OUTPUT_SAMPLE_RATE = 24000
@ -45,11 +61,22 @@ TTS_PRE_ROLL_MS = 80
 TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
 TTS_INTERRUPT_SILENCE_FRAMES = 3
 INTERRUPT_TOPIC = "lk.interrupt"
+VISION_FRAME_TOPIC = "vision.frame"
 TTS_DISPLAY_SENTENCE_BREAKS = "。！？!?；;"
 TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
 TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
 TTS_DISPLAY_SCROLL_GAP = "   "
 TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
+EMOTION_TEXT_PATTERN = re.compile(
+    r"^\s*<?\s*emotion\s*=\s*([^\s>,，;；]+)\s*>?[\s,，;；]*(.*)$",
+    re.DOTALL,
+)
+EMOTION_TEST_SEQUENCE = [
+    emotion.strip()
+    for emotion in os.getenv("BRIDGE_EMOTION_TEST_SEQUENCE", "").split(",")
+    if emotion.strip()
+]
+EMOTION_TEST_INTERVAL_SECONDS = float(os.getenv("BRIDGE_EMOTION_TEST_INTERVAL_SECONDS", "2.0"))


@dataclass
@ -59,6 +86,10 @@ class DeviceSession:
    protocol_version: int
    room_name: str
    identity: str
+    chat_mode: str
+    agent_mode: str
+    agent_name: str
+    vision_enabled: bool
    room: rtc.Room
    mic_source: AudioSource
    agent_ready: asyncio.Event
@ -70,6 +101,7 @@ class DeviceSession:
    tts_transcript_text: str = ""
    tts_display_text: str = ""
    tts_display_final: bool = False
+    tts_emotion: str = ""
    tts_suppressed_until: float = 0.0
    agent_dispatch_task: Optional[asyncio.Task] = None
    closed: bool = False
@ -77,10 +109,10 @@ class DeviceSession:
    first_capture_log_time: float = 0.0


-async def fetch_token(room_name: str, identity: str) -> str:
+async def fetch_token(room_name: str, identity: str, agent_name: str) -> str:
    params = {"room": room_name, "identity": identity}
    if AGENT_DISPATCH_MODE == "token":
-        params["agent_name"] = AGENT_NAME
+        params["agent_name"] = agent_name

    async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
        response = await client.get(TOKEN_URL, params=params)
@ -120,15 +152,61 @@ class ESP32LiveKitBridge:
        print(f"[session] device={device_id} room={room_name} identity={identity}")
        return room_name, identity

-    def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool:
+    def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]:
+        requested_chat_mode = (
+            headers.get("Chat-Mode")
+            or headers.get("X-Chat-Mode")
+            or ""
+        ).strip().lower()
+        chat_mode_to_agent = {
+            "normal": ("normal", False),
+            "beaver": ("beaver", False),
+            "vision-normal": ("normal", True),
+            "vision-beaver": ("beaver", True),
+        }
+
+        if requested_chat_mode in chat_mode_to_agent:
+            requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode]
+        else:
+            if requested_chat_mode:
+                print(f"未知 Chat-Mode={requested_chat_mode!r}，回退到 Agent-Mode")
+            requested_mode = (
+                headers.get("Agent-Mode")
+                or headers.get("X-Agent-Mode")
+                or DEFAULT_AGENT_MODE
+                or "normal"
+            ).strip().lower()
+            vision_enabled = False
+            requested_chat_mode = requested_mode
+
+        requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name")
+        if requested_name:
+            return requested_chat_mode, "custom", requested_name, vision_enabled
+
+        if requested_mode not in AGENT_NAMES:
+            print(f"未知 Agent-Mode={requested_mode!r}，回退到 normal")
+            requested_mode = "normal"
+            requested_chat_mode = "vision-normal" if vision_enabled else "normal"
+
+        if requested_chat_mode in CHAT_MODE_AGENT_NAMES:
+            return (
+                requested_chat_mode,
+                requested_mode,
+                CHAT_MODE_AGENT_NAMES[requested_chat_mode],
+                vision_enabled,
+            )
+
+        return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled
+
+    def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool:
        identity = getattr(participant, "identity", "") or ""
-        return identity.startswith("agent-") or AGENT_NAME in identity
+        return identity.startswith("agent-") or agent_name in identity

    def _get_agent_identities(self, session: DeviceSession) -> list[str]:
        return [
            participant.identity
            for participant in session.room.remote_participants.values()
-            if self._is_agent_participant(participant)
+            if self._is_agent_participant(participant, session.agent_name)
        ]

    def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
@ -170,7 +248,7 @@ class ESP32LiveKitBridge:
                dispatch_agent_name = getattr(dispatch, "agent_name", None)
                dispatch_room = getattr(dispatch, "room", None)
                if dispatch_room == session.room_name and (
-                    dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None
+                    dispatch_agent_name == session.agent_name or dispatch_agent_name is None
                ):
                    print(
                        f"检测到已有 dispatch: room={session.room_name} "
@ -193,7 +271,7 @@ class ESP32LiveKitBridge:
            return

        for participant in session.room.remote_participants.values():
-            if self._is_agent_participant(participant):
+            if self._is_agent_participant(participant, session.agent_name):
                # print(f"Agent 已在房间中，跳过 dispatch: {participant.identity}")
                return

@ -205,7 +283,7 @@ class ESP32LiveKitBridge:
        await session.agent_dispatch_task

    async def _dispatch_agent(self, session: DeviceSession) -> None:
-        print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}")
+        print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}")

        try:
            if await self._dispatch_agent_with_sdk(session):
@ -235,13 +313,17 @@ class ESP32LiveKitBridge:
        try:
            dispatch = await lkapi.agent_dispatch.create_dispatch(
                livekit_api.CreateAgentDispatchRequest(
-                    agent_name=AGENT_NAME,
+                    agent_name=session.agent_name,
                    room=session.room_name,
                    metadata=json.dumps(
                        {
                            "source": "bridge_server",
                            "identity": session.identity,
                            "device_id": session.device_id,
+                            "chat_mode": session.chat_mode,
+                            "agent_mode": session.agent_mode,
+                            "agent_name": session.agent_name,
+                            "vision_enabled": session.vision_enabled,
                        }
                    ),
                )
@ -264,13 +346,17 @@ class ESP32LiveKitBridge:
            "--room",
            session.room_name,
            "--agent-name",
-            AGENT_NAME,
+            session.agent_name,
            "--metadata",
            json.dumps(
                {
                    "source": "bridge_server",
                    "identity": session.identity,
                    "device_id": session.device_id,
+                    "chat_mode": session.chat_mode,
+                    "agent_mode": session.agent_mode,
+                    "agent_name": session.agent_name,
+                    "vision_enabled": session.vision_enabled,
                }
            ),
            stdout=asyncio.subprocess.PIPE,
@ -289,7 +375,7 @@ class ESP32LiveKitBridge:
            print(f"lk dispatch create 失败，退出码: {process.returncode}")
            return False

-        print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}")
+        print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}")
        return True

    async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
@ -336,6 +422,73 @@ class ESP32LiveKitBridge:
        if not ok:
            print("警告: bridge 已停止 TTS，但 agent 侧 interrupt 未确认送出")

+    def _save_vision_frame(self, session: DeviceSession, image: str) -> Optional[Path]:
+        try:
+            image_bytes = base64.b64decode(image, validate=True)
+        except Exception as exc:
+            print(f"vision frame base64 解码失败: {exc}")
+            return None
+
+        safe_device_id = "".join(
+            char if char.isalnum() or char in ("-", "_") else "_"
+            for char in session.device_id
+        )
+        timestamp_ms = int(time.time() * 1000)
+        VISION_FRAME_SAVE_DIR.mkdir(parents=True, exist_ok=True)
+        path = VISION_FRAME_SAVE_DIR / f"{timestamp_ms}_{safe_device_id}.jpg"
+        path.write_bytes(image_bytes)
+        return path
+
+    async def _publish_vision_frame(self, session: DeviceSession, message: dict[str, Any]) -> None:
+        image = message.get("image")
+        if not isinstance(image, str) or not image:
+            print("收到 vision frame，但 image 字段为空")
+            return
+
+        saved_path = self._save_vision_frame(session, image)
+        if saved_path is None:
+            return
+        print(f"已保存 vision frame: {saved_path}")
+
+        participant = getattr(session.room, "local_participant", None)
+        if participant is None:
+            print("跳过发送 vision frame，local participant 尚未就绪")
+            return
+
+        payload = {
+            "type": "vision_frame",
+            "topic": VISION_FRAME_TOPIC,
+            "room": session.room_name,
+            "identity": session.identity,
+            "device_id": session.device_id,
+            "mime_type": message.get("mime_type", "image/jpeg"),
+            "image": image,
+            "saved_path": str(saved_path),
+        }
+        data = json.dumps(payload).encode("utf-8")
+        agent_identities = self._get_agent_identities(session)
+        kwargs: dict[str, Any] = {}
+        if agent_identities:
+            kwargs["destination_identities"] = agent_identities
+
+        last_error: Optional[Exception] = None
+        for attempt in ({"topic": VISION_FRAME_TOPIC, **kwargs}, kwargs):
+            try:
+                await participant.publish_data(data, **attempt)
+                print(
+                    f"已发送 vision frame: bytes={len(data)} "
+                    f"targets={agent_identities or 'broadcast'}"
+                )
+                return
+            except TypeError as exc:
+                last_error = exc
+            except Exception as exc:
+                print(f"发送 vision frame 失败: {exc}")
+                return
+
+        if last_error is not None:
+            print(f"发送 vision frame 失败，publish_data 签名不兼容: {last_error}")
+
    async def _send_tts_state(self, session: DeviceSession, state: str) -> None:
        if session.websocket is None:
            print(f"跳过 tts {state}，ESP32 尚未连接")
@ -343,9 +496,40 @@ class ESP32LiveKitBridge:
        await session.websocket.send(json.dumps({"type": "tts", "state": state}))
        print(f"已发送 tts {state}: device={session.device_id}")

+    async def _send_emotion(self, session: DeviceSession, emotion: str) -> None:
+        if session.websocket is None:
+            print(f"跳过 emotion {emotion}，ESP32 尚未连接")
+            return
+        await session.websocket.send(json.dumps({"type": "llm", "emotion": emotion}))
+        print(f"已发送 emotion: device={session.device_id} emotion={emotion}")
+
+    def _parse_emotion_text(self, text: str) -> tuple[Optional[str], str]:
+        match = EMOTION_TEXT_PATTERN.match(text)
+        if match is None:
+            return None, text.strip()
+        emotion, tts_text = match.groups()
+        return emotion.strip(), tts_text.strip()
+
+    async def _run_emotion_test_sequence(self, session: DeviceSession) -> None:
+        if not EMOTION_TEST_SEQUENCE:
+            return
+
+        for index, emotion in enumerate(EMOTION_TEST_SEQUENCE):
+            if session.websocket is None or session.closed:
+                return
+            if index > 0:
+                await asyncio.sleep(EMOTION_TEST_INTERVAL_SECONDS)
+            await self._send_emotion(session, emotion)
+
    async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
        if session.websocket is None:
            return
+        raw_text = text
+        _emotion, text = self._parse_emotion_text(text)
+        if not text:
+            print(f"[tts->esp32] skip empty text: raw={raw_text!r} final={final}")
+            return
+        print(f"[tts->esp32] text={text!r} final={final}")
        await session.websocket.send(
            json.dumps(
                {
@ -421,6 +605,7 @@ class ESP32LiveKitBridge:
        if not session.tts_display_text:
            session.tts_transcript_text = ""
            session.tts_display_final = False
+            session.tts_emotion = ""
            self._cancel_tts_display_task(session)
        await self._send_tts_state(session, "start")
        session.tts_active = True
@ -435,6 +620,7 @@ class ESP32LiveKitBridge:
        session.tts_transcript_text = ""
        session.tts_display_text = ""
        session.tts_display_final = False
+        session.tts_emotion = ""

    async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
        self._cancel_tts_display_task(session)
@ -445,6 +631,7 @@ class ESP32LiveKitBridge:
        session.tts_transcript_text = ""
        session.tts_display_text = ""
        session.tts_display_final = False
+        session.tts_emotion = ""
        await self._send_tts_state(session, "stop")
        print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")

@ -609,16 +796,16 @@ class ESP32LiveKitBridge:
            print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
            self._log_agent_participants(session, "connected")
            for participant in session.room.remote_participants.values():
-                if self._is_agent_participant(participant):
+                if self._is_agent_participant(participant, session.agent_name):
                    session.agent_ready.set()
                    self._scan_participant_audio_tracks(session, participant, "connected_scan")

        @session.room.on("participant_connected")
        def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
-            role = "Agent" if self._is_agent_participant(participant) else "Remote participant"
+            role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
            print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
            self._log_agent_participants(session, "participant_connected")
-            if self._is_agent_participant(participant):
+            if self._is_agent_participant(participant, session.agent_name):
                session.agent_ready.set()
                self._scan_participant_audio_tracks(
                    session, participant, "participant_connected_scan"
@ -651,14 +838,26 @@ class ESP32LiveKitBridge:
            track_pub: rtc.TrackPublication,
        ) -> None:
            identity = participant.identity if participant else "未知"
-            is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant)
+            is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(
+                participant, session.agent_name
+            )
            for segment in segments:
                status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
                print(f"🗣️  [{status} | room={session.room_name} | {identity}]: {segment.text}")
                if is_agent:
                    if time.monotonic() < session.tts_suppressed_until:
                        continue
-                    display_text = self._current_tts_display_text(segment.text)
+                    print(f"[livekit-llm] raw={segment.text!r} final={segment.final}")
+                    emotion, tts_text = self._parse_emotion_text(segment.text)
+                    print(
+                        f"[livekit-llm] parsed emotion={emotion!r} "
+                        f"tts_text={tts_text!r} final={segment.final}"
+                    )
+                    if emotion and emotion != session.tts_emotion:
+                        session.tts_emotion = emotion
+                        asyncio.create_task(self._send_emotion(session, emotion))
+                    display_text = self._current_tts_display_text(tts_text)
+                    print(f"[livekit-llm] display_text={display_text!r} final={segment.final}")
                    if not display_text or display_text == session.tts_transcript_text:
                        continue
                    session.tts_transcript_text = display_text
@ -713,7 +912,7 @@ class ESP32LiveKitBridge:
        # print(f"[config] token_url={TOKEN_URL}")
        # print(f"[config] room={session.room_name} identity={session.identity}")
        # print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
-        token = await fetch_token(session.room_name, session.identity)
+        token = await fetch_token(session.room_name, session.identity, session.agent_name)

        try:
            await session.room.connect(
@ -758,8 +957,12 @@ class ESP32LiveKitBridge:
        # print(f"等待 agent 加入: room={session.room_name}")
        try:
            await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
+            if session.closed:
+                return
            # print(f"✅ agent 已就绪: room={session.room_name}")
        except asyncio.TimeoutError:
+            if session.closed:
+                return
            print(f"⚠️ agent 等待超时: room={session.room_name}")

    async def start(self) -> None:
@ -767,6 +970,20 @@ class ESP32LiveKitBridge:
        print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
        print(f"[config] token_url={TOKEN_URL}")
        print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
+        print(
+            "[config] agents="
+            f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
+            f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} "
+            f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} "
+            f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} "
+            f"default_mode:{DEFAULT_AGENT_MODE}"
+        )
+        if EMOTION_TEST_SEQUENCE:
+            print(
+                "[config] emotion_test_sequence="
+                f"{','.join(EMOTION_TEST_SEQUENCE)} "
+                f"interval={EMOTION_TEST_INTERVAL_SECONDS}s"
+            )

    async def close(self) -> None:
        for session in list(self.device_sessions.values()):
@ -777,6 +994,7 @@ class ESP32LiveKitBridge:
            return
        session.closed = True
        session.websocket = None
+        session.agent_ready.set()
        session.tts_active = False
        session.tts_stream_id += 1
        if session.tts_idle_task is not None:
@ -926,6 +1144,7 @@ class ESP32LiveKitBridge:
            await self._close_session(existing_session)
            self.device_sessions.pop(device_id, None)

+        chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers)
        room_name, identity = self._build_session_names(device_id)
        session = DeviceSession(
            device_id=device_id,
@ -933,20 +1152,27 @@ class ESP32LiveKitBridge:
            protocol_version=protocol_version,
            room_name=room_name,
            identity=identity,
+            chat_mode=chat_mode,
+            agent_mode=agent_mode,
+            agent_name=agent_name,
+            vision_enabled=vision_enabled,
            room=rtc.Room(),
            mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
            agent_ready=asyncio.Event(),
        )
        self.device_sessions[device_id] = session

-        # print(f"ESP32 已连接: device={device_id}")
-        # print(f"ESP32 协议版本: {session.protocol_version}")
+        print(f"ESP32 已连接: device={device_id}")
+        print(f"ESP32 协议版本: {session.protocol_version}")
+        print(
+            f"ESP32 mode: chat={session.chat_mode} "
+            f"agent={session.agent_mode}/{session.agent_name} "
+            f"vision={session.vision_enabled}"
+        )
        session.tts_stream_id += 1
        opus_decoder = None

        try:
-            await self._connect_session_room(session)
-
            hello_msg = {
                "type": "hello",
                "transport": "websocket",
@ -962,6 +1188,10 @@ class ESP32LiveKitBridge:
                },
            }
            await websocket.send(json.dumps(hello_msg))
+            print(f"已发送 server hello: device={device_id} room={session.room_name}")
+            asyncio.create_task(self._run_emotion_test_sequence(session))
+
+            await self._connect_session_room(session)

            async for message in websocket:
                if isinstance(message, bytes):
@ -1017,6 +1247,8 @@ class ESP32LiveKitBridge:
                            abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
                            print(f"处理 ESP32 打断请求: reason={abort_reason}")
                            await self._abort_tts(session, abort_reason)
+                        elif msg_type == "vision" and data.get("state") == "frame":
+                            await self._publish_vision_frame(session, data)
                    except json.JSONDecodeError:
                        print(f"收到未知的字符消息: {message}")
        except ConnectionClosedError as exc:
--- a/main/protocols/protocol.cc
+++ b/main/protocols/protocol.cc
@ -1,9 +1,22 @@
 #include "protocol.h"

 #include <esp_log.h>
+#include <mbedtls/base64.h>

 #define TAG "Protocol"

+static std::string Base64Encode(const std::string& data) {
+    size_t encoded_length = 0;
+    size_t output_length = 0;
+    mbedtls_base64_encode(nullptr, 0, &encoded_length,
+                          reinterpret_cast<const unsigned char*>(data.data()), data.size());
+    std::string result(encoded_length, 0);
+    mbedtls_base64_encode(reinterpret_cast<unsigned char*>(result.data()), result.size(), &output_length,
+                          reinterpret_cast<const unsigned char*>(data.data()), data.size());
+    result.resize(output_length);
+    return result;
+}
+
 void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) {
    on_incoming_json_ = callback;
 }
@ -78,6 +91,27 @@ void Protocol::SendMcpMessage(const std::string& payload) {
    SendText(message);
 }

+void Protocol::SendVisionFrame(const std::string& jpeg_data) {
+    if (jpeg_data.empty()) {
+        return;
+    }
+
+    cJSON* root = cJSON_CreateObject();
+    cJSON_AddStringToObject(root, "session_id", session_id_.c_str());
+    cJSON_AddStringToObject(root, "type", "vision");
+    cJSON_AddStringToObject(root, "state", "frame");
+    cJSON_AddStringToObject(root, "mime_type", "image/jpeg");
+    auto encoded = Base64Encode(jpeg_data);
+    cJSON_AddStringToObject(root, "image", encoded.c_str());
+
+    char* json_str = cJSON_PrintUnformatted(root);
+    if (json_str != nullptr) {
+        SendText(json_str);
+        cJSON_free(json_str);
+    }
+    cJSON_Delete(root);
+}
+
 bool Protocol::IsTimeout() const {
    const int kTimeoutSeconds = 120;
    auto now = std::chrono::steady_clock::now();
--- a/main/protocols/protocol.h
+++ b/main/protocols/protocol.h
@ -73,6 +73,7 @@ public:
    virtual void SendStopListening();
    virtual void SendAbortSpeaking(AbortReason reason);
    virtual void SendMcpMessage(const std::string& message);
+    virtual void SendVisionFrame(const std::string& jpeg_data);

 protected:
    std::function<void(const cJSON* root)> on_incoming_json_;
@ -95,4 +96,3 @@ protected:
 };

 #endif // PROTOCOL_H
-
--- a/main/protocols/websocket_protocol.cc
+++ b/main/protocols/websocket_protocol.cc
@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
    websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
    websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
    websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
+    websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName());
+    websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName());

    websocket_->OnData([this](const char* data, size_t len, bool binary) {
        if (binary) {
Author	SHA1	Message	Date
0Xiao0	9637e09aef	feat: beaver	2026-06-04 15:48:10 +08:00
0Xiao0	b92e6e1b07	feat: remove background cam every time	2026-05-29 14:53:58 +08:00
0Xiao0	33ee598c21	feat: add icon beaver	2026-05-29 11:22:31 +08:00
0Xiao0	37343ac0fe	feat: icon first commit	2026-05-27 17:16:11 +08:00
0Xiao0	fc6302661d	feat: support camera capture to livekit	2026-05-25 17:21:11 +08:00