From fc6302661d2cb42af3ed8b4d60cd0467830f5219 Mon Sep 17 00:00:00 2001 From: 0Xiao0 <511201264@qq.com> Date: Mon, 25 May 2026 17:21:11 +0800 Subject: [PATCH] feat: support camera capture to livekit --- .gitignore | 1 + main/Kconfig.projbuild | 2 +- main/application.cc | 35 +++- main/application.h | 4 + main/boards/common/camera.h | 2 + main/boards/common/esp_video.cc | 60 ++++++- main/boards/common/esp_video.h | 8 + .../boards/m5stack-core-s3/m5stack_core_s3.cc | 159 ++++++++++-------- .../esp32-s3-touch-lcd-4.3c/sdkconfig.4_3c | 2 +- main/bridge_server.py | 83 ++++++++- main/protocols/protocol.cc | 34 ++++ main/protocols/protocol.h | 2 +- 12 files changed, 314 insertions(+), 78 deletions(-) diff --git a/.gitignore b/.gitignore index 5326d47..0efd9de 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ sdkconfig dependencies.lock .env releases/ +vision_frames/ main/assets/lang_config.h main/mmap_generate_emoji.h .DS_Store diff --git a/main/Kconfig.projbuild b/main/Kconfig.projbuild index c136b4c..d1f1dbf 100644 --- a/main/Kconfig.projbuild +++ b/main/Kconfig.projbuild @@ -15,7 +15,7 @@ config USE_DIRECT_WEBSOCKET config WEBSOCKET_URL string "Default WebSocket URL" depends on USE_DIRECT_WEBSOCKET - default "ws://10.6.80.130:8080" + default "ws://172.19.0.240:8080" help The WebSocket server URL used when direct WebSocket mode is enabled. diff --git a/main/application.cc b/main/application.cc index 23da40f..981a675 100644 --- a/main/application.cc +++ b/main/application.cc @@ -673,10 +673,17 @@ void Application::DismissAlert() { } void Application::ToggleChatState() { + vision_text_mode_enabled_.store(false); + xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); +} + +void Application::ToggleChatStateWithVision() { + vision_text_mode_enabled_.store(true); xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); } void Application::StartListening() { + vision_text_mode_enabled_.store(false); xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING); } @@ -686,7 +693,10 @@ void Application::StopListening() { void Application::HandleToggleChatEvent() { auto state = GetDeviceState(); - + if (state != kDeviceStateIdle) { + vision_text_mode_enabled_.store(false); + } + if (state == kDeviceStateActivating) { SetDeviceState(kDeviceStateIdle); return; @@ -905,6 +915,9 @@ void Application::HandleStateChangedEvent() { audio_service_.WaitForPlaybackQueueEmpty(); } + if (vision_text_mode_enabled_.load()) { + SendCurrentVisionFrame(); + } // Send the start listening command protocol_->SendStartListening(listening_mode_); audio_service_.EnableVoiceProcessing(true); @@ -944,6 +957,26 @@ void Application::HandleStateChangedEvent() { } } +void Application::SendCurrentVisionFrame() { + if (!protocol_ || !protocol_->IsAudioChannelOpened()) { + return; + } + + auto camera = Board::GetInstance().GetCamera(); + if (camera == nullptr) { + return; + } + + std::string jpeg_data; + if (!camera->CaptureToJpeg(jpeg_data, false)) { + ESP_LOGW(TAG, "Failed to capture vision frame"); + return; + } + + protocol_->SendVisionFrame(jpeg_data); + ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast(jpeg_data.size())); +} + void Application::Schedule(std::function&& callback) { { std::lock_guard lock(mutex_); diff --git a/main/application.h b/main/application.h index cb635d4..8aae2c5 100644 --- a/main/application.h +++ b/main/application.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "protocol.h" #include "ota.h" @@ -91,6 +92,7 @@ public: * Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run() */ void ToggleChatState(); + void ToggleChatStateWithVision(); /** * Start listening (event-based, thread-safe) @@ -144,6 +146,7 @@ private: bool aborted_ = false; bool assets_version_checked_ = false; bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening + std::atomic vision_text_mode_enabled_ = false; int clock_ticks_ = 0; TaskHandle_t activation_task_handle_ = nullptr; @@ -159,6 +162,7 @@ private: void HandleWakeWordDetectedEvent(); void ContinueOpenAudioChannel(ListeningMode mode); void ContinueWakeWordInvoke(const std::string& wake_word); + void SendCurrentVisionFrame(); // Activation task (runs in background) void ActivationTask(); diff --git a/main/boards/common/camera.h b/main/boards/common/camera.h index 09a6a3e..38e795c 100644 --- a/main/boards/common/camera.h +++ b/main/boards/common/camera.h @@ -7,6 +7,8 @@ class Camera { public: virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0; virtual bool Capture() = 0; + virtual bool CaptureBackground() { return Capture(); } + virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) { return false; } virtual bool SetHMirror(bool enabled) = 0; virtual bool SetVFlip(bool enabled) = 0; virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op diff --git a/main/boards/common/esp_video.cc b/main/boards/common/esp_video.cc index 9fd4b12..084395d 100644 --- a/main/boards/common/esp_video.cc +++ b/main/boards/common/esp_video.cc @@ -24,6 +24,7 @@ #include "lvgl_display.h" #include "mcp_server.h" #include "system_info.h" +#include "esp_timer.h" #ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE #undef LOG_LOCAL_LEVEL @@ -55,6 +56,7 @@ #define TAG "EspVideo" +#define FOREGROUND_CAPTURE_PROTECTION_US (10 * 1000 * 1000) #if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP) #warning \ @@ -381,11 +383,47 @@ EspVideo::~EspVideo() { } void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) { + std::lock_guard lock(frame_mutex_); explain_url_ = url; explain_token_ = token; } bool EspVideo::Capture() { + return CaptureFrame(true); +} + +bool EspVideo::CaptureBackground() { + return CaptureFrame(false); +} + +bool EspVideo::CaptureToJpeg(std::string& jpeg_data, bool show_preview) { + jpeg_data.clear(); + if (!CaptureFrame(show_preview)) { + return false; + } + + std::lock_guard lock(frame_mutex_); + if (frame_.data == nullptr || frame_.len == 0) { + return false; + } + + uint16_t w = frame_.width ? frame_.width : 320; + uint16_t h = frame_.height ? frame_.height : 240; + return image_to_jpeg_cb( + frame_.data, frame_.len, w, h, frame_.format, 60, + [](void* arg, size_t index, const void* data, size_t len) -> size_t { + auto jpeg_data = static_cast(arg); + if (data != nullptr && len > 0) { + jpeg_data->append(static_cast(data), len); + } + return len; + }, + &jpeg_data); +} + +bool EspVideo::CaptureFrame(bool show_preview) { + std::lock_guard lock(frame_mutex_); + if (encoder_thread_.joinable()) { encoder_thread_.join(); } @@ -394,6 +432,10 @@ bool EspVideo::Capture() { return false; } + if (!show_preview && esp_timer_get_time() < foreground_capture_protected_until_us_) { + return true; + } + for (int i = 0; i < 3; i++) { struct v4l2_buffer buf = {}; buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; @@ -729,9 +771,14 @@ bool EspVideo::Capture() { } } - // 显示预览图片 - auto display = dynamic_cast(Board::GetInstance().GetDisplay()); - if (display != nullptr) { + if (show_preview) { + foreground_capture_protected_until_us_ = esp_timer_get_time() + FOREGROUND_CAPTURE_PROTECTION_US; + } + + if (show_preview) { + // 显示预览图片 + auto display = dynamic_cast(Board::GetInstance().GetDisplay()); + if (display != nullptr) { if (!frame_.data) { ESP_LOGE(TAG, "frame.data is null"); return false; @@ -836,6 +883,7 @@ bool EspVideo::Capture() { auto image = std::make_unique(data, lvgl_image_size, w, h, stride, color_format); display->SetPreviewImage(std::move(image)); + } } return true; } @@ -898,10 +946,16 @@ bool EspVideo::SetVFlip(bool enabled) { * @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息 */ std::string EspVideo::Explain(const std::string& question) { + std::lock_guard lock(frame_mutex_); + if (explain_url_.empty()) { throw std::runtime_error("Image explain URL or token is not set"); } + if (frame_.data == nullptr || frame_.len == 0) { + throw std::runtime_error("No camera frame captured"); + } + // 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk)); if (jpeg_queue == nullptr) { diff --git a/main/boards/common/esp_video.h b/main/boards/common/esp_video.h index 063276e..1a27a9b 100644 --- a/main/boards/common/esp_video.h +++ b/main/boards/common/esp_video.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include @@ -39,6 +41,10 @@ private: std::string explain_url_; std::string explain_token_; std::thread encoder_thread_; + std::mutex frame_mutex_; + int64_t foreground_capture_protected_until_us_ = 0; + + bool CaptureFrame(bool show_preview); public: EspVideo(const esp_video_init_config_t& config); @@ -46,6 +52,8 @@ public: virtual void SetExplainUrl(const std::string& url, const std::string& token); virtual bool Capture(); + virtual bool CaptureBackground() override; + virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) override; // 翻转控制函数 virtual bool SetHMirror(bool enabled) override; virtual bool SetVFlip(bool enabled) override; diff --git a/main/boards/m5stack-core-s3/m5stack_core_s3.cc b/main/boards/m5stack-core-s3/m5stack_core_s3.cc index 7ca34b4..0513491 100644 --- a/main/boards/m5stack-core-s3/m5stack_core_s3.cc +++ b/main/boards/m5stack-core-s3/m5stack_core_s3.cc @@ -1,21 +1,23 @@ -#include "wifi_board.h" +#include "application.h" +#include "axp2101.h" +#include "config.h" #include "cores3_audio_codec.h" #include "display/lcd_display.h" -#include "application.h" -#include "config.h" -#include "power_save_timer.h" #include "i2c_device.h" -#include "axp2101.h" +#include "power_save_timer.h" +#include "wifi_board.h" -#include #include +#include #include #include -#include +#include #include #include "esp_video.h" #define TAG "M5StackCoreS3Board" +#define BACKGROUND_VISION_INITIAL_DELAY_MS 8000 +#define BACKGROUND_VISION_SAMPLE_INTERVAL_MS 100 class Pmic : public Axp2101 { public: @@ -41,7 +43,7 @@ public: class CustomBacklight : public Backlight { public: - CustomBacklight(Pmic *pmic) : pmic_(pmic) {} + CustomBacklight(Pmic* pmic) : pmic_(pmic) {} void SetBrightnessImpl(uint8_t brightness) override { pmic_->SetBrightness(target_brightness_); @@ -49,7 +51,7 @@ public: } private: - Pmic *pmic_; + Pmic* pmic_; }; class Aw9523 : public I2cDevice { @@ -89,16 +91,14 @@ public: int x = -1; int y = -1; }; - + Ft6336(i2c_master_bus_handle_t i2c_bus, uint8_t addr) : I2cDevice(i2c_bus, addr) { uint8_t chip_id = ReadReg(0xA3); ESP_LOGI(TAG, "Get chip ID: 0x%02X", chip_id); read_buffer_ = new uint8_t[6]; } - ~Ft6336() { - delete[] read_buffer_; - } + ~Ft6336() { delete[] read_buffer_; } void UpdateTouchPoint() { ReadRegs(0x02, read_buffer_, 6); @@ -107,9 +107,7 @@ public: tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4]; } - inline const TouchPoint_t& GetTouchPoint() { - return tp_; - } + inline const TouchPoint_t& GetTouchPoint() { return tp_; } private: uint8_t* read_buffer_ = nullptr; @@ -137,9 +135,7 @@ private: GetDisplay()->SetPowerSaveMode(false); GetBacklight()->RestoreBrightness(); }); - power_save_timer_->OnShutdownRequest([this]() { - pmic_->PowerOff(); - }); + power_save_timer_->OnShutdownRequest([this]() { pmic_->PowerOff(); }); power_save_timer_->SetEnabled(true); } @@ -153,9 +149,10 @@ private: .glitch_ignore_cnt = 7, .intr_priority = 0, .trans_queue_depth = 0, - .flags = { - .enable_internal_pullup = 1, - }, + .flags = + { + .enable_internal_pullup = 1, + }, }; ESP_ERROR_CHECK(i2c_new_master_bus(&i2c_bus_cfg, &i2c_bus_)); } @@ -196,28 +193,32 @@ private: static bool was_touched = false; static int64_t touch_start_time = 0; const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值,超过500ms视为长按 - + ft6336_->UpdateTouchPoint(); auto& touch_point = ft6336_->GetTouchPoint(); - + // 检测触摸开始 if (touch_point.num > 0 && !was_touched) { was_touched = true; - touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒 - } + touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒 + } // 检测触摸释放 else if (touch_point.num == 0 && was_touched) { was_touched = false; int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time; - - // 只有短触才触发 + if (touch_duration < TOUCH_THRESHOLD_MS) { auto& app = Application::GetInstance(); if (app.GetDeviceState() == kDeviceStateStarting) { EnterWifiConfigMode(); return; } + ESP_LOGI(TAG, "Touch short: text-only mode"); app.ToggleChatState(); + } else { + auto& app = Application::GetInstance(); + ESP_LOGI(TAG, "Touch long: vision+text mode"); + app.ToggleChatStateWithVision(); } } } @@ -225,19 +226,20 @@ private: void InitializeFt6336TouchPad() { ESP_LOGI(TAG, "Init FT6336"); ft6336_ = new Ft6336(i2c_bus_, 0x38); - + // 创建定时器,20ms 间隔 esp_timer_create_args_t timer_args = { - .callback = [](void* arg) { - M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg; - board->PollTouchpad(); - }, + .callback = + [](void* arg) { + M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg; + board->PollTouchpad(); + }, .arg = this, .dispatch_method = ESP_TIMER_TASK, .name = "touchpad_timer", .skip_unhandled_events = true, }; - + ESP_ERROR_CHECK(esp_timer_create(&timer_args, &touchpad_timer_)); ESP_ERROR_CHECK(esp_timer_start_periodic(touchpad_timer_, 20 * 1000)); } @@ -276,7 +278,7 @@ private: panel_config.rgb_ele_order = LCD_RGB_ELEMENT_ORDER_BGR; panel_config.bits_per_pixel = 16; ESP_ERROR_CHECK(esp_lcd_new_panel_ili9341(panel_io, &panel_config, &panel)); - + esp_lcd_panel_reset(panel); aw9523_->ResetIli9342(); @@ -285,23 +287,25 @@ private: esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY); esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y); - display_ = new SpiLcdDisplay(panel_io, panel, - DISPLAY_WIDTH, DISPLAY_HEIGHT, DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY); + display_ = new SpiLcdDisplay(panel_io, panel, DISPLAY_WIDTH, DISPLAY_HEIGHT, + DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X, + DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY); } - void InitializeCamera() { + void InitializeCamera() { static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = { .data_width = CAM_CTLR_DATA_WIDTH_8, - .data_io = { - [0] = CAMERA_PIN_D0, - [1] = CAMERA_PIN_D1, - [2] = CAMERA_PIN_D2, - [3] = CAMERA_PIN_D3, - [4] = CAMERA_PIN_D4, - [5] = CAMERA_PIN_D5, - [6] = CAMERA_PIN_D6, - [7] = CAMERA_PIN_D7, - }, + .data_io = + { + [0] = CAMERA_PIN_D0, + [1] = CAMERA_PIN_D1, + [2] = CAMERA_PIN_D2, + [3] = CAMERA_PIN_D3, + [4] = CAMERA_PIN_D4, + [5] = CAMERA_PIN_D5, + [6] = CAMERA_PIN_D6, + [7] = CAMERA_PIN_D7, + }, .vsync_io = CAMERA_PIN_VSYNC, .de_io = CAMERA_PIN_HREF, .pclk_io = CAMERA_PIN_PCLK, @@ -330,6 +334,37 @@ private: camera_->SetHMirror(false); } + void InitializeBackgroundVisionSampler() { + xTaskCreate( + [](void* arg) { + auto board = static_cast(arg); + bool has_logged_success = false; + bool has_logged_failure = false; + + vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS)); + + while (true) { + if (board->camera_ == nullptr) { + vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS)); + continue; + } + + if (board->camera_->CaptureBackground()) { + if (!has_logged_success) { + ESP_LOGI(TAG, "Background vision sampler started"); + has_logged_success = true; + } + } else if (!has_logged_failure) { + ESP_LOGW(TAG, "Background vision sampler is waiting for camera"); + has_logged_failure = true; + } + + vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS)); + } + }, + "BgVisionSampler", 4096, this, 1, nullptr); + } + public: M5StackCoreS3Board() { InitializePowerSaveTimer(); @@ -340,34 +375,24 @@ public: InitializeSpi(); InitializeIli9342Display(); InitializeCamera(); + InitializeBackgroundVisionSampler(); InitializeFt6336TouchPad(); GetBacklight()->RestoreBrightness(); } virtual AudioCodec* GetAudioCodec() override { - static CoreS3AudioCodec audio_codec(i2c_bus_, - AUDIO_INPUT_SAMPLE_RATE, - AUDIO_OUTPUT_SAMPLE_RATE, - AUDIO_I2S_GPIO_MCLK, - AUDIO_I2S_GPIO_BCLK, - AUDIO_I2S_GPIO_WS, - AUDIO_I2S_GPIO_DOUT, - AUDIO_I2S_GPIO_DIN, - AUDIO_CODEC_AW88298_ADDR, - AUDIO_CODEC_ES7210_ADDR, - AUDIO_INPUT_REFERENCE); + static CoreS3AudioCodec audio_codec( + i2c_bus_, AUDIO_INPUT_SAMPLE_RATE, AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_MCLK, + AUDIO_I2S_GPIO_BCLK, AUDIO_I2S_GPIO_WS, AUDIO_I2S_GPIO_DOUT, AUDIO_I2S_GPIO_DIN, + AUDIO_CODEC_AW88298_ADDR, AUDIO_CODEC_ES7210_ADDR, AUDIO_INPUT_REFERENCE); return &audio_codec; } - virtual Display* GetDisplay() override { - return display_; - } + virtual Display* GetDisplay() override { return display_; } - virtual Camera* GetCamera() override { - return camera_; - } + virtual Camera* GetCamera() override { return camera_; } - virtual bool GetBatteryLevel(int &level, bool& charging, bool& discharging) override { + virtual bool GetBatteryLevel(int& level, bool& charging, bool& discharging) override { static bool last_discharging = false; charging = pmic_->IsCharging(); discharging = pmic_->IsDischarging(); @@ -387,7 +412,7 @@ public: WifiBoard::SetPowerSaveLevel(level); } - virtual Backlight *GetBacklight() override { + virtual Backlight* GetBacklight() override { static CustomBacklight backlight(pmic_); return &backlight; } diff --git a/main/boards/waveshare/esp32-s3-touch-lcd-4.3c/sdkconfig.4_3c b/main/boards/waveshare/esp32-s3-touch-lcd-4.3c/sdkconfig.4_3c index 2129bc5..cba0dd4 100755 --- a/main/boards/waveshare/esp32-s3-touch-lcd-4.3c/sdkconfig.4_3c +++ b/main/boards/waveshare/esp32-s3-touch-lcd-4.3c/sdkconfig.4_3c @@ -599,7 +599,7 @@ CONFIG_PARTITION_TABLE_MD5=y # CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/" CONFIG_USE_DIRECT_WEBSOCKET=y -CONFIG_WEBSOCKET_URL="ws://10.6.80.130:8080" +CONFIG_WEBSOCKET_URL="ws://172.19.0.240:8080" CONFIG_WEBSOCKET_TOKEN="" CONFIG_WEBSOCKET_PROTOCOL_VERSION=1 # CONFIG_FLASH_NONE_ASSETS is not set diff --git a/main/bridge_server.py b/main/bridge_server.py index 03beccc..dea6cbf 100644 --- a/main/bridge_server.py +++ b/main/bridge_server.py @@ -1,4 +1,5 @@ import asyncio +import base64 import json import os import shutil @@ -8,6 +9,7 @@ import time import traceback import uuid from dataclasses import dataclass, field +from pathlib import Path from typing import Any, Optional import httpx @@ -31,6 +33,8 @@ CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20 AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0")) WS_PORT = 8080 AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower() +PROJECT_ROOT = Path(__file__).resolve().parent.parent +VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames"))) INPUT_SAMPLE_RATE = 16000 OUTPUT_SAMPLE_RATE = 24000 @@ -45,6 +49,7 @@ TTS_PRE_ROLL_MS = 80 TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1 TTS_INTERRUPT_SILENCE_FRAMES = 3 INTERRUPT_TOPIC = "lk.interrupt" +VISION_FRAME_TOPIC = "vision.frame" TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;;" TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18")) TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18")) @@ -336,6 +341,73 @@ class ESP32LiveKitBridge: if not ok: print("警告: bridge 已停止 TTS,但 agent 侧 interrupt 未确认送出") + def _save_vision_frame(self, session: DeviceSession, image: str) -> Optional[Path]: + try: + image_bytes = base64.b64decode(image, validate=True) + except Exception as exc: + print(f"vision frame base64 解码失败: {exc}") + return None + + safe_device_id = "".join( + char if char.isalnum() or char in ("-", "_") else "_" + for char in session.device_id + ) + timestamp_ms = int(time.time() * 1000) + VISION_FRAME_SAVE_DIR.mkdir(parents=True, exist_ok=True) + path = VISION_FRAME_SAVE_DIR / f"{timestamp_ms}_{safe_device_id}.jpg" + path.write_bytes(image_bytes) + return path + + async def _publish_vision_frame(self, session: DeviceSession, message: dict[str, Any]) -> None: + image = message.get("image") + if not isinstance(image, str) or not image: + print("收到 vision frame,但 image 字段为空") + return + + saved_path = self._save_vision_frame(session, image) + if saved_path is None: + return + print(f"已保存 vision frame: {saved_path}") + + participant = getattr(session.room, "local_participant", None) + if participant is None: + print("跳过发送 vision frame,local participant 尚未就绪") + return + + payload = { + "type": "vision_frame", + "topic": VISION_FRAME_TOPIC, + "room": session.room_name, + "identity": session.identity, + "device_id": session.device_id, + "mime_type": message.get("mime_type", "image/jpeg"), + "image": image, + "saved_path": str(saved_path), + } + data = json.dumps(payload).encode("utf-8") + agent_identities = self._get_agent_identities(session) + kwargs: dict[str, Any] = {} + if agent_identities: + kwargs["destination_identities"] = agent_identities + + last_error: Optional[Exception] = None + for attempt in ({"topic": VISION_FRAME_TOPIC, **kwargs}, kwargs): + try: + await participant.publish_data(data, **attempt) + print( + f"已发送 vision frame: bytes={len(data)} " + f"targets={agent_identities or 'broadcast'}" + ) + return + except TypeError as exc: + last_error = exc + except Exception as exc: + print(f"发送 vision frame 失败: {exc}") + return + + if last_error is not None: + print(f"发送 vision frame 失败,publish_data 签名不兼容: {last_error}") + async def _send_tts_state(self, session: DeviceSession, state: str) -> None: if session.websocket is None: print(f"跳过 tts {state},ESP32 尚未连接") @@ -939,14 +1011,12 @@ class ESP32LiveKitBridge: ) self.device_sessions[device_id] = session - # print(f"ESP32 已连接: device={device_id}") - # print(f"ESP32 协议版本: {session.protocol_version}") + print(f"ESP32 已连接: device={device_id}") + print(f"ESP32 协议版本: {session.protocol_version}") session.tts_stream_id += 1 opus_decoder = None try: - await self._connect_session_room(session) - hello_msg = { "type": "hello", "transport": "websocket", @@ -962,6 +1032,9 @@ class ESP32LiveKitBridge: }, } await websocket.send(json.dumps(hello_msg)) + print(f"已发送 server hello: device={device_id} room={session.room_name}") + + await self._connect_session_room(session) async for message in websocket: if isinstance(message, bytes): @@ -1017,6 +1090,8 @@ class ESP32LiveKitBridge: abort_reason = reason if isinstance(reason, str) and reason else "button_abort" print(f"处理 ESP32 打断请求: reason={abort_reason}") await self._abort_tts(session, abort_reason) + elif msg_type == "vision" and data.get("state") == "frame": + await self._publish_vision_frame(session, data) except json.JSONDecodeError: print(f"收到未知的字符消息: {message}") except ConnectionClosedError as exc: diff --git a/main/protocols/protocol.cc b/main/protocols/protocol.cc index 470cc91..c31d6b4 100644 --- a/main/protocols/protocol.cc +++ b/main/protocols/protocol.cc @@ -1,9 +1,22 @@ #include "protocol.h" #include +#include #define TAG "Protocol" +static std::string Base64Encode(const std::string& data) { + size_t encoded_length = 0; + size_t output_length = 0; + mbedtls_base64_encode(nullptr, 0, &encoded_length, + reinterpret_cast(data.data()), data.size()); + std::string result(encoded_length, 0); + mbedtls_base64_encode(reinterpret_cast(result.data()), result.size(), &output_length, + reinterpret_cast(data.data()), data.size()); + result.resize(output_length); + return result; +} + void Protocol::OnIncomingJson(std::function callback) { on_incoming_json_ = callback; } @@ -78,6 +91,27 @@ void Protocol::SendMcpMessage(const std::string& payload) { SendText(message); } +void Protocol::SendVisionFrame(const std::string& jpeg_data) { + if (jpeg_data.empty()) { + return; + } + + cJSON* root = cJSON_CreateObject(); + cJSON_AddStringToObject(root, "session_id", session_id_.c_str()); + cJSON_AddStringToObject(root, "type", "vision"); + cJSON_AddStringToObject(root, "state", "frame"); + cJSON_AddStringToObject(root, "mime_type", "image/jpeg"); + auto encoded = Base64Encode(jpeg_data); + cJSON_AddStringToObject(root, "image", encoded.c_str()); + + char* json_str = cJSON_PrintUnformatted(root); + if (json_str != nullptr) { + SendText(json_str); + cJSON_free(json_str); + } + cJSON_Delete(root); +} + bool Protocol::IsTimeout() const { const int kTimeoutSeconds = 120; auto now = std::chrono::steady_clock::now(); diff --git a/main/protocols/protocol.h b/main/protocols/protocol.h index 28ef604..174d97c 100644 --- a/main/protocols/protocol.h +++ b/main/protocols/protocol.h @@ -73,6 +73,7 @@ public: virtual void SendStopListening(); virtual void SendAbortSpeaking(AbortReason reason); virtual void SendMcpMessage(const std::string& message); + virtual void SendVisionFrame(const std::string& jpeg_data); protected: std::function on_incoming_json_; @@ -95,4 +96,3 @@ protected: }; #endif // PROTOCOL_H -