1 Commits
ws ... cam

Author SHA1 Message Date
fc6302661d feat: support camera capture to livekit 2026-05-25 17:21:11 +08:00
12 changed files with 314 additions and 78 deletions

1
.gitignore vendored
View File

@ -10,6 +10,7 @@ sdkconfig
dependencies.lock dependencies.lock
.env .env
releases/ releases/
vision_frames/
main/assets/lang_config.h main/assets/lang_config.h
main/mmap_generate_emoji.h main/mmap_generate_emoji.h
.DS_Store .DS_Store

View File

@ -15,7 +15,7 @@ config USE_DIRECT_WEBSOCKET
config WEBSOCKET_URL config WEBSOCKET_URL
string "Default WebSocket URL" string "Default WebSocket URL"
depends on USE_DIRECT_WEBSOCKET depends on USE_DIRECT_WEBSOCKET
default "ws://10.6.80.130:8080" default "ws://172.19.0.240:8080"
help help
The WebSocket server URL used when direct WebSocket mode is enabled. The WebSocket server URL used when direct WebSocket mode is enabled.

View File

@ -673,10 +673,17 @@ void Application::DismissAlert() {
} }
void Application::ToggleChatState() { void Application::ToggleChatState() {
vision_text_mode_enabled_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
}
void Application::ToggleChatStateWithVision() {
vision_text_mode_enabled_.store(true);
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
} }
void Application::StartListening() { void Application::StartListening() {
vision_text_mode_enabled_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING); xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
} }
@ -686,7 +693,10 @@ void Application::StopListening() {
void Application::HandleToggleChatEvent() { void Application::HandleToggleChatEvent() {
auto state = GetDeviceState(); auto state = GetDeviceState();
if (state != kDeviceStateIdle) {
vision_text_mode_enabled_.store(false);
}
if (state == kDeviceStateActivating) { if (state == kDeviceStateActivating) {
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
return; return;
@ -905,6 +915,9 @@ void Application::HandleStateChangedEvent() {
audio_service_.WaitForPlaybackQueueEmpty(); audio_service_.WaitForPlaybackQueueEmpty();
} }
if (vision_text_mode_enabled_.load()) {
SendCurrentVisionFrame();
}
// Send the start listening command // Send the start listening command
protocol_->SendStartListening(listening_mode_); protocol_->SendStartListening(listening_mode_);
audio_service_.EnableVoiceProcessing(true); audio_service_.EnableVoiceProcessing(true);
@ -944,6 +957,26 @@ void Application::HandleStateChangedEvent() {
} }
} }
void Application::SendCurrentVisionFrame() {
if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
return;
}
auto camera = Board::GetInstance().GetCamera();
if (camera == nullptr) {
return;
}
std::string jpeg_data;
if (!camera->CaptureToJpeg(jpeg_data, false)) {
ESP_LOGW(TAG, "Failed to capture vision frame");
return;
}
protocol_->SendVisionFrame(jpeg_data);
ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
}
void Application::Schedule(std::function<void()>&& callback) { void Application::Schedule(std::function<void()>&& callback) {
{ {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);

View File

@ -11,6 +11,7 @@
#include <deque> #include <deque>
#include <memory> #include <memory>
#include <functional> #include <functional>
#include <atomic>
#include "protocol.h" #include "protocol.h"
#include "ota.h" #include "ota.h"
@ -91,6 +92,7 @@ public:
* Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run() * Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run()
*/ */
void ToggleChatState(); void ToggleChatState();
void ToggleChatStateWithVision();
/** /**
* Start listening (event-based, thread-safe) * Start listening (event-based, thread-safe)
@ -144,6 +146,7 @@ private:
bool aborted_ = false; bool aborted_ = false;
bool assets_version_checked_ = false; bool assets_version_checked_ = false;
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
std::atomic<bool> vision_text_mode_enabled_ = false;
int clock_ticks_ = 0; int clock_ticks_ = 0;
TaskHandle_t activation_task_handle_ = nullptr; TaskHandle_t activation_task_handle_ = nullptr;
@ -159,6 +162,7 @@ private:
void HandleWakeWordDetectedEvent(); void HandleWakeWordDetectedEvent();
void ContinueOpenAudioChannel(ListeningMode mode); void ContinueOpenAudioChannel(ListeningMode mode);
void ContinueWakeWordInvoke(const std::string& wake_word); void ContinueWakeWordInvoke(const std::string& wake_word);
void SendCurrentVisionFrame();
// Activation task (runs in background) // Activation task (runs in background)
void ActivationTask(); void ActivationTask();

View File

@ -7,6 +7,8 @@ class Camera {
public: public:
virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0; virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0;
virtual bool Capture() = 0; virtual bool Capture() = 0;
virtual bool CaptureBackground() { return Capture(); }
virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) { return false; }
virtual bool SetHMirror(bool enabled) = 0; virtual bool SetHMirror(bool enabled) = 0;
virtual bool SetVFlip(bool enabled) = 0; virtual bool SetVFlip(bool enabled) = 0;
virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op

View File

@ -24,6 +24,7 @@
#include "lvgl_display.h" #include "lvgl_display.h"
#include "mcp_server.h" #include "mcp_server.h"
#include "system_info.h" #include "system_info.h"
#include "esp_timer.h"
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE #ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#undef LOG_LOCAL_LEVEL #undef LOG_LOCAL_LEVEL
@ -55,6 +56,7 @@
#define TAG "EspVideo" #define TAG "EspVideo"
#define FOREGROUND_CAPTURE_PROTECTION_US (10 * 1000 * 1000)
#if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP) #if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
#warning \ #warning \
@ -381,11 +383,47 @@ EspVideo::~EspVideo() {
} }
void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) { void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) {
std::lock_guard<std::mutex> lock(frame_mutex_);
explain_url_ = url; explain_url_ = url;
explain_token_ = token; explain_token_ = token;
} }
bool EspVideo::Capture() { bool EspVideo::Capture() {
return CaptureFrame(true);
}
bool EspVideo::CaptureBackground() {
return CaptureFrame(false);
}
bool EspVideo::CaptureToJpeg(std::string& jpeg_data, bool show_preview) {
jpeg_data.clear();
if (!CaptureFrame(show_preview)) {
return false;
}
std::lock_guard<std::mutex> lock(frame_mutex_);
if (frame_.data == nullptr || frame_.len == 0) {
return false;
}
uint16_t w = frame_.width ? frame_.width : 320;
uint16_t h = frame_.height ? frame_.height : 240;
return image_to_jpeg_cb(
frame_.data, frame_.len, w, h, frame_.format, 60,
[](void* arg, size_t index, const void* data, size_t len) -> size_t {
auto jpeg_data = static_cast<std::string*>(arg);
if (data != nullptr && len > 0) {
jpeg_data->append(static_cast<const char*>(data), len);
}
return len;
},
&jpeg_data);
}
bool EspVideo::CaptureFrame(bool show_preview) {
std::lock_guard<std::mutex> lock(frame_mutex_);
if (encoder_thread_.joinable()) { if (encoder_thread_.joinable()) {
encoder_thread_.join(); encoder_thread_.join();
} }
@ -394,6 +432,10 @@ bool EspVideo::Capture() {
return false; return false;
} }
if (!show_preview && esp_timer_get_time() < foreground_capture_protected_until_us_) {
return true;
}
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
struct v4l2_buffer buf = {}; struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
@ -729,9 +771,14 @@ bool EspVideo::Capture() {
} }
} }
// 显示预览图片 if (show_preview) {
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay()); foreground_capture_protected_until_us_ = esp_timer_get_time() + FOREGROUND_CAPTURE_PROTECTION_US;
if (display != nullptr) { }
if (show_preview) {
// 显示预览图片
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
if (display != nullptr) {
if (!frame_.data) { if (!frame_.data) {
ESP_LOGE(TAG, "frame.data is null"); ESP_LOGE(TAG, "frame.data is null");
return false; return false;
@ -836,6 +883,7 @@ bool EspVideo::Capture() {
auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format); auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
display->SetPreviewImage(std::move(image)); display->SetPreviewImage(std::move(image));
}
} }
return true; return true;
} }
@ -898,10 +946,16 @@ bool EspVideo::SetVFlip(bool enabled) {
* @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息 * @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息
*/ */
std::string EspVideo::Explain(const std::string& question) { std::string EspVideo::Explain(const std::string& question) {
std::lock_guard<std::mutex> lock(frame_mutex_);
if (explain_url_.empty()) { if (explain_url_.empty()) {
throw std::runtime_error("Image explain URL or token is not set"); throw std::runtime_error("Image explain URL or token is not set");
} }
if (frame_.data == nullptr || frame_.len == 0) {
throw std::runtime_error("No camera frame captured");
}
// 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data // 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk)); QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
if (jpeg_queue == nullptr) { if (jpeg_queue == nullptr) {

View File

@ -5,6 +5,8 @@
#include <thread> #include <thread>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <mutex>
#include <cstdint>
#include <freertos/FreeRTOS.h> #include <freertos/FreeRTOS.h>
#include <freertos/queue.h> #include <freertos/queue.h>
@ -39,6 +41,10 @@ private:
std::string explain_url_; std::string explain_url_;
std::string explain_token_; std::string explain_token_;
std::thread encoder_thread_; std::thread encoder_thread_;
std::mutex frame_mutex_;
int64_t foreground_capture_protected_until_us_ = 0;
bool CaptureFrame(bool show_preview);
public: public:
EspVideo(const esp_video_init_config_t& config); EspVideo(const esp_video_init_config_t& config);
@ -46,6 +52,8 @@ public:
virtual void SetExplainUrl(const std::string& url, const std::string& token); virtual void SetExplainUrl(const std::string& url, const std::string& token);
virtual bool Capture(); virtual bool Capture();
virtual bool CaptureBackground() override;
virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) override;
// 翻转控制函数 // 翻转控制函数
virtual bool SetHMirror(bool enabled) override; virtual bool SetHMirror(bool enabled) override;
virtual bool SetVFlip(bool enabled) override; virtual bool SetVFlip(bool enabled) override;

View File

@ -1,21 +1,23 @@
#include "wifi_board.h" #include "application.h"
#include "axp2101.h"
#include "config.h"
#include "cores3_audio_codec.h" #include "cores3_audio_codec.h"
#include "display/lcd_display.h" #include "display/lcd_display.h"
#include "application.h"
#include "config.h"
#include "power_save_timer.h"
#include "i2c_device.h" #include "i2c_device.h"
#include "axp2101.h" #include "power_save_timer.h"
#include "wifi_board.h"
#include <esp_log.h>
#include <driver/i2c_master.h> #include <driver/i2c_master.h>
#include <esp_lcd_ili9341.h>
#include <esp_lcd_panel_io.h> #include <esp_lcd_panel_io.h>
#include <esp_lcd_panel_ops.h> #include <esp_lcd_panel_ops.h>
#include <esp_lcd_ili9341.h> #include <esp_log.h>
#include <esp_timer.h> #include <esp_timer.h>
#include "esp_video.h" #include "esp_video.h"
#define TAG "M5StackCoreS3Board" #define TAG "M5StackCoreS3Board"
#define BACKGROUND_VISION_INITIAL_DELAY_MS 8000
#define BACKGROUND_VISION_SAMPLE_INTERVAL_MS 100
class Pmic : public Axp2101 { class Pmic : public Axp2101 {
public: public:
@ -41,7 +43,7 @@ public:
class CustomBacklight : public Backlight { class CustomBacklight : public Backlight {
public: public:
CustomBacklight(Pmic *pmic) : pmic_(pmic) {} CustomBacklight(Pmic* pmic) : pmic_(pmic) {}
void SetBrightnessImpl(uint8_t brightness) override { void SetBrightnessImpl(uint8_t brightness) override {
pmic_->SetBrightness(target_brightness_); pmic_->SetBrightness(target_brightness_);
@ -49,7 +51,7 @@ public:
} }
private: private:
Pmic *pmic_; Pmic* pmic_;
}; };
class Aw9523 : public I2cDevice { class Aw9523 : public I2cDevice {
@ -89,16 +91,14 @@ public:
int x = -1; int x = -1;
int y = -1; int y = -1;
}; };
Ft6336(i2c_master_bus_handle_t i2c_bus, uint8_t addr) : I2cDevice(i2c_bus, addr) { Ft6336(i2c_master_bus_handle_t i2c_bus, uint8_t addr) : I2cDevice(i2c_bus, addr) {
uint8_t chip_id = ReadReg(0xA3); uint8_t chip_id = ReadReg(0xA3);
ESP_LOGI(TAG, "Get chip ID: 0x%02X", chip_id); ESP_LOGI(TAG, "Get chip ID: 0x%02X", chip_id);
read_buffer_ = new uint8_t[6]; read_buffer_ = new uint8_t[6];
} }
~Ft6336() { ~Ft6336() { delete[] read_buffer_; }
delete[] read_buffer_;
}
void UpdateTouchPoint() { void UpdateTouchPoint() {
ReadRegs(0x02, read_buffer_, 6); ReadRegs(0x02, read_buffer_, 6);
@ -107,9 +107,7 @@ public:
tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4]; tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4];
} }
inline const TouchPoint_t& GetTouchPoint() { inline const TouchPoint_t& GetTouchPoint() { return tp_; }
return tp_;
}
private: private:
uint8_t* read_buffer_ = nullptr; uint8_t* read_buffer_ = nullptr;
@ -137,9 +135,7 @@ private:
GetDisplay()->SetPowerSaveMode(false); GetDisplay()->SetPowerSaveMode(false);
GetBacklight()->RestoreBrightness(); GetBacklight()->RestoreBrightness();
}); });
power_save_timer_->OnShutdownRequest([this]() { power_save_timer_->OnShutdownRequest([this]() { pmic_->PowerOff(); });
pmic_->PowerOff();
});
power_save_timer_->SetEnabled(true); power_save_timer_->SetEnabled(true);
} }
@ -153,9 +149,10 @@ private:
.glitch_ignore_cnt = 7, .glitch_ignore_cnt = 7,
.intr_priority = 0, .intr_priority = 0,
.trans_queue_depth = 0, .trans_queue_depth = 0,
.flags = { .flags =
.enable_internal_pullup = 1, {
}, .enable_internal_pullup = 1,
},
}; };
ESP_ERROR_CHECK(i2c_new_master_bus(&i2c_bus_cfg, &i2c_bus_)); ESP_ERROR_CHECK(i2c_new_master_bus(&i2c_bus_cfg, &i2c_bus_));
} }
@ -196,28 +193,32 @@ private:
static bool was_touched = false; static bool was_touched = false;
static int64_t touch_start_time = 0; static int64_t touch_start_time = 0;
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按 const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按
ft6336_->UpdateTouchPoint(); ft6336_->UpdateTouchPoint();
auto& touch_point = ft6336_->GetTouchPoint(); auto& touch_point = ft6336_->GetTouchPoint();
// 检测触摸开始 // 检测触摸开始
if (touch_point.num > 0 && !was_touched) { if (touch_point.num > 0 && !was_touched) {
was_touched = true; was_touched = true;
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒 touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
} }
// 检测触摸释放 // 检测触摸释放
else if (touch_point.num == 0 && was_touched) { else if (touch_point.num == 0 && was_touched) {
was_touched = false; was_touched = false;
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time; int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
// 只有短触才触发
if (touch_duration < TOUCH_THRESHOLD_MS) { if (touch_duration < TOUCH_THRESHOLD_MS) {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
if (app.GetDeviceState() == kDeviceStateStarting) { if (app.GetDeviceState() == kDeviceStateStarting) {
EnterWifiConfigMode(); EnterWifiConfigMode();
return; return;
} }
ESP_LOGI(TAG, "Touch short: text-only mode");
app.ToggleChatState(); app.ToggleChatState();
} else {
auto& app = Application::GetInstance();
ESP_LOGI(TAG, "Touch long: vision+text mode");
app.ToggleChatStateWithVision();
} }
} }
} }
@ -225,19 +226,20 @@ private:
void InitializeFt6336TouchPad() { void InitializeFt6336TouchPad() {
ESP_LOGI(TAG, "Init FT6336"); ESP_LOGI(TAG, "Init FT6336");
ft6336_ = new Ft6336(i2c_bus_, 0x38); ft6336_ = new Ft6336(i2c_bus_, 0x38);
// 创建定时器20ms 间隔 // 创建定时器20ms 间隔
esp_timer_create_args_t timer_args = { esp_timer_create_args_t timer_args = {
.callback = [](void* arg) { .callback =
M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg; [](void* arg) {
board->PollTouchpad(); M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg;
}, board->PollTouchpad();
},
.arg = this, .arg = this,
.dispatch_method = ESP_TIMER_TASK, .dispatch_method = ESP_TIMER_TASK,
.name = "touchpad_timer", .name = "touchpad_timer",
.skip_unhandled_events = true, .skip_unhandled_events = true,
}; };
ESP_ERROR_CHECK(esp_timer_create(&timer_args, &touchpad_timer_)); ESP_ERROR_CHECK(esp_timer_create(&timer_args, &touchpad_timer_));
ESP_ERROR_CHECK(esp_timer_start_periodic(touchpad_timer_, 20 * 1000)); ESP_ERROR_CHECK(esp_timer_start_periodic(touchpad_timer_, 20 * 1000));
} }
@ -276,7 +278,7 @@ private:
panel_config.rgb_ele_order = LCD_RGB_ELEMENT_ORDER_BGR; panel_config.rgb_ele_order = LCD_RGB_ELEMENT_ORDER_BGR;
panel_config.bits_per_pixel = 16; panel_config.bits_per_pixel = 16;
ESP_ERROR_CHECK(esp_lcd_new_panel_ili9341(panel_io, &panel_config, &panel)); ESP_ERROR_CHECK(esp_lcd_new_panel_ili9341(panel_io, &panel_config, &panel));
esp_lcd_panel_reset(panel); esp_lcd_panel_reset(panel);
aw9523_->ResetIli9342(); aw9523_->ResetIli9342();
@ -285,23 +287,25 @@ private:
esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY); esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY);
esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y); esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y);
display_ = new SpiLcdDisplay(panel_io, panel, display_ = new SpiLcdDisplay(panel_io, panel, DISPLAY_WIDTH, DISPLAY_HEIGHT,
DISPLAY_WIDTH, DISPLAY_HEIGHT, DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY); DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X,
DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY);
} }
void InitializeCamera() { void InitializeCamera() {
static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = { static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = {
.data_width = CAM_CTLR_DATA_WIDTH_8, .data_width = CAM_CTLR_DATA_WIDTH_8,
.data_io = { .data_io =
[0] = CAMERA_PIN_D0, {
[1] = CAMERA_PIN_D1, [0] = CAMERA_PIN_D0,
[2] = CAMERA_PIN_D2, [1] = CAMERA_PIN_D1,
[3] = CAMERA_PIN_D3, [2] = CAMERA_PIN_D2,
[4] = CAMERA_PIN_D4, [3] = CAMERA_PIN_D3,
[5] = CAMERA_PIN_D5, [4] = CAMERA_PIN_D4,
[6] = CAMERA_PIN_D6, [5] = CAMERA_PIN_D5,
[7] = CAMERA_PIN_D7, [6] = CAMERA_PIN_D6,
}, [7] = CAMERA_PIN_D7,
},
.vsync_io = CAMERA_PIN_VSYNC, .vsync_io = CAMERA_PIN_VSYNC,
.de_io = CAMERA_PIN_HREF, .de_io = CAMERA_PIN_HREF,
.pclk_io = CAMERA_PIN_PCLK, .pclk_io = CAMERA_PIN_PCLK,
@ -330,6 +334,37 @@ private:
camera_->SetHMirror(false); camera_->SetHMirror(false);
} }
void InitializeBackgroundVisionSampler() {
xTaskCreate(
[](void* arg) {
auto board = static_cast<M5StackCoreS3Board*>(arg);
bool has_logged_success = false;
bool has_logged_failure = false;
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
while (true) {
if (board->camera_ == nullptr) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue;
}
if (board->camera_->CaptureBackground()) {
if (!has_logged_success) {
ESP_LOGI(TAG, "Background vision sampler started");
has_logged_success = true;
}
} else if (!has_logged_failure) {
ESP_LOGW(TAG, "Background vision sampler is waiting for camera");
has_logged_failure = true;
}
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
}
},
"BgVisionSampler", 4096, this, 1, nullptr);
}
public: public:
M5StackCoreS3Board() { M5StackCoreS3Board() {
InitializePowerSaveTimer(); InitializePowerSaveTimer();
@ -340,34 +375,24 @@ public:
InitializeSpi(); InitializeSpi();
InitializeIli9342Display(); InitializeIli9342Display();
InitializeCamera(); InitializeCamera();
InitializeBackgroundVisionSampler();
InitializeFt6336TouchPad(); InitializeFt6336TouchPad();
GetBacklight()->RestoreBrightness(); GetBacklight()->RestoreBrightness();
} }
virtual AudioCodec* GetAudioCodec() override { virtual AudioCodec* GetAudioCodec() override {
static CoreS3AudioCodec audio_codec(i2c_bus_, static CoreS3AudioCodec audio_codec(
AUDIO_INPUT_SAMPLE_RATE, i2c_bus_, AUDIO_INPUT_SAMPLE_RATE, AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_MCLK,
AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_BCLK, AUDIO_I2S_GPIO_WS, AUDIO_I2S_GPIO_DOUT, AUDIO_I2S_GPIO_DIN,
AUDIO_I2S_GPIO_MCLK, AUDIO_CODEC_AW88298_ADDR, AUDIO_CODEC_ES7210_ADDR, AUDIO_INPUT_REFERENCE);
AUDIO_I2S_GPIO_BCLK,
AUDIO_I2S_GPIO_WS,
AUDIO_I2S_GPIO_DOUT,
AUDIO_I2S_GPIO_DIN,
AUDIO_CODEC_AW88298_ADDR,
AUDIO_CODEC_ES7210_ADDR,
AUDIO_INPUT_REFERENCE);
return &audio_codec; return &audio_codec;
} }
virtual Display* GetDisplay() override { virtual Display* GetDisplay() override { return display_; }
return display_;
}
virtual Camera* GetCamera() override { virtual Camera* GetCamera() override { return camera_; }
return camera_;
}
virtual bool GetBatteryLevel(int &level, bool& charging, bool& discharging) override { virtual bool GetBatteryLevel(int& level, bool& charging, bool& discharging) override {
static bool last_discharging = false; static bool last_discharging = false;
charging = pmic_->IsCharging(); charging = pmic_->IsCharging();
discharging = pmic_->IsDischarging(); discharging = pmic_->IsDischarging();
@ -387,7 +412,7 @@ public:
WifiBoard::SetPowerSaveLevel(level); WifiBoard::SetPowerSaveLevel(level);
} }
virtual Backlight *GetBacklight() override { virtual Backlight* GetBacklight() override {
static CustomBacklight backlight(pmic_); static CustomBacklight backlight(pmic_);
return &backlight; return &backlight;
} }

View File

@ -599,7 +599,7 @@ CONFIG_PARTITION_TABLE_MD5=y
# #
CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/" CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/"
CONFIG_USE_DIRECT_WEBSOCKET=y CONFIG_USE_DIRECT_WEBSOCKET=y
CONFIG_WEBSOCKET_URL="ws://10.6.80.130:8080" CONFIG_WEBSOCKET_URL="ws://172.19.0.240:8080"
CONFIG_WEBSOCKET_TOKEN="" CONFIG_WEBSOCKET_TOKEN=""
CONFIG_WEBSOCKET_PROTOCOL_VERSION=1 CONFIG_WEBSOCKET_PROTOCOL_VERSION=1
# CONFIG_FLASH_NONE_ASSETS is not set # CONFIG_FLASH_NONE_ASSETS is not set

View File

@ -1,4 +1,5 @@
import asyncio import asyncio
import base64
import json import json
import os import os
import shutil import shutil
@ -8,6 +9,7 @@ import time
import traceback import traceback
import uuid import uuid
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import httpx import httpx
@ -31,6 +33,8 @@ CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0")) AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
WS_PORT = 8080 WS_PORT = 8080
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower() AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
PROJECT_ROOT = Path(__file__).resolve().parent.parent
VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
INPUT_SAMPLE_RATE = 16000 INPUT_SAMPLE_RATE = 16000
OUTPUT_SAMPLE_RATE = 24000 OUTPUT_SAMPLE_RATE = 24000
@ -45,6 +49,7 @@ TTS_PRE_ROLL_MS = 80
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1 TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
TTS_INTERRUPT_SILENCE_FRAMES = 3 TTS_INTERRUPT_SILENCE_FRAMES = 3
INTERRUPT_TOPIC = "lk.interrupt" INTERRUPT_TOPIC = "lk.interrupt"
VISION_FRAME_TOPIC = "vision.frame"
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;" TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;"
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18")) TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18")) TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
@ -336,6 +341,73 @@ class ESP32LiveKitBridge:
if not ok: if not ok:
print("警告: bridge 已停止 TTS但 agent 侧 interrupt 未确认送出") print("警告: bridge 已停止 TTS但 agent 侧 interrupt 未确认送出")
def _save_vision_frame(self, session: DeviceSession, image: str) -> Optional[Path]:
try:
image_bytes = base64.b64decode(image, validate=True)
except Exception as exc:
print(f"vision frame base64 解码失败: {exc}")
return None
safe_device_id = "".join(
char if char.isalnum() or char in ("-", "_") else "_"
for char in session.device_id
)
timestamp_ms = int(time.time() * 1000)
VISION_FRAME_SAVE_DIR.mkdir(parents=True, exist_ok=True)
path = VISION_FRAME_SAVE_DIR / f"{timestamp_ms}_{safe_device_id}.jpg"
path.write_bytes(image_bytes)
return path
async def _publish_vision_frame(self, session: DeviceSession, message: dict[str, Any]) -> None:
image = message.get("image")
if not isinstance(image, str) or not image:
print("收到 vision frame但 image 字段为空")
return
saved_path = self._save_vision_frame(session, image)
if saved_path is None:
return
print(f"已保存 vision frame: {saved_path}")
participant = getattr(session.room, "local_participant", None)
if participant is None:
print("跳过发送 vision framelocal participant 尚未就绪")
return
payload = {
"type": "vision_frame",
"topic": VISION_FRAME_TOPIC,
"room": session.room_name,
"identity": session.identity,
"device_id": session.device_id,
"mime_type": message.get("mime_type", "image/jpeg"),
"image": image,
"saved_path": str(saved_path),
}
data = json.dumps(payload).encode("utf-8")
agent_identities = self._get_agent_identities(session)
kwargs: dict[str, Any] = {}
if agent_identities:
kwargs["destination_identities"] = agent_identities
last_error: Optional[Exception] = None
for attempt in ({"topic": VISION_FRAME_TOPIC, **kwargs}, kwargs):
try:
await participant.publish_data(data, **attempt)
print(
f"已发送 vision frame: bytes={len(data)} "
f"targets={agent_identities or 'broadcast'}"
)
return
except TypeError as exc:
last_error = exc
except Exception as exc:
print(f"发送 vision frame 失败: {exc}")
return
if last_error is not None:
print(f"发送 vision frame 失败publish_data 签名不兼容: {last_error}")
async def _send_tts_state(self, session: DeviceSession, state: str) -> None: async def _send_tts_state(self, session: DeviceSession, state: str) -> None:
if session.websocket is None: if session.websocket is None:
print(f"跳过 tts {state}ESP32 尚未连接") print(f"跳过 tts {state}ESP32 尚未连接")
@ -939,14 +1011,12 @@ class ESP32LiveKitBridge:
) )
self.device_sessions[device_id] = session self.device_sessions[device_id] = session
# print(f"ESP32 已连接: device={device_id}") print(f"ESP32 已连接: device={device_id}")
# print(f"ESP32 协议版本: {session.protocol_version}") print(f"ESP32 协议版本: {session.protocol_version}")
session.tts_stream_id += 1 session.tts_stream_id += 1
opus_decoder = None opus_decoder = None
try: try:
await self._connect_session_room(session)
hello_msg = { hello_msg = {
"type": "hello", "type": "hello",
"transport": "websocket", "transport": "websocket",
@ -962,6 +1032,9 @@ class ESP32LiveKitBridge:
}, },
} }
await websocket.send(json.dumps(hello_msg)) await websocket.send(json.dumps(hello_msg))
print(f"已发送 server hello: device={device_id} room={session.room_name}")
await self._connect_session_room(session)
async for message in websocket: async for message in websocket:
if isinstance(message, bytes): if isinstance(message, bytes):
@ -1017,6 +1090,8 @@ class ESP32LiveKitBridge:
abort_reason = reason if isinstance(reason, str) and reason else "button_abort" abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
print(f"处理 ESP32 打断请求: reason={abort_reason}") print(f"处理 ESP32 打断请求: reason={abort_reason}")
await self._abort_tts(session, abort_reason) await self._abort_tts(session, abort_reason)
elif msg_type == "vision" and data.get("state") == "frame":
await self._publish_vision_frame(session, data)
except json.JSONDecodeError: except json.JSONDecodeError:
print(f"收到未知的字符消息: {message}") print(f"收到未知的字符消息: {message}")
except ConnectionClosedError as exc: except ConnectionClosedError as exc:

View File

@ -1,9 +1,22 @@
#include "protocol.h" #include "protocol.h"
#include <esp_log.h> #include <esp_log.h>
#include <mbedtls/base64.h>
#define TAG "Protocol" #define TAG "Protocol"
static std::string Base64Encode(const std::string& data) {
size_t encoded_length = 0;
size_t output_length = 0;
mbedtls_base64_encode(nullptr, 0, &encoded_length,
reinterpret_cast<const unsigned char*>(data.data()), data.size());
std::string result(encoded_length, 0);
mbedtls_base64_encode(reinterpret_cast<unsigned char*>(result.data()), result.size(), &output_length,
reinterpret_cast<const unsigned char*>(data.data()), data.size());
result.resize(output_length);
return result;
}
void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) { void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) {
on_incoming_json_ = callback; on_incoming_json_ = callback;
} }
@ -78,6 +91,27 @@ void Protocol::SendMcpMessage(const std::string& payload) {
SendText(message); SendText(message);
} }
void Protocol::SendVisionFrame(const std::string& jpeg_data) {
if (jpeg_data.empty()) {
return;
}
cJSON* root = cJSON_CreateObject();
cJSON_AddStringToObject(root, "session_id", session_id_.c_str());
cJSON_AddStringToObject(root, "type", "vision");
cJSON_AddStringToObject(root, "state", "frame");
cJSON_AddStringToObject(root, "mime_type", "image/jpeg");
auto encoded = Base64Encode(jpeg_data);
cJSON_AddStringToObject(root, "image", encoded.c_str());
char* json_str = cJSON_PrintUnformatted(root);
if (json_str != nullptr) {
SendText(json_str);
cJSON_free(json_str);
}
cJSON_Delete(root);
}
bool Protocol::IsTimeout() const { bool Protocol::IsTimeout() const {
const int kTimeoutSeconds = 120; const int kTimeoutSeconds = 120;
auto now = std::chrono::steady_clock::now(); auto now = std::chrono::steady_clock::now();

View File

@ -73,6 +73,7 @@ public:
virtual void SendStopListening(); virtual void SendStopListening();
virtual void SendAbortSpeaking(AbortReason reason); virtual void SendAbortSpeaking(AbortReason reason);
virtual void SendMcpMessage(const std::string& message); virtual void SendMcpMessage(const std::string& message);
virtual void SendVisionFrame(const std::string& jpeg_data);
protected: protected:
std::function<void(const cJSON* root)> on_incoming_json_; std::function<void(const cJSON* root)> on_incoming_json_;
@ -95,4 +96,3 @@ protected:
}; };
#endif // PROTOCOL_H #endif // PROTOCOL_H