Compare commits

5 Commits

Author SHA1 Message Date
9637e09aef feat: beaver 2026-06-04 15:48:10 +08:00
b92e6e1b07 feat: remove background cam every time 2026-05-29 14:53:58 +08:00
33ee598c21 feat: add icon beaver 2026-05-29 11:22:31 +08:00
37343ac0fe feat: icon first commit 2026-05-27 17:16:11 +08:00
fc6302661d feat: support camera capture to livekit 2026-05-25 17:21:11 +08:00
13 changed files with 557 additions and 98 deletions

1
.gitignore vendored
View File

@ -10,6 +10,7 @@ sdkconfig
dependencies.lock dependencies.lock
.env .env
releases/ releases/
vision_frames/
main/assets/lang_config.h main/assets/lang_config.h
main/mmap_generate_emoji.h main/mmap_generate_emoji.h
.DS_Store .DS_Store

View File

@ -15,7 +15,7 @@ config USE_DIRECT_WEBSOCKET
config WEBSOCKET_URL config WEBSOCKET_URL
string "Default WebSocket URL" string "Default WebSocket URL"
depends on USE_DIRECT_WEBSOCKET depends on USE_DIRECT_WEBSOCKET
default "ws://10.6.80.130:8080" default "ws://172.19.0.240:8080"
help help
The WebSocket server URL used when direct WebSocket mode is enabled. The WebSocket server URL used when direct WebSocket mode is enabled.

View File

@ -81,6 +81,7 @@ void Application::Initialize() {
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED); xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
}; };
callbacks.on_vad_change = [this](bool speaking) { callbacks.on_vad_change = [this](bool speaking) {
vad_speaking_.store(speaking);
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE); xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
}; };
audio_service_.SetCallbacks(callbacks); audio_service_.SetCallbacks(callbacks);
@ -233,6 +234,13 @@ void Application::Run() {
if (GetDeviceState() == kDeviceStateListening) { if (GetDeviceState() == kDeviceStateListening) {
auto led = Board::GetInstance().GetLed(); auto led = Board::GetInstance().GetLed();
led->OnStateChanged(); led->OnStateChanged();
if (vad_speaking_.load() && vision_text_mode_enabled_.load() &&
!vision_frame_sent_for_current_listen_.exchange(true)) {
if (!SendCurrentVisionFrame()) {
vision_frame_sent_for_current_listen_.store(false);
}
}
} }
} }
@ -525,6 +533,9 @@ void Application::InitializeProtocol() {
protocol_->OnAudioChannelClosed([this, &board]() { protocol_->OnAudioChannelClosed([this, &board]() {
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
Schedule([this]() { Schedule([this]() {
if (GetDeviceState() == kDeviceStateConnecting) {
return;
}
auto display = Board::GetInstance().GetDisplay(); auto display = Board::GetInstance().GetDisplay();
display->SetChatMessage("system", ""); display->SetChatMessage("system", "");
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
@ -673,10 +684,39 @@ void Application::DismissAlert() {
} }
void Application::ToggleChatState() { void Application::ToggleChatState() {
ToggleChatStateForMode(kChatAgentModeNormal, false);
}
void Application::ToggleChatStateWithVision() {
ToggleChatStateForMode(kChatAgentModeNormal, true);
}
void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) {
chat_agent_mode_.store(agent_mode);
vision_text_mode_enabled_.store(vision_enabled);
vision_frame_sent_for_current_listen_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
} }
bool Application::IsVisionTextModeEnabled() const {
return vision_text_mode_enabled_.load();
}
const char* Application::GetChatAgentModeName() const {
return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
}
const char* Application::GetChatModeName() const {
bool vision_enabled = vision_text_mode_enabled_.load();
if (chat_agent_mode_.load() == kChatAgentModeBeaver) {
return vision_enabled ? "vision-beaver" : "beaver";
}
return vision_enabled ? "vision-normal" : "normal";
}
void Application::StartListening() { void Application::StartListening() {
vision_text_mode_enabled_.store(false);
vision_frame_sent_for_current_listen_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING); xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
} }
@ -707,7 +747,12 @@ void Application::HandleToggleChatEvent() {
if (state == kDeviceStateIdle) { if (state == kDeviceStateIdle) {
ListeningMode mode = GetDefaultListeningMode(); ListeningMode mode = GetDefaultListeningMode();
if (!protocol_->IsAudioChannelOpened()) { bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
bool vision_mode_changed = vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
if (protocol_->IsAudioChannelOpened()) {
protocol_->CloseAudioChannel();
}
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update) // Schedule to let the state change be processed first (UI update)
Schedule([this, mode]() { Schedule([this, mode]() {
@ -739,6 +784,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
} }
} }
active_chat_agent_mode_.store(chat_agent_mode_.load());
active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load());
SetListeningMode(mode); SetListeningMode(mode);
} }
@ -882,6 +929,7 @@ void Application::HandleStateChangedEvent() {
switch (new_state) { switch (new_state) {
case kDeviceStateUnknown: case kDeviceStateUnknown:
case kDeviceStateIdle: case kDeviceStateIdle:
vision_frame_sent_for_current_listen_.store(false);
display->SetStatus(Lang::Strings::STANDBY); display->SetStatus(Lang::Strings::STANDBY);
display->ClearChatMessages(); // Clear messages first display->ClearChatMessages(); // Clear messages first
display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count) display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
@ -894,6 +942,8 @@ void Application::HandleStateChangedEvent() {
display->SetChatMessage("system", ""); display->SetChatMessage("system", "");
break; break;
case kDeviceStateListening: case kDeviceStateListening:
vad_speaking_.store(false);
vision_frame_sent_for_current_listen_.store(false);
display->SetStatus(Lang::Strings::LISTENING); display->SetStatus(Lang::Strings::LISTENING);
display->SetEmotion("neutral"); display->SetEmotion("neutral");
@ -944,6 +994,27 @@ void Application::HandleStateChangedEvent() {
} }
} }
bool Application::SendCurrentVisionFrame() {
if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
return false;
}
auto camera = Board::GetInstance().GetCamera();
if (camera == nullptr) {
return false;
}
std::string jpeg_data;
if (!camera->CaptureToJpeg(jpeg_data, true)) {
ESP_LOGW(TAG, "Failed to capture vision frame");
return false;
}
protocol_->SendVisionFrame(jpeg_data);
ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
return true;
}
void Application::Schedule(std::function<void()>&& callback) { void Application::Schedule(std::function<void()>&& callback) {
{ {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
@ -962,6 +1033,8 @@ void Application::AbortSpeaking(AbortReason reason) {
void Application::SetListeningMode(ListeningMode mode) { void Application::SetListeningMode(ListeningMode mode) {
listening_mode_ = mode; listening_mode_ = mode;
vad_speaking_.store(false);
vision_frame_sent_for_current_listen_.store(false);
SetDeviceState(kDeviceStateListening); SetDeviceState(kDeviceStateListening);
} }

View File

@ -11,6 +11,7 @@
#include <deque> #include <deque>
#include <memory> #include <memory>
#include <functional> #include <functional>
#include <atomic>
#include "protocol.h" #include "protocol.h"
#include "ota.h" #include "ota.h"
@ -40,6 +41,11 @@ enum AecMode {
kAecOnServerSide, kAecOnServerSide,
}; };
enum ChatAgentMode {
kChatAgentModeNormal,
kChatAgentModeBeaver,
};
class Application { class Application {
public: public:
static Application& GetInstance() { static Application& GetInstance() {
@ -91,6 +97,12 @@ public:
* Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run() * Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run()
*/ */
void ToggleChatState(); void ToggleChatState();
void ToggleChatStateWithVision();
void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled);
bool IsVisionTextModeEnabled() const;
ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); }
const char* GetChatAgentModeName() const;
const char* GetChatModeName() const;
/** /**
* Start listening (event-based, thread-safe) * Start listening (event-based, thread-safe)
@ -144,6 +156,12 @@ private:
bool aborted_ = false; bool aborted_ = false;
bool assets_version_checked_ = false; bool assets_version_checked_ = false;
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
std::atomic<ChatAgentMode> chat_agent_mode_ = kChatAgentModeNormal;
std::atomic<ChatAgentMode> active_chat_agent_mode_ = kChatAgentModeNormal;
std::atomic<bool> vision_text_mode_enabled_ = false;
std::atomic<bool> active_vision_text_mode_enabled_ = false;
std::atomic<bool> vad_speaking_ = false;
std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
int clock_ticks_ = 0; int clock_ticks_ = 0;
TaskHandle_t activation_task_handle_ = nullptr; TaskHandle_t activation_task_handle_ = nullptr;
@ -159,6 +177,7 @@ private:
void HandleWakeWordDetectedEvent(); void HandleWakeWordDetectedEvent();
void ContinueOpenAudioChannel(ListeningMode mode); void ContinueOpenAudioChannel(ListeningMode mode);
void ContinueWakeWordInvoke(const std::string& wake_word); void ContinueWakeWordInvoke(const std::string& wake_word);
bool SendCurrentVisionFrame();
// Activation task (runs in background) // Activation task (runs in background)
void ActivationTask(); void ActivationTask();

View File

@ -7,6 +7,8 @@ class Camera {
public: public:
virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0; virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0;
virtual bool Capture() = 0; virtual bool Capture() = 0;
virtual bool CaptureBackground() { return Capture(); }
virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) { return false; }
virtual bool SetHMirror(bool enabled) = 0; virtual bool SetHMirror(bool enabled) = 0;
virtual bool SetVFlip(bool enabled) = 0; virtual bool SetVFlip(bool enabled) = 0;
virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op

View File

@ -24,6 +24,7 @@
#include "lvgl_display.h" #include "lvgl_display.h"
#include "mcp_server.h" #include "mcp_server.h"
#include "system_info.h" #include "system_info.h"
#include "esp_timer.h"
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE #ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#undef LOG_LOCAL_LEVEL #undef LOG_LOCAL_LEVEL
@ -55,6 +56,7 @@
#define TAG "EspVideo" #define TAG "EspVideo"
#define FOREGROUND_CAPTURE_PROTECTION_US (10 * 1000 * 1000)
#if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP) #if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
#warning \ #warning \
@ -381,11 +383,47 @@ EspVideo::~EspVideo() {
} }
void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) { void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) {
std::lock_guard<std::mutex> lock(frame_mutex_);
explain_url_ = url; explain_url_ = url;
explain_token_ = token; explain_token_ = token;
} }
bool EspVideo::Capture() { bool EspVideo::Capture() {
return CaptureFrame(true);
}
bool EspVideo::CaptureBackground() {
return CaptureFrame(false);
}
bool EspVideo::CaptureToJpeg(std::string& jpeg_data, bool show_preview) {
jpeg_data.clear();
if (!CaptureFrame(show_preview)) {
return false;
}
std::lock_guard<std::mutex> lock(frame_mutex_);
if (frame_.data == nullptr || frame_.len == 0) {
return false;
}
uint16_t w = frame_.width ? frame_.width : 320;
uint16_t h = frame_.height ? frame_.height : 240;
return image_to_jpeg_cb(
frame_.data, frame_.len, w, h, frame_.format, 60,
[](void* arg, size_t index, const void* data, size_t len) -> size_t {
auto jpeg_data = static_cast<std::string*>(arg);
if (data != nullptr && len > 0) {
jpeg_data->append(static_cast<const char*>(data), len);
}
return len;
},
&jpeg_data);
}
bool EspVideo::CaptureFrame(bool show_preview) {
std::lock_guard<std::mutex> lock(frame_mutex_);
if (encoder_thread_.joinable()) { if (encoder_thread_.joinable()) {
encoder_thread_.join(); encoder_thread_.join();
} }
@ -394,6 +432,10 @@ bool EspVideo::Capture() {
return false; return false;
} }
if (!show_preview && esp_timer_get_time() < foreground_capture_protected_until_us_) {
return true;
}
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
struct v4l2_buffer buf = {}; struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
@ -729,6 +771,11 @@ bool EspVideo::Capture() {
} }
} }
if (show_preview) {
foreground_capture_protected_until_us_ = esp_timer_get_time() + FOREGROUND_CAPTURE_PROTECTION_US;
}
if (show_preview) {
// 显示预览图片 // 显示预览图片
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay()); auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
if (display != nullptr) { if (display != nullptr) {
@ -837,6 +884,7 @@ bool EspVideo::Capture() {
auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format); auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
display->SetPreviewImage(std::move(image)); display->SetPreviewImage(std::move(image));
} }
}
return true; return true;
} }
@ -898,10 +946,16 @@ bool EspVideo::SetVFlip(bool enabled) {
* @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息 * @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息
*/ */
std::string EspVideo::Explain(const std::string& question) { std::string EspVideo::Explain(const std::string& question) {
std::lock_guard<std::mutex> lock(frame_mutex_);
if (explain_url_.empty()) { if (explain_url_.empty()) {
throw std::runtime_error("Image explain URL or token is not set"); throw std::runtime_error("Image explain URL or token is not set");
} }
if (frame_.data == nullptr || frame_.len == 0) {
throw std::runtime_error("No camera frame captured");
}
// 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data // 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk)); QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
if (jpeg_queue == nullptr) { if (jpeg_queue == nullptr) {

View File

@ -5,6 +5,8 @@
#include <thread> #include <thread>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <mutex>
#include <cstdint>
#include <freertos/FreeRTOS.h> #include <freertos/FreeRTOS.h>
#include <freertos/queue.h> #include <freertos/queue.h>
@ -39,6 +41,10 @@ private:
std::string explain_url_; std::string explain_url_;
std::string explain_token_; std::string explain_token_;
std::thread encoder_thread_; std::thread encoder_thread_;
std::mutex frame_mutex_;
int64_t foreground_capture_protected_until_us_ = 0;
bool CaptureFrame(bool show_preview);
public: public:
EspVideo(const esp_video_init_config_t& config); EspVideo(const esp_video_init_config_t& config);
@ -46,6 +52,8 @@ public:
virtual void SetExplainUrl(const std::string& url, const std::string& token); virtual void SetExplainUrl(const std::string& url, const std::string& token);
virtual bool Capture(); virtual bool Capture();
virtual bool CaptureBackground() override;
virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) override;
// 翻转控制函数 // 翻转控制函数
virtual bool SetHMirror(bool enabled) override; virtual bool SetHMirror(bool enabled) override;
virtual bool SetVFlip(bool enabled) override; virtual bool SetVFlip(bool enabled) override;

View File

@ -1,21 +1,23 @@
#include "wifi_board.h" #include "application.h"
#include "axp2101.h"
#include "config.h"
#include "cores3_audio_codec.h" #include "cores3_audio_codec.h"
#include "display/lcd_display.h" #include "display/lcd_display.h"
#include "application.h"
#include "config.h"
#include "power_save_timer.h"
#include "i2c_device.h" #include "i2c_device.h"
#include "axp2101.h" #include "power_save_timer.h"
#include "wifi_board.h"
#include <esp_log.h>
#include <driver/i2c_master.h> #include <driver/i2c_master.h>
#include <esp_lcd_ili9341.h>
#include <esp_lcd_panel_io.h> #include <esp_lcd_panel_io.h>
#include <esp_lcd_panel_ops.h> #include <esp_lcd_panel_ops.h>
#include <esp_lcd_ili9341.h> #include <esp_log.h>
#include <esp_timer.h> #include <esp_timer.h>
#include "esp_video.h" #include "esp_video.h"
#define TAG "M5StackCoreS3Board" #define TAG "M5StackCoreS3Board"
#define BACKGROUND_VISION_INITIAL_DELAY_MS 8000
#define BACKGROUND_VISION_SAMPLE_INTERVAL_MS 100
class Pmic : public Axp2101 { class Pmic : public Axp2101 {
public: public:
@ -96,9 +98,7 @@ public:
read_buffer_ = new uint8_t[6]; read_buffer_ = new uint8_t[6];
} }
~Ft6336() { ~Ft6336() { delete[] read_buffer_; }
delete[] read_buffer_;
}
void UpdateTouchPoint() { void UpdateTouchPoint() {
ReadRegs(0x02, read_buffer_, 6); ReadRegs(0x02, read_buffer_, 6);
@ -107,9 +107,7 @@ public:
tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4]; tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4];
} }
inline const TouchPoint_t& GetTouchPoint() { inline const TouchPoint_t& GetTouchPoint() { return tp_; }
return tp_;
}
private: private:
uint8_t* read_buffer_ = nullptr; uint8_t* read_buffer_ = nullptr;
@ -137,9 +135,7 @@ private:
GetDisplay()->SetPowerSaveMode(false); GetDisplay()->SetPowerSaveMode(false);
GetBacklight()->RestoreBrightness(); GetBacklight()->RestoreBrightness();
}); });
power_save_timer_->OnShutdownRequest([this]() { power_save_timer_->OnShutdownRequest([this]() { pmic_->PowerOff(); });
pmic_->PowerOff();
});
power_save_timer_->SetEnabled(true); power_save_timer_->SetEnabled(true);
} }
@ -153,7 +149,8 @@ private:
.glitch_ignore_cnt = 7, .glitch_ignore_cnt = 7,
.intr_priority = 0, .intr_priority = 0,
.trans_queue_depth = 0, .trans_queue_depth = 0,
.flags = { .flags =
{
.enable_internal_pullup = 1, .enable_internal_pullup = 1,
}, },
}; };
@ -195,6 +192,7 @@ private:
void PollTouchpad() { void PollTouchpad() {
static bool was_touched = false; static bool was_touched = false;
static int64_t touch_start_time = 0; static int64_t touch_start_time = 0;
static int touch_start_x = -1;
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按 const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按
ft6336_->UpdateTouchPoint(); ft6336_->UpdateTouchPoint();
@ -204,20 +202,27 @@ private:
if (touch_point.num > 0 && !was_touched) { if (touch_point.num > 0 && !was_touched) {
was_touched = true; was_touched = true;
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒 touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
touch_start_x = touch_point.x;
} }
// 检测触摸释放 // 检测触摸释放
else if (touch_point.num == 0 && was_touched) { else if (touch_point.num == 0 && was_touched) {
was_touched = false; was_touched = false;
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time; int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2;
auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal;
// 只有短触才触发
if (touch_duration < TOUCH_THRESHOLD_MS) { if (touch_duration < TOUCH_THRESHOLD_MS) {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
if (app.GetDeviceState() == kDeviceStateStarting) { if (app.GetDeviceState() == kDeviceStateStarting) {
EnterWifiConfigMode(); EnterWifiConfigMode();
return; return;
} }
app.ToggleChatState(); ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal");
app.ToggleChatStateForMode(agent_mode, false);
} else {
auto& app = Application::GetInstance();
ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal");
app.ToggleChatStateForMode(agent_mode, true);
} }
} }
} }
@ -228,7 +233,8 @@ private:
// 创建定时器20ms 间隔 // 创建定时器20ms 间隔
esp_timer_create_args_t timer_args = { esp_timer_create_args_t timer_args = {
.callback = [](void* arg) { .callback =
[](void* arg) {
M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg; M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg;
board->PollTouchpad(); board->PollTouchpad();
}, },
@ -285,14 +291,16 @@ private:
esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY); esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY);
esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y); esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y);
display_ = new SpiLcdDisplay(panel_io, panel, display_ = new SpiLcdDisplay(panel_io, panel, DISPLAY_WIDTH, DISPLAY_HEIGHT,
DISPLAY_WIDTH, DISPLAY_HEIGHT, DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY); DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X,
DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY);
} }
void InitializeCamera() { void InitializeCamera() {
static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = { static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = {
.data_width = CAM_CTLR_DATA_WIDTH_8, .data_width = CAM_CTLR_DATA_WIDTH_8,
.data_io = { .data_io =
{
[0] = CAMERA_PIN_D0, [0] = CAMERA_PIN_D0,
[1] = CAMERA_PIN_D1, [1] = CAMERA_PIN_D1,
[2] = CAMERA_PIN_D2, [2] = CAMERA_PIN_D2,
@ -330,6 +338,42 @@ private:
camera_->SetHMirror(false); camera_->SetHMirror(false);
} }
void InitializeBackgroundVisionSampler() {
xTaskCreate(
[](void* arg) {
auto board = static_cast<M5StackCoreS3Board*>(arg);
bool has_logged_success = false;
bool has_logged_failure = false;
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
while (true) {
if (!Application::GetInstance().IsVisionTextModeEnabled()) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue;
}
if (board->camera_ == nullptr) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue;
}
if (board->camera_->Capture()) {
if (!has_logged_success) {
ESP_LOGI(TAG, "Vision preview sampler started");
has_logged_success = true;
}
} else if (!has_logged_failure) {
ESP_LOGW(TAG, "Vision preview sampler is waiting for camera");
has_logged_failure = true;
}
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
}
},
"BgVisionSampler", 4096, this, 1, nullptr);
}
public: public:
M5StackCoreS3Board() { M5StackCoreS3Board() {
InitializePowerSaveTimer(); InitializePowerSaveTimer();
@ -340,32 +384,22 @@ public:
InitializeSpi(); InitializeSpi();
InitializeIli9342Display(); InitializeIli9342Display();
InitializeCamera(); InitializeCamera();
InitializeBackgroundVisionSampler();
InitializeFt6336TouchPad(); InitializeFt6336TouchPad();
GetBacklight()->RestoreBrightness(); GetBacklight()->RestoreBrightness();
} }
virtual AudioCodec* GetAudioCodec() override { virtual AudioCodec* GetAudioCodec() override {
static CoreS3AudioCodec audio_codec(i2c_bus_, static CoreS3AudioCodec audio_codec(
AUDIO_INPUT_SAMPLE_RATE, i2c_bus_, AUDIO_INPUT_SAMPLE_RATE, AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_MCLK,
AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_BCLK, AUDIO_I2S_GPIO_WS, AUDIO_I2S_GPIO_DOUT, AUDIO_I2S_GPIO_DIN,
AUDIO_I2S_GPIO_MCLK, AUDIO_CODEC_AW88298_ADDR, AUDIO_CODEC_ES7210_ADDR, AUDIO_INPUT_REFERENCE);
AUDIO_I2S_GPIO_BCLK,
AUDIO_I2S_GPIO_WS,
AUDIO_I2S_GPIO_DOUT,
AUDIO_I2S_GPIO_DIN,
AUDIO_CODEC_AW88298_ADDR,
AUDIO_CODEC_ES7210_ADDR,
AUDIO_INPUT_REFERENCE);
return &audio_codec; return &audio_codec;
} }
virtual Display* GetDisplay() override { virtual Display* GetDisplay() override { return display_; }
return display_;
}
virtual Camera* GetCamera() override { virtual Camera* GetCamera() override { return camera_; }
return camera_;
}
virtual bool GetBatteryLevel(int& level, bool& charging, bool& discharging) override { virtual bool GetBatteryLevel(int& level, bool& charging, bool& discharging) override {
static bool last_discharging = false; static bool last_discharging = false;

View File

@ -599,7 +599,7 @@ CONFIG_PARTITION_TABLE_MD5=y
# #
CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/" CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/"
CONFIG_USE_DIRECT_WEBSOCKET=y CONFIG_USE_DIRECT_WEBSOCKET=y
CONFIG_WEBSOCKET_URL="ws://10.6.80.130:8080" CONFIG_WEBSOCKET_URL="ws://172.19.0.240:8080"
CONFIG_WEBSOCKET_TOKEN="" CONFIG_WEBSOCKET_TOKEN=""
CONFIG_WEBSOCKET_PROTOCOL_VERSION=1 CONFIG_WEBSOCKET_PROTOCOL_VERSION=1
# CONFIG_FLASH_NONE_ASSETS is not set # CONFIG_FLASH_NONE_ASSETS is not set

View File

@ -1,6 +1,8 @@
import asyncio import asyncio
import base64
import json import json
import os import os
import re
import shutil import shutil
import struct import struct
import sys import sys
@ -8,6 +10,7 @@ import time
import traceback import traceback
import uuid import uuid
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import httpx import httpx
@ -26,11 +29,24 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken"
LIVEKIT_WS_URL = "ws://172.19.0.240:7880" LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
ROOM_PREFIX = "test-livekit" ROOM_PREFIX = "test-livekit"
IDENTITY_PREFIX = "uv-livekit" IDENTITY_PREFIX = "uv-livekit"
AGENT_NAME = "my-agent" LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent")
DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower()
AGENT_NAMES = {
"normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME),
"beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"),
}
CHAT_MODE_AGENT_NAMES = {
"normal": AGENT_NAMES["normal"],
"beaver": AGENT_NAMES["beaver"],
"vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"),
"vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"),
}
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0")) CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0")) AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
WS_PORT = 8080 WS_PORT = 8080
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower() AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
PROJECT_ROOT = Path(__file__).resolve().parent.parent
VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
INPUT_SAMPLE_RATE = 16000 INPUT_SAMPLE_RATE = 16000
OUTPUT_SAMPLE_RATE = 24000 OUTPUT_SAMPLE_RATE = 24000
@ -45,11 +61,22 @@ TTS_PRE_ROLL_MS = 80
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1 TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
TTS_INTERRUPT_SILENCE_FRAMES = 3 TTS_INTERRUPT_SILENCE_FRAMES = 3
INTERRUPT_TOPIC = "lk.interrupt" INTERRUPT_TOPIC = "lk.interrupt"
VISION_FRAME_TOPIC = "vision.frame"
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;" TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;"
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18")) TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18")) TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
TTS_DISPLAY_SCROLL_GAP = " " TTS_DISPLAY_SCROLL_GAP = " "
TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8")) TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
EMOTION_TEXT_PATTERN = re.compile(
r"^\s*<?\s*emotion\s*=\s*([^\s>,;]+)\s*>?[\s,;]*(.*)$",
re.DOTALL,
)
EMOTION_TEST_SEQUENCE = [
emotion.strip()
for emotion in os.getenv("BRIDGE_EMOTION_TEST_SEQUENCE", "").split(",")
if emotion.strip()
]
EMOTION_TEST_INTERVAL_SECONDS = float(os.getenv("BRIDGE_EMOTION_TEST_INTERVAL_SECONDS", "2.0"))
@dataclass @dataclass
@ -59,6 +86,10 @@ class DeviceSession:
protocol_version: int protocol_version: int
room_name: str room_name: str
identity: str identity: str
chat_mode: str
agent_mode: str
agent_name: str
vision_enabled: bool
room: rtc.Room room: rtc.Room
mic_source: AudioSource mic_source: AudioSource
agent_ready: asyncio.Event agent_ready: asyncio.Event
@ -70,6 +101,7 @@ class DeviceSession:
tts_transcript_text: str = "" tts_transcript_text: str = ""
tts_display_text: str = "" tts_display_text: str = ""
tts_display_final: bool = False tts_display_final: bool = False
tts_emotion: str = ""
tts_suppressed_until: float = 0.0 tts_suppressed_until: float = 0.0
agent_dispatch_task: Optional[asyncio.Task] = None agent_dispatch_task: Optional[asyncio.Task] = None
closed: bool = False closed: bool = False
@ -77,10 +109,10 @@ class DeviceSession:
first_capture_log_time: float = 0.0 first_capture_log_time: float = 0.0
async def fetch_token(room_name: str, identity: str) -> str: async def fetch_token(room_name: str, identity: str, agent_name: str) -> str:
params = {"room": room_name, "identity": identity} params = {"room": room_name, "identity": identity}
if AGENT_DISPATCH_MODE == "token": if AGENT_DISPATCH_MODE == "token":
params["agent_name"] = AGENT_NAME params["agent_name"] = agent_name
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
response = await client.get(TOKEN_URL, params=params) response = await client.get(TOKEN_URL, params=params)
@ -120,15 +152,61 @@ class ESP32LiveKitBridge:
print(f"[session] device={device_id} room={room_name} identity={identity}") print(f"[session] device={device_id} room={room_name} identity={identity}")
return room_name, identity return room_name, identity
def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool: def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]:
requested_chat_mode = (
headers.get("Chat-Mode")
or headers.get("X-Chat-Mode")
or ""
).strip().lower()
chat_mode_to_agent = {
"normal": ("normal", False),
"beaver": ("beaver", False),
"vision-normal": ("normal", True),
"vision-beaver": ("beaver", True),
}
if requested_chat_mode in chat_mode_to_agent:
requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode]
else:
if requested_chat_mode:
print(f"未知 Chat-Mode={requested_chat_mode!r},回退到 Agent-Mode")
requested_mode = (
headers.get("Agent-Mode")
or headers.get("X-Agent-Mode")
or DEFAULT_AGENT_MODE
or "normal"
).strip().lower()
vision_enabled = False
requested_chat_mode = requested_mode
requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name")
if requested_name:
return requested_chat_mode, "custom", requested_name, vision_enabled
if requested_mode not in AGENT_NAMES:
print(f"未知 Agent-Mode={requested_mode!r},回退到 normal")
requested_mode = "normal"
requested_chat_mode = "vision-normal" if vision_enabled else "normal"
if requested_chat_mode in CHAT_MODE_AGENT_NAMES:
return (
requested_chat_mode,
requested_mode,
CHAT_MODE_AGENT_NAMES[requested_chat_mode],
vision_enabled,
)
return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled
def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool:
identity = getattr(participant, "identity", "") or "" identity = getattr(participant, "identity", "") or ""
return identity.startswith("agent-") or AGENT_NAME in identity return identity.startswith("agent-") or agent_name in identity
def _get_agent_identities(self, session: DeviceSession) -> list[str]: def _get_agent_identities(self, session: DeviceSession) -> list[str]:
return [ return [
participant.identity participant.identity
for participant in session.room.remote_participants.values() for participant in session.room.remote_participants.values()
if self._is_agent_participant(participant) if self._is_agent_participant(participant, session.agent_name)
] ]
def _log_agent_participants(self, session: DeviceSession, source: str) -> None: def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
@ -170,7 +248,7 @@ class ESP32LiveKitBridge:
dispatch_agent_name = getattr(dispatch, "agent_name", None) dispatch_agent_name = getattr(dispatch, "agent_name", None)
dispatch_room = getattr(dispatch, "room", None) dispatch_room = getattr(dispatch, "room", None)
if dispatch_room == session.room_name and ( if dispatch_room == session.room_name and (
dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None dispatch_agent_name == session.agent_name or dispatch_agent_name is None
): ):
print( print(
f"检测到已有 dispatch: room={session.room_name} " f"检测到已有 dispatch: room={session.room_name} "
@ -193,7 +271,7 @@ class ESP32LiveKitBridge:
return return
for participant in session.room.remote_participants.values(): for participant in session.room.remote_participants.values():
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
# print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}") # print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}")
return return
@ -205,7 +283,7 @@ class ESP32LiveKitBridge:
await session.agent_dispatch_task await session.agent_dispatch_task
async def _dispatch_agent(self, session: DeviceSession) -> None: async def _dispatch_agent(self, session: DeviceSession) -> None:
print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}") print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}")
try: try:
if await self._dispatch_agent_with_sdk(session): if await self._dispatch_agent_with_sdk(session):
@ -235,13 +313,17 @@ class ESP32LiveKitBridge:
try: try:
dispatch = await lkapi.agent_dispatch.create_dispatch( dispatch = await lkapi.agent_dispatch.create_dispatch(
livekit_api.CreateAgentDispatchRequest( livekit_api.CreateAgentDispatchRequest(
agent_name=AGENT_NAME, agent_name=session.agent_name,
room=session.room_name, room=session.room_name,
metadata=json.dumps( metadata=json.dumps(
{ {
"source": "bridge_server", "source": "bridge_server",
"identity": session.identity, "identity": session.identity,
"device_id": session.device_id, "device_id": session.device_id,
"chat_mode": session.chat_mode,
"agent_mode": session.agent_mode,
"agent_name": session.agent_name,
"vision_enabled": session.vision_enabled,
} }
), ),
) )
@ -264,13 +346,17 @@ class ESP32LiveKitBridge:
"--room", "--room",
session.room_name, session.room_name,
"--agent-name", "--agent-name",
AGENT_NAME, session.agent_name,
"--metadata", "--metadata",
json.dumps( json.dumps(
{ {
"source": "bridge_server", "source": "bridge_server",
"identity": session.identity, "identity": session.identity,
"device_id": session.device_id, "device_id": session.device_id,
"chat_mode": session.chat_mode,
"agent_mode": session.agent_mode,
"agent_name": session.agent_name,
"vision_enabled": session.vision_enabled,
} }
), ),
stdout=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE,
@ -289,7 +375,7 @@ class ESP32LiveKitBridge:
print(f"lk dispatch create 失败,退出码: {process.returncode}") print(f"lk dispatch create 失败,退出码: {process.returncode}")
return False return False
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}") print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}")
return True return True
async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool: async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
@ -336,6 +422,73 @@ class ESP32LiveKitBridge:
if not ok: if not ok:
print("警告: bridge 已停止 TTS但 agent 侧 interrupt 未确认送出") print("警告: bridge 已停止 TTS但 agent 侧 interrupt 未确认送出")
def _save_vision_frame(self, session: DeviceSession, image: str) -> Optional[Path]:
try:
image_bytes = base64.b64decode(image, validate=True)
except Exception as exc:
print(f"vision frame base64 解码失败: {exc}")
return None
safe_device_id = "".join(
char if char.isalnum() or char in ("-", "_") else "_"
for char in session.device_id
)
timestamp_ms = int(time.time() * 1000)
VISION_FRAME_SAVE_DIR.mkdir(parents=True, exist_ok=True)
path = VISION_FRAME_SAVE_DIR / f"{timestamp_ms}_{safe_device_id}.jpg"
path.write_bytes(image_bytes)
return path
async def _publish_vision_frame(self, session: DeviceSession, message: dict[str, Any]) -> None:
image = message.get("image")
if not isinstance(image, str) or not image:
print("收到 vision frame但 image 字段为空")
return
saved_path = self._save_vision_frame(session, image)
if saved_path is None:
return
print(f"已保存 vision frame: {saved_path}")
participant = getattr(session.room, "local_participant", None)
if participant is None:
print("跳过发送 vision framelocal participant 尚未就绪")
return
payload = {
"type": "vision_frame",
"topic": VISION_FRAME_TOPIC,
"room": session.room_name,
"identity": session.identity,
"device_id": session.device_id,
"mime_type": message.get("mime_type", "image/jpeg"),
"image": image,
"saved_path": str(saved_path),
}
data = json.dumps(payload).encode("utf-8")
agent_identities = self._get_agent_identities(session)
kwargs: dict[str, Any] = {}
if agent_identities:
kwargs["destination_identities"] = agent_identities
last_error: Optional[Exception] = None
for attempt in ({"topic": VISION_FRAME_TOPIC, **kwargs}, kwargs):
try:
await participant.publish_data(data, **attempt)
print(
f"已发送 vision frame: bytes={len(data)} "
f"targets={agent_identities or 'broadcast'}"
)
return
except TypeError as exc:
last_error = exc
except Exception as exc:
print(f"发送 vision frame 失败: {exc}")
return
if last_error is not None:
print(f"发送 vision frame 失败publish_data 签名不兼容: {last_error}")
async def _send_tts_state(self, session: DeviceSession, state: str) -> None: async def _send_tts_state(self, session: DeviceSession, state: str) -> None:
if session.websocket is None: if session.websocket is None:
print(f"跳过 tts {state}ESP32 尚未连接") print(f"跳过 tts {state}ESP32 尚未连接")
@ -343,9 +496,40 @@ class ESP32LiveKitBridge:
await session.websocket.send(json.dumps({"type": "tts", "state": state})) await session.websocket.send(json.dumps({"type": "tts", "state": state}))
print(f"已发送 tts {state}: device={session.device_id}") print(f"已发送 tts {state}: device={session.device_id}")
async def _send_emotion(self, session: DeviceSession, emotion: str) -> None:
if session.websocket is None:
print(f"跳过 emotion {emotion}ESP32 尚未连接")
return
await session.websocket.send(json.dumps({"type": "llm", "emotion": emotion}))
print(f"已发送 emotion: device={session.device_id} emotion={emotion}")
def _parse_emotion_text(self, text: str) -> tuple[Optional[str], str]:
match = EMOTION_TEXT_PATTERN.match(text)
if match is None:
return None, text.strip()
emotion, tts_text = match.groups()
return emotion.strip(), tts_text.strip()
async def _run_emotion_test_sequence(self, session: DeviceSession) -> None:
if not EMOTION_TEST_SEQUENCE:
return
for index, emotion in enumerate(EMOTION_TEST_SEQUENCE):
if session.websocket is None or session.closed:
return
if index > 0:
await asyncio.sleep(EMOTION_TEST_INTERVAL_SECONDS)
await self._send_emotion(session, emotion)
async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None: async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
if session.websocket is None: if session.websocket is None:
return return
raw_text = text
_emotion, text = self._parse_emotion_text(text)
if not text:
print(f"[tts->esp32] skip empty text: raw={raw_text!r} final={final}")
return
print(f"[tts->esp32] text={text!r} final={final}")
await session.websocket.send( await session.websocket.send(
json.dumps( json.dumps(
{ {
@ -421,6 +605,7 @@ class ESP32LiveKitBridge:
if not session.tts_display_text: if not session.tts_display_text:
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
self._cancel_tts_display_task(session) self._cancel_tts_display_task(session)
await self._send_tts_state(session, "start") await self._send_tts_state(session, "start")
session.tts_active = True session.tts_active = True
@ -435,6 +620,7 @@ class ESP32LiveKitBridge:
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_text = "" session.tts_display_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None: async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
self._cancel_tts_display_task(session) self._cancel_tts_display_task(session)
@ -445,6 +631,7 @@ class ESP32LiveKitBridge:
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_text = "" session.tts_display_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
await self._send_tts_state(session, "stop") await self._send_tts_state(session, "stop")
print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}") print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")
@ -609,16 +796,16 @@ class ESP32LiveKitBridge:
print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}") print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
self._log_agent_participants(session, "connected") self._log_agent_participants(session, "connected")
for participant in session.room.remote_participants.values(): for participant in session.room.remote_participants.values():
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
session.agent_ready.set() session.agent_ready.set()
self._scan_participant_audio_tracks(session, participant, "connected_scan") self._scan_participant_audio_tracks(session, participant, "connected_scan")
@session.room.on("participant_connected") @session.room.on("participant_connected")
def on_participant_connected(participant: rtc.RemoteParticipant) -> None: def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
role = "Agent" if self._is_agent_participant(participant) else "Remote participant" role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}") print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
self._log_agent_participants(session, "participant_connected") self._log_agent_participants(session, "participant_connected")
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
session.agent_ready.set() session.agent_ready.set()
self._scan_participant_audio_tracks( self._scan_participant_audio_tracks(
session, participant, "participant_connected_scan" session, participant, "participant_connected_scan"
@ -651,14 +838,26 @@ class ESP32LiveKitBridge:
track_pub: rtc.TrackPublication, track_pub: rtc.TrackPublication,
) -> None: ) -> None:
identity = participant.identity if participant else "未知" identity = participant.identity if participant else "未知"
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant) is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(
participant, session.agent_name
)
for segment in segments: for segment in segments:
status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果" status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}") print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}")
if is_agent: if is_agent:
if time.monotonic() < session.tts_suppressed_until: if time.monotonic() < session.tts_suppressed_until:
continue continue
display_text = self._current_tts_display_text(segment.text) print(f"[livekit-llm] raw={segment.text!r} final={segment.final}")
emotion, tts_text = self._parse_emotion_text(segment.text)
print(
f"[livekit-llm] parsed emotion={emotion!r} "
f"tts_text={tts_text!r} final={segment.final}"
)
if emotion and emotion != session.tts_emotion:
session.tts_emotion = emotion
asyncio.create_task(self._send_emotion(session, emotion))
display_text = self._current_tts_display_text(tts_text)
print(f"[livekit-llm] display_text={display_text!r} final={segment.final}")
if not display_text or display_text == session.tts_transcript_text: if not display_text or display_text == session.tts_transcript_text:
continue continue
session.tts_transcript_text = display_text session.tts_transcript_text = display_text
@ -713,7 +912,7 @@ class ESP32LiveKitBridge:
# print(f"[config] token_url={TOKEN_URL}") # print(f"[config] token_url={TOKEN_URL}")
# print(f"[config] room={session.room_name} identity={session.identity}") # print(f"[config] room={session.room_name} identity={session.identity}")
# print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}") # print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
token = await fetch_token(session.room_name, session.identity) token = await fetch_token(session.room_name, session.identity, session.agent_name)
try: try:
await session.room.connect( await session.room.connect(
@ -758,8 +957,12 @@ class ESP32LiveKitBridge:
# print(f"等待 agent 加入: room={session.room_name}") # print(f"等待 agent 加入: room={session.room_name}")
try: try:
await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS) await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
if session.closed:
return
# print(f"✅ agent 已就绪: room={session.room_name}") # print(f"✅ agent 已就绪: room={session.room_name}")
except asyncio.TimeoutError: except asyncio.TimeoutError:
if session.closed:
return
print(f"⚠️ agent 等待超时: room={session.room_name}") print(f"⚠️ agent 等待超时: room={session.room_name}")
async def start(self) -> None: async def start(self) -> None:
@ -767,6 +970,20 @@ class ESP32LiveKitBridge:
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}") print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
print(f"[config] token_url={TOKEN_URL}") print(f"[config] token_url={TOKEN_URL}")
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}") print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
print(
"[config] agents="
f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} "
f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} "
f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} "
f"default_mode:{DEFAULT_AGENT_MODE}"
)
if EMOTION_TEST_SEQUENCE:
print(
"[config] emotion_test_sequence="
f"{','.join(EMOTION_TEST_SEQUENCE)} "
f"interval={EMOTION_TEST_INTERVAL_SECONDS}s"
)
async def close(self) -> None: async def close(self) -> None:
for session in list(self.device_sessions.values()): for session in list(self.device_sessions.values()):
@ -777,6 +994,7 @@ class ESP32LiveKitBridge:
return return
session.closed = True session.closed = True
session.websocket = None session.websocket = None
session.agent_ready.set()
session.tts_active = False session.tts_active = False
session.tts_stream_id += 1 session.tts_stream_id += 1
if session.tts_idle_task is not None: if session.tts_idle_task is not None:
@ -926,6 +1144,7 @@ class ESP32LiveKitBridge:
await self._close_session(existing_session) await self._close_session(existing_session)
self.device_sessions.pop(device_id, None) self.device_sessions.pop(device_id, None)
chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers)
room_name, identity = self._build_session_names(device_id) room_name, identity = self._build_session_names(device_id)
session = DeviceSession( session = DeviceSession(
device_id=device_id, device_id=device_id,
@ -933,20 +1152,27 @@ class ESP32LiveKitBridge:
protocol_version=protocol_version, protocol_version=protocol_version,
room_name=room_name, room_name=room_name,
identity=identity, identity=identity,
chat_mode=chat_mode,
agent_mode=agent_mode,
agent_name=agent_name,
vision_enabled=vision_enabled,
room=rtc.Room(), room=rtc.Room(),
mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1), mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
agent_ready=asyncio.Event(), agent_ready=asyncio.Event(),
) )
self.device_sessions[device_id] = session self.device_sessions[device_id] = session
# print(f"ESP32 已连接: device={device_id}") print(f"ESP32 已连接: device={device_id}")
# print(f"ESP32 协议版本: {session.protocol_version}") print(f"ESP32 协议版本: {session.protocol_version}")
print(
f"ESP32 mode: chat={session.chat_mode} "
f"agent={session.agent_mode}/{session.agent_name} "
f"vision={session.vision_enabled}"
)
session.tts_stream_id += 1 session.tts_stream_id += 1
opus_decoder = None opus_decoder = None
try: try:
await self._connect_session_room(session)
hello_msg = { hello_msg = {
"type": "hello", "type": "hello",
"transport": "websocket", "transport": "websocket",
@ -962,6 +1188,10 @@ class ESP32LiveKitBridge:
}, },
} }
await websocket.send(json.dumps(hello_msg)) await websocket.send(json.dumps(hello_msg))
print(f"已发送 server hello: device={device_id} room={session.room_name}")
asyncio.create_task(self._run_emotion_test_sequence(session))
await self._connect_session_room(session)
async for message in websocket: async for message in websocket:
if isinstance(message, bytes): if isinstance(message, bytes):
@ -1017,6 +1247,8 @@ class ESP32LiveKitBridge:
abort_reason = reason if isinstance(reason, str) and reason else "button_abort" abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
print(f"处理 ESP32 打断请求: reason={abort_reason}") print(f"处理 ESP32 打断请求: reason={abort_reason}")
await self._abort_tts(session, abort_reason) await self._abort_tts(session, abort_reason)
elif msg_type == "vision" and data.get("state") == "frame":
await self._publish_vision_frame(session, data)
except json.JSONDecodeError: except json.JSONDecodeError:
print(f"收到未知的字符消息: {message}") print(f"收到未知的字符消息: {message}")
except ConnectionClosedError as exc: except ConnectionClosedError as exc:

View File

@ -1,9 +1,22 @@
#include "protocol.h" #include "protocol.h"
#include <esp_log.h> #include <esp_log.h>
#include <mbedtls/base64.h>
#define TAG "Protocol" #define TAG "Protocol"
static std::string Base64Encode(const std::string& data) {
size_t encoded_length = 0;
size_t output_length = 0;
mbedtls_base64_encode(nullptr, 0, &encoded_length,
reinterpret_cast<const unsigned char*>(data.data()), data.size());
std::string result(encoded_length, 0);
mbedtls_base64_encode(reinterpret_cast<unsigned char*>(result.data()), result.size(), &output_length,
reinterpret_cast<const unsigned char*>(data.data()), data.size());
result.resize(output_length);
return result;
}
void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) { void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) {
on_incoming_json_ = callback; on_incoming_json_ = callback;
} }
@ -78,6 +91,27 @@ void Protocol::SendMcpMessage(const std::string& payload) {
SendText(message); SendText(message);
} }
void Protocol::SendVisionFrame(const std::string& jpeg_data) {
if (jpeg_data.empty()) {
return;
}
cJSON* root = cJSON_CreateObject();
cJSON_AddStringToObject(root, "session_id", session_id_.c_str());
cJSON_AddStringToObject(root, "type", "vision");
cJSON_AddStringToObject(root, "state", "frame");
cJSON_AddStringToObject(root, "mime_type", "image/jpeg");
auto encoded = Base64Encode(jpeg_data);
cJSON_AddStringToObject(root, "image", encoded.c_str());
char* json_str = cJSON_PrintUnformatted(root);
if (json_str != nullptr) {
SendText(json_str);
cJSON_free(json_str);
}
cJSON_Delete(root);
}
bool Protocol::IsTimeout() const { bool Protocol::IsTimeout() const {
const int kTimeoutSeconds = 120; const int kTimeoutSeconds = 120;
auto now = std::chrono::steady_clock::now(); auto now = std::chrono::steady_clock::now();

View File

@ -73,6 +73,7 @@ public:
virtual void SendStopListening(); virtual void SendStopListening();
virtual void SendAbortSpeaking(AbortReason reason); virtual void SendAbortSpeaking(AbortReason reason);
virtual void SendMcpMessage(const std::string& message); virtual void SendMcpMessage(const std::string& message);
virtual void SendVisionFrame(const std::string& jpeg_data);
protected: protected:
std::function<void(const cJSON* root)> on_incoming_json_; std::function<void(const cJSON* root)> on_incoming_json_;
@ -95,4 +96,3 @@ protected:
}; };
#endif // PROTOCOL_H #endif // PROTOCOL_H

View File

@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str()); websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str()); websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str()); websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName());
websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName());
websocket_->OnData([this](const char* data, size_t len, bool binary) { websocket_->OnData([this](const char* data, size_t len, bool binary) {
if (binary) { if (binary) {