Compare commits
5 Commits
ws
...
9637e09aef
| Author | SHA1 | Date | |
|---|---|---|---|
| 9637e09aef | |||
| b92e6e1b07 | |||
| 33ee598c21 | |||
| 37343ac0fe | |||
| fc6302661d |
1
.gitignore
vendored
1
.gitignore
vendored
@ -10,6 +10,7 @@ sdkconfig
|
|||||||
dependencies.lock
|
dependencies.lock
|
||||||
.env
|
.env
|
||||||
releases/
|
releases/
|
||||||
|
vision_frames/
|
||||||
main/assets/lang_config.h
|
main/assets/lang_config.h
|
||||||
main/mmap_generate_emoji.h
|
main/mmap_generate_emoji.h
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|||||||
@ -15,7 +15,7 @@ config USE_DIRECT_WEBSOCKET
|
|||||||
config WEBSOCKET_URL
|
config WEBSOCKET_URL
|
||||||
string "Default WebSocket URL"
|
string "Default WebSocket URL"
|
||||||
depends on USE_DIRECT_WEBSOCKET
|
depends on USE_DIRECT_WEBSOCKET
|
||||||
default "ws://10.6.80.130:8080"
|
default "ws://172.19.0.240:8080"
|
||||||
help
|
help
|
||||||
The WebSocket server URL used when direct WebSocket mode is enabled.
|
The WebSocket server URL used when direct WebSocket mode is enabled.
|
||||||
|
|
||||||
|
|||||||
@ -81,6 +81,7 @@ void Application::Initialize() {
|
|||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
|
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
|
||||||
};
|
};
|
||||||
callbacks.on_vad_change = [this](bool speaking) {
|
callbacks.on_vad_change = [this](bool speaking) {
|
||||||
|
vad_speaking_.store(speaking);
|
||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
|
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
|
||||||
};
|
};
|
||||||
audio_service_.SetCallbacks(callbacks);
|
audio_service_.SetCallbacks(callbacks);
|
||||||
@ -233,6 +234,13 @@ void Application::Run() {
|
|||||||
if (GetDeviceState() == kDeviceStateListening) {
|
if (GetDeviceState() == kDeviceStateListening) {
|
||||||
auto led = Board::GetInstance().GetLed();
|
auto led = Board::GetInstance().GetLed();
|
||||||
led->OnStateChanged();
|
led->OnStateChanged();
|
||||||
|
|
||||||
|
if (vad_speaking_.load() && vision_text_mode_enabled_.load() &&
|
||||||
|
!vision_frame_sent_for_current_listen_.exchange(true)) {
|
||||||
|
if (!SendCurrentVisionFrame()) {
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -525,6 +533,9 @@ void Application::InitializeProtocol() {
|
|||||||
protocol_->OnAudioChannelClosed([this, &board]() {
|
protocol_->OnAudioChannelClosed([this, &board]() {
|
||||||
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
|
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
|
||||||
Schedule([this]() {
|
Schedule([this]() {
|
||||||
|
if (GetDeviceState() == kDeviceStateConnecting) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
auto display = Board::GetInstance().GetDisplay();
|
auto display = Board::GetInstance().GetDisplay();
|
||||||
display->SetChatMessage("system", "");
|
display->SetChatMessage("system", "");
|
||||||
SetDeviceState(kDeviceStateIdle);
|
SetDeviceState(kDeviceStateIdle);
|
||||||
@ -673,10 +684,39 @@ void Application::DismissAlert() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Application::ToggleChatState() {
|
void Application::ToggleChatState() {
|
||||||
|
ToggleChatStateForMode(kChatAgentModeNormal, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Application::ToggleChatStateWithVision() {
|
||||||
|
ToggleChatStateForMode(kChatAgentModeNormal, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) {
|
||||||
|
chat_agent_mode_.store(agent_mode);
|
||||||
|
vision_text_mode_enabled_.store(vision_enabled);
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
|
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Application::IsVisionTextModeEnabled() const {
|
||||||
|
return vision_text_mode_enabled_.load();
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* Application::GetChatAgentModeName() const {
|
||||||
|
return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* Application::GetChatModeName() const {
|
||||||
|
bool vision_enabled = vision_text_mode_enabled_.load();
|
||||||
|
if (chat_agent_mode_.load() == kChatAgentModeBeaver) {
|
||||||
|
return vision_enabled ? "vision-beaver" : "beaver";
|
||||||
|
}
|
||||||
|
return vision_enabled ? "vision-normal" : "normal";
|
||||||
|
}
|
||||||
|
|
||||||
void Application::StartListening() {
|
void Application::StartListening() {
|
||||||
|
vision_text_mode_enabled_.store(false);
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
|
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -707,7 +747,12 @@ void Application::HandleToggleChatEvent() {
|
|||||||
|
|
||||||
if (state == kDeviceStateIdle) {
|
if (state == kDeviceStateIdle) {
|
||||||
ListeningMode mode = GetDefaultListeningMode();
|
ListeningMode mode = GetDefaultListeningMode();
|
||||||
if (!protocol_->IsAudioChannelOpened()) {
|
bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
|
||||||
|
bool vision_mode_changed = vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
|
||||||
|
if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
|
||||||
|
if (protocol_->IsAudioChannelOpened()) {
|
||||||
|
protocol_->CloseAudioChannel();
|
||||||
|
}
|
||||||
SetDeviceState(kDeviceStateConnecting);
|
SetDeviceState(kDeviceStateConnecting);
|
||||||
// Schedule to let the state change be processed first (UI update)
|
// Schedule to let the state change be processed first (UI update)
|
||||||
Schedule([this, mode]() {
|
Schedule([this, mode]() {
|
||||||
@ -739,6 +784,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
active_chat_agent_mode_.store(chat_agent_mode_.load());
|
||||||
|
active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load());
|
||||||
SetListeningMode(mode);
|
SetListeningMode(mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -882,6 +929,7 @@ void Application::HandleStateChangedEvent() {
|
|||||||
switch (new_state) {
|
switch (new_state) {
|
||||||
case kDeviceStateUnknown:
|
case kDeviceStateUnknown:
|
||||||
case kDeviceStateIdle:
|
case kDeviceStateIdle:
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
display->SetStatus(Lang::Strings::STANDBY);
|
display->SetStatus(Lang::Strings::STANDBY);
|
||||||
display->ClearChatMessages(); // Clear messages first
|
display->ClearChatMessages(); // Clear messages first
|
||||||
display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
|
display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
|
||||||
@ -894,6 +942,8 @@ void Application::HandleStateChangedEvent() {
|
|||||||
display->SetChatMessage("system", "");
|
display->SetChatMessage("system", "");
|
||||||
break;
|
break;
|
||||||
case kDeviceStateListening:
|
case kDeviceStateListening:
|
||||||
|
vad_speaking_.store(false);
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
display->SetStatus(Lang::Strings::LISTENING);
|
display->SetStatus(Lang::Strings::LISTENING);
|
||||||
display->SetEmotion("neutral");
|
display->SetEmotion("neutral");
|
||||||
|
|
||||||
@ -944,6 +994,27 @@ void Application::HandleStateChangedEvent() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Application::SendCurrentVisionFrame() {
|
||||||
|
if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto camera = Board::GetInstance().GetCamera();
|
||||||
|
if (camera == nullptr) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string jpeg_data;
|
||||||
|
if (!camera->CaptureToJpeg(jpeg_data, true)) {
|
||||||
|
ESP_LOGW(TAG, "Failed to capture vision frame");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
protocol_->SendVisionFrame(jpeg_data);
|
||||||
|
ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void Application::Schedule(std::function<void()>&& callback) {
|
void Application::Schedule(std::function<void()>&& callback) {
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_);
|
std::lock_guard<std::mutex> lock(mutex_);
|
||||||
@ -962,6 +1033,8 @@ void Application::AbortSpeaking(AbortReason reason) {
|
|||||||
|
|
||||||
void Application::SetListeningMode(ListeningMode mode) {
|
void Application::SetListeningMode(ListeningMode mode) {
|
||||||
listening_mode_ = mode;
|
listening_mode_ = mode;
|
||||||
|
vad_speaking_.store(false);
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
SetDeviceState(kDeviceStateListening);
|
SetDeviceState(kDeviceStateListening);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,7 @@
|
|||||||
#include <deque>
|
#include <deque>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
#include "protocol.h"
|
#include "protocol.h"
|
||||||
#include "ota.h"
|
#include "ota.h"
|
||||||
@ -40,6 +41,11 @@ enum AecMode {
|
|||||||
kAecOnServerSide,
|
kAecOnServerSide,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum ChatAgentMode {
|
||||||
|
kChatAgentModeNormal,
|
||||||
|
kChatAgentModeBeaver,
|
||||||
|
};
|
||||||
|
|
||||||
class Application {
|
class Application {
|
||||||
public:
|
public:
|
||||||
static Application& GetInstance() {
|
static Application& GetInstance() {
|
||||||
@ -91,6 +97,12 @@ public:
|
|||||||
* Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run()
|
* Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run()
|
||||||
*/
|
*/
|
||||||
void ToggleChatState();
|
void ToggleChatState();
|
||||||
|
void ToggleChatStateWithVision();
|
||||||
|
void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled);
|
||||||
|
bool IsVisionTextModeEnabled() const;
|
||||||
|
ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); }
|
||||||
|
const char* GetChatAgentModeName() const;
|
||||||
|
const char* GetChatModeName() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Start listening (event-based, thread-safe)
|
* Start listening (event-based, thread-safe)
|
||||||
@ -144,6 +156,12 @@ private:
|
|||||||
bool aborted_ = false;
|
bool aborted_ = false;
|
||||||
bool assets_version_checked_ = false;
|
bool assets_version_checked_ = false;
|
||||||
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
|
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
|
||||||
|
std::atomic<ChatAgentMode> chat_agent_mode_ = kChatAgentModeNormal;
|
||||||
|
std::atomic<ChatAgentMode> active_chat_agent_mode_ = kChatAgentModeNormal;
|
||||||
|
std::atomic<bool> vision_text_mode_enabled_ = false;
|
||||||
|
std::atomic<bool> active_vision_text_mode_enabled_ = false;
|
||||||
|
std::atomic<bool> vad_speaking_ = false;
|
||||||
|
std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
|
||||||
int clock_ticks_ = 0;
|
int clock_ticks_ = 0;
|
||||||
TaskHandle_t activation_task_handle_ = nullptr;
|
TaskHandle_t activation_task_handle_ = nullptr;
|
||||||
|
|
||||||
@ -159,6 +177,7 @@ private:
|
|||||||
void HandleWakeWordDetectedEvent();
|
void HandleWakeWordDetectedEvent();
|
||||||
void ContinueOpenAudioChannel(ListeningMode mode);
|
void ContinueOpenAudioChannel(ListeningMode mode);
|
||||||
void ContinueWakeWordInvoke(const std::string& wake_word);
|
void ContinueWakeWordInvoke(const std::string& wake_word);
|
||||||
|
bool SendCurrentVisionFrame();
|
||||||
|
|
||||||
// Activation task (runs in background)
|
// Activation task (runs in background)
|
||||||
void ActivationTask();
|
void ActivationTask();
|
||||||
|
|||||||
@ -7,6 +7,8 @@ class Camera {
|
|||||||
public:
|
public:
|
||||||
virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0;
|
virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0;
|
||||||
virtual bool Capture() = 0;
|
virtual bool Capture() = 0;
|
||||||
|
virtual bool CaptureBackground() { return Capture(); }
|
||||||
|
virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) { return false; }
|
||||||
virtual bool SetHMirror(bool enabled) = 0;
|
virtual bool SetHMirror(bool enabled) = 0;
|
||||||
virtual bool SetVFlip(bool enabled) = 0;
|
virtual bool SetVFlip(bool enabled) = 0;
|
||||||
virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op
|
virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op
|
||||||
|
|||||||
@ -24,6 +24,7 @@
|
|||||||
#include "lvgl_display.h"
|
#include "lvgl_display.h"
|
||||||
#include "mcp_server.h"
|
#include "mcp_server.h"
|
||||||
#include "system_info.h"
|
#include "system_info.h"
|
||||||
|
#include "esp_timer.h"
|
||||||
|
|
||||||
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
|
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
|
||||||
#undef LOG_LOCAL_LEVEL
|
#undef LOG_LOCAL_LEVEL
|
||||||
@ -55,6 +56,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#define TAG "EspVideo"
|
#define TAG "EspVideo"
|
||||||
|
#define FOREGROUND_CAPTURE_PROTECTION_US (10 * 1000 * 1000)
|
||||||
|
|
||||||
#if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
|
#if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
|
||||||
#warning \
|
#warning \
|
||||||
@ -381,11 +383,47 @@ EspVideo::~EspVideo() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) {
|
void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) {
|
||||||
|
std::lock_guard<std::mutex> lock(frame_mutex_);
|
||||||
explain_url_ = url;
|
explain_url_ = url;
|
||||||
explain_token_ = token;
|
explain_token_ = token;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool EspVideo::Capture() {
|
bool EspVideo::Capture() {
|
||||||
|
return CaptureFrame(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EspVideo::CaptureBackground() {
|
||||||
|
return CaptureFrame(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EspVideo::CaptureToJpeg(std::string& jpeg_data, bool show_preview) {
|
||||||
|
jpeg_data.clear();
|
||||||
|
if (!CaptureFrame(show_preview)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> lock(frame_mutex_);
|
||||||
|
if (frame_.data == nullptr || frame_.len == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint16_t w = frame_.width ? frame_.width : 320;
|
||||||
|
uint16_t h = frame_.height ? frame_.height : 240;
|
||||||
|
return image_to_jpeg_cb(
|
||||||
|
frame_.data, frame_.len, w, h, frame_.format, 60,
|
||||||
|
[](void* arg, size_t index, const void* data, size_t len) -> size_t {
|
||||||
|
auto jpeg_data = static_cast<std::string*>(arg);
|
||||||
|
if (data != nullptr && len > 0) {
|
||||||
|
jpeg_data->append(static_cast<const char*>(data), len);
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
},
|
||||||
|
&jpeg_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EspVideo::CaptureFrame(bool show_preview) {
|
||||||
|
std::lock_guard<std::mutex> lock(frame_mutex_);
|
||||||
|
|
||||||
if (encoder_thread_.joinable()) {
|
if (encoder_thread_.joinable()) {
|
||||||
encoder_thread_.join();
|
encoder_thread_.join();
|
||||||
}
|
}
|
||||||
@ -394,6 +432,10 @@ bool EspVideo::Capture() {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!show_preview && esp_timer_get_time() < foreground_capture_protected_until_us_) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
struct v4l2_buffer buf = {};
|
struct v4l2_buffer buf = {};
|
||||||
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
|
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
|
||||||
@ -729,6 +771,11 @@ bool EspVideo::Capture() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (show_preview) {
|
||||||
|
foreground_capture_protected_until_us_ = esp_timer_get_time() + FOREGROUND_CAPTURE_PROTECTION_US;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (show_preview) {
|
||||||
// 显示预览图片
|
// 显示预览图片
|
||||||
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
|
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
|
||||||
if (display != nullptr) {
|
if (display != nullptr) {
|
||||||
@ -837,6 +884,7 @@ bool EspVideo::Capture() {
|
|||||||
auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
|
auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
|
||||||
display->SetPreviewImage(std::move(image));
|
display->SetPreviewImage(std::move(image));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -898,10 +946,16 @@ bool EspVideo::SetVFlip(bool enabled) {
|
|||||||
* @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息
|
* @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息
|
||||||
*/
|
*/
|
||||||
std::string EspVideo::Explain(const std::string& question) {
|
std::string EspVideo::Explain(const std::string& question) {
|
||||||
|
std::lock_guard<std::mutex> lock(frame_mutex_);
|
||||||
|
|
||||||
if (explain_url_.empty()) {
|
if (explain_url_.empty()) {
|
||||||
throw std::runtime_error("Image explain URL or token is not set");
|
throw std::runtime_error("Image explain URL or token is not set");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (frame_.data == nullptr || frame_.len == 0) {
|
||||||
|
throw std::runtime_error("No camera frame captured");
|
||||||
|
}
|
||||||
|
|
||||||
// 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
|
// 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
|
||||||
QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
|
QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
|
||||||
if (jpeg_queue == nullptr) {
|
if (jpeg_queue == nullptr) {
|
||||||
|
|||||||
@ -5,6 +5,8 @@
|
|||||||
#include <thread>
|
#include <thread>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <mutex>
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
#include <freertos/FreeRTOS.h>
|
#include <freertos/FreeRTOS.h>
|
||||||
#include <freertos/queue.h>
|
#include <freertos/queue.h>
|
||||||
@ -39,6 +41,10 @@ private:
|
|||||||
std::string explain_url_;
|
std::string explain_url_;
|
||||||
std::string explain_token_;
|
std::string explain_token_;
|
||||||
std::thread encoder_thread_;
|
std::thread encoder_thread_;
|
||||||
|
std::mutex frame_mutex_;
|
||||||
|
int64_t foreground_capture_protected_until_us_ = 0;
|
||||||
|
|
||||||
|
bool CaptureFrame(bool show_preview);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
EspVideo(const esp_video_init_config_t& config);
|
EspVideo(const esp_video_init_config_t& config);
|
||||||
@ -46,6 +52,8 @@ public:
|
|||||||
|
|
||||||
virtual void SetExplainUrl(const std::string& url, const std::string& token);
|
virtual void SetExplainUrl(const std::string& url, const std::string& token);
|
||||||
virtual bool Capture();
|
virtual bool Capture();
|
||||||
|
virtual bool CaptureBackground() override;
|
||||||
|
virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) override;
|
||||||
// 翻转控制函数
|
// 翻转控制函数
|
||||||
virtual bool SetHMirror(bool enabled) override;
|
virtual bool SetHMirror(bool enabled) override;
|
||||||
virtual bool SetVFlip(bool enabled) override;
|
virtual bool SetVFlip(bool enabled) override;
|
||||||
|
|||||||
@ -1,21 +1,23 @@
|
|||||||
#include "wifi_board.h"
|
#include "application.h"
|
||||||
|
#include "axp2101.h"
|
||||||
|
#include "config.h"
|
||||||
#include "cores3_audio_codec.h"
|
#include "cores3_audio_codec.h"
|
||||||
#include "display/lcd_display.h"
|
#include "display/lcd_display.h"
|
||||||
#include "application.h"
|
|
||||||
#include "config.h"
|
|
||||||
#include "power_save_timer.h"
|
|
||||||
#include "i2c_device.h"
|
#include "i2c_device.h"
|
||||||
#include "axp2101.h"
|
#include "power_save_timer.h"
|
||||||
|
#include "wifi_board.h"
|
||||||
|
|
||||||
#include <esp_log.h>
|
|
||||||
#include <driver/i2c_master.h>
|
#include <driver/i2c_master.h>
|
||||||
|
#include <esp_lcd_ili9341.h>
|
||||||
#include <esp_lcd_panel_io.h>
|
#include <esp_lcd_panel_io.h>
|
||||||
#include <esp_lcd_panel_ops.h>
|
#include <esp_lcd_panel_ops.h>
|
||||||
#include <esp_lcd_ili9341.h>
|
#include <esp_log.h>
|
||||||
#include <esp_timer.h>
|
#include <esp_timer.h>
|
||||||
#include "esp_video.h"
|
#include "esp_video.h"
|
||||||
|
|
||||||
#define TAG "M5StackCoreS3Board"
|
#define TAG "M5StackCoreS3Board"
|
||||||
|
#define BACKGROUND_VISION_INITIAL_DELAY_MS 8000
|
||||||
|
#define BACKGROUND_VISION_SAMPLE_INTERVAL_MS 100
|
||||||
|
|
||||||
class Pmic : public Axp2101 {
|
class Pmic : public Axp2101 {
|
||||||
public:
|
public:
|
||||||
@ -96,9 +98,7 @@ public:
|
|||||||
read_buffer_ = new uint8_t[6];
|
read_buffer_ = new uint8_t[6];
|
||||||
}
|
}
|
||||||
|
|
||||||
~Ft6336() {
|
~Ft6336() { delete[] read_buffer_; }
|
||||||
delete[] read_buffer_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void UpdateTouchPoint() {
|
void UpdateTouchPoint() {
|
||||||
ReadRegs(0x02, read_buffer_, 6);
|
ReadRegs(0x02, read_buffer_, 6);
|
||||||
@ -107,9 +107,7 @@ public:
|
|||||||
tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4];
|
tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4];
|
||||||
}
|
}
|
||||||
|
|
||||||
inline const TouchPoint_t& GetTouchPoint() {
|
inline const TouchPoint_t& GetTouchPoint() { return tp_; }
|
||||||
return tp_;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint8_t* read_buffer_ = nullptr;
|
uint8_t* read_buffer_ = nullptr;
|
||||||
@ -137,9 +135,7 @@ private:
|
|||||||
GetDisplay()->SetPowerSaveMode(false);
|
GetDisplay()->SetPowerSaveMode(false);
|
||||||
GetBacklight()->RestoreBrightness();
|
GetBacklight()->RestoreBrightness();
|
||||||
});
|
});
|
||||||
power_save_timer_->OnShutdownRequest([this]() {
|
power_save_timer_->OnShutdownRequest([this]() { pmic_->PowerOff(); });
|
||||||
pmic_->PowerOff();
|
|
||||||
});
|
|
||||||
power_save_timer_->SetEnabled(true);
|
power_save_timer_->SetEnabled(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,7 +149,8 @@ private:
|
|||||||
.glitch_ignore_cnt = 7,
|
.glitch_ignore_cnt = 7,
|
||||||
.intr_priority = 0,
|
.intr_priority = 0,
|
||||||
.trans_queue_depth = 0,
|
.trans_queue_depth = 0,
|
||||||
.flags = {
|
.flags =
|
||||||
|
{
|
||||||
.enable_internal_pullup = 1,
|
.enable_internal_pullup = 1,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@ -195,6 +192,7 @@ private:
|
|||||||
void PollTouchpad() {
|
void PollTouchpad() {
|
||||||
static bool was_touched = false;
|
static bool was_touched = false;
|
||||||
static int64_t touch_start_time = 0;
|
static int64_t touch_start_time = 0;
|
||||||
|
static int touch_start_x = -1;
|
||||||
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值,超过500ms视为长按
|
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值,超过500ms视为长按
|
||||||
|
|
||||||
ft6336_->UpdateTouchPoint();
|
ft6336_->UpdateTouchPoint();
|
||||||
@ -204,20 +202,27 @@ private:
|
|||||||
if (touch_point.num > 0 && !was_touched) {
|
if (touch_point.num > 0 && !was_touched) {
|
||||||
was_touched = true;
|
was_touched = true;
|
||||||
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
|
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
|
||||||
|
touch_start_x = touch_point.x;
|
||||||
}
|
}
|
||||||
// 检测触摸释放
|
// 检测触摸释放
|
||||||
else if (touch_point.num == 0 && was_touched) {
|
else if (touch_point.num == 0 && was_touched) {
|
||||||
was_touched = false;
|
was_touched = false;
|
||||||
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
|
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
|
||||||
|
bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2;
|
||||||
|
auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal;
|
||||||
|
|
||||||
// 只有短触才触发
|
|
||||||
if (touch_duration < TOUCH_THRESHOLD_MS) {
|
if (touch_duration < TOUCH_THRESHOLD_MS) {
|
||||||
auto& app = Application::GetInstance();
|
auto& app = Application::GetInstance();
|
||||||
if (app.GetDeviceState() == kDeviceStateStarting) {
|
if (app.GetDeviceState() == kDeviceStateStarting) {
|
||||||
EnterWifiConfigMode();
|
EnterWifiConfigMode();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
app.ToggleChatState();
|
ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal");
|
||||||
|
app.ToggleChatStateForMode(agent_mode, false);
|
||||||
|
} else {
|
||||||
|
auto& app = Application::GetInstance();
|
||||||
|
ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal");
|
||||||
|
app.ToggleChatStateForMode(agent_mode, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -228,7 +233,8 @@ private:
|
|||||||
|
|
||||||
// 创建定时器,20ms 间隔
|
// 创建定时器,20ms 间隔
|
||||||
esp_timer_create_args_t timer_args = {
|
esp_timer_create_args_t timer_args = {
|
||||||
.callback = [](void* arg) {
|
.callback =
|
||||||
|
[](void* arg) {
|
||||||
M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg;
|
M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg;
|
||||||
board->PollTouchpad();
|
board->PollTouchpad();
|
||||||
},
|
},
|
||||||
@ -285,14 +291,16 @@ private:
|
|||||||
esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY);
|
esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY);
|
||||||
esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y);
|
esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y);
|
||||||
|
|
||||||
display_ = new SpiLcdDisplay(panel_io, panel,
|
display_ = new SpiLcdDisplay(panel_io, panel, DISPLAY_WIDTH, DISPLAY_HEIGHT,
|
||||||
DISPLAY_WIDTH, DISPLAY_HEIGHT, DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY);
|
DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X,
|
||||||
|
DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY);
|
||||||
}
|
}
|
||||||
|
|
||||||
void InitializeCamera() {
|
void InitializeCamera() {
|
||||||
static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = {
|
static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = {
|
||||||
.data_width = CAM_CTLR_DATA_WIDTH_8,
|
.data_width = CAM_CTLR_DATA_WIDTH_8,
|
||||||
.data_io = {
|
.data_io =
|
||||||
|
{
|
||||||
[0] = CAMERA_PIN_D0,
|
[0] = CAMERA_PIN_D0,
|
||||||
[1] = CAMERA_PIN_D1,
|
[1] = CAMERA_PIN_D1,
|
||||||
[2] = CAMERA_PIN_D2,
|
[2] = CAMERA_PIN_D2,
|
||||||
@ -330,6 +338,42 @@ private:
|
|||||||
camera_->SetHMirror(false);
|
camera_->SetHMirror(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void InitializeBackgroundVisionSampler() {
|
||||||
|
xTaskCreate(
|
||||||
|
[](void* arg) {
|
||||||
|
auto board = static_cast<M5StackCoreS3Board*>(arg);
|
||||||
|
bool has_logged_success = false;
|
||||||
|
bool has_logged_failure = false;
|
||||||
|
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
if (!Application::GetInstance().IsVisionTextModeEnabled()) {
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (board->camera_ == nullptr) {
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (board->camera_->Capture()) {
|
||||||
|
if (!has_logged_success) {
|
||||||
|
ESP_LOGI(TAG, "Vision preview sampler started");
|
||||||
|
has_logged_success = true;
|
||||||
|
}
|
||||||
|
} else if (!has_logged_failure) {
|
||||||
|
ESP_LOGW(TAG, "Vision preview sampler is waiting for camera");
|
||||||
|
has_logged_failure = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"BgVisionSampler", 4096, this, 1, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
M5StackCoreS3Board() {
|
M5StackCoreS3Board() {
|
||||||
InitializePowerSaveTimer();
|
InitializePowerSaveTimer();
|
||||||
@ -340,32 +384,22 @@ public:
|
|||||||
InitializeSpi();
|
InitializeSpi();
|
||||||
InitializeIli9342Display();
|
InitializeIli9342Display();
|
||||||
InitializeCamera();
|
InitializeCamera();
|
||||||
|
InitializeBackgroundVisionSampler();
|
||||||
InitializeFt6336TouchPad();
|
InitializeFt6336TouchPad();
|
||||||
GetBacklight()->RestoreBrightness();
|
GetBacklight()->RestoreBrightness();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual AudioCodec* GetAudioCodec() override {
|
virtual AudioCodec* GetAudioCodec() override {
|
||||||
static CoreS3AudioCodec audio_codec(i2c_bus_,
|
static CoreS3AudioCodec audio_codec(
|
||||||
AUDIO_INPUT_SAMPLE_RATE,
|
i2c_bus_, AUDIO_INPUT_SAMPLE_RATE, AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_MCLK,
|
||||||
AUDIO_OUTPUT_SAMPLE_RATE,
|
AUDIO_I2S_GPIO_BCLK, AUDIO_I2S_GPIO_WS, AUDIO_I2S_GPIO_DOUT, AUDIO_I2S_GPIO_DIN,
|
||||||
AUDIO_I2S_GPIO_MCLK,
|
AUDIO_CODEC_AW88298_ADDR, AUDIO_CODEC_ES7210_ADDR, AUDIO_INPUT_REFERENCE);
|
||||||
AUDIO_I2S_GPIO_BCLK,
|
|
||||||
AUDIO_I2S_GPIO_WS,
|
|
||||||
AUDIO_I2S_GPIO_DOUT,
|
|
||||||
AUDIO_I2S_GPIO_DIN,
|
|
||||||
AUDIO_CODEC_AW88298_ADDR,
|
|
||||||
AUDIO_CODEC_ES7210_ADDR,
|
|
||||||
AUDIO_INPUT_REFERENCE);
|
|
||||||
return &audio_codec;
|
return &audio_codec;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Display* GetDisplay() override {
|
virtual Display* GetDisplay() override { return display_; }
|
||||||
return display_;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual Camera* GetCamera() override {
|
virtual Camera* GetCamera() override { return camera_; }
|
||||||
return camera_;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool GetBatteryLevel(int& level, bool& charging, bool& discharging) override {
|
virtual bool GetBatteryLevel(int& level, bool& charging, bool& discharging) override {
|
||||||
static bool last_discharging = false;
|
static bool last_discharging = false;
|
||||||
|
|||||||
@ -599,7 +599,7 @@ CONFIG_PARTITION_TABLE_MD5=y
|
|||||||
#
|
#
|
||||||
CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/"
|
CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/"
|
||||||
CONFIG_USE_DIRECT_WEBSOCKET=y
|
CONFIG_USE_DIRECT_WEBSOCKET=y
|
||||||
CONFIG_WEBSOCKET_URL="ws://10.6.80.130:8080"
|
CONFIG_WEBSOCKET_URL="ws://172.19.0.240:8080"
|
||||||
CONFIG_WEBSOCKET_TOKEN=""
|
CONFIG_WEBSOCKET_TOKEN=""
|
||||||
CONFIG_WEBSOCKET_PROTOCOL_VERSION=1
|
CONFIG_WEBSOCKET_PROTOCOL_VERSION=1
|
||||||
# CONFIG_FLASH_NONE_ASSETS is not set
|
# CONFIG_FLASH_NONE_ASSETS is not set
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import base64
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
@ -8,6 +10,7 @@ import time
|
|||||||
import traceback
|
import traceback
|
||||||
import uuid
|
import uuid
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
@ -26,11 +29,24 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken"
|
|||||||
LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
|
LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
|
||||||
ROOM_PREFIX = "test-livekit"
|
ROOM_PREFIX = "test-livekit"
|
||||||
IDENTITY_PREFIX = "uv-livekit"
|
IDENTITY_PREFIX = "uv-livekit"
|
||||||
AGENT_NAME = "my-agent"
|
LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent")
|
||||||
|
DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower()
|
||||||
|
AGENT_NAMES = {
|
||||||
|
"normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME),
|
||||||
|
"beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"),
|
||||||
|
}
|
||||||
|
CHAT_MODE_AGENT_NAMES = {
|
||||||
|
"normal": AGENT_NAMES["normal"],
|
||||||
|
"beaver": AGENT_NAMES["beaver"],
|
||||||
|
"vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"),
|
||||||
|
"vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"),
|
||||||
|
}
|
||||||
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
|
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
|
||||||
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
|
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
|
||||||
WS_PORT = 8080
|
WS_PORT = 8080
|
||||||
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
|
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
|
||||||
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
|
||||||
|
|
||||||
INPUT_SAMPLE_RATE = 16000
|
INPUT_SAMPLE_RATE = 16000
|
||||||
OUTPUT_SAMPLE_RATE = 24000
|
OUTPUT_SAMPLE_RATE = 24000
|
||||||
@ -45,11 +61,22 @@ TTS_PRE_ROLL_MS = 80
|
|||||||
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
|
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
|
||||||
TTS_INTERRUPT_SILENCE_FRAMES = 3
|
TTS_INTERRUPT_SILENCE_FRAMES = 3
|
||||||
INTERRUPT_TOPIC = "lk.interrupt"
|
INTERRUPT_TOPIC = "lk.interrupt"
|
||||||
|
VISION_FRAME_TOPIC = "vision.frame"
|
||||||
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;;"
|
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;;"
|
||||||
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
|
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
|
||||||
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
|
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
|
||||||
TTS_DISPLAY_SCROLL_GAP = " "
|
TTS_DISPLAY_SCROLL_GAP = " "
|
||||||
TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
|
TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
|
||||||
|
EMOTION_TEXT_PATTERN = re.compile(
|
||||||
|
r"^\s*<?\s*emotion\s*=\s*([^\s>,,;;]+)\s*>?[\s,,;;]*(.*)$",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
EMOTION_TEST_SEQUENCE = [
|
||||||
|
emotion.strip()
|
||||||
|
for emotion in os.getenv("BRIDGE_EMOTION_TEST_SEQUENCE", "").split(",")
|
||||||
|
if emotion.strip()
|
||||||
|
]
|
||||||
|
EMOTION_TEST_INTERVAL_SECONDS = float(os.getenv("BRIDGE_EMOTION_TEST_INTERVAL_SECONDS", "2.0"))
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -59,6 +86,10 @@ class DeviceSession:
|
|||||||
protocol_version: int
|
protocol_version: int
|
||||||
room_name: str
|
room_name: str
|
||||||
identity: str
|
identity: str
|
||||||
|
chat_mode: str
|
||||||
|
agent_mode: str
|
||||||
|
agent_name: str
|
||||||
|
vision_enabled: bool
|
||||||
room: rtc.Room
|
room: rtc.Room
|
||||||
mic_source: AudioSource
|
mic_source: AudioSource
|
||||||
agent_ready: asyncio.Event
|
agent_ready: asyncio.Event
|
||||||
@ -70,6 +101,7 @@ class DeviceSession:
|
|||||||
tts_transcript_text: str = ""
|
tts_transcript_text: str = ""
|
||||||
tts_display_text: str = ""
|
tts_display_text: str = ""
|
||||||
tts_display_final: bool = False
|
tts_display_final: bool = False
|
||||||
|
tts_emotion: str = ""
|
||||||
tts_suppressed_until: float = 0.0
|
tts_suppressed_until: float = 0.0
|
||||||
agent_dispatch_task: Optional[asyncio.Task] = None
|
agent_dispatch_task: Optional[asyncio.Task] = None
|
||||||
closed: bool = False
|
closed: bool = False
|
||||||
@ -77,10 +109,10 @@ class DeviceSession:
|
|||||||
first_capture_log_time: float = 0.0
|
first_capture_log_time: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
async def fetch_token(room_name: str, identity: str) -> str:
|
async def fetch_token(room_name: str, identity: str, agent_name: str) -> str:
|
||||||
params = {"room": room_name, "identity": identity}
|
params = {"room": room_name, "identity": identity}
|
||||||
if AGENT_DISPATCH_MODE == "token":
|
if AGENT_DISPATCH_MODE == "token":
|
||||||
params["agent_name"] = AGENT_NAME
|
params["agent_name"] = agent_name
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||||
response = await client.get(TOKEN_URL, params=params)
|
response = await client.get(TOKEN_URL, params=params)
|
||||||
@ -120,15 +152,61 @@ class ESP32LiveKitBridge:
|
|||||||
print(f"[session] device={device_id} room={room_name} identity={identity}")
|
print(f"[session] device={device_id} room={room_name} identity={identity}")
|
||||||
return room_name, identity
|
return room_name, identity
|
||||||
|
|
||||||
def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool:
|
def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]:
|
||||||
|
requested_chat_mode = (
|
||||||
|
headers.get("Chat-Mode")
|
||||||
|
or headers.get("X-Chat-Mode")
|
||||||
|
or ""
|
||||||
|
).strip().lower()
|
||||||
|
chat_mode_to_agent = {
|
||||||
|
"normal": ("normal", False),
|
||||||
|
"beaver": ("beaver", False),
|
||||||
|
"vision-normal": ("normal", True),
|
||||||
|
"vision-beaver": ("beaver", True),
|
||||||
|
}
|
||||||
|
|
||||||
|
if requested_chat_mode in chat_mode_to_agent:
|
||||||
|
requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode]
|
||||||
|
else:
|
||||||
|
if requested_chat_mode:
|
||||||
|
print(f"未知 Chat-Mode={requested_chat_mode!r},回退到 Agent-Mode")
|
||||||
|
requested_mode = (
|
||||||
|
headers.get("Agent-Mode")
|
||||||
|
or headers.get("X-Agent-Mode")
|
||||||
|
or DEFAULT_AGENT_MODE
|
||||||
|
or "normal"
|
||||||
|
).strip().lower()
|
||||||
|
vision_enabled = False
|
||||||
|
requested_chat_mode = requested_mode
|
||||||
|
|
||||||
|
requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name")
|
||||||
|
if requested_name:
|
||||||
|
return requested_chat_mode, "custom", requested_name, vision_enabled
|
||||||
|
|
||||||
|
if requested_mode not in AGENT_NAMES:
|
||||||
|
print(f"未知 Agent-Mode={requested_mode!r},回退到 normal")
|
||||||
|
requested_mode = "normal"
|
||||||
|
requested_chat_mode = "vision-normal" if vision_enabled else "normal"
|
||||||
|
|
||||||
|
if requested_chat_mode in CHAT_MODE_AGENT_NAMES:
|
||||||
|
return (
|
||||||
|
requested_chat_mode,
|
||||||
|
requested_mode,
|
||||||
|
CHAT_MODE_AGENT_NAMES[requested_chat_mode],
|
||||||
|
vision_enabled,
|
||||||
|
)
|
||||||
|
|
||||||
|
return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled
|
||||||
|
|
||||||
|
def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool:
|
||||||
identity = getattr(participant, "identity", "") or ""
|
identity = getattr(participant, "identity", "") or ""
|
||||||
return identity.startswith("agent-") or AGENT_NAME in identity
|
return identity.startswith("agent-") or agent_name in identity
|
||||||
|
|
||||||
def _get_agent_identities(self, session: DeviceSession) -> list[str]:
|
def _get_agent_identities(self, session: DeviceSession) -> list[str]:
|
||||||
return [
|
return [
|
||||||
participant.identity
|
participant.identity
|
||||||
for participant in session.room.remote_participants.values()
|
for participant in session.room.remote_participants.values()
|
||||||
if self._is_agent_participant(participant)
|
if self._is_agent_participant(participant, session.agent_name)
|
||||||
]
|
]
|
||||||
|
|
||||||
def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
|
def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
|
||||||
@ -170,7 +248,7 @@ class ESP32LiveKitBridge:
|
|||||||
dispatch_agent_name = getattr(dispatch, "agent_name", None)
|
dispatch_agent_name = getattr(dispatch, "agent_name", None)
|
||||||
dispatch_room = getattr(dispatch, "room", None)
|
dispatch_room = getattr(dispatch, "room", None)
|
||||||
if dispatch_room == session.room_name and (
|
if dispatch_room == session.room_name and (
|
||||||
dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None
|
dispatch_agent_name == session.agent_name or dispatch_agent_name is None
|
||||||
):
|
):
|
||||||
print(
|
print(
|
||||||
f"检测到已有 dispatch: room={session.room_name} "
|
f"检测到已有 dispatch: room={session.room_name} "
|
||||||
@ -193,7 +271,7 @@ class ESP32LiveKitBridge:
|
|||||||
return
|
return
|
||||||
|
|
||||||
for participant in session.room.remote_participants.values():
|
for participant in session.room.remote_participants.values():
|
||||||
if self._is_agent_participant(participant):
|
if self._is_agent_participant(participant, session.agent_name):
|
||||||
# print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}")
|
# print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}")
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -205,7 +283,7 @@ class ESP32LiveKitBridge:
|
|||||||
await session.agent_dispatch_task
|
await session.agent_dispatch_task
|
||||||
|
|
||||||
async def _dispatch_agent(self, session: DeviceSession) -> None:
|
async def _dispatch_agent(self, session: DeviceSession) -> None:
|
||||||
print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}")
|
print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if await self._dispatch_agent_with_sdk(session):
|
if await self._dispatch_agent_with_sdk(session):
|
||||||
@ -235,13 +313,17 @@ class ESP32LiveKitBridge:
|
|||||||
try:
|
try:
|
||||||
dispatch = await lkapi.agent_dispatch.create_dispatch(
|
dispatch = await lkapi.agent_dispatch.create_dispatch(
|
||||||
livekit_api.CreateAgentDispatchRequest(
|
livekit_api.CreateAgentDispatchRequest(
|
||||||
agent_name=AGENT_NAME,
|
agent_name=session.agent_name,
|
||||||
room=session.room_name,
|
room=session.room_name,
|
||||||
metadata=json.dumps(
|
metadata=json.dumps(
|
||||||
{
|
{
|
||||||
"source": "bridge_server",
|
"source": "bridge_server",
|
||||||
"identity": session.identity,
|
"identity": session.identity,
|
||||||
"device_id": session.device_id,
|
"device_id": session.device_id,
|
||||||
|
"chat_mode": session.chat_mode,
|
||||||
|
"agent_mode": session.agent_mode,
|
||||||
|
"agent_name": session.agent_name,
|
||||||
|
"vision_enabled": session.vision_enabled,
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@ -264,13 +346,17 @@ class ESP32LiveKitBridge:
|
|||||||
"--room",
|
"--room",
|
||||||
session.room_name,
|
session.room_name,
|
||||||
"--agent-name",
|
"--agent-name",
|
||||||
AGENT_NAME,
|
session.agent_name,
|
||||||
"--metadata",
|
"--metadata",
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
"source": "bridge_server",
|
"source": "bridge_server",
|
||||||
"identity": session.identity,
|
"identity": session.identity,
|
||||||
"device_id": session.device_id,
|
"device_id": session.device_id,
|
||||||
|
"chat_mode": session.chat_mode,
|
||||||
|
"agent_mode": session.agent_mode,
|
||||||
|
"agent_name": session.agent_name,
|
||||||
|
"vision_enabled": session.vision_enabled,
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
stdout=asyncio.subprocess.PIPE,
|
stdout=asyncio.subprocess.PIPE,
|
||||||
@ -289,7 +375,7 @@ class ESP32LiveKitBridge:
|
|||||||
print(f"lk dispatch create 失败,退出码: {process.returncode}")
|
print(f"lk dispatch create 失败,退出码: {process.returncode}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}")
|
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
|
async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
|
||||||
@ -336,6 +422,73 @@ class ESP32LiveKitBridge:
|
|||||||
if not ok:
|
if not ok:
|
||||||
print("警告: bridge 已停止 TTS,但 agent 侧 interrupt 未确认送出")
|
print("警告: bridge 已停止 TTS,但 agent 侧 interrupt 未确认送出")
|
||||||
|
|
||||||
|
def _save_vision_frame(self, session: DeviceSession, image: str) -> Optional[Path]:
|
||||||
|
try:
|
||||||
|
image_bytes = base64.b64decode(image, validate=True)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"vision frame base64 解码失败: {exc}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
safe_device_id = "".join(
|
||||||
|
char if char.isalnum() or char in ("-", "_") else "_"
|
||||||
|
for char in session.device_id
|
||||||
|
)
|
||||||
|
timestamp_ms = int(time.time() * 1000)
|
||||||
|
VISION_FRAME_SAVE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
path = VISION_FRAME_SAVE_DIR / f"{timestamp_ms}_{safe_device_id}.jpg"
|
||||||
|
path.write_bytes(image_bytes)
|
||||||
|
return path
|
||||||
|
|
||||||
|
async def _publish_vision_frame(self, session: DeviceSession, message: dict[str, Any]) -> None:
|
||||||
|
image = message.get("image")
|
||||||
|
if not isinstance(image, str) or not image:
|
||||||
|
print("收到 vision frame,但 image 字段为空")
|
||||||
|
return
|
||||||
|
|
||||||
|
saved_path = self._save_vision_frame(session, image)
|
||||||
|
if saved_path is None:
|
||||||
|
return
|
||||||
|
print(f"已保存 vision frame: {saved_path}")
|
||||||
|
|
||||||
|
participant = getattr(session.room, "local_participant", None)
|
||||||
|
if participant is None:
|
||||||
|
print("跳过发送 vision frame,local participant 尚未就绪")
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"type": "vision_frame",
|
||||||
|
"topic": VISION_FRAME_TOPIC,
|
||||||
|
"room": session.room_name,
|
||||||
|
"identity": session.identity,
|
||||||
|
"device_id": session.device_id,
|
||||||
|
"mime_type": message.get("mime_type", "image/jpeg"),
|
||||||
|
"image": image,
|
||||||
|
"saved_path": str(saved_path),
|
||||||
|
}
|
||||||
|
data = json.dumps(payload).encode("utf-8")
|
||||||
|
agent_identities = self._get_agent_identities(session)
|
||||||
|
kwargs: dict[str, Any] = {}
|
||||||
|
if agent_identities:
|
||||||
|
kwargs["destination_identities"] = agent_identities
|
||||||
|
|
||||||
|
last_error: Optional[Exception] = None
|
||||||
|
for attempt in ({"topic": VISION_FRAME_TOPIC, **kwargs}, kwargs):
|
||||||
|
try:
|
||||||
|
await participant.publish_data(data, **attempt)
|
||||||
|
print(
|
||||||
|
f"已发送 vision frame: bytes={len(data)} "
|
||||||
|
f"targets={agent_identities or 'broadcast'}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
except TypeError as exc:
|
||||||
|
last_error = exc
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"发送 vision frame 失败: {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
if last_error is not None:
|
||||||
|
print(f"发送 vision frame 失败,publish_data 签名不兼容: {last_error}")
|
||||||
|
|
||||||
async def _send_tts_state(self, session: DeviceSession, state: str) -> None:
|
async def _send_tts_state(self, session: DeviceSession, state: str) -> None:
|
||||||
if session.websocket is None:
|
if session.websocket is None:
|
||||||
print(f"跳过 tts {state},ESP32 尚未连接")
|
print(f"跳过 tts {state},ESP32 尚未连接")
|
||||||
@ -343,9 +496,40 @@ class ESP32LiveKitBridge:
|
|||||||
await session.websocket.send(json.dumps({"type": "tts", "state": state}))
|
await session.websocket.send(json.dumps({"type": "tts", "state": state}))
|
||||||
print(f"已发送 tts {state}: device={session.device_id}")
|
print(f"已发送 tts {state}: device={session.device_id}")
|
||||||
|
|
||||||
|
async def _send_emotion(self, session: DeviceSession, emotion: str) -> None:
|
||||||
|
if session.websocket is None:
|
||||||
|
print(f"跳过 emotion {emotion},ESP32 尚未连接")
|
||||||
|
return
|
||||||
|
await session.websocket.send(json.dumps({"type": "llm", "emotion": emotion}))
|
||||||
|
print(f"已发送 emotion: device={session.device_id} emotion={emotion}")
|
||||||
|
|
||||||
|
def _parse_emotion_text(self, text: str) -> tuple[Optional[str], str]:
|
||||||
|
match = EMOTION_TEXT_PATTERN.match(text)
|
||||||
|
if match is None:
|
||||||
|
return None, text.strip()
|
||||||
|
emotion, tts_text = match.groups()
|
||||||
|
return emotion.strip(), tts_text.strip()
|
||||||
|
|
||||||
|
async def _run_emotion_test_sequence(self, session: DeviceSession) -> None:
|
||||||
|
if not EMOTION_TEST_SEQUENCE:
|
||||||
|
return
|
||||||
|
|
||||||
|
for index, emotion in enumerate(EMOTION_TEST_SEQUENCE):
|
||||||
|
if session.websocket is None or session.closed:
|
||||||
|
return
|
||||||
|
if index > 0:
|
||||||
|
await asyncio.sleep(EMOTION_TEST_INTERVAL_SECONDS)
|
||||||
|
await self._send_emotion(session, emotion)
|
||||||
|
|
||||||
async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
|
async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
|
||||||
if session.websocket is None:
|
if session.websocket is None:
|
||||||
return
|
return
|
||||||
|
raw_text = text
|
||||||
|
_emotion, text = self._parse_emotion_text(text)
|
||||||
|
if not text:
|
||||||
|
print(f"[tts->esp32] skip empty text: raw={raw_text!r} final={final}")
|
||||||
|
return
|
||||||
|
print(f"[tts->esp32] text={text!r} final={final}")
|
||||||
await session.websocket.send(
|
await session.websocket.send(
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
@ -421,6 +605,7 @@ class ESP32LiveKitBridge:
|
|||||||
if not session.tts_display_text:
|
if not session.tts_display_text:
|
||||||
session.tts_transcript_text = ""
|
session.tts_transcript_text = ""
|
||||||
session.tts_display_final = False
|
session.tts_display_final = False
|
||||||
|
session.tts_emotion = ""
|
||||||
self._cancel_tts_display_task(session)
|
self._cancel_tts_display_task(session)
|
||||||
await self._send_tts_state(session, "start")
|
await self._send_tts_state(session, "start")
|
||||||
session.tts_active = True
|
session.tts_active = True
|
||||||
@ -435,6 +620,7 @@ class ESP32LiveKitBridge:
|
|||||||
session.tts_transcript_text = ""
|
session.tts_transcript_text = ""
|
||||||
session.tts_display_text = ""
|
session.tts_display_text = ""
|
||||||
session.tts_display_final = False
|
session.tts_display_final = False
|
||||||
|
session.tts_emotion = ""
|
||||||
|
|
||||||
async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
|
async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
|
||||||
self._cancel_tts_display_task(session)
|
self._cancel_tts_display_task(session)
|
||||||
@ -445,6 +631,7 @@ class ESP32LiveKitBridge:
|
|||||||
session.tts_transcript_text = ""
|
session.tts_transcript_text = ""
|
||||||
session.tts_display_text = ""
|
session.tts_display_text = ""
|
||||||
session.tts_display_final = False
|
session.tts_display_final = False
|
||||||
|
session.tts_emotion = ""
|
||||||
await self._send_tts_state(session, "stop")
|
await self._send_tts_state(session, "stop")
|
||||||
print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")
|
print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")
|
||||||
|
|
||||||
@ -609,16 +796,16 @@ class ESP32LiveKitBridge:
|
|||||||
print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
|
print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
|
||||||
self._log_agent_participants(session, "connected")
|
self._log_agent_participants(session, "connected")
|
||||||
for participant in session.room.remote_participants.values():
|
for participant in session.room.remote_participants.values():
|
||||||
if self._is_agent_participant(participant):
|
if self._is_agent_participant(participant, session.agent_name):
|
||||||
session.agent_ready.set()
|
session.agent_ready.set()
|
||||||
self._scan_participant_audio_tracks(session, participant, "connected_scan")
|
self._scan_participant_audio_tracks(session, participant, "connected_scan")
|
||||||
|
|
||||||
@session.room.on("participant_connected")
|
@session.room.on("participant_connected")
|
||||||
def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
|
def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
|
||||||
role = "Agent" if self._is_agent_participant(participant) else "Remote participant"
|
role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
|
||||||
print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
|
print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
|
||||||
self._log_agent_participants(session, "participant_connected")
|
self._log_agent_participants(session, "participant_connected")
|
||||||
if self._is_agent_participant(participant):
|
if self._is_agent_participant(participant, session.agent_name):
|
||||||
session.agent_ready.set()
|
session.agent_ready.set()
|
||||||
self._scan_participant_audio_tracks(
|
self._scan_participant_audio_tracks(
|
||||||
session, participant, "participant_connected_scan"
|
session, participant, "participant_connected_scan"
|
||||||
@ -651,14 +838,26 @@ class ESP32LiveKitBridge:
|
|||||||
track_pub: rtc.TrackPublication,
|
track_pub: rtc.TrackPublication,
|
||||||
) -> None:
|
) -> None:
|
||||||
identity = participant.identity if participant else "未知"
|
identity = participant.identity if participant else "未知"
|
||||||
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant)
|
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(
|
||||||
|
participant, session.agent_name
|
||||||
|
)
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
|
status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
|
||||||
print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}")
|
print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}")
|
||||||
if is_agent:
|
if is_agent:
|
||||||
if time.monotonic() < session.tts_suppressed_until:
|
if time.monotonic() < session.tts_suppressed_until:
|
||||||
continue
|
continue
|
||||||
display_text = self._current_tts_display_text(segment.text)
|
print(f"[livekit-llm] raw={segment.text!r} final={segment.final}")
|
||||||
|
emotion, tts_text = self._parse_emotion_text(segment.text)
|
||||||
|
print(
|
||||||
|
f"[livekit-llm] parsed emotion={emotion!r} "
|
||||||
|
f"tts_text={tts_text!r} final={segment.final}"
|
||||||
|
)
|
||||||
|
if emotion and emotion != session.tts_emotion:
|
||||||
|
session.tts_emotion = emotion
|
||||||
|
asyncio.create_task(self._send_emotion(session, emotion))
|
||||||
|
display_text = self._current_tts_display_text(tts_text)
|
||||||
|
print(f"[livekit-llm] display_text={display_text!r} final={segment.final}")
|
||||||
if not display_text or display_text == session.tts_transcript_text:
|
if not display_text or display_text == session.tts_transcript_text:
|
||||||
continue
|
continue
|
||||||
session.tts_transcript_text = display_text
|
session.tts_transcript_text = display_text
|
||||||
@ -713,7 +912,7 @@ class ESP32LiveKitBridge:
|
|||||||
# print(f"[config] token_url={TOKEN_URL}")
|
# print(f"[config] token_url={TOKEN_URL}")
|
||||||
# print(f"[config] room={session.room_name} identity={session.identity}")
|
# print(f"[config] room={session.room_name} identity={session.identity}")
|
||||||
# print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
|
# print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
|
||||||
token = await fetch_token(session.room_name, session.identity)
|
token = await fetch_token(session.room_name, session.identity, session.agent_name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await session.room.connect(
|
await session.room.connect(
|
||||||
@ -758,8 +957,12 @@ class ESP32LiveKitBridge:
|
|||||||
# print(f"等待 agent 加入: room={session.room_name}")
|
# print(f"等待 agent 加入: room={session.room_name}")
|
||||||
try:
|
try:
|
||||||
await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
|
await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
|
||||||
|
if session.closed:
|
||||||
|
return
|
||||||
# print(f"✅ agent 已就绪: room={session.room_name}")
|
# print(f"✅ agent 已就绪: room={session.room_name}")
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
|
if session.closed:
|
||||||
|
return
|
||||||
print(f"⚠️ agent 等待超时: room={session.room_name}")
|
print(f"⚠️ agent 等待超时: room={session.room_name}")
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
@ -767,6 +970,20 @@ class ESP32LiveKitBridge:
|
|||||||
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
|
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
|
||||||
print(f"[config] token_url={TOKEN_URL}")
|
print(f"[config] token_url={TOKEN_URL}")
|
||||||
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
|
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
|
||||||
|
print(
|
||||||
|
"[config] agents="
|
||||||
|
f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
|
||||||
|
f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} "
|
||||||
|
f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} "
|
||||||
|
f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} "
|
||||||
|
f"default_mode:{DEFAULT_AGENT_MODE}"
|
||||||
|
)
|
||||||
|
if EMOTION_TEST_SEQUENCE:
|
||||||
|
print(
|
||||||
|
"[config] emotion_test_sequence="
|
||||||
|
f"{','.join(EMOTION_TEST_SEQUENCE)} "
|
||||||
|
f"interval={EMOTION_TEST_INTERVAL_SECONDS}s"
|
||||||
|
)
|
||||||
|
|
||||||
async def close(self) -> None:
|
async def close(self) -> None:
|
||||||
for session in list(self.device_sessions.values()):
|
for session in list(self.device_sessions.values()):
|
||||||
@ -777,6 +994,7 @@ class ESP32LiveKitBridge:
|
|||||||
return
|
return
|
||||||
session.closed = True
|
session.closed = True
|
||||||
session.websocket = None
|
session.websocket = None
|
||||||
|
session.agent_ready.set()
|
||||||
session.tts_active = False
|
session.tts_active = False
|
||||||
session.tts_stream_id += 1
|
session.tts_stream_id += 1
|
||||||
if session.tts_idle_task is not None:
|
if session.tts_idle_task is not None:
|
||||||
@ -926,6 +1144,7 @@ class ESP32LiveKitBridge:
|
|||||||
await self._close_session(existing_session)
|
await self._close_session(existing_session)
|
||||||
self.device_sessions.pop(device_id, None)
|
self.device_sessions.pop(device_id, None)
|
||||||
|
|
||||||
|
chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers)
|
||||||
room_name, identity = self._build_session_names(device_id)
|
room_name, identity = self._build_session_names(device_id)
|
||||||
session = DeviceSession(
|
session = DeviceSession(
|
||||||
device_id=device_id,
|
device_id=device_id,
|
||||||
@ -933,20 +1152,27 @@ class ESP32LiveKitBridge:
|
|||||||
protocol_version=protocol_version,
|
protocol_version=protocol_version,
|
||||||
room_name=room_name,
|
room_name=room_name,
|
||||||
identity=identity,
|
identity=identity,
|
||||||
|
chat_mode=chat_mode,
|
||||||
|
agent_mode=agent_mode,
|
||||||
|
agent_name=agent_name,
|
||||||
|
vision_enabled=vision_enabled,
|
||||||
room=rtc.Room(),
|
room=rtc.Room(),
|
||||||
mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
|
mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
|
||||||
agent_ready=asyncio.Event(),
|
agent_ready=asyncio.Event(),
|
||||||
)
|
)
|
||||||
self.device_sessions[device_id] = session
|
self.device_sessions[device_id] = session
|
||||||
|
|
||||||
# print(f"ESP32 已连接: device={device_id}")
|
print(f"ESP32 已连接: device={device_id}")
|
||||||
# print(f"ESP32 协议版本: {session.protocol_version}")
|
print(f"ESP32 协议版本: {session.protocol_version}")
|
||||||
|
print(
|
||||||
|
f"ESP32 mode: chat={session.chat_mode} "
|
||||||
|
f"agent={session.agent_mode}/{session.agent_name} "
|
||||||
|
f"vision={session.vision_enabled}"
|
||||||
|
)
|
||||||
session.tts_stream_id += 1
|
session.tts_stream_id += 1
|
||||||
opus_decoder = None
|
opus_decoder = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await self._connect_session_room(session)
|
|
||||||
|
|
||||||
hello_msg = {
|
hello_msg = {
|
||||||
"type": "hello",
|
"type": "hello",
|
||||||
"transport": "websocket",
|
"transport": "websocket",
|
||||||
@ -962,6 +1188,10 @@ class ESP32LiveKitBridge:
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
await websocket.send(json.dumps(hello_msg))
|
await websocket.send(json.dumps(hello_msg))
|
||||||
|
print(f"已发送 server hello: device={device_id} room={session.room_name}")
|
||||||
|
asyncio.create_task(self._run_emotion_test_sequence(session))
|
||||||
|
|
||||||
|
await self._connect_session_room(session)
|
||||||
|
|
||||||
async for message in websocket:
|
async for message in websocket:
|
||||||
if isinstance(message, bytes):
|
if isinstance(message, bytes):
|
||||||
@ -1017,6 +1247,8 @@ class ESP32LiveKitBridge:
|
|||||||
abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
|
abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
|
||||||
print(f"处理 ESP32 打断请求: reason={abort_reason}")
|
print(f"处理 ESP32 打断请求: reason={abort_reason}")
|
||||||
await self._abort_tts(session, abort_reason)
|
await self._abort_tts(session, abort_reason)
|
||||||
|
elif msg_type == "vision" and data.get("state") == "frame":
|
||||||
|
await self._publish_vision_frame(session, data)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
print(f"收到未知的字符消息: {message}")
|
print(f"收到未知的字符消息: {message}")
|
||||||
except ConnectionClosedError as exc:
|
except ConnectionClosedError as exc:
|
||||||
|
|||||||
@ -1,9 +1,22 @@
|
|||||||
#include "protocol.h"
|
#include "protocol.h"
|
||||||
|
|
||||||
#include <esp_log.h>
|
#include <esp_log.h>
|
||||||
|
#include <mbedtls/base64.h>
|
||||||
|
|
||||||
#define TAG "Protocol"
|
#define TAG "Protocol"
|
||||||
|
|
||||||
|
static std::string Base64Encode(const std::string& data) {
|
||||||
|
size_t encoded_length = 0;
|
||||||
|
size_t output_length = 0;
|
||||||
|
mbedtls_base64_encode(nullptr, 0, &encoded_length,
|
||||||
|
reinterpret_cast<const unsigned char*>(data.data()), data.size());
|
||||||
|
std::string result(encoded_length, 0);
|
||||||
|
mbedtls_base64_encode(reinterpret_cast<unsigned char*>(result.data()), result.size(), &output_length,
|
||||||
|
reinterpret_cast<const unsigned char*>(data.data()), data.size());
|
||||||
|
result.resize(output_length);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) {
|
void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) {
|
||||||
on_incoming_json_ = callback;
|
on_incoming_json_ = callback;
|
||||||
}
|
}
|
||||||
@ -78,6 +91,27 @@ void Protocol::SendMcpMessage(const std::string& payload) {
|
|||||||
SendText(message);
|
SendText(message);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Protocol::SendVisionFrame(const std::string& jpeg_data) {
|
||||||
|
if (jpeg_data.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cJSON* root = cJSON_CreateObject();
|
||||||
|
cJSON_AddStringToObject(root, "session_id", session_id_.c_str());
|
||||||
|
cJSON_AddStringToObject(root, "type", "vision");
|
||||||
|
cJSON_AddStringToObject(root, "state", "frame");
|
||||||
|
cJSON_AddStringToObject(root, "mime_type", "image/jpeg");
|
||||||
|
auto encoded = Base64Encode(jpeg_data);
|
||||||
|
cJSON_AddStringToObject(root, "image", encoded.c_str());
|
||||||
|
|
||||||
|
char* json_str = cJSON_PrintUnformatted(root);
|
||||||
|
if (json_str != nullptr) {
|
||||||
|
SendText(json_str);
|
||||||
|
cJSON_free(json_str);
|
||||||
|
}
|
||||||
|
cJSON_Delete(root);
|
||||||
|
}
|
||||||
|
|
||||||
bool Protocol::IsTimeout() const {
|
bool Protocol::IsTimeout() const {
|
||||||
const int kTimeoutSeconds = 120;
|
const int kTimeoutSeconds = 120;
|
||||||
auto now = std::chrono::steady_clock::now();
|
auto now = std::chrono::steady_clock::now();
|
||||||
|
|||||||
@ -73,6 +73,7 @@ public:
|
|||||||
virtual void SendStopListening();
|
virtual void SendStopListening();
|
||||||
virtual void SendAbortSpeaking(AbortReason reason);
|
virtual void SendAbortSpeaking(AbortReason reason);
|
||||||
virtual void SendMcpMessage(const std::string& message);
|
virtual void SendMcpMessage(const std::string& message);
|
||||||
|
virtual void SendVisionFrame(const std::string& jpeg_data);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
std::function<void(const cJSON* root)> on_incoming_json_;
|
std::function<void(const cJSON* root)> on_incoming_json_;
|
||||||
@ -95,4 +96,3 @@ protected:
|
|||||||
};
|
};
|
||||||
|
|
||||||
#endif // PROTOCOL_H
|
#endif // PROTOCOL_H
|
||||||
|
|
||||||
|
|||||||
@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
|
|||||||
websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
|
websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
|
||||||
websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
|
websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
|
||||||
websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
|
websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
|
||||||
|
websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName());
|
||||||
|
websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName());
|
||||||
|
|
||||||
websocket_->OnData([this](const char* data, size_t len, bool binary) {
|
websocket_->OnData([this](const char* data, size_t len, bool binary) {
|
||||||
if (binary) {
|
if (binary) {
|
||||||
|
|||||||
Reference in New Issue
Block a user