Compare commits

5 Commits

Author SHA1 Message Date
2c4329fd84 fix: voice interupt 2026-06-12 11:38:47 +08:00
9637e09aef feat: beaver 2026-06-04 15:48:10 +08:00
b92e6e1b07 feat: remove background cam every time 2026-05-29 14:53:58 +08:00
33ee598c21 feat: add icon beaver 2026-05-29 11:22:31 +08:00
37343ac0fe feat: icon first commit 2026-05-27 17:16:11 +08:00
24 changed files with 1075 additions and 276 deletions

1
.gitignore vendored
View File

@ -19,3 +19,4 @@ main/mmap_generate_emoji.h
*.bin *.bin
mmap_generate_*.h mmap_generate_*.h
.clangd .clangd
background_frames/

View File

@ -1,25 +1,24 @@
#include "application.h" #include "application.h"
#include "assets.h"
#include "assets/lang_config.h"
#include "audio_codec.h"
#include "board.h" #include "board.h"
#include "display.h" #include "display.h"
#include "system_info.h"
#include "audio_codec.h"
#include "mqtt_protocol.h"
#include "websocket_protocol.h"
#include "assets/lang_config.h"
#include "mcp_server.h" #include "mcp_server.h"
#include "assets.h" #include "mqtt_protocol.h"
#include "settings.h" #include "settings.h"
#include "system_info.h"
#include "websocket_protocol.h"
#include <cstring>
#include <esp_log.h>
#include <cJSON.h>
#include <driver/gpio.h> #include <driver/gpio.h>
#include <esp_log.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include <cJSON.h>
#include <font_awesome.h> #include <font_awesome.h>
#include <cstring>
#define TAG "Application" #define TAG "Application"
Application::Application() { Application::Application() {
event_group_ = xEventGroupCreate(); event_group_ = xEventGroupCreate();
@ -33,16 +32,16 @@ Application::Application() {
aec_mode_ = kAecOff; aec_mode_ = kAecOff;
#endif #endif
esp_timer_create_args_t clock_timer_args = { esp_timer_create_args_t clock_timer_args = {.callback =
.callback = [](void* arg) { [](void* arg) {
Application* app = (Application*)arg; Application* app = (Application*)arg;
xEventGroupSetBits(app->event_group_, MAIN_EVENT_CLOCK_TICK); xEventGroupSetBits(app->event_group_,
MAIN_EVENT_CLOCK_TICK);
}, },
.arg = this, .arg = this,
.dispatch_method = ESP_TIMER_TASK, .dispatch_method = ESP_TIMER_TASK,
.name = "clock_timer", .name = "clock_timer",
.skip_unhandled_events = true .skip_unhandled_events = true};
};
esp_timer_create(&clock_timer_args, &clock_timer_handle_); esp_timer_create(&clock_timer_args, &clock_timer_handle_);
} }
@ -54,9 +53,7 @@ Application::~Application() {
vEventGroupDelete(event_group_); vEventGroupDelete(event_group_);
} }
bool Application::SetDeviceState(DeviceState state) { bool Application::SetDeviceState(DeviceState state) { return state_machine_.TransitionTo(state); }
return state_machine_.TransitionTo(state);
}
void Application::Initialize() { void Application::Initialize() {
auto& board = Board::GetInstance(); auto& board = Board::GetInstance();
@ -81,6 +78,7 @@ void Application::Initialize() {
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED); xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
}; };
callbacks.on_vad_change = [this](bool speaking) { callbacks.on_vad_change = [this](bool speaking) {
vad_speaking_.store(speaking);
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE); xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
}; };
audio_service_.SetCallbacks(callbacks); audio_service_.SetCallbacks(callbacks);
@ -141,13 +139,16 @@ void Application::Initialize() {
display->SetStatus(Lang::Strings::DETECTING_MODULE); display->SetStatus(Lang::Strings::DETECTING_MODULE);
break; break;
case NetworkEvent::ModemErrorNoSim: case NetworkEvent::ModemErrorNoSim:
Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_PIN); Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation",
Lang::Sounds::OGG_ERR_PIN);
break; break;
case NetworkEvent::ModemErrorRegDenied: case NetworkEvent::ModemErrorRegDenied:
Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_REG); Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation",
Lang::Sounds::OGG_ERR_REG);
break; break;
case NetworkEvent::ModemErrorInitFailed: case NetworkEvent::ModemErrorInitFailed:
Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation",
Lang::Sounds::OGG_EXCLAMATION);
break; break;
case NetworkEvent::ModemErrorTimeout: case NetworkEvent::ModemErrorTimeout:
display->SetStatus(Lang::Strings::REGISTERING_NETWORK); display->SetStatus(Lang::Strings::REGISTERING_NETWORK);
@ -167,18 +168,10 @@ void Application::Run() {
vTaskPrioritySet(nullptr, 10); vTaskPrioritySet(nullptr, 10);
const EventBits_t ALL_EVENTS = const EventBits_t ALL_EVENTS =
MAIN_EVENT_SCHEDULE | MAIN_EVENT_SCHEDULE | MAIN_EVENT_SEND_AUDIO | MAIN_EVENT_WAKE_WORD_DETECTED |
MAIN_EVENT_SEND_AUDIO | MAIN_EVENT_VAD_CHANGE | MAIN_EVENT_CLOCK_TICK | MAIN_EVENT_ERROR |
MAIN_EVENT_WAKE_WORD_DETECTED | MAIN_EVENT_NETWORK_CONNECTED | MAIN_EVENT_NETWORK_DISCONNECTED | MAIN_EVENT_TOGGLE_CHAT |
MAIN_EVENT_VAD_CHANGE | MAIN_EVENT_START_LISTENING | MAIN_EVENT_STOP_LISTENING | MAIN_EVENT_ACTIVATION_DONE |
MAIN_EVENT_CLOCK_TICK |
MAIN_EVENT_ERROR |
MAIN_EVENT_NETWORK_CONNECTED |
MAIN_EVENT_NETWORK_DISCONNECTED |
MAIN_EVENT_TOGGLE_CHAT |
MAIN_EVENT_START_LISTENING |
MAIN_EVENT_STOP_LISTENING |
MAIN_EVENT_ACTIVATION_DONE |
MAIN_EVENT_STATE_CHANGED; MAIN_EVENT_STATE_CHANGED;
while (true) { while (true) {
@ -186,7 +179,8 @@ void Application::Run() {
if (bits & MAIN_EVENT_ERROR) { if (bits & MAIN_EVENT_ERROR) {
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark",
Lang::Sounds::OGG_EXCLAMATION);
} }
if (bits & MAIN_EVENT_NETWORK_CONNECTED) { if (bits & MAIN_EVENT_NETWORK_CONNECTED) {
@ -233,6 +227,13 @@ void Application::Run() {
if (GetDeviceState() == kDeviceStateListening) { if (GetDeviceState() == kDeviceStateListening) {
auto led = Board::GetInstance().GetLed(); auto led = Board::GetInstance().GetLed();
led->OnStateChanged(); led->OnStateChanged();
if (vad_speaking_.load() && vision_text_mode_enabled_.load() &&
!vision_frame_sent_for_current_listen_.exchange(true)) {
if (!SendCurrentVisionFrame()) {
vision_frame_sent_for_current_listen_.store(false);
}
}
} }
} }
@ -270,12 +271,14 @@ void Application::HandleNetworkConnectedEvent() {
return; return;
} }
xTaskCreate([](void* arg) { xTaskCreate(
[](void* arg) {
Application* app = static_cast<Application*>(arg); Application* app = static_cast<Application*>(arg);
app->ActivationTask(); app->ActivationTask();
app->activation_task_handle_ = nullptr; app->activation_task_handle_ = nullptr;
vTaskDelete(NULL); vTaskDelete(NULL);
}, "activation", 4096 * 2, this, 2, &activation_task_handle_); },
"activation", 4096 * 2, this, 2, &activation_task_handle_);
} }
// Update the status bar immediately to show the network state // Update the status bar immediately to show the network state
@ -286,7 +289,8 @@ void Application::HandleNetworkConnectedEvent() {
void Application::HandleNetworkDisconnectedEvent() { void Application::HandleNetworkDisconnectedEvent() {
// Close current conversation when network disconnected // Close current conversation when network disconnected
auto state = GetDeviceState(); auto state = GetDeviceState();
if (state == kDeviceStateConnecting || state == kDeviceStateListening || state == kDeviceStateSpeaking) { if (state == kDeviceStateConnecting || state == kDeviceStateListening ||
state == kDeviceStateThinking || state == kDeviceStateSpeaking) {
ESP_LOGI(TAG, "Closing audio channel due to network disconnection"); ESP_LOGI(TAG, "Closing audio channel due to network disconnection");
protocol_->CloseAudioChannel(); protocol_->CloseAudioChannel();
} }
@ -371,7 +375,8 @@ void Application::CheckAssetsVersion() {
char message[256]; char message[256];
snprintf(message, sizeof(message), Lang::Strings::FOUND_NEW_ASSETS, download_url.c_str()); snprintf(message, sizeof(message), Lang::Strings::FOUND_NEW_ASSETS, download_url.c_str());
Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down", Lang::Sounds::OGG_UPGRADE); Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down",
Lang::Sounds::OGG_UPGRADE);
// Wait for the audio service to be idle for 3 seconds // Wait for the audio service to be idle for 3 seconds
vTaskDelay(pdMS_TO_TICKS(3000)); vTaskDelay(pdMS_TO_TICKS(3000));
@ -379,7 +384,8 @@ void Application::CheckAssetsVersion() {
board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE); board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
display->SetChatMessage("system", Lang::Strings::PLEASE_WAIT); display->SetChatMessage("system", Lang::Strings::PLEASE_WAIT);
bool success = assets.Download(download_url, [this, display](int progress, size_t speed) -> void { bool success =
assets.Download(download_url, [this, display](int progress, size_t speed) -> void {
char buffer[32]; char buffer[32];
snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024); snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024);
Schedule([display, message = std::string(buffer)]() { Schedule([display, message = std::string(buffer)]() {
@ -391,7 +397,8 @@ void Application::CheckAssetsVersion() {
vTaskDelay(pdMS_TO_TICKS(1000)); vTaskDelay(pdMS_TO_TICKS(1000));
if (!success) { if (!success) {
Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark",
Lang::Sounds::OGG_EXCLAMATION);
vTaskDelay(pdMS_TO_TICKS(2000)); vTaskDelay(pdMS_TO_TICKS(2000));
SetDeviceState(kDeviceStateActivating); SetDeviceState(kDeviceStateActivating);
return; return;
@ -423,12 +430,15 @@ void Application::CheckNewVersion() {
} }
char error_message[128]; char error_message[128];
snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err, ota_->GetCheckVersionUrl().c_str()); snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err,
ota_->GetCheckVersionUrl().c_str());
char buffer[256]; char buffer[256];
snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay, error_message); snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay,
error_message);
Alert(Lang::Strings::ERROR, buffer, "cloud_slash", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, buffer, "cloud_slash", Lang::Sounds::OGG_EXCLAMATION);
ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay, retry_count, MAX_RETRY); ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay,
retry_count, MAX_RETRY);
for (int i = 0; i < retry_delay; i++) { for (int i = 0; i < retry_delay; i++) {
vTaskDelay(pdMS_TO_TICKS(1000)); vTaskDelay(pdMS_TO_TICKS(1000));
if (GetDeviceState() == kDeviceStateIdle) { if (GetDeviceState() == kDeviceStateIdle) {
@ -499,9 +509,7 @@ void Application::InitializeProtocol() {
} }
#endif #endif
protocol_->OnConnected([this]() { protocol_->OnConnected([this]() { DismissAlert(); });
DismissAlert();
});
protocol_->OnNetworkError([this](const std::string& message) { protocol_->OnNetworkError([this](const std::string& message) {
last_error_message_ = message; last_error_message_ = message;
@ -509,7 +517,7 @@ void Application::InitializeProtocol() {
}); });
protocol_->OnIncomingAudio([this](std::unique_ptr<AudioStreamPacket> packet) { protocol_->OnIncomingAudio([this](std::unique_ptr<AudioStreamPacket> packet) {
if (GetDeviceState() == kDeviceStateSpeaking) { if (accepting_tts_audio_.load() || GetDeviceState() == kDeviceStateSpeaking) {
audio_service_.PushPacketToDecodeQueue(std::move(packet)); audio_service_.PushPacketToDecodeQueue(std::move(packet));
} }
}); });
@ -517,14 +525,20 @@ void Application::InitializeProtocol() {
protocol_->OnAudioChannelOpened([this, codec, &board]() { protocol_->OnAudioChannelOpened([this, codec, &board]() {
board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE); board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
if (protocol_->server_sample_rate() != codec->output_sample_rate()) { if (protocol_->server_sample_rate() != codec->output_sample_rate()) {
ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion", ESP_LOGW(TAG,
"Server sample rate %d does not match device output sample rate %d, "
"resampling may cause distortion",
protocol_->server_sample_rate(), codec->output_sample_rate()); protocol_->server_sample_rate(), codec->output_sample_rate());
} }
}); });
protocol_->OnAudioChannelClosed([this, &board]() { protocol_->OnAudioChannelClosed([this, &board]() {
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
accepting_tts_audio_.store(false);
Schedule([this]() { Schedule([this]() {
if (GetDeviceState() == kDeviceStateConnecting) {
return;
}
auto display = Board::GetInstance().GetDisplay(); auto display = Board::GetInstance().GetDisplay();
display->SetChatMessage("system", ""); display->SetChatMessage("system", "");
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
@ -536,14 +550,20 @@ void Application::InitializeProtocol() {
auto type = cJSON_GetObjectItem(root, "type"); auto type = cJSON_GetObjectItem(root, "type");
if (strcmp(type->valuestring, "tts") == 0) { if (strcmp(type->valuestring, "tts") == 0) {
auto state = cJSON_GetObjectItem(root, "state"); auto state = cJSON_GetObjectItem(root, "state");
if (strcmp(state->valuestring, "start") == 0) { if (strcmp(state->valuestring, "thinking") == 0) {
Schedule([this]() { SetDeviceState(kDeviceStateThinking); });
} else if (strcmp(state->valuestring, "start") == 0) {
audio_service_.ResetDecoder();
accepting_tts_audio_.store(true);
Schedule([this]() { Schedule([this]() {
aborted_ = false; aborted_ = false;
SetDeviceState(kDeviceStateSpeaking); SetDeviceState(kDeviceStateSpeaking);
}); });
} else if (strcmp(state->valuestring, "stop") == 0) { } else if (strcmp(state->valuestring, "stop") == 0) {
accepting_tts_audio_.store(false);
Schedule([this]() { Schedule([this]() {
if (GetDeviceState() == kDeviceStateSpeaking) { auto state = GetDeviceState();
if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
if (listening_mode_ == kListeningModeManualStop) { if (listening_mode_ == kListeningModeManualStop) {
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
} else { } else {
@ -586,9 +606,7 @@ void Application::InitializeProtocol() {
ESP_LOGI(TAG, "System command: %s", command->valuestring); ESP_LOGI(TAG, "System command: %s", command->valuestring);
if (strcmp(command->valuestring, "reboot") == 0) { if (strcmp(command->valuestring, "reboot") == 0) {
// Do a reboot if user requests a OTA update // Do a reboot if user requests a OTA update
Schedule([this]() { Schedule([this]() { Reboot(); });
Reboot();
});
} else { } else {
ESP_LOGW(TAG, "Unknown system command: %s", command->valuestring); ESP_LOGW(TAG, "Unknown system command: %s", command->valuestring);
} }
@ -598,7 +616,8 @@ void Application::InitializeProtocol() {
auto message = cJSON_GetObjectItem(root, "message"); auto message = cJSON_GetObjectItem(root, "message");
auto emotion = cJSON_GetObjectItem(root, "emotion"); auto emotion = cJSON_GetObjectItem(root, "emotion");
if (cJSON_IsString(status) && cJSON_IsString(message) && cJSON_IsString(emotion)) { if (cJSON_IsString(status) && cJSON_IsString(message) && cJSON_IsString(emotion)) {
Alert(status->valuestring, message->valuestring, emotion->valuestring, Lang::Sounds::OGG_VIBRATION); Alert(status->valuestring, message->valuestring, emotion->valuestring,
Lang::Sounds::OGG_VIBRATION);
} else { } else {
ESP_LOGW(TAG, "Alert command requires status, message and emotion"); ESP_LOGW(TAG, "Alert command requires status, message and emotion");
} }
@ -607,7 +626,8 @@ void Application::InitializeProtocol() {
auto payload = cJSON_GetObjectItem(root, "payload"); auto payload = cJSON_GetObjectItem(root, "payload");
ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root)); ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root));
if (cJSON_IsObject(payload)) { if (cJSON_IsObject(payload)) {
Schedule([this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() { Schedule(
[this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() {
display->SetChatMessage("system", payload_str.c_str()); display->SetChatMessage("system", payload_str.c_str());
}); });
} else { } else {
@ -627,18 +647,12 @@ void Application::ShowActivationCode(const std::string& code, const std::string&
char digit; char digit;
const std::string_view& sound; const std::string_view& sound;
}; };
static const std::array<digit_sound, 10> digit_sounds{{ static const std::array<digit_sound, 10> digit_sounds{
digit_sound{'0', Lang::Sounds::OGG_0}, {digit_sound{'0', Lang::Sounds::OGG_0}, digit_sound{'1', Lang::Sounds::OGG_1},
digit_sound{'1', Lang::Sounds::OGG_1}, digit_sound{'2', Lang::Sounds::OGG_2}, digit_sound{'3', Lang::Sounds::OGG_3},
digit_sound{'2', Lang::Sounds::OGG_2}, digit_sound{'4', Lang::Sounds::OGG_4}, digit_sound{'5', Lang::Sounds::OGG_5},
digit_sound{'3', Lang::Sounds::OGG_3}, digit_sound{'6', Lang::Sounds::OGG_6}, digit_sound{'7', Lang::Sounds::OGG_7},
digit_sound{'4', Lang::Sounds::OGG_4}, digit_sound{'8', Lang::Sounds::OGG_8}, digit_sound{'9', Lang::Sounds::OGG_9}}};
digit_sound{'5', Lang::Sounds::OGG_5},
digit_sound{'6', Lang::Sounds::OGG_6},
digit_sound{'7', Lang::Sounds::OGG_7},
digit_sound{'8', Lang::Sounds::OGG_8},
digit_sound{'9', Lang::Sounds::OGG_9}
}};
// This sentence uses 9KB of SRAM, so we need to wait for it to finish // This sentence uses 9KB of SRAM, so we need to wait for it to finish
Alert(Lang::Strings::ACTIVATION, message.c_str(), "link", Lang::Sounds::OGG_ACTIVATION); Alert(Lang::Strings::ACTIVATION, message.c_str(), "link", Lang::Sounds::OGG_ACTIVATION);
@ -652,7 +666,8 @@ void Application::ShowActivationCode(const std::string& code, const std::string&
} }
} }
void Application::Alert(const char* status, const char* message, const char* emotion, const std::string_view& sound) { void Application::Alert(const char* status, const char* message, const char* emotion,
const std::string_view& sound) {
ESP_LOGW(TAG, "Alert [%s] %s: %s", emotion, status, message); ESP_LOGW(TAG, "Alert [%s] %s: %s", emotion, status, message);
auto display = Board::GetInstance().GetDisplay(); auto display = Board::GetInstance().GetDisplay();
display->SetStatus(status); display->SetStatus(status);
@ -672,30 +687,43 @@ void Application::DismissAlert() {
} }
} }
void Application::ToggleChatState() { void Application::ToggleChatState() { ToggleChatStateForMode(kChatAgentModeNormal, false); }
vision_text_mode_enabled_.store(false);
void Application::ToggleChatStateWithVision() {
ToggleChatStateForMode(kChatAgentModeNormal, true);
}
void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) {
chat_agent_mode_.store(agent_mode);
vision_text_mode_enabled_.store(vision_enabled);
vision_frame_sent_for_current_listen_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
} }
void Application::ToggleChatStateWithVision() { bool Application::IsVisionTextModeEnabled() const { return vision_text_mode_enabled_.load(); }
vision_text_mode_enabled_.store(true);
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); const char* Application::GetChatAgentModeName() const {
return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
}
const char* Application::GetChatModeName() const {
bool vision_enabled = vision_text_mode_enabled_.load();
if (chat_agent_mode_.load() == kChatAgentModeBeaver) {
return vision_enabled ? "vision-beaver" : "beaver";
}
return vision_enabled ? "vision-normal" : "normal";
} }
void Application::StartListening() { void Application::StartListening() {
vision_text_mode_enabled_.store(false); vision_text_mode_enabled_.store(false);
vision_frame_sent_for_current_listen_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING); xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
} }
void Application::StopListening() { void Application::StopListening() { xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING); }
xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING);
}
void Application::HandleToggleChatEvent() { void Application::HandleToggleChatEvent() {
auto state = GetDeviceState(); auto state = GetDeviceState();
if (state != kDeviceStateIdle) {
vision_text_mode_enabled_.store(false);
}
if (state == kDeviceStateActivating) { if (state == kDeviceStateActivating) {
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
@ -717,17 +745,22 @@ void Application::HandleToggleChatEvent() {
if (state == kDeviceStateIdle) { if (state == kDeviceStateIdle) {
ListeningMode mode = GetDefaultListeningMode(); ListeningMode mode = GetDefaultListeningMode();
if (!protocol_->IsAudioChannelOpened()) { bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
bool vision_mode_changed =
vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
if (protocol_->IsAudioChannelOpened()) {
protocol_->CloseAudioChannel();
}
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update) // Schedule to let the state change be processed first (UI update)
Schedule([this, mode]() { Schedule([this, mode]() { ContinueOpenAudioChannel(mode); });
ContinueOpenAudioChannel(mode);
});
return; return;
} }
SetListeningMode(mode); SetListeningMode(mode);
} else if (state == kDeviceStateSpeaking) { } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
AbortSpeaking(kAbortReasonNone); AbortSpeaking(kAbortReasonNone);
SetListeningMode(GetDefaultListeningMode());
} else if (state == kDeviceStateListening) { } else if (state == kDeviceStateListening) {
protocol_->CloseAudioChannel(); protocol_->CloseAudioChannel();
} }
@ -749,6 +782,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
} }
} }
active_chat_agent_mode_.store(chat_agent_mode_.load());
active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load());
SetListeningMode(mode); SetListeningMode(mode);
} }
@ -773,13 +808,11 @@ void Application::HandleStartListeningEvent() {
if (!protocol_->IsAudioChannelOpened()) { if (!protocol_->IsAudioChannelOpened()) {
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update) // Schedule to let the state change be processed first (UI update)
Schedule([this]() { Schedule([this]() { ContinueOpenAudioChannel(kListeningModeManualStop); });
ContinueOpenAudioChannel(kListeningModeManualStop);
});
return; return;
} }
SetListeningMode(kListeningModeManualStop); SetListeningMode(kListeningModeManualStop);
} else if (state == kDeviceStateSpeaking) { } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
AbortSpeaking(kAbortReasonNone); AbortSpeaking(kAbortReasonNone);
SetListeningMode(kListeningModeManualStop); SetListeningMode(kListeningModeManualStop);
} }
@ -817,17 +850,14 @@ void Application::HandleWakeWordDetectedEvent() {
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update), // Schedule to let the state change be processed first (UI update),
// then continue with OpenAudioChannel which may block for ~1 second // then continue with OpenAudioChannel which may block for ~1 second
Schedule([this, wake_word]() { Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); });
ContinueWakeWordInvoke(wake_word);
});
return; return;
} }
// Channel already opened, continue directly // Channel already opened, continue directly
ContinueWakeWordInvoke(wake_word); ContinueWakeWordInvoke(wake_word);
} else if (state == kDeviceStateSpeaking || state == kDeviceStateListening) { } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking ||
state == kDeviceStateListening) {
AbortSpeaking(kAbortReasonWakeWordDetected); AbortSpeaking(kAbortReasonWakeWordDetected);
// Clear send queue to avoid sending residues to server
while (audio_service_.PopPacketFromSendQueue());
if (state == kDeviceStateListening) { if (state == kDeviceStateListening) {
protocol_->SendStartListening(GetDefaultListeningMode()); protocol_->SendStartListening(GetDefaultListeningMode());
@ -892,6 +922,7 @@ void Application::HandleStateChangedEvent() {
switch (new_state) { switch (new_state) {
case kDeviceStateUnknown: case kDeviceStateUnknown:
case kDeviceStateIdle: case kDeviceStateIdle:
vision_frame_sent_for_current_listen_.store(false);
display->SetStatus(Lang::Strings::STANDBY); display->SetStatus(Lang::Strings::STANDBY);
display->ClearChatMessages(); // Clear messages first display->ClearChatMessages(); // Clear messages first
display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count) display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
@ -904,24 +935,19 @@ void Application::HandleStateChangedEvent() {
display->SetChatMessage("system", ""); display->SetChatMessage("system", "");
break; break;
case kDeviceStateListening: case kDeviceStateListening:
vad_speaking_.store(false);
vision_frame_sent_for_current_listen_.store(false);
display->SetStatus(Lang::Strings::LISTENING); display->SetStatus(Lang::Strings::LISTENING);
display->SetEmotion("neutral"); display->SetEmotion("neutral");
// Make sure the audio processor is running // Re-entering listening after an interrupt must restart the capture path even if the
if (play_popup_on_listening_ || !audio_service_.IsAudioProcessorRunning()) { // processor task is still marked running, otherwise realtime mode can show Listening
// For auto mode, wait for playback queue to be empty before enabling voice processing // while no fresh mic frames are sent.
// This prevents audio truncation when STOP arrives late due to network jitter
if (listening_mode_ == kListeningModeAutoStop) { if (listening_mode_ == kListeningModeAutoStop) {
audio_service_.WaitForPlaybackQueueEmpty(); audio_service_.WaitForPlaybackQueueEmpty();
} }
if (vision_text_mode_enabled_.load()) {
SendCurrentVisionFrame();
}
// Send the start listening command
protocol_->SendStartListening(listening_mode_); protocol_->SendStartListening(listening_mode_);
audio_service_.EnableVoiceProcessing(true); audio_service_.EnableVoiceProcessing(true);
}
#ifdef CONFIG_WAKE_WORD_DETECTION_IN_LISTENING #ifdef CONFIG_WAKE_WORD_DETECTION_IN_LISTENING
// Enable wake word detection in listening mode (configured via Kconfig) // Enable wake word detection in listening mode (configured via Kconfig)
@ -937,6 +963,16 @@ void Application::HandleStateChangedEvent() {
audio_service_.PlaySound(Lang::Sounds::OGG_POPUP); audio_service_.PlaySound(Lang::Sounds::OGG_POPUP);
} }
break; break;
case kDeviceStateThinking:
vad_speaking_.store(false);
display->SetStatus(Lang::Strings::THINKING);
display->SetEmotion("thinking");
if (listening_mode_ != kListeningModeRealtime) {
audio_service_.EnableVoiceProcessing(false);
audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
}
break;
case kDeviceStateSpeaking: case kDeviceStateSpeaking:
display->SetStatus(Lang::Strings::SPEAKING); display->SetStatus(Lang::Strings::SPEAKING);
@ -945,7 +981,9 @@ void Application::HandleStateChangedEvent() {
// Only AFE wake word can be detected in speaking mode // Only AFE wake word can be detected in speaking mode
audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord()); audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
} }
if (!accepting_tts_audio_.load()) {
audio_service_.ResetDecoder(); audio_service_.ResetDecoder();
}
break; break;
case kDeviceStateWifiConfiguring: case kDeviceStateWifiConfiguring:
audio_service_.EnableVoiceProcessing(false); audio_service_.EnableVoiceProcessing(false);
@ -957,24 +995,25 @@ void Application::HandleStateChangedEvent() {
} }
} }
void Application::SendCurrentVisionFrame() { bool Application::SendCurrentVisionFrame() {
if (!protocol_ || !protocol_->IsAudioChannelOpened()) { if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
return; return false;
} }
auto camera = Board::GetInstance().GetCamera(); auto camera = Board::GetInstance().GetCamera();
if (camera == nullptr) { if (camera == nullptr) {
return; return false;
} }
std::string jpeg_data; std::string jpeg_data;
if (!camera->CaptureToJpeg(jpeg_data, false)) { if (!camera->CaptureToJpeg(jpeg_data, true)) {
ESP_LOGW(TAG, "Failed to capture vision frame"); ESP_LOGW(TAG, "Failed to capture vision frame");
return; return false;
} }
protocol_->SendVisionFrame(jpeg_data); protocol_->SendVisionFrame(jpeg_data);
ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size())); ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
return true;
} }
void Application::Schedule(std::function<void()>&& callback) { void Application::Schedule(std::function<void()>&& callback) {
@ -988,6 +1027,8 @@ void Application::Schedule(std::function<void()>&& callback) {
void Application::AbortSpeaking(AbortReason reason) { void Application::AbortSpeaking(AbortReason reason) {
ESP_LOGI(TAG, "Abort speaking"); ESP_LOGI(TAG, "Abort speaking");
aborted_ = true; aborted_ = true;
accepting_tts_audio_.store(false);
audio_service_.ResetDecoder();
if (protocol_) { if (protocol_) {
protocol_->SendAbortSpeaking(reason); protocol_->SendAbortSpeaking(reason);
} }
@ -995,6 +1036,8 @@ void Application::AbortSpeaking(AbortReason reason) {
void Application::SetListeningMode(ListeningMode mode) { void Application::SetListeningMode(ListeningMode mode) {
listening_mode_ = mode; listening_mode_ = mode;
vad_speaking_.store(false);
vision_frame_sent_for_current_listen_.store(false);
SetDeviceState(kDeviceStateListening); SetDeviceState(kDeviceStateListening);
} }
@ -1029,7 +1072,8 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver
} }
ESP_LOGI(TAG, "Starting firmware upgrade from URL: %s", upgrade_url.c_str()); ESP_LOGI(TAG, "Starting firmware upgrade from URL: %s", upgrade_url.c_str());
Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download", Lang::Sounds::OGG_UPGRADE); Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download",
Lang::Sounds::OGG_UPGRADE);
vTaskDelay(pdMS_TO_TICKS(3000)); vTaskDelay(pdMS_TO_TICKS(3000));
SetDeviceState(kDeviceStateUpgrading); SetDeviceState(kDeviceStateUpgrading);
@ -1051,10 +1095,12 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver
if (!upgrade_success) { if (!upgrade_success) {
// Upgrade failed, restart audio service and continue running // Upgrade failed, restart audio service and continue running
ESP_LOGE(TAG, "Firmware upgrade failed, restarting audio service and continuing operation..."); ESP_LOGE(TAG,
"Firmware upgrade failed, restarting audio service and continuing operation...");
audio_service_.Start(); // Restart audio service audio_service_.Start(); // Restart audio service
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level
Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark",
Lang::Sounds::OGG_EXCLAMATION);
vTaskDelay(pdMS_TO_TICKS(3000)); vTaskDelay(pdMS_TO_TICKS(3000));
return false; return false;
} else { } else {
@ -1080,17 +1126,13 @@ void Application::WakeWordInvoke(const std::string& wake_word) {
if (!protocol_->IsAudioChannelOpened()) { if (!protocol_->IsAudioChannelOpened()) {
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update) // Schedule to let the state change be processed first (UI update)
Schedule([this, wake_word]() { Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); });
ContinueWakeWordInvoke(wake_word);
});
return; return;
} }
// Channel already opened, continue directly // Channel already opened, continue directly
ContinueWakeWordInvoke(wake_word); ContinueWakeWordInvoke(wake_word);
} else if (state == kDeviceStateSpeaking) { } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
Schedule([this]() { Schedule([this]() { AbortSpeaking(kAbortReasonNone); });
AbortSpeaking(kAbortReasonNone);
});
} else if (state == kDeviceStateListening) { } else if (state == kDeviceStateListening) {
Schedule([this]() { Schedule([this]() {
if (protocol_) { if (protocol_) {
@ -1160,9 +1202,7 @@ void Application::SetAecMode(AecMode mode) {
}); });
} }
void Application::PlaySound(const std::string_view& sound) { void Application::PlaySound(const std::string_view& sound) { audio_service_.PlaySound(sound); }
audio_service_.PlaySound(sound);
}
void Application::ResetProtocol() { void Application::ResetProtocol() {
Schedule([this]() { Schedule([this]() {

View File

@ -41,6 +41,11 @@ enum AecMode {
kAecOnServerSide, kAecOnServerSide,
}; };
enum ChatAgentMode {
kChatAgentModeNormal,
kChatAgentModeBeaver,
};
class Application { class Application {
public: public:
static Application& GetInstance() { static Application& GetInstance() {
@ -93,6 +98,11 @@ public:
*/ */
void ToggleChatState(); void ToggleChatState();
void ToggleChatStateWithVision(); void ToggleChatStateWithVision();
void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled);
bool IsVisionTextModeEnabled() const;
ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); }
const char* GetChatAgentModeName() const;
const char* GetChatModeName() const;
/** /**
* Start listening (event-based, thread-safe) * Start listening (event-based, thread-safe)
@ -146,7 +156,13 @@ private:
bool aborted_ = false; bool aborted_ = false;
bool assets_version_checked_ = false; bool assets_version_checked_ = false;
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
std::atomic<ChatAgentMode> chat_agent_mode_ = kChatAgentModeNormal;
std::atomic<ChatAgentMode> active_chat_agent_mode_ = kChatAgentModeNormal;
std::atomic<bool> vision_text_mode_enabled_ = false; std::atomic<bool> vision_text_mode_enabled_ = false;
std::atomic<bool> active_vision_text_mode_enabled_ = false;
std::atomic<bool> vad_speaking_ = false;
std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
std::atomic<bool> accepting_tts_audio_ = false;
int clock_ticks_ = 0; int clock_ticks_ = 0;
TaskHandle_t activation_task_handle_ = nullptr; TaskHandle_t activation_task_handle_ = nullptr;
@ -162,7 +178,7 @@ private:
void HandleWakeWordDetectedEvent(); void HandleWakeWordDetectedEvent();
void ContinueOpenAudioChannel(ListeningMode mode); void ContinueOpenAudioChannel(ListeningMode mode);
void ContinueWakeWordInvoke(const std::string& wake_word); void ContinueWakeWordInvoke(const std::string& wake_word);
void SendCurrentVisionFrame(); bool SendCurrentVisionFrame();
// Activation task (runs in background) // Activation task (runs in background)
void ActivationTask(); void ActivationTask();

View File

@ -26,6 +26,7 @@
"CONNECTION_SUCCESSFUL": "Connection Successful", "CONNECTION_SUCCESSFUL": "Connection Successful",
"CONNECTED_TO": "Connected to ", "CONNECTED_TO": "Connected to ",
"LISTENING": "Listening...", "LISTENING": "Listening...",
"THINKING": "Thinking...",
"SPEAKING": "Speaking...", "SPEAKING": "Speaking...",
"SERVER_NOT_FOUND": "Looking for available service", "SERVER_NOT_FOUND": "Looking for available service",
"SERVER_NOT_CONNECTED": "Unable to connect to service, please try again later", "SERVER_NOT_CONNECTED": "Unable to connect to service, please try again later",

View File

@ -23,6 +23,7 @@
"CONNECTING": "连接中...", "CONNECTING": "连接中...",
"CONNECTED_TO": "已连接 ", "CONNECTED_TO": "已连接 ",
"LISTENING": "聆听中...", "LISTENING": "聆听中...",
"THINKING": "思考中...",
"SPEAKING": "说话中...", "SPEAKING": "说话中...",
"SERVER_NOT_FOUND": "正在寻找可用服务", "SERVER_NOT_FOUND": "正在寻找可用服务",
"SERVER_NOT_CONNECTED": "无法连接服务,请稍后再试", "SERVER_NOT_CONNECTED": "无法连接服务,请稍后再试",

View File

@ -579,6 +579,7 @@ void AudioService::EnableWakeWordDetection(bool enable) {
void AudioService::EnableVoiceProcessing(bool enable) { void AudioService::EnableVoiceProcessing(bool enable) {
ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling"); ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
if (enable) { if (enable) {
bool was_running = IsAudioProcessorRunning();
if (!audio_processor_initialized_) { if (!audio_processor_initialized_) {
audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_); audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_);
audio_processor_initialized_ = true; audio_processor_initialized_ = true;
@ -586,7 +587,7 @@ void AudioService::EnableVoiceProcessing(bool enable) {
/* We should make sure no audio is playing */ /* We should make sure no audio is playing */
ResetDecoder(); ResetDecoder();
audio_input_need_warmup_ = true; audio_input_need_warmup_ = !was_running;
// Reset input resampler to clear cached data from previous mode (e.g. WakeWord) // Reset input resampler to clear cached data from previous mode (e.g. WakeWord)
// This prevents buffer overflow when switching between different feed sizes // This prevents buffer overflow when switching between different feed sizes
{ {

View File

@ -0,0 +1,177 @@
#include "background_capture_service.h"
#include "board.h"
#include "camera.h"
#include <algorithm>
#include <esp_heap_caps.h>
#include <esp_log.h>
#define TAG "BgCapture"
BackgroundCaptureService::BackgroundCaptureService() = default;
BackgroundCaptureService::~BackgroundCaptureService() {
Stop();
}
void BackgroundCaptureService::Start() {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
if (running_.exchange(true)) {
return;
}
auto result = xTaskCreate(
&BackgroundCaptureService::TaskEntry,
"bg_capture",
CONFIG_BACKGROUND_CAPTURE_TASK_STACK_SIZE,
this,
CONFIG_BACKGROUND_CAPTURE_TASK_PRIORITY,
&task_handle_);
if (result != pdPASS) {
running_.store(false);
task_handle_ = nullptr;
ESP_LOGE(TAG, "Failed to create background capture task");
}
#endif
}
void BackgroundCaptureService::Stop() {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
if (!running_.exchange(false)) {
return;
}
while (task_handle_ != nullptr) {
vTaskDelay(pdMS_TO_TICKS(20));
}
#endif
}
void BackgroundCaptureService::TaskEntry(void* arg) {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
auto* service = static_cast<BackgroundCaptureService*>(arg);
service->Run();
service->task_handle_ = nullptr;
#else
(void)arg;
#endif
vTaskDelete(nullptr);
}
void BackgroundCaptureService::Run() {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
ESP_LOGI(TAG, "Background capture task started");
while (running_.load()) {
if (!CaptureAndSendFrame()) {
consecutive_failures_++;
auto delay_ms = GetFailureDelayMs();
ESP_LOGW(TAG, "Background capture retry in %u ms, failures=%u",
delay_ms, consecutive_failures_);
vTaskDelay(pdMS_TO_TICKS(delay_ms));
continue;
}
consecutive_failures_ = 0;
vTaskDelay(pdMS_TO_TICKS(CONFIG_BACKGROUND_CAPTURE_FRAME_INTERVAL_MS));
}
ESP_LOGI(TAG, "Background capture task stopped");
#endif
}
bool BackgroundCaptureService::CaptureAndSendFrame() {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
const size_t free_internal_heap = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
if (free_internal_heap < CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP) {
ESP_LOGW(TAG, "Skip background capture, low internal heap: free=%u threshold=%u",
static_cast<unsigned>(free_internal_heap),
static_cast<unsigned>(CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP));
return false;
}
auto camera = Board::GetInstance().GetCamera();
if (camera == nullptr) {
ESP_LOGW(TAG, "No camera available for background capture");
return false;
}
std::string jpeg_data;
if (!camera->CaptureToJpeg(jpeg_data, false)) {
ESP_LOGW(TAG, "Failed to capture background frame");
return false;
}
if (jpeg_data.empty()) {
ESP_LOGW(TAG, "Captured empty background frame");
return false;
}
return UploadJpegFrame(jpeg_data);
#else
return false;
#endif
}
uint32_t BackgroundCaptureService::GetFailureDelayMs() const {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
const uint32_t base_delay_ms = CONFIG_BACKGROUND_CAPTURE_RETRY_INTERVAL_MS;
const uint32_t max_delay_ms = CONFIG_BACKGROUND_CAPTURE_MAX_BACKOFF_MS;
const uint32_t shift = std::min<uint32_t>(consecutive_failures_ - 1, 4);
return std::min<uint32_t>(base_delay_ms << shift, max_delay_ms);
#else
return 0;
#endif
}
bool BackgroundCaptureService::UploadJpegFrame(const std::string& jpeg_data) {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
const std::string url = CONFIG_BACKGROUND_CAPTURE_UPLOAD_URL;
if (url.empty()) {
ESP_LOGI(TAG, "Captured background frame: %u bytes", static_cast<unsigned>(jpeg_data.size()));
return true;
}
auto network = Board::GetInstance().GetNetwork();
if (network == nullptr) {
ESP_LOGW(TAG, "No network available for background upload");
return false;
}
const std::string boundary = "----XIAOZHI_BACKGROUND_CAPTURE_BOUNDARY";
auto http = network->CreateHttp(3);
http->SetHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
if (!http->Open("POST", url)) {
ESP_LOGW(TAG, "Failed to open background upload URL: %s", url.c_str());
return false;
}
std::string file_header;
file_header += "--" + boundary + "\r\n";
file_header += "Content-Disposition: form-data; name=\"file\"; filename=\"frame.jpg\"\r\n";
file_header += "Content-Type: image/jpeg\r\n\r\n";
http->Write(file_header.c_str(), file_header.size());
http->Write(jpeg_data.data(), jpeg_data.size());
std::string footer;
footer += "\r\n--" + boundary + "--\r\n";
http->Write(footer.c_str(), footer.size());
http->Write("", 0);
const int status_code = http->GetStatusCode();
http->Close();
if (status_code < 200 || status_code >= 300) {
ESP_LOGW(TAG, "Background upload failed, status=%d", status_code);
return false;
}
ESP_LOGI(TAG, "Uploaded background frame: %u bytes", static_cast<unsigned>(jpeg_data.size()));
return true;
#else
(void)jpeg_data;
return false;
#endif
}

View File

@ -0,0 +1,32 @@
#ifndef BACKGROUND_CAPTURE_SERVICE_H
#define BACKGROUND_CAPTURE_SERVICE_H
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <atomic>
#include <cstdint>
#include <string>
class BackgroundCaptureService {
public:
BackgroundCaptureService();
~BackgroundCaptureService();
void Start();
void Stop();
bool IsRunning() const { return running_.load(); }
private:
TaskHandle_t task_handle_ = nullptr;
std::atomic<bool> running_ = false;
uint32_t consecutive_failures_ = 0;
static void TaskEntry(void* arg);
void Run();
bool CaptureAndSendFrame();
bool UploadJpegFrame(const std::string& jpeg_data);
uint32_t GetFailureDelayMs() const;
};
#endif // BACKGROUND_CAPTURE_SERVICE_H

View File

@ -214,6 +214,9 @@ public:
case kDeviceStateSpeaking: case kDeviceStateSpeaking:
ctrl_->SetStatusColor(64, 0, 0); // red ctrl_->SetStatusColor(64, 0, 0); // red
break; break;
case kDeviceStateThinking:
ctrl_->SetStatusColor(0, 0, 64); // blue
break;
default: default:
ctrl_->SetStatusColor(0, 0, 64); // blue ctrl_->SetStatusColor(0, 0, 64); // blue
break; break;

View File

@ -203,7 +203,10 @@ void WifiBoard::EnterWifiConfigMode() {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
auto state = app.GetDeviceState(); auto state = app.GetDeviceState();
if (state == kDeviceStateSpeaking || state == kDeviceStateListening || state == kDeviceStateIdle) { if (state == kDeviceStateSpeaking ||
state == kDeviceStateThinking ||
state == kDeviceStateListening ||
state == kDeviceStateIdle) {
// Reset protocol (close audio channel, reset protocol) // Reset protocol (close audio channel, reset protocol)
Application::GetInstance().ResetProtocol(); Application::GetInstance().ResetProtocol();

View File

@ -85,6 +85,13 @@ void ElectronEmojiDisplay::SetStatus(const char* status) {
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
return; return;
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
lv_obj_set_style_text_font(status_label_, text_font, 0);
lv_label_set_text(status_label_, status);
lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
return;
} else if (strcmp(status, Lang::Strings::CONNECTING) == 0) { } else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0); lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标 lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标

View File

@ -155,6 +155,8 @@ void EmojiWidget::SetStatus(const char* status)
if (player_) { if (player_) {
if (strcmp(status, Lang::Strings::LISTENING) == 0) { if (strcmp(status, Lang::Strings::LISTENING) == 0) {
player_->StartPlayer("asking", true, 15); player_->StartPlayer("asking", true, 15);
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
player_->StartPlayer("thinking", true, 15);
} else if (strcmp(status, Lang::Strings::STANDBY) == 0) { } else if (strcmp(status, Lang::Strings::STANDBY) == 0) {
player_->StartPlayer("wake", true, 15); player_->StartPlayer("wake", true, 15);
} }

View File

@ -231,9 +231,9 @@ private:
// 如果当前是聆听状态,切换到待命状态 // 如果当前是聆听状态,切换到待命状态
ESP_LOGI(TAG, "从聆听状态切换到待命状态"); ESP_LOGI(TAG, "从聆听状态切换到待命状态");
app.ToggleChatState(); // 切换到待命状态 app.ToggleChatState(); // 切换到待命状态
} else if (current_state == kDeviceStateSpeaking) { } else if (current_state == kDeviceStateSpeaking || current_state == kDeviceStateThinking) {
// 如果当前是说话状态,终止说话并切换到待命状态 // 如果当前是说话或思考状态,终止并切换到待命状态
ESP_LOGI(TAG, "从说话状态切换到待命状态"); ESP_LOGI(TAG, "从说话/思考状态切换到待命状态");
app.ToggleChatState(); // 终止说话 app.ToggleChatState(); // 终止说话
} else { } else {
// 其他状态下只唤醒设备 // 其他状态下只唤醒设备

View File

@ -192,6 +192,7 @@ private:
void PollTouchpad() { void PollTouchpad() {
static bool was_touched = false; static bool was_touched = false;
static int64_t touch_start_time = 0; static int64_t touch_start_time = 0;
static int touch_start_x = -1;
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按 const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按
ft6336_->UpdateTouchPoint(); ft6336_->UpdateTouchPoint();
@ -201,11 +202,14 @@ private:
if (touch_point.num > 0 && !was_touched) { if (touch_point.num > 0 && !was_touched) {
was_touched = true; was_touched = true;
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒 touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
touch_start_x = touch_point.x;
} }
// 检测触摸释放 // 检测触摸释放
else if (touch_point.num == 0 && was_touched) { else if (touch_point.num == 0 && was_touched) {
was_touched = false; was_touched = false;
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time; int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2;
auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal;
if (touch_duration < TOUCH_THRESHOLD_MS) { if (touch_duration < TOUCH_THRESHOLD_MS) {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
@ -213,12 +217,12 @@ private:
EnterWifiConfigMode(); EnterWifiConfigMode();
return; return;
} }
ESP_LOGI(TAG, "Touch short: text-only mode"); ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal");
app.ToggleChatState(); app.ToggleChatStateForMode(agent_mode, false);
} else { } else {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
ESP_LOGI(TAG, "Touch long: vision+text mode"); ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal");
app.ToggleChatStateWithVision(); app.ToggleChatStateForMode(agent_mode, true);
} }
} }
} }
@ -344,18 +348,23 @@ private:
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS)); vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
while (true) { while (true) {
if (!Application::GetInstance().IsVisionTextModeEnabled()) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue;
}
if (board->camera_ == nullptr) { if (board->camera_ == nullptr) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS)); vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue; continue;
} }
if (board->camera_->CaptureBackground()) { if (board->camera_->Capture()) {
if (!has_logged_success) { if (!has_logged_success) {
ESP_LOGI(TAG, "Background vision sampler started"); ESP_LOGI(TAG, "Vision preview sampler started");
has_logged_success = true; has_logged_success = true;
} }
} else if (!has_logged_failure) { } else if (!has_logged_failure) {
ESP_LOGW(TAG, "Background vision sampler is waiting for camera"); ESP_LOGW(TAG, "Vision preview sampler is waiting for camera");
has_logged_failure = true; has_logged_failure = true;
} }

View File

@ -77,6 +77,13 @@ void OttoEmojiDisplay::SetStatus(const char* status) {
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
return; return;
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
lv_obj_set_style_text_font(status_label_, text_font, 0);
lv_label_set_text(status_label_, status);
lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
return;
} else if (strcmp(status, Lang::Strings::CONNECTING) == 0) { } else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0); lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标 lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标

View File

@ -1,7 +1,9 @@
import asyncio import asyncio
import base64 import base64
import contextlib
import json import json
import os import os
import re
import shutil import shutil
import struct import struct
import sys import sys
@ -28,33 +30,65 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken"
LIVEKIT_WS_URL = "ws://172.19.0.240:7880" LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
ROOM_PREFIX = "test-livekit" ROOM_PREFIX = "test-livekit"
IDENTITY_PREFIX = "uv-livekit" IDENTITY_PREFIX = "uv-livekit"
AGENT_NAME = "my-agent" LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent")
DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower()
AGENT_NAMES = {
"normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME),
"beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"),
}
CHAT_MODE_AGENT_NAMES = {
"normal": AGENT_NAMES["normal"],
"beaver": AGENT_NAMES["beaver"],
"vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"),
"vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"),
}
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0")) CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0")) AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
WS_PORT = 8080 WS_PORT = 8080
WS_MAX_QUEUE = int(os.getenv("BRIDGE_WS_MAX_QUEUE", "128"))
WS_MAX_SIZE = int(os.getenv("BRIDGE_WS_MAX_SIZE", str(8 * 1024 * 1024)))
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower() AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
PROJECT_ROOT = Path(__file__).resolve().parent.parent PROJECT_ROOT = Path(__file__).resolve().parent.parent
VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames"))) VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
INPUT_SAMPLE_RATE = 16000 INPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_INPUT_SAMPLE_RATE", "16000"))
OUTPUT_SAMPLE_RATE = 24000 OUTPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_OUTPUT_SAMPLE_RATE", "24000"))
INPUT_FRAME_DURATION_MS = 20 INPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_INPUT_FRAME_DURATION_MS", "60"))
INPUT_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * INPUT_FRAME_DURATION_MS // 1000 INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 120 // 1000
INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 60 // 1000 OUTPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_OUTPUT_FRAME_DURATION_MS", "60"))
OUTPUT_FRAME_DURATION_MS = 20 AUDIO_STATS_INTERVAL_SECONDS = float(os.getenv("BRIDGE_AUDIO_STATS_INTERVAL_SECONDS", "5.0"))
OUTPUT_SAMPLES_PER_OPUS_FRAME = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000 DOWNLINK_SEND_GAP_WARN_MS = float(os.getenv("BRIDGE_DOWNLINK_SEND_GAP_WARN_MS", "180.0"))
TTS_IDLE_TIMEOUT_SECONDS = 0.25 UPLINK_CAPTURE_TIMEOUT_SECONDS = float(os.getenv("BRIDGE_UPLINK_CAPTURE_TIMEOUT_SECONDS", "0.25"))
TTS_IDLE_TIMEOUT_SECONDS = float(os.getenv("TTS_IDLE_TIMEOUT_SECONDS", "1.2"))
TTS_MIN_ACTIVE_SECONDS = float(os.getenv("TTS_MIN_ACTIVE_SECONDS", "1.0"))
TTS_SILENCE_PEAK_THRESHOLD = 96 TTS_SILENCE_PEAK_THRESHOLD = 96
TTS_PRE_ROLL_MS = 80 TTS_PRE_ROLL_MS = int(os.getenv("TTS_PRE_ROLL_MS", "480"))
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1 TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = int(os.getenv("TTS_START_CONSECUTIVE_AUDIBLE_FRAMES", "1"))
TTS_INTERRUPT_SILENCE_FRAMES = 3 TTS_INTERRUPT_SILENCE_FRAMES = 3
INTERRUPT_TOPIC = "lk.interrupt" INTERRUPT_TOPIC = "lk.interrupt"
VISION_FRAME_TOPIC = "vision.frame" VISION_FRAME_TOPIC = "vision.frame"
AGENT_STATE_ATTRIBUTE = "lk.agent.state"
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;" TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;"
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18")) TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18")) TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
TTS_DISPLAY_SCROLL_GAP = " " TTS_DISPLAY_SCROLL_GAP = " "
TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8")) TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS = float(
os.getenv("TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS", "0.25")
)
TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS = float(
os.getenv("TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS", "8.0")
)
EMOTION_TEXT_PATTERN = re.compile(
r"^\s*<?\s*emotion\s*=\s*([^\s>,;]+)\s*>?[\s,;]*(.*)$",
re.DOTALL,
)
EMOTION_TEST_SEQUENCE = [
emotion.strip()
for emotion in os.getenv("BRIDGE_EMOTION_TEST_SEQUENCE", "").split(",")
if emotion.strip()
]
EMOTION_TEST_INTERVAL_SECONDS = float(os.getenv("BRIDGE_EMOTION_TEST_INTERVAL_SECONDS", "2.0"))
@dataclass @dataclass
@ -64,28 +98,39 @@ class DeviceSession:
protocol_version: int protocol_version: int
room_name: str room_name: str
identity: str identity: str
chat_mode: str
agent_mode: str
agent_name: str
vision_enabled: bool
room: rtc.Room room: rtc.Room
mic_source: AudioSource mic_source: AudioSource
agent_ready: asyncio.Event agent_ready: asyncio.Event
forwarding_tracks: dict[str, asyncio.Task[Any]] = field(default_factory=dict) forwarding_tracks: dict[str, asyncio.Task[Any]] = field(default_factory=dict)
tts_active: bool = False tts_active: bool = False
tts_thinking: bool = False
tts_idle_task: Optional[asyncio.Task] = None tts_idle_task: Optional[asyncio.Task] = None
tts_display_task: Optional[asyncio.Task] = None tts_display_task: Optional[asyncio.Task] = None
tts_stream_id: int = 0 tts_stream_id: int = 0
tts_transcript_text: str = "" tts_transcript_text: str = ""
tts_display_text: str = "" tts_display_text: str = ""
tts_display_final: bool = False tts_display_final: bool = False
tts_emotion: str = ""
tts_suppressed_until: float = 0.0 tts_suppressed_until: float = 0.0
tts_started_at: float = 0.0
tts_last_audible_at: float = 0.0
tts_waiting_for_user_audio_after_interrupt: bool = False
last_interrupt_time: float = 0.0
last_uplink_audible_time: float = 0.0
agent_dispatch_task: Optional[asyncio.Task] = None agent_dispatch_task: Optional[asyncio.Task] = None
closed: bool = False closed: bool = False
captured_frame_count: int = 0 captured_frame_count: int = 0
first_capture_log_time: float = 0.0 first_capture_log_time: float = 0.0
async def fetch_token(room_name: str, identity: str) -> str: async def fetch_token(room_name: str, identity: str, agent_name: str) -> str:
params = {"room": room_name, "identity": identity} params = {"room": room_name, "identity": identity}
if AGENT_DISPATCH_MODE == "token": if AGENT_DISPATCH_MODE == "token":
params["agent_name"] = AGENT_NAME params["agent_name"] = agent_name
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
response = await client.get(TOKEN_URL, params=params) response = await client.get(TOKEN_URL, params=params)
@ -111,6 +156,100 @@ class ESP32LiveKitBridge:
if formatted_tb.strip(): if formatted_tb.strip():
print(formatted_tb.rstrip()) print(formatted_tb.rstrip())
def _audio_duration_ms(self, sample_count: int, sample_rate: int) -> float:
if sample_rate <= 0:
return 0.0
return sample_count * 1000.0 / sample_rate
def _build_server_hello(self, session: DeviceSession) -> dict[str, Any]:
return {
"type": "hello",
"transport": "websocket",
"session": {
"room": session.room_name,
"identity": session.identity,
},
"audio_params": {
"format": "opus",
"sample_rate": OUTPUT_SAMPLE_RATE,
"channels": 1,
"frame_duration": OUTPUT_FRAME_DURATION_MS,
},
}
def _log_client_hello(self, session: DeviceSession, message: dict[str, Any]) -> None:
audio_params = message.get("audio_params")
if not isinstance(audio_params, dict):
return
sample_rate = audio_params.get("sample_rate")
frame_duration = audio_params.get("frame_duration")
channels = audio_params.get("channels")
fmt = audio_params.get("format")
print(
"[client-audio] "
f"device={session.device_id} format={fmt} sample_rate={sample_rate} "
f"channels={channels} frame_duration={frame_duration}"
)
if sample_rate != INPUT_SAMPLE_RATE or channels != 1:
print(
"[client-audio] warning: bridge uplink decode expects "
f"{INPUT_SAMPLE_RATE}Hz mono, got {sample_rate}Hz channels={channels}"
)
if frame_duration != INPUT_FRAME_DURATION_MS:
print(
"[client-audio] warning: bridge expects "
f"{INPUT_FRAME_DURATION_MS}ms uplink frames, got {frame_duration}ms"
)
def _track_room_connect_task(
self,
session: DeviceSession,
task: asyncio.Task[Any],
) -> None:
if task.cancelled():
return
try:
task.result()
except Exception as exc:
self._log_exception(
f"LiveKit 房间连接后台任务失败: room={session.room_name}",
exc,
)
websocket = session.websocket
if websocket is not None:
asyncio.create_task(websocket.close(code=1011, reason="livekit connect failed"))
async def _capture_mic_frame(
self,
session: DeviceSession,
pcm_bytes: bytes,
num_samples: int,
) -> bool:
try:
frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples)
except TypeError:
frame = AudioFrame.create(
sample_rate=INPUT_SAMPLE_RATE,
num_channels=1,
samples_per_channel=num_samples,
)
memoryview(frame.data).cast("B")[:] = pcm_bytes
try:
await asyncio.wait_for(
session.mic_source.capture_frame(frame),
timeout=UPLINK_CAPTURE_TIMEOUT_SECONDS,
)
return True
except asyncio.TimeoutError:
print(
"[uplink] warning: capture_frame timeout, dropping frame "
f"device={session.device_id} samples={num_samples}"
)
return False
def _build_device_id(self, websocket: Any) -> str: def _build_device_id(self, websocket: Any) -> str:
headers = websocket.request.headers headers = websocket.request.headers
requested_id = headers.get("X-Device-Id") or headers.get("Device-Id") requested_id = headers.get("X-Device-Id") or headers.get("Device-Id")
@ -125,15 +264,61 @@ class ESP32LiveKitBridge:
print(f"[session] device={device_id} room={room_name} identity={identity}") print(f"[session] device={device_id} room={room_name} identity={identity}")
return room_name, identity return room_name, identity
def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool: def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]:
requested_chat_mode = (
headers.get("Chat-Mode")
or headers.get("X-Chat-Mode")
or ""
).strip().lower()
chat_mode_to_agent = {
"normal": ("normal", False),
"beaver": ("beaver", False),
"vision-normal": ("normal", True),
"vision-beaver": ("beaver", True),
}
if requested_chat_mode in chat_mode_to_agent:
requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode]
else:
if requested_chat_mode:
print(f"未知 Chat-Mode={requested_chat_mode!r},回退到 Agent-Mode")
requested_mode = (
headers.get("Agent-Mode")
or headers.get("X-Agent-Mode")
or DEFAULT_AGENT_MODE
or "normal"
).strip().lower()
vision_enabled = False
requested_chat_mode = requested_mode
requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name")
if requested_name:
return requested_chat_mode, "custom", requested_name, vision_enabled
if requested_mode not in AGENT_NAMES:
print(f"未知 Agent-Mode={requested_mode!r},回退到 normal")
requested_mode = "normal"
requested_chat_mode = "vision-normal" if vision_enabled else "normal"
if requested_chat_mode in CHAT_MODE_AGENT_NAMES:
return (
requested_chat_mode,
requested_mode,
CHAT_MODE_AGENT_NAMES[requested_chat_mode],
vision_enabled,
)
return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled
def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool:
identity = getattr(participant, "identity", "") or "" identity = getattr(participant, "identity", "") or ""
return identity.startswith("agent-") or AGENT_NAME in identity return identity.startswith("agent-") or agent_name in identity
def _get_agent_identities(self, session: DeviceSession) -> list[str]: def _get_agent_identities(self, session: DeviceSession) -> list[str]:
return [ return [
participant.identity participant.identity
for participant in session.room.remote_participants.values() for participant in session.room.remote_participants.values()
if self._is_agent_participant(participant) if self._is_agent_participant(participant, session.agent_name)
] ]
def _log_agent_participants(self, session: DeviceSession, source: str) -> None: def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
@ -175,7 +360,7 @@ class ESP32LiveKitBridge:
dispatch_agent_name = getattr(dispatch, "agent_name", None) dispatch_agent_name = getattr(dispatch, "agent_name", None)
dispatch_room = getattr(dispatch, "room", None) dispatch_room = getattr(dispatch, "room", None)
if dispatch_room == session.room_name and ( if dispatch_room == session.room_name and (
dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None dispatch_agent_name == session.agent_name or dispatch_agent_name is None
): ):
print( print(
f"检测到已有 dispatch: room={session.room_name} " f"检测到已有 dispatch: room={session.room_name} "
@ -198,7 +383,7 @@ class ESP32LiveKitBridge:
return return
for participant in session.room.remote_participants.values(): for participant in session.room.remote_participants.values():
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
# print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}") # print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}")
return return
@ -210,7 +395,7 @@ class ESP32LiveKitBridge:
await session.agent_dispatch_task await session.agent_dispatch_task
async def _dispatch_agent(self, session: DeviceSession) -> None: async def _dispatch_agent(self, session: DeviceSession) -> None:
print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}") print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}")
try: try:
if await self._dispatch_agent_with_sdk(session): if await self._dispatch_agent_with_sdk(session):
@ -240,13 +425,17 @@ class ESP32LiveKitBridge:
try: try:
dispatch = await lkapi.agent_dispatch.create_dispatch( dispatch = await lkapi.agent_dispatch.create_dispatch(
livekit_api.CreateAgentDispatchRequest( livekit_api.CreateAgentDispatchRequest(
agent_name=AGENT_NAME, agent_name=session.agent_name,
room=session.room_name, room=session.room_name,
metadata=json.dumps( metadata=json.dumps(
{ {
"source": "bridge_server", "source": "bridge_server",
"identity": session.identity, "identity": session.identity,
"device_id": session.device_id, "device_id": session.device_id,
"chat_mode": session.chat_mode,
"agent_mode": session.agent_mode,
"agent_name": session.agent_name,
"vision_enabled": session.vision_enabled,
} }
), ),
) )
@ -269,13 +458,17 @@ class ESP32LiveKitBridge:
"--room", "--room",
session.room_name, session.room_name,
"--agent-name", "--agent-name",
AGENT_NAME, session.agent_name,
"--metadata", "--metadata",
json.dumps( json.dumps(
{ {
"source": "bridge_server", "source": "bridge_server",
"identity": session.identity, "identity": session.identity,
"device_id": session.device_id, "device_id": session.device_id,
"chat_mode": session.chat_mode,
"agent_mode": session.agent_mode,
"agent_name": session.agent_name,
"vision_enabled": session.vision_enabled,
} }
), ),
stdout=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE,
@ -294,7 +487,7 @@ class ESP32LiveKitBridge:
print(f"lk dispatch create 失败,退出码: {process.returncode}") print(f"lk dispatch create 失败,退出码: {process.returncode}")
return False return False
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}") print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}")
return True return True
async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool: async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
@ -364,7 +557,7 @@ class ESP32LiveKitBridge:
print("收到 vision frame但 image 字段为空") print("收到 vision frame但 image 字段为空")
return return
saved_path = self._save_vision_frame(session, image) saved_path = await asyncio.to_thread(self._save_vision_frame, session, image)
if saved_path is None: if saved_path is None:
return return
print(f"已保存 vision frame: {saved_path}") print(f"已保存 vision frame: {saved_path}")
@ -415,9 +608,40 @@ class ESP32LiveKitBridge:
await session.websocket.send(json.dumps({"type": "tts", "state": state})) await session.websocket.send(json.dumps({"type": "tts", "state": state}))
print(f"已发送 tts {state}: device={session.device_id}") print(f"已发送 tts {state}: device={session.device_id}")
async def _send_emotion(self, session: DeviceSession, emotion: str) -> None:
if session.websocket is None:
print(f"跳过 emotion {emotion}ESP32 尚未连接")
return
await session.websocket.send(json.dumps({"type": "llm", "emotion": emotion}))
print(f"已发送 emotion: device={session.device_id} emotion={emotion}")
def _parse_emotion_text(self, text: str) -> tuple[Optional[str], str]:
match = EMOTION_TEXT_PATTERN.match(text)
if match is None:
return None, text.strip()
emotion, tts_text = match.groups()
return emotion.strip(), tts_text.strip()
async def _run_emotion_test_sequence(self, session: DeviceSession) -> None:
if not EMOTION_TEST_SEQUENCE:
return
for index, emotion in enumerate(EMOTION_TEST_SEQUENCE):
if session.websocket is None or session.closed:
return
if index > 0:
await asyncio.sleep(EMOTION_TEST_INTERVAL_SECONDS)
await self._send_emotion(session, emotion)
async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None: async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
if session.websocket is None: if session.websocket is None:
return return
raw_text = text
_emotion, text = self._parse_emotion_text(text)
if not text:
print(f"[tts->esp32] skip empty text: raw={raw_text!r} final={final}")
return
print(f"[tts->esp32] text={text!r} final={final}")
await session.websocket.send( await session.websocket.send(
json.dumps( json.dumps(
{ {
@ -487,26 +711,99 @@ class ESP32LiveKitBridge:
if session.tts_active: if session.tts_active:
print("跳过 tts start当前已处于激活状态") print("跳过 tts start当前已处于激活状态")
return return
block_reason = self._tts_resume_block_reason(session, include_user_quiet=False)
if block_reason is not None:
print(f"跳过 tts start打断后仍在等待稳定聆听: {block_reason}")
return
if time.monotonic() < session.tts_suppressed_until: if time.monotonic() < session.tts_suppressed_until:
print("跳过 tts start中断后的残留音频仍在抑制窗口内") print("跳过 tts start中断后的残留音频仍在抑制窗口内")
return return
if not session.tts_display_text: if not session.tts_display_text:
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
self._cancel_tts_display_task(session) self._cancel_tts_display_task(session)
now = time.monotonic()
session.tts_started_at = now
session.tts_last_audible_at = now
await self._send_tts_state(session, "start") await self._send_tts_state(session, "start")
session.tts_active = True session.tts_active = True
session.tts_thinking = False
async def _start_thinking(self, session: DeviceSession) -> None:
if session.tts_active:
print("跳过 tts thinking当前已处于 TTS 播放状态")
return
if session.tts_thinking:
print("跳过 tts thinking当前已处于思考状态")
return
block_reason = self._tts_resume_block_reason(session, include_user_quiet=False)
if block_reason is not None:
print(f"跳过 tts thinking打断后仍在等待稳定聆听: {block_reason}")
return
if time.monotonic() < session.tts_suppressed_until:
print("跳过 tts thinking中断后的残留音频仍在抑制窗口内")
return
await self._send_tts_state(session, "thinking")
session.tts_thinking = True
def _tts_resume_block_reason(
self,
session: DeviceSession,
now: Optional[float] = None,
*,
include_user_quiet: bool = True,
) -> Optional[str]:
if now is None:
now = time.monotonic()
if session.tts_waiting_for_user_audio_after_interrupt:
return "waiting_for_user_audio_after_interrupt"
if session.last_interrupt_time <= 0.0:
return None
since_interrupt = now - session.last_interrupt_time
if since_interrupt > TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS:
return None
if session.last_uplink_audible_time < session.last_interrupt_time:
return None
if not include_user_quiet:
return None
quiet_for = now - session.last_uplink_audible_time
if quiet_for < TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS:
return f"user_audio_quiet_for={quiet_for:.2f}s"
return None
def _handle_agent_state(self, session: DeviceSession, participant: rtc.Participant) -> None:
state = participant.attributes.get(AGENT_STATE_ATTRIBUTE)
if not isinstance(state, str) or not state:
return
print(
f"[agent-state] room={session.room_name} identity={participant.identity} state={state}"
)
if state == "thinking":
asyncio.create_task(self._start_thinking(session))
async def _stop_tts(self, session: DeviceSession) -> None: async def _stop_tts(self, session: DeviceSession) -> None:
if not session.tts_active: if not session.tts_active and not session.tts_thinking:
print("跳过 tts stop当前未激活") print("跳过 tts stop当前未激活")
return return
self._cancel_tts_display_task(session) self._cancel_tts_display_task(session)
await self._send_tts_state(session, "stop") await self._send_tts_state(session, "stop")
session.tts_active = False session.tts_active = False
session.tts_thinking = False
session.tts_started_at = 0.0
session.tts_last_audible_at = 0.0
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_text = "" session.tts_display_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None: async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
self._cancel_tts_display_task(session) self._cancel_tts_display_task(session)
@ -514,20 +811,28 @@ class ESP32LiveKitBridge:
session.tts_idle_task.cancel() session.tts_idle_task.cancel()
session.tts_idle_task = None session.tts_idle_task = None
session.tts_active = False session.tts_active = False
session.tts_thinking = False
session.tts_started_at = 0.0
session.tts_last_audible_at = 0.0
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_text = "" session.tts_display_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
await self._send_tts_state(session, "stop") await self._send_tts_state(session, "stop")
print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}") print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")
async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None: async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None:
print(f"收到打断请求,停止当前 TTS: device={session.device_id} reason={reason}") print(f"收到打断请求,停止当前 TTS: device={session.device_id} reason={reason}")
now = time.monotonic()
session.tts_stream_id += 1 session.tts_stream_id += 1
session.tts_suppressed_until = time.monotonic() + TTS_INTERRUPT_SUPPRESS_SECONDS session.last_interrupt_time = now
session.tts_suppressed_until = now + TTS_INTERRUPT_SUPPRESS_SECONDS
session.tts_waiting_for_user_audio_after_interrupt = True
await self._force_stop_tts(session, reason) await self._force_stop_tts(session, reason)
asyncio.create_task(self._send_agent_interrupt(session, reason)) asyncio.create_task(self._send_agent_interrupt(session, reason))
def _reset_tts_idle_timer(self, session: DeviceSession) -> None: def _reset_tts_idle_timer(self, session: DeviceSession) -> None:
session.tts_last_audible_at = time.monotonic()
if session.tts_idle_task is not None: if session.tts_idle_task is not None:
session.tts_idle_task.cancel() session.tts_idle_task.cancel()
session.tts_idle_task = asyncio.create_task( session.tts_idle_task = asyncio.create_task(
@ -536,11 +841,28 @@ class ESP32LiveKitBridge:
async def _tts_idle_watchdog(self, session: DeviceSession, stream_id: int) -> None: async def _tts_idle_watchdog(self, session: DeviceSession, stream_id: int) -> None:
try: try:
while True:
await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS) await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS)
if stream_id != session.tts_stream_id: if stream_id != session.tts_stream_id or not session.tts_active:
return return
print(f"TTS 空闲超过 {TTS_IDLE_TIMEOUT_SECONDS}s切回聆听状态")
now = time.monotonic()
idle_for = now - session.tts_last_audible_at
active_for = now - session.tts_started_at
remaining = max(
TTS_IDLE_TIMEOUT_SECONDS - idle_for,
TTS_MIN_ACTIVE_SECONDS - active_for,
)
if remaining > 0:
await asyncio.sleep(remaining)
continue
print(
"TTS 静音达到阈值,切回聆听状态: "
f"idle={idle_for:.2f}s active={active_for:.2f}s"
)
await self._stop_tts(session) await self._stop_tts(session)
return
except asyncio.CancelledError: except asyncio.CancelledError:
pass pass
@ -681,20 +1003,22 @@ class ESP32LiveKitBridge:
print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}") print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
self._log_agent_participants(session, "connected") self._log_agent_participants(session, "connected")
for participant in session.room.remote_participants.values(): for participant in session.room.remote_participants.values():
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
session.agent_ready.set() session.agent_ready.set()
self._scan_participant_audio_tracks(session, participant, "connected_scan") self._scan_participant_audio_tracks(session, participant, "connected_scan")
self._handle_agent_state(session, participant)
@session.room.on("participant_connected") @session.room.on("participant_connected")
def on_participant_connected(participant: rtc.RemoteParticipant) -> None: def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
role = "Agent" if self._is_agent_participant(participant) else "Remote participant" role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}") print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
self._log_agent_participants(session, "participant_connected") self._log_agent_participants(session, "participant_connected")
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
session.agent_ready.set() session.agent_ready.set()
self._scan_participant_audio_tracks( self._scan_participant_audio_tracks(
session, participant, "participant_connected_scan" session, participant, "participant_connected_scan"
) )
self._handle_agent_state(session, participant)
@session.room.on("participant_disconnected") @session.room.on("participant_disconnected")
def on_participant_disconnected(participant: rtc.RemoteParticipant) -> None: def on_participant_disconnected(participant: rtc.RemoteParticipant) -> None:
@ -705,6 +1029,16 @@ class ESP32LiveKitBridge:
if not track_sid.endswith(f":{participant.identity}") if not track_sid.endswith(f":{participant.identity}")
} }
@session.room.on("participant_attributes_changed")
def on_participant_attributes_changed(changed: list[str], participant: rtc.Participant) -> None:
if AGENT_STATE_ATTRIBUTE not in changed:
return
if not isinstance(participant, rtc.RemoteParticipant):
return
if not self._is_agent_participant(participant, session.agent_name):
return
self._handle_agent_state(session, participant)
@session.room.on("data_received") @session.room.on("data_received")
def on_data_received(data_packet: rtc.DataPacket) -> None: def on_data_received(data_packet: rtc.DataPacket) -> None:
identity = data_packet.participant.identity if data_packet.participant else "未知" identity = data_packet.participant.identity if data_packet.participant else "未知"
@ -723,14 +1057,26 @@ class ESP32LiveKitBridge:
track_pub: rtc.TrackPublication, track_pub: rtc.TrackPublication,
) -> None: ) -> None:
identity = participant.identity if participant else "未知" identity = participant.identity if participant else "未知"
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant) is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(
participant, session.agent_name
)
for segment in segments: for segment in segments:
status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果" status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}") print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}")
if is_agent: if is_agent:
if time.monotonic() < session.tts_suppressed_until: if time.monotonic() < session.tts_suppressed_until:
continue continue
display_text = self._current_tts_display_text(segment.text) print(f"[livekit-llm] raw={segment.text!r} final={segment.final}")
emotion, tts_text = self._parse_emotion_text(segment.text)
print(
f"[livekit-llm] parsed emotion={emotion!r} "
f"tts_text={tts_text!r} final={segment.final}"
)
if emotion and emotion != session.tts_emotion:
session.tts_emotion = emotion
asyncio.create_task(self._send_emotion(session, emotion))
display_text = self._current_tts_display_text(tts_text)
print(f"[livekit-llm] display_text={display_text!r} final={segment.final}")
if not display_text or display_text == session.tts_transcript_text: if not display_text or display_text == session.tts_transcript_text:
continue continue
session.tts_transcript_text = display_text session.tts_transcript_text = display_text
@ -738,6 +1084,7 @@ class ESP32LiveKitBridge:
if not segment.final: if not segment.final:
continue continue
display_text = segment.text display_text = segment.text
asyncio.create_task(self._start_thinking(session))
if session.websocket is not None: if session.websocket is not None:
ws = session.websocket ws = session.websocket
@ -785,7 +1132,7 @@ class ESP32LiveKitBridge:
# print(f"[config] token_url={TOKEN_URL}") # print(f"[config] token_url={TOKEN_URL}")
# print(f"[config] room={session.room_name} identity={session.identity}") # print(f"[config] room={session.room_name} identity={session.identity}")
# print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}") # print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
token = await fetch_token(session.room_name, session.identity) token = await fetch_token(session.room_name, session.identity, session.agent_name)
try: try:
await session.room.connect( await session.room.connect(
@ -830,15 +1177,45 @@ class ESP32LiveKitBridge:
# print(f"等待 agent 加入: room={session.room_name}") # print(f"等待 agent 加入: room={session.room_name}")
try: try:
await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS) await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
if session.closed:
return
# print(f"✅ agent 已就绪: room={session.room_name}") # print(f"✅ agent 已就绪: room={session.room_name}")
except asyncio.TimeoutError: except asyncio.TimeoutError:
if session.closed:
return
print(f"⚠️ agent 等待超时: room={session.room_name}") print(f"⚠️ agent 等待超时: room={session.room_name}")
async def start(self) -> None: async def start(self) -> None:
print(f"[config] websocket_port={WS_PORT}") print(f"[config] websocket_port={WS_PORT}")
print(f"[config] websocket_max_queue={WS_MAX_QUEUE} websocket_max_size={WS_MAX_SIZE}")
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}") print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
print(f"[config] token_url={TOKEN_URL}") print(f"[config] token_url={TOKEN_URL}")
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}") print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
print(
"[config] audio="
f"uplink_decode:{INPUT_SAMPLE_RATE}Hz/{INPUT_FRAME_DURATION_MS}ms "
f"downlink_encode:{OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms "
f"stats_interval:{AUDIO_STATS_INTERVAL_SECONDS}s "
f"capture_timeout:{UPLINK_CAPTURE_TIMEOUT_SECONDS}s "
f"tts_idle:{TTS_IDLE_TIMEOUT_SECONDS}s "
f"tts_min_active:{TTS_MIN_ACTIVE_SECONDS}s "
f"tts_start_frames:{TTS_START_CONSECUTIVE_AUDIBLE_FRAMES} "
f"tts_pre_roll:{TTS_PRE_ROLL_MS}ms"
)
print(
"[config] agents="
f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} "
f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} "
f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} "
f"default_mode:{DEFAULT_AGENT_MODE}"
)
if EMOTION_TEST_SEQUENCE:
print(
"[config] emotion_test_sequence="
f"{','.join(EMOTION_TEST_SEQUENCE)} "
f"interval={EMOTION_TEST_INTERVAL_SECONDS}s"
)
async def close(self) -> None: async def close(self) -> None:
for session in list(self.device_sessions.values()): for session in list(self.device_sessions.values()):
@ -849,7 +1226,9 @@ class ESP32LiveKitBridge:
return return
session.closed = True session.closed = True
session.websocket = None session.websocket = None
session.agent_ready.set()
session.tts_active = False session.tts_active = False
session.tts_thinking = False
session.tts_stream_id += 1 session.tts_stream_id += 1
if session.tts_idle_task is not None: if session.tts_idle_task is not None:
session.tts_idle_task.cancel() session.tts_idle_task.cancel()
@ -870,13 +1249,20 @@ class ESP32LiveKitBridge:
pending_pcm = bytearray() pending_pcm = bytearray()
pre_roll_pcm = bytearray() pre_roll_pcm = bytearray()
pre_roll_max_bytes = OUTPUT_SAMPLE_RATE * TTS_PRE_ROLL_MS // 1000 * 2 pre_roll_max_bytes = OUTPUT_SAMPLE_RATE * TTS_PRE_ROLL_MS // 1000 * 2
output_samples_per_opus_frame = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000
output_frame_bytes = output_samples_per_opus_frame * 2
audible_frame_streak = 0 audible_frame_streak = 0
silence_frame_streak = 0 silence_frame_streak = 0
waiting_for_post_interrupt_silence = False waiting_for_post_interrupt_silence = False
downlink_packets = 0
downlink_audio_ms = 0.0
last_downlink_stats_time = time.monotonic()
last_send_time: Optional[float] = None
stream_id = session.tts_stream_id stream_id = session.tts_stream_id
print( print(
f"启动 TTS 转发: device={session.device_id} room={session.room_name} " f"启动 TTS 转发: device={session.device_id} room={session.room_name} "
f"track_sid={track_sid} stream_id={stream_id} " f"track_sid={track_sid} stream_id={stream_id} "
f"opus={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
) )
try: try:
@ -897,7 +1283,8 @@ class ESP32LiveKitBridge:
pcm_data = frame.data.tobytes() pcm_data = frame.data.tobytes()
has_audible_audio = self._has_audible_audio(pcm_data) has_audible_audio = self._has_audible_audio(pcm_data)
if time.monotonic() < session.tts_suppressed_until: now = time.monotonic()
if now < session.tts_suppressed_until:
pending_pcm.clear() pending_pcm.clear()
pre_roll_pcm.clear() pre_roll_pcm.clear()
audible_frame_streak = 0 audible_frame_streak = 0
@ -905,6 +1292,31 @@ class ESP32LiveKitBridge:
waiting_for_post_interrupt_silence = True waiting_for_post_interrupt_silence = True
continue continue
block_reason = self._tts_resume_block_reason(
session,
now,
include_user_quiet=False,
)
if block_reason is not None:
pending_pcm.clear()
pre_roll_pcm.clear()
audible_frame_streak = 0
silence_frame_streak = 0
if block_reason == "waiting_for_user_audio_after_interrupt":
waiting_for_post_interrupt_silence = True
continue
if (
waiting_for_post_interrupt_silence
and session.last_interrupt_time > 0.0
and session.last_uplink_audible_time >= session.last_interrupt_time
and now - session.last_uplink_audible_time
>= TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS
):
print("检测到用户打断后语音已结束,允许新 TTS 直接起播")
waiting_for_post_interrupt_silence = False
silence_frame_streak = 0
if waiting_for_post_interrupt_silence: if waiting_for_post_interrupt_silence:
if has_audible_audio: if has_audible_audio:
silence_frame_streak = 0 silence_frame_streak = 0
@ -952,20 +1364,42 @@ class ESP32LiveKitBridge:
if not current_frame_buffered: if not current_frame_buffered:
pending_pcm.extend(pcm_data) pending_pcm.extend(pcm_data)
frame_bytes = OUTPUT_SAMPLES_PER_OPUS_FRAME * 2
while ( while (
len(pending_pcm) >= frame_bytes len(pending_pcm) >= output_frame_bytes
and stream_id == session.tts_stream_id and stream_id == session.tts_stream_id
and session.websocket is not None and session.websocket is not None
): ):
try: try:
opus_packet = encoder.encode( now = time.monotonic()
bytes(pending_pcm[:frame_bytes]), if last_send_time is not None:
OUTPUT_SAMPLES_PER_OPUS_FRAME, send_gap_ms = (now - last_send_time) * 1000.0
if send_gap_ms > DOWNLINK_SEND_GAP_WARN_MS:
print(
"[downlink] warning: send gap "
f"{send_gap_ms:.1f}ms device={session.device_id} "
f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}"
) )
del pending_pcm[:frame_bytes] last_send_time = now
opus_packet = encoder.encode(
bytes(pending_pcm[:output_frame_bytes]),
output_samples_per_opus_frame,
)
del pending_pcm[:output_frame_bytes]
await session.websocket.send(self._wrap_opus_payload(session, opus_packet)) await session.websocket.send(self._wrap_opus_payload(session, opus_packet))
downlink_packets += 1
downlink_audio_ms += OUTPUT_FRAME_DURATION_MS
if now - last_downlink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS:
print(
"[downlink] "
f"device={session.device_id} packets={downlink_packets} "
f"audio_ms={downlink_audio_ms:.0f} "
f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}"
)
downlink_packets = 0
downlink_audio_ms = 0.0
last_downlink_stats_time = now
except Exception as exc: except Exception as exc:
print(f"发送回 ESP32 失败: {exc}") print(f"发送回 ESP32 失败: {exc}")
break break
@ -998,6 +1432,7 @@ class ESP32LiveKitBridge:
await self._close_session(existing_session) await self._close_session(existing_session)
self.device_sessions.pop(device_id, None) self.device_sessions.pop(device_id, None)
chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers)
room_name, identity = self._build_session_names(device_id) room_name, identity = self._build_session_names(device_id)
session = DeviceSession( session = DeviceSession(
device_id=device_id, device_id=device_id,
@ -1005,6 +1440,10 @@ class ESP32LiveKitBridge:
protocol_version=protocol_version, protocol_version=protocol_version,
room_name=room_name, room_name=room_name,
identity=identity, identity=identity,
chat_mode=chat_mode,
agent_mode=agent_mode,
agent_name=agent_name,
vision_enabled=vision_enabled,
room=rtc.Room(), room=rtc.Room(),
mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1), mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
agent_ready=asyncio.Event(), agent_ready=asyncio.Event(),
@ -1013,28 +1452,33 @@ class ESP32LiveKitBridge:
print(f"ESP32 已连接: device={device_id}") print(f"ESP32 已连接: device={device_id}")
print(f"ESP32 协议版本: {session.protocol_version}") print(f"ESP32 协议版本: {session.protocol_version}")
print(
f"ESP32 mode: chat={session.chat_mode} "
f"agent={session.agent_mode}/{session.agent_name} "
f"vision={session.vision_enabled}"
)
session.tts_stream_id += 1 session.tts_stream_id += 1
opus_decoder = None opus_decoder = None
uplink_packets = 0
uplink_audio_ms = 0.0
uplink_decode_errors = 0
uplink_dropped_frames = 0
last_uplink_stats_time = time.monotonic()
room_connect_task: Optional[asyncio.Task[Any]] = None
try: try:
hello_msg = { hello_msg = self._build_server_hello(session)
"type": "hello",
"transport": "websocket",
"session": {
"room": session.room_name,
"identity": session.identity,
},
"audio_params": {
"format": "opus",
"sample_rate": OUTPUT_SAMPLE_RATE,
"channels": 1,
"frame_duration": OUTPUT_FRAME_DURATION_MS,
},
}
await websocket.send(json.dumps(hello_msg)) await websocket.send(json.dumps(hello_msg))
print(f"已发送 server hello: device={device_id} room={session.room_name}") print(
f"已发送 server hello: device={device_id} room={session.room_name} "
f"audio={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
)
asyncio.create_task(self._run_emotion_test_sequence(session))
await self._connect_session_room(session) room_connect_task = asyncio.create_task(self._connect_session_room(session))
room_connect_task.add_done_callback(
lambda task: self._track_room_connect_task(session, task)
)
async for message in websocket: async for message in websocket:
if isinstance(message, bytes): if isinstance(message, bytes):
@ -1057,6 +1501,16 @@ class ESP32LiveKitBridge:
if num_samples > 0: if num_samples > 0:
session.captured_frame_count += 1 session.captured_frame_count += 1
now = time.monotonic() now = time.monotonic()
uplink_packets += 1
uplink_audio_ms += self._audio_duration_ms(num_samples, INPUT_SAMPLE_RATE)
if self._has_audible_audio(pcm_bytes):
session.last_uplink_audible_time = now
if session.tts_waiting_for_user_audio_after_interrupt:
session.tts_waiting_for_user_audio_after_interrupt = False
print(
f"[uplink] detected user audio after interrupt: "
f"device={session.device_id}"
)
if ( if (
session.captured_frame_count <= 5 session.captured_frame_count <= 5
or now - session.first_capture_log_time >= 5.0 or now - session.first_capture_log_time >= 5.0
@ -1067,25 +1521,32 @@ class ESP32LiveKitBridge:
# f"bytes={len(pcm_bytes)} samples={num_samples} " # f"bytes={len(pcm_bytes)} samples={num_samples} "
# f"room={session.room_name}" # f"room={session.room_name}"
# ) # )
try: if now - last_uplink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS:
frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples) print(
await session.mic_source.capture_frame(frame) "[uplink] "
except TypeError: f"device={session.device_id} packets={uplink_packets} "
frame = AudioFrame.create( f"audio_ms={uplink_audio_ms:.0f} "
sample_rate=INPUT_SAMPLE_RATE, f"decode_errors={uplink_decode_errors} "
num_channels=1, f"dropped_frames={uplink_dropped_frames}"
samples_per_channel=num_samples,
) )
memoryview(frame.data).cast("B")[:] = pcm_bytes uplink_packets = 0
await session.mic_source.capture_frame(frame) uplink_audio_ms = 0.0
uplink_decode_errors = 0
uplink_dropped_frames = 0
last_uplink_stats_time = now
if not await self._capture_mic_frame(session, pcm_bytes, num_samples):
uplink_dropped_frames += 1
except Exception as exc: except Exception as exc:
uplink_decode_errors += 1
print(f"Opus audio decode error ({len(message)} bytes): {exc}") print(f"Opus audio decode error ({len(message)} bytes): {exc}")
elif isinstance(message, str): elif isinstance(message, str):
try: try:
data = json.loads(message) data = json.loads(message)
# print(f"收到 ESP32 JSON 消息: {data}") # print(f"收到 ESP32 JSON 消息: {data}")
msg_type = data.get("type") msg_type = data.get("type")
if msg_type == "abort": if msg_type == "hello":
self._log_client_hello(session, data)
elif msg_type == "abort":
reason = data.get("reason") reason = data.get("reason")
abort_reason = reason if isinstance(reason, str) and reason else "button_abort" abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
print(f"处理 ESP32 打断请求: reason={abort_reason}") print(f"处理 ESP32 打断请求: reason={abort_reason}")
@ -1100,6 +1561,10 @@ class ESP32LiveKitBridge:
self._log_exception("WebSocket 其他错误", exc) self._log_exception("WebSocket 其他错误", exc)
finally: finally:
print(f"ESP32 断开连接: device={device_id} room={session.room_name}") print(f"ESP32 断开连接: device={device_id} room={session.room_name}")
if room_connect_task is not None and not room_connect_task.done():
room_connect_task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await room_connect_task
await self._close_session(session) await self._close_session(session)
self.device_sessions.pop(device_id, None) self.device_sessions.pop(device_id, None)
@ -1108,7 +1573,13 @@ async def main() -> None:
bridge = ESP32LiveKitBridge() bridge = ESP32LiveKitBridge()
try: try:
await bridge.start() await bridge.start()
async with websockets.serve(bridge.handle_websocket, "0.0.0.0", WS_PORT): async with websockets.serve(
bridge.handle_websocket,
"0.0.0.0",
WS_PORT,
max_queue=WS_MAX_QUEUE,
max_size=WS_MAX_SIZE,
):
print(f"WebSocket 服务器运行在端口 {WS_PORT},等待 ESP32 连接...") print(f"WebSocket 服务器运行在端口 {WS_PORT},等待 ESP32 连接...")
await asyncio.Future() await asyncio.Future()
finally: finally:

View File

@ -8,6 +8,7 @@ enum DeviceState {
kDeviceStateIdle, kDeviceStateIdle,
kDeviceStateConnecting, kDeviceStateConnecting,
kDeviceStateListening, kDeviceStateListening,
kDeviceStateThinking,
kDeviceStateSpeaking, kDeviceStateSpeaking,
kDeviceStateUpgrading, kDeviceStateUpgrading,
kDeviceStateActivating, kDeviceStateActivating,

View File

@ -13,6 +13,7 @@ static const char* const STATE_STRINGS[] = {
"idle", "idle",
"connecting", "connecting",
"listening", "listening",
"thinking",
"speaking", "speaking",
"upgrading", "upgrading",
"activating", "activating",
@ -69,9 +70,10 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con
to == kDeviceStateActivating; to == kDeviceStateActivating;
case kDeviceStateIdle: case kDeviceStateIdle:
// Can go to connecting, listening (manual mode), speaking, activating, upgrading, or wifi configuring // Can go to connecting, listening (manual mode), thinking, speaking, activating, upgrading, or wifi configuring
return to == kDeviceStateConnecting || return to == kDeviceStateConnecting ||
to == kDeviceStateListening || to == kDeviceStateListening ||
to == kDeviceStateThinking ||
to == kDeviceStateSpeaking || to == kDeviceStateSpeaking ||
to == kDeviceStateActivating || to == kDeviceStateActivating ||
to == kDeviceStateUpgrading || to == kDeviceStateUpgrading ||
@ -83,8 +85,15 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con
to == kDeviceStateListening; to == kDeviceStateListening;
case kDeviceStateListening: case kDeviceStateListening:
// Can go to speaking or idle // Can go to thinking, speaking, or idle
return to == kDeviceStateThinking ||
to == kDeviceStateSpeaking ||
to == kDeviceStateIdle;
case kDeviceStateThinking:
// Can go to speaking, listening, or idle
return to == kDeviceStateSpeaking || return to == kDeviceStateSpeaking ||
to == kDeviceStateListening ||
to == kDeviceStateIdle; to == kDeviceStateIdle;
case kDeviceStateSpeaking: case kDeviceStateSpeaking:

View File

@ -167,6 +167,8 @@ void EmoteDisplay::SetStatus(const char* const status)
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL); emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
} else if (std::strcmp(status, Lang::Strings::STANDBY) == 0) { } else if (std::strcmp(status, Lang::Strings::STANDBY) == 0) {
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_IDLE, NULL); emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_IDLE, NULL);
} else if (std::strcmp(status, Lang::Strings::THINKING) == 0) {
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
} else if (std::strcmp(status, Lang::Strings::SPEAKING) == 0) { } else if (std::strcmp(status, Lang::Strings::SPEAKING) == 0) {
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_SPEAK, NULL); emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_SPEAK, NULL);
} else if (std::strcmp(status, Lang::Strings::ERROR) == 0) { } else if (std::strcmp(status, Lang::Strings::ERROR) == 0) {

View File

@ -203,6 +203,7 @@ void LvglDisplay::UpdateStatusBar(bool update_all) {
kDeviceStateStarting, kDeviceStateStarting,
kDeviceStateWifiConfiguring, kDeviceStateWifiConfiguring,
kDeviceStateListening, kDeviceStateListening,
kDeviceStateThinking,
kDeviceStateActivating, kDeviceStateActivating,
}; };
if (std::find(allowed_states.begin(), allowed_states.end(), device_state) != allowed_states.end()) { if (std::find(allowed_states.begin(), allowed_states.end(), device_state) != allowed_states.end()) {

View File

@ -228,6 +228,11 @@ void CircularStrip::OnStateChanged() {
SetAllColor(color); SetAllColor(color);
break; break;
} }
case kDeviceStateThinking: {
StripColor color = { low_brightness_, low_brightness_, default_brightness_ };
Blink(color, 300);
break;
}
case kDeviceStateUpgrading: { case kDeviceStateUpgrading: {
StripColor color = { low_brightness_, default_brightness_, low_brightness_ }; StripColor color = { low_brightness_, default_brightness_, low_brightness_ };
Blink(color, 100); Blink(color, 100);

View File

@ -235,6 +235,10 @@ void GpioLed::OnStateChanged() {
// TurnOn(); // TurnOn();
StartFadeTask(); StartFadeTask();
break; break;
case kDeviceStateThinking:
SetBrightness(DEFAULT_BRIGHTNESS);
StartContinuousBlink(300);
break;
case kDeviceStateSpeaking: case kDeviceStateSpeaking:
SetBrightness(SPEAKING_BRIGHTNESS); SetBrightness(SPEAKING_BRIGHTNESS);
TurnOn(); TurnOn();

View File

@ -152,6 +152,10 @@ void SingleLed::OnStateChanged() {
SetColor(0, DEFAULT_BRIGHTNESS, 0); SetColor(0, DEFAULT_BRIGHTNESS, 0);
TurnOn(); TurnOn();
break; break;
case kDeviceStateThinking:
SetColor(0, 0, DEFAULT_BRIGHTNESS);
StartContinuousBlink(300);
break;
case kDeviceStateUpgrading: case kDeviceStateUpgrading:
SetColor(0, DEFAULT_BRIGHTNESS, 0); SetColor(0, DEFAULT_BRIGHTNESS, 0);
StartContinuousBlink(100); StartContinuousBlink(100);

View File

@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str()); websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str()); websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str()); websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName());
websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName());
websocket_->OnData([this](const char* data, size_t len, bool binary) { websocket_->OnData([this](const char* data, size_t len, bool binary) {
if (binary) { if (binary) {