From 2c4329fd84a7c12df34346380bc2253a8fc39843 Mon Sep 17 00:00:00 2001 From: 0Xiao0 <511201264@qq.com> Date: Fri, 12 Jun 2026 11:38:47 +0800 Subject: [PATCH] fix: voice interupt --- .gitignore | 1 + main/application.cc | 348 +++++++-------- main/application.h | 1 + main/assets/locales/en-US/language.json | 3 +- main/assets/locales/zh-CN/language.json | 3 +- main/audio/audio_service.cc | 3 +- main/background_capture_service.cc | 177 ++++++++ main/background_capture_service.h | 32 ++ .../atoms3r_echo_pyramid.cc | 3 + main/boards/common/wifi_board.cc | 5 +- .../electron-bot/electron_emoji_display.cc | 9 +- main/boards/esp-hi/emoji_display.cc | 2 + main/boards/jiuchuan-s3/jiuchuan_dev_board.cc | 6 +- main/boards/otto-robot/otto_emoji_display.cc | 9 +- main/bridge_server.py | 418 +++++++++++++++--- main/device_state.h | 3 +- main/device_state_machine.cc | 13 +- main/display/emote_display.cc | 4 +- main/display/lvgl_display/lvgl_display.cc | 1 + main/led/circular_strip.cc | 5 + main/led/gpio_led.cc | 6 +- main/led/single_led.cc | 4 + 22 files changed, 816 insertions(+), 240 deletions(-) create mode 100644 main/background_capture_service.cc create mode 100644 main/background_capture_service.h diff --git a/.gitignore b/.gitignore index 0efd9de..eec5b95 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ main/mmap_generate_emoji.h *.bin mmap_generate_*.h .clangd +background_frames/ diff --git a/main/application.cc b/main/application.cc index b679152..8d98579 100644 --- a/main/application.cc +++ b/main/application.cc @@ -1,25 +1,24 @@ #include "application.h" +#include "assets.h" +#include "assets/lang_config.h" +#include "audio_codec.h" #include "board.h" #include "display.h" -#include "system_info.h" -#include "audio_codec.h" -#include "mqtt_protocol.h" -#include "websocket_protocol.h" -#include "assets/lang_config.h" #include "mcp_server.h" -#include "assets.h" +#include "mqtt_protocol.h" #include "settings.h" +#include "system_info.h" +#include "websocket_protocol.h" -#include -#include -#include #include +#include #include +#include #include +#include #define TAG "Application" - Application::Application() { event_group_ = xEventGroupCreate(); @@ -33,16 +32,16 @@ Application::Application() { aec_mode_ = kAecOff; #endif - esp_timer_create_args_t clock_timer_args = { - .callback = [](void* arg) { - Application* app = (Application*)arg; - xEventGroupSetBits(app->event_group_, MAIN_EVENT_CLOCK_TICK); - }, - .arg = this, - .dispatch_method = ESP_TIMER_TASK, - .name = "clock_timer", - .skip_unhandled_events = true - }; + esp_timer_create_args_t clock_timer_args = {.callback = + [](void* arg) { + Application* app = (Application*)arg; + xEventGroupSetBits(app->event_group_, + MAIN_EVENT_CLOCK_TICK); + }, + .arg = this, + .dispatch_method = ESP_TIMER_TASK, + .name = "clock_timer", + .skip_unhandled_events = true}; esp_timer_create(&clock_timer_args, &clock_timer_handle_); } @@ -54,9 +53,7 @@ Application::~Application() { vEventGroupDelete(event_group_); } -bool Application::SetDeviceState(DeviceState state) { - return state_machine_.TransitionTo(state); -} +bool Application::SetDeviceState(DeviceState state) { return state_machine_.TransitionTo(state); } void Application::Initialize() { auto& board = Board::GetInstance(); @@ -102,7 +99,7 @@ void Application::Initialize() { // Set network event callback for UI updates and network state handling board.SetNetworkEventCallback([this](NetworkEvent event, const std::string& data) { auto display = Board::GetInstance().GetDisplay(); - + switch (event) { case NetworkEvent::Scanning: display->ShowNotification(Lang::Strings::SCANNING_WIFI, 30000); @@ -142,13 +139,16 @@ void Application::Initialize() { display->SetStatus(Lang::Strings::DETECTING_MODULE); break; case NetworkEvent::ModemErrorNoSim: - Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_PIN); + Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation", + Lang::Sounds::OGG_ERR_PIN); break; case NetworkEvent::ModemErrorRegDenied: - Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_REG); + Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation", + Lang::Sounds::OGG_ERR_REG); break; case NetworkEvent::ModemErrorInitFailed: - Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation", Lang::Sounds::OGG_EXCLAMATION); + Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation", + Lang::Sounds::OGG_EXCLAMATION); break; case NetworkEvent::ModemErrorTimeout: display->SetStatus(Lang::Strings::REGISTERING_NETWORK); @@ -167,19 +167,11 @@ void Application::Run() { // Set the priority of the main task to 10 vTaskPrioritySet(nullptr, 10); - const EventBits_t ALL_EVENTS = - MAIN_EVENT_SCHEDULE | - MAIN_EVENT_SEND_AUDIO | - MAIN_EVENT_WAKE_WORD_DETECTED | - MAIN_EVENT_VAD_CHANGE | - MAIN_EVENT_CLOCK_TICK | - MAIN_EVENT_ERROR | - MAIN_EVENT_NETWORK_CONNECTED | - MAIN_EVENT_NETWORK_DISCONNECTED | - MAIN_EVENT_TOGGLE_CHAT | - MAIN_EVENT_START_LISTENING | - MAIN_EVENT_STOP_LISTENING | - MAIN_EVENT_ACTIVATION_DONE | + const EventBits_t ALL_EVENTS = + MAIN_EVENT_SCHEDULE | MAIN_EVENT_SEND_AUDIO | MAIN_EVENT_WAKE_WORD_DETECTED | + MAIN_EVENT_VAD_CHANGE | MAIN_EVENT_CLOCK_TICK | MAIN_EVENT_ERROR | + MAIN_EVENT_NETWORK_CONNECTED | MAIN_EVENT_NETWORK_DISCONNECTED | MAIN_EVENT_TOGGLE_CHAT | + MAIN_EVENT_START_LISTENING | MAIN_EVENT_STOP_LISTENING | MAIN_EVENT_ACTIVATION_DONE | MAIN_EVENT_STATE_CHANGED; while (true) { @@ -187,7 +179,8 @@ void Application::Run() { if (bits & MAIN_EVENT_ERROR) { SetDeviceState(kDeviceStateIdle); - Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); + Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark", + Lang::Sounds::OGG_EXCLAMATION); } if (bits & MAIN_EVENT_NETWORK_CONNECTED) { @@ -257,7 +250,7 @@ void Application::Run() { clock_ticks_++; auto display = Board::GetInstance().GetDisplay(); display->UpdateStatusBar(); - + // Print debug info every 10 seconds if (clock_ticks_ % 10 == 0) { SystemInfo::PrintHeapStats(); @@ -278,12 +271,14 @@ void Application::HandleNetworkConnectedEvent() { return; } - xTaskCreate([](void* arg) { - Application* app = static_cast(arg); - app->ActivationTask(); - app->activation_task_handle_ = nullptr; - vTaskDelete(NULL); - }, "activation", 4096 * 2, this, 2, &activation_task_handle_); + xTaskCreate( + [](void* arg) { + Application* app = static_cast(arg); + app->ActivationTask(); + app->activation_task_handle_ = nullptr; + vTaskDelete(NULL); + }, + "activation", 4096 * 2, this, 2, &activation_task_handle_); } // Update the status bar immediately to show the network state @@ -294,7 +289,8 @@ void Application::HandleNetworkConnectedEvent() { void Application::HandleNetworkDisconnectedEvent() { // Close current conversation when network disconnected auto state = GetDeviceState(); - if (state == kDeviceStateConnecting || state == kDeviceStateListening || state == kDeviceStateSpeaking) { + if (state == kDeviceStateConnecting || state == kDeviceStateListening || + state == kDeviceStateThinking || state == kDeviceStateSpeaking) { ESP_LOGI(TAG, "Closing audio channel due to network disconnection"); protocol_->CloseAudioChannel(); } @@ -369,7 +365,7 @@ void Application::CheckAssetsVersion() { ESP_LOGW(TAG, "Assets partition is disabled for board %s", BOARD_NAME); return; } - + Settings settings("assets", true); // Check if there is a new assets need to be downloaded std::string download_url = settings.GetString("download_url"); @@ -379,27 +375,30 @@ void Application::CheckAssetsVersion() { char message[256]; snprintf(message, sizeof(message), Lang::Strings::FOUND_NEW_ASSETS, download_url.c_str()); - Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down", Lang::Sounds::OGG_UPGRADE); - + Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down", + Lang::Sounds::OGG_UPGRADE); + // Wait for the audio service to be idle for 3 seconds vTaskDelay(pdMS_TO_TICKS(3000)); SetDeviceState(kDeviceStateUpgrading); board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE); display->SetChatMessage("system", Lang::Strings::PLEASE_WAIT); - bool success = assets.Download(download_url, [this, display](int progress, size_t speed) -> void { - char buffer[32]; - snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024); - Schedule([display, message = std::string(buffer)]() { - display->SetChatMessage("system", message.c_str()); + bool success = + assets.Download(download_url, [this, display](int progress, size_t speed) -> void { + char buffer[32]; + snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024); + Schedule([display, message = std::string(buffer)]() { + display->SetChatMessage("system", message.c_str()); + }); }); - }); board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); vTaskDelay(pdMS_TO_TICKS(1000)); if (!success) { - Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); + Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark", + Lang::Sounds::OGG_EXCLAMATION); vTaskDelay(pdMS_TO_TICKS(2000)); SetDeviceState(kDeviceStateActivating); return; @@ -415,7 +414,7 @@ void Application::CheckAssetsVersion() { void Application::CheckNewVersion() { const int MAX_RETRY = 10; int retry_count = 0; - int retry_delay = 10; // Initial retry delay in seconds + int retry_delay = 10; // Initial retry delay in seconds auto& board = Board::GetInstance(); while (true) { @@ -431,27 +430,30 @@ void Application::CheckNewVersion() { } char error_message[128]; - snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err, ota_->GetCheckVersionUrl().c_str()); + snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err, + ota_->GetCheckVersionUrl().c_str()); char buffer[256]; - snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay, error_message); + snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay, + error_message); Alert(Lang::Strings::ERROR, buffer, "cloud_slash", Lang::Sounds::OGG_EXCLAMATION); - ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay, retry_count, MAX_RETRY); + ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay, + retry_count, MAX_RETRY); for (int i = 0; i < retry_delay; i++) { vTaskDelay(pdMS_TO_TICKS(1000)); if (GetDeviceState() == kDeviceStateIdle) { break; } } - retry_delay *= 2; // Double the retry delay + retry_delay *= 2; // Double the retry delay continue; } retry_count = 0; - retry_delay = 10; // Reset retry delay + retry_delay = 10; // Reset retry delay if (ota_->HasNewVersion()) { if (UpgradeFirmware(ota_->GetFirmwareUrl(), ota_->GetFirmwareVersion())) { - return; // This line will never be reached after reboot + return; // This line will never be reached after reboot } // If upgrade failed, continue to normal operation } @@ -507,31 +509,32 @@ void Application::InitializeProtocol() { } #endif - protocol_->OnConnected([this]() { - DismissAlert(); - }); + protocol_->OnConnected([this]() { DismissAlert(); }); protocol_->OnNetworkError([this](const std::string& message) { last_error_message_ = message; xEventGroupSetBits(event_group_, MAIN_EVENT_ERROR); }); - + protocol_->OnIncomingAudio([this](std::unique_ptr packet) { - if (GetDeviceState() == kDeviceStateSpeaking) { + if (accepting_tts_audio_.load() || GetDeviceState() == kDeviceStateSpeaking) { audio_service_.PushPacketToDecodeQueue(std::move(packet)); } }); - + protocol_->OnAudioChannelOpened([this, codec, &board]() { board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE); if (protocol_->server_sample_rate() != codec->output_sample_rate()) { - ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion", - protocol_->server_sample_rate(), codec->output_sample_rate()); + ESP_LOGW(TAG, + "Server sample rate %d does not match device output sample rate %d, " + "resampling may cause distortion", + protocol_->server_sample_rate(), codec->output_sample_rate()); } }); - + protocol_->OnAudioChannelClosed([this, &board]() { board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); + accepting_tts_audio_.store(false); Schedule([this]() { if (GetDeviceState() == kDeviceStateConnecting) { return; @@ -541,20 +544,26 @@ void Application::InitializeProtocol() { SetDeviceState(kDeviceStateIdle); }); }); - + protocol_->OnIncomingJson([this, display](const cJSON* root) { // Parse JSON data auto type = cJSON_GetObjectItem(root, "type"); if (strcmp(type->valuestring, "tts") == 0) { auto state = cJSON_GetObjectItem(root, "state"); - if (strcmp(state->valuestring, "start") == 0) { + if (strcmp(state->valuestring, "thinking") == 0) { + Schedule([this]() { SetDeviceState(kDeviceStateThinking); }); + } else if (strcmp(state->valuestring, "start") == 0) { + audio_service_.ResetDecoder(); + accepting_tts_audio_.store(true); Schedule([this]() { aborted_ = false; SetDeviceState(kDeviceStateSpeaking); }); } else if (strcmp(state->valuestring, "stop") == 0) { + accepting_tts_audio_.store(false); Schedule([this]() { - if (GetDeviceState() == kDeviceStateSpeaking) { + auto state = GetDeviceState(); + if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) { if (listening_mode_ == kListeningModeManualStop) { SetDeviceState(kDeviceStateIdle); } else { @@ -597,9 +606,7 @@ void Application::InitializeProtocol() { ESP_LOGI(TAG, "System command: %s", command->valuestring); if (strcmp(command->valuestring, "reboot") == 0) { // Do a reboot if user requests a OTA update - Schedule([this]() { - Reboot(); - }); + Schedule([this]() { Reboot(); }); } else { ESP_LOGW(TAG, "Unknown system command: %s", command->valuestring); } @@ -609,7 +616,8 @@ void Application::InitializeProtocol() { auto message = cJSON_GetObjectItem(root, "message"); auto emotion = cJSON_GetObjectItem(root, "emotion"); if (cJSON_IsString(status) && cJSON_IsString(message) && cJSON_IsString(emotion)) { - Alert(status->valuestring, message->valuestring, emotion->valuestring, Lang::Sounds::OGG_VIBRATION); + Alert(status->valuestring, message->valuestring, emotion->valuestring, + Lang::Sounds::OGG_VIBRATION); } else { ESP_LOGW(TAG, "Alert command requires status, message and emotion"); } @@ -618,9 +626,10 @@ void Application::InitializeProtocol() { auto payload = cJSON_GetObjectItem(root, "payload"); ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root)); if (cJSON_IsObject(payload)) { - Schedule([this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() { - display->SetChatMessage("system", payload_str.c_str()); - }); + Schedule( + [this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() { + display->SetChatMessage("system", payload_str.c_str()); + }); } else { ESP_LOGW(TAG, "Invalid custom message format: missing payload"); } @@ -629,7 +638,7 @@ void Application::InitializeProtocol() { ESP_LOGW(TAG, "Unknown message type: %s", type->valuestring); } }); - + protocol_->Start(); } @@ -638,32 +647,27 @@ void Application::ShowActivationCode(const std::string& code, const std::string& char digit; const std::string_view& sound; }; - static const std::array digit_sounds{{ - digit_sound{'0', Lang::Sounds::OGG_0}, - digit_sound{'1', Lang::Sounds::OGG_1}, - digit_sound{'2', Lang::Sounds::OGG_2}, - digit_sound{'3', Lang::Sounds::OGG_3}, - digit_sound{'4', Lang::Sounds::OGG_4}, - digit_sound{'5', Lang::Sounds::OGG_5}, - digit_sound{'6', Lang::Sounds::OGG_6}, - digit_sound{'7', Lang::Sounds::OGG_7}, - digit_sound{'8', Lang::Sounds::OGG_8}, - digit_sound{'9', Lang::Sounds::OGG_9} - }}; + static const std::array digit_sounds{ + {digit_sound{'0', Lang::Sounds::OGG_0}, digit_sound{'1', Lang::Sounds::OGG_1}, + digit_sound{'2', Lang::Sounds::OGG_2}, digit_sound{'3', Lang::Sounds::OGG_3}, + digit_sound{'4', Lang::Sounds::OGG_4}, digit_sound{'5', Lang::Sounds::OGG_5}, + digit_sound{'6', Lang::Sounds::OGG_6}, digit_sound{'7', Lang::Sounds::OGG_7}, + digit_sound{'8', Lang::Sounds::OGG_8}, digit_sound{'9', Lang::Sounds::OGG_9}}}; // This sentence uses 9KB of SRAM, so we need to wait for it to finish Alert(Lang::Strings::ACTIVATION, message.c_str(), "link", Lang::Sounds::OGG_ACTIVATION); for (const auto& digit : code) { auto it = std::find_if(digit_sounds.begin(), digit_sounds.end(), - [digit](const digit_sound& ds) { return ds.digit == digit; }); + [digit](const digit_sound& ds) { return ds.digit == digit; }); if (it != digit_sounds.end()) { audio_service_.PlaySound(it->sound); } } } -void Application::Alert(const char* status, const char* message, const char* emotion, const std::string_view& sound) { +void Application::Alert(const char* status, const char* message, const char* emotion, + const std::string_view& sound) { ESP_LOGW(TAG, "Alert [%s] %s: %s", emotion, status, message); auto display = Board::GetInstance().GetDisplay(); display->SetStatus(status); @@ -683,9 +687,7 @@ void Application::DismissAlert() { } } -void Application::ToggleChatState() { - ToggleChatStateForMode(kChatAgentModeNormal, false); -} +void Application::ToggleChatState() { ToggleChatStateForMode(kChatAgentModeNormal, false); } void Application::ToggleChatStateWithVision() { ToggleChatStateForMode(kChatAgentModeNormal, true); @@ -698,9 +700,7 @@ void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_e xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); } -bool Application::IsVisionTextModeEnabled() const { - return vision_text_mode_enabled_.load(); -} +bool Application::IsVisionTextModeEnabled() const { return vision_text_mode_enabled_.load(); } const char* Application::GetChatAgentModeName() const { return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal"; @@ -720,9 +720,7 @@ void Application::StartListening() { xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING); } -void Application::StopListening() { - xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING); -} +void Application::StopListening() { xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING); } void Application::HandleToggleChatEvent() { auto state = GetDeviceState(); @@ -748,21 +746,21 @@ void Application::HandleToggleChatEvent() { if (state == kDeviceStateIdle) { ListeningMode mode = GetDefaultListeningMode(); bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load(); - bool vision_mode_changed = vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load(); + bool vision_mode_changed = + vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load(); if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) { if (protocol_->IsAudioChannelOpened()) { protocol_->CloseAudioChannel(); } SetDeviceState(kDeviceStateConnecting); // Schedule to let the state change be processed first (UI update) - Schedule([this, mode]() { - ContinueOpenAudioChannel(mode); - }); + Schedule([this, mode]() { ContinueOpenAudioChannel(mode); }); return; } SetListeningMode(mode); - } else if (state == kDeviceStateSpeaking) { + } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) { AbortSpeaking(kAbortReasonNone); + SetListeningMode(GetDefaultListeningMode()); } else if (state == kDeviceStateListening) { protocol_->CloseAudioChannel(); } @@ -791,7 +789,7 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) { void Application::HandleStartListeningEvent() { auto state = GetDeviceState(); - + if (state == kDeviceStateActivating) { SetDeviceState(kDeviceStateIdle); return; @@ -805,18 +803,16 @@ void Application::HandleStartListeningEvent() { ESP_LOGE(TAG, "Protocol not initialized"); return; } - + if (state == kDeviceStateIdle) { if (!protocol_->IsAudioChannelOpened()) { SetDeviceState(kDeviceStateConnecting); // Schedule to let the state change be processed first (UI update) - Schedule([this]() { - ContinueOpenAudioChannel(kListeningModeManualStop); - }); + Schedule([this]() { ContinueOpenAudioChannel(kListeningModeManualStop); }); return; } SetListeningMode(kListeningModeManualStop); - } else if (state == kDeviceStateSpeaking) { + } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) { AbortSpeaking(kAbortReasonNone); SetListeningMode(kListeningModeManualStop); } @@ -824,7 +820,7 @@ void Application::HandleStartListeningEvent() { void Application::HandleStopListeningEvent() { auto state = GetDeviceState(); - + if (state == kDeviceStateAudioTesting) { audio_service_.EnableAudioTesting(false); SetDeviceState(kDeviceStateWifiConfiguring); @@ -854,17 +850,14 @@ void Application::HandleWakeWordDetectedEvent() { SetDeviceState(kDeviceStateConnecting); // Schedule to let the state change be processed first (UI update), // then continue with OpenAudioChannel which may block for ~1 second - Schedule([this, wake_word]() { - ContinueWakeWordInvoke(wake_word); - }); + Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); }); return; } // Channel already opened, continue directly ContinueWakeWordInvoke(wake_word); - } else if (state == kDeviceStateSpeaking || state == kDeviceStateListening) { + } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking || + state == kDeviceStateListening) { AbortSpeaking(kAbortReasonWakeWordDetected); - // Clear send queue to avoid sending residues to server - while (audio_service_.PopPacketFromSendQueue()); if (state == kDeviceStateListening) { protocol_->SendStartListening(GetDefaultListeningMode()); @@ -925,14 +918,14 @@ void Application::HandleStateChangedEvent() { auto display = board.GetDisplay(); auto led = board.GetLed(); led->OnStateChanged(); - + switch (new_state) { case kDeviceStateUnknown: case kDeviceStateIdle: vision_frame_sent_for_current_listen_.store(false); display->SetStatus(Lang::Strings::STANDBY); - display->ClearChatMessages(); // Clear messages first - display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count) + display->ClearChatMessages(); // Clear messages first + display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count) audio_service_.EnableVoiceProcessing(false); audio_service_.EnableWakeWordDetection(true); break; @@ -947,18 +940,14 @@ void Application::HandleStateChangedEvent() { display->SetStatus(Lang::Strings::LISTENING); display->SetEmotion("neutral"); - // Make sure the audio processor is running - if (play_popup_on_listening_ || !audio_service_.IsAudioProcessorRunning()) { - // For auto mode, wait for playback queue to be empty before enabling voice processing - // This prevents audio truncation when STOP arrives late due to network jitter - if (listening_mode_ == kListeningModeAutoStop) { - audio_service_.WaitForPlaybackQueueEmpty(); - } - - // Send the start listening command - protocol_->SendStartListening(listening_mode_); - audio_service_.EnableVoiceProcessing(true); + // Re-entering listening after an interrupt must restart the capture path even if the + // processor task is still marked running, otherwise realtime mode can show Listening + // while no fresh mic frames are sent. + if (listening_mode_ == kListeningModeAutoStop) { + audio_service_.WaitForPlaybackQueueEmpty(); } + protocol_->SendStartListening(listening_mode_); + audio_service_.EnableVoiceProcessing(true); #ifdef CONFIG_WAKE_WORD_DETECTION_IN_LISTENING // Enable wake word detection in listening mode (configured via Kconfig) @@ -967,13 +956,23 @@ void Application::HandleStateChangedEvent() { // Disable wake word detection in listening mode audio_service_.EnableWakeWordDetection(false); #endif - + // Play popup sound after ResetDecoder (in EnableVoiceProcessing) has been called if (play_popup_on_listening_) { play_popup_on_listening_ = false; audio_service_.PlaySound(Lang::Sounds::OGG_POPUP); } break; + case kDeviceStateThinking: + vad_speaking_.store(false); + display->SetStatus(Lang::Strings::THINKING); + display->SetEmotion("thinking"); + + if (listening_mode_ != kListeningModeRealtime) { + audio_service_.EnableVoiceProcessing(false); + audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord()); + } + break; case kDeviceStateSpeaking: display->SetStatus(Lang::Strings::SPEAKING); @@ -982,7 +981,9 @@ void Application::HandleStateChangedEvent() { // Only AFE wake word can be detected in speaking mode audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord()); } - audio_service_.ResetDecoder(); + if (!accepting_tts_audio_.load()) { + audio_service_.ResetDecoder(); + } break; case kDeviceStateWifiConfiguring: audio_service_.EnableVoiceProcessing(false); @@ -1026,6 +1027,8 @@ void Application::Schedule(std::function&& callback) { void Application::AbortSpeaking(AbortReason reason) { ESP_LOGI(TAG, "Abort speaking"); aborted_ = true; + accepting_tts_audio_.store(false); + audio_service_.ResetDecoder(); if (protocol_) { protocol_->SendAbortSpeaking(reason); } @@ -1069,7 +1072,8 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver } ESP_LOGI(TAG, "Starting firmware upgrade from URL: %s", upgrade_url.c_str()); - Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download", Lang::Sounds::OGG_UPGRADE); + Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download", + Lang::Sounds::OGG_UPGRADE); vTaskDelay(pdMS_TO_TICKS(3000)); SetDeviceState(kDeviceStateUpgrading); @@ -1091,17 +1095,19 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver if (!upgrade_success) { // Upgrade failed, restart audio service and continue running - ESP_LOGE(TAG, "Firmware upgrade failed, restarting audio service and continuing operation..."); - audio_service_.Start(); // Restart audio service - board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level - Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); + ESP_LOGE(TAG, + "Firmware upgrade failed, restarting audio service and continuing operation..."); + audio_service_.Start(); // Restart audio service + board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level + Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark", + Lang::Sounds::OGG_EXCLAMATION); vTaskDelay(pdMS_TO_TICKS(3000)); return false; } else { // Upgrade success, reboot immediately ESP_LOGI(TAG, "Firmware upgrade successful, rebooting..."); display->SetChatMessage("system", "Upgrade successful, rebooting..."); - vTaskDelay(pdMS_TO_TICKS(1000)); // Brief pause to show message + vTaskDelay(pdMS_TO_TICKS(1000)); // Brief pause to show message Reboot(); return true; } @@ -1113,25 +1119,21 @@ void Application::WakeWordInvoke(const std::string& wake_word) { } auto state = GetDeviceState(); - + if (state == kDeviceStateIdle) { audio_service_.EncodeWakeWord(); if (!protocol_->IsAudioChannelOpened()) { SetDeviceState(kDeviceStateConnecting); // Schedule to let the state change be processed first (UI update) - Schedule([this, wake_word]() { - ContinueWakeWordInvoke(wake_word); - }); + Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); }); return; } // Channel already opened, continue directly ContinueWakeWordInvoke(wake_word); - } else if (state == kDeviceStateSpeaking) { - Schedule([this]() { - AbortSpeaking(kAbortReasonNone); - }); - } else if (state == kDeviceStateListening) { + } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) { + Schedule([this]() { AbortSpeaking(kAbortReasonNone); }); + } else if (state == kDeviceStateListening) { Schedule([this]() { if (protocol_) { protocol_->CloseAudioChannel(); @@ -1163,7 +1165,7 @@ void Application::RegisterMcpBroadcastCallback(std::functionSendMcpMessage(payload); } @@ -1179,18 +1181,18 @@ void Application::SetAecMode(AecMode mode) { auto& board = Board::GetInstance(); auto display = board.GetDisplay(); switch (aec_mode_) { - case kAecOff: - audio_service_.EnableDeviceAec(false); - display->ShowNotification(Lang::Strings::RTC_MODE_OFF); - break; - case kAecOnServerSide: - audio_service_.EnableDeviceAec(false); - display->ShowNotification(Lang::Strings::RTC_MODE_ON); - break; - case kAecOnDeviceSide: - audio_service_.EnableDeviceAec(true); - display->ShowNotification(Lang::Strings::RTC_MODE_ON); - break; + case kAecOff: + audio_service_.EnableDeviceAec(false); + display->ShowNotification(Lang::Strings::RTC_MODE_OFF); + break; + case kAecOnServerSide: + audio_service_.EnableDeviceAec(false); + display->ShowNotification(Lang::Strings::RTC_MODE_ON); + break; + case kAecOnDeviceSide: + audio_service_.EnableDeviceAec(true); + display->ShowNotification(Lang::Strings::RTC_MODE_ON); + break; } // If the AEC mode is changed, close the audio channel @@ -1200,9 +1202,7 @@ void Application::SetAecMode(AecMode mode) { }); } -void Application::PlaySound(const std::string_view& sound) { - audio_service_.PlaySound(sound); -} +void Application::PlaySound(const std::string_view& sound) { audio_service_.PlaySound(sound); } void Application::ResetProtocol() { Schedule([this]() { diff --git a/main/application.h b/main/application.h index 03f7edf..4ebba78 100644 --- a/main/application.h +++ b/main/application.h @@ -162,6 +162,7 @@ private: std::atomic active_vision_text_mode_enabled_ = false; std::atomic vad_speaking_ = false; std::atomic vision_frame_sent_for_current_listen_ = false; + std::atomic accepting_tts_audio_ = false; int clock_ticks_ = 0; TaskHandle_t activation_task_handle_ = nullptr; diff --git a/main/assets/locales/en-US/language.json b/main/assets/locales/en-US/language.json index 8bb764a..38029ca 100644 --- a/main/assets/locales/en-US/language.json +++ b/main/assets/locales/en-US/language.json @@ -26,6 +26,7 @@ "CONNECTION_SUCCESSFUL": "Connection Successful", "CONNECTED_TO": "Connected to ", "LISTENING": "Listening...", + "THINKING": "Thinking...", "SPEAKING": "Speaking...", "SERVER_NOT_FOUND": "Looking for available service", "SERVER_NOT_CONNECTED": "Unable to connect to service, please try again later", @@ -56,4 +57,4 @@ "LOADING_ASSETS": "Loading assets...", "HELLO_MY_FRIEND": "Hello, my friend!" } -} \ No newline at end of file +} diff --git a/main/assets/locales/zh-CN/language.json b/main/assets/locales/zh-CN/language.json index a21f51a..17b287f 100644 --- a/main/assets/locales/zh-CN/language.json +++ b/main/assets/locales/zh-CN/language.json @@ -23,6 +23,7 @@ "CONNECTING": "连接中...", "CONNECTED_TO": "已连接 ", "LISTENING": "聆听中...", + "THINKING": "思考中...", "SPEAKING": "说话中...", "SERVER_NOT_FOUND": "正在寻找可用服务", "SERVER_NOT_CONNECTED": "无法连接服务,请稍后再试", @@ -56,4 +57,4 @@ "FLIGHT_MODE_OFF": "飞行模式已关闭", "FLIGHT_MODE_ON": "飞行模式已开启" } -} \ No newline at end of file +} diff --git a/main/audio/audio_service.cc b/main/audio/audio_service.cc index 350ac58..7698c6f 100644 --- a/main/audio/audio_service.cc +++ b/main/audio/audio_service.cc @@ -579,6 +579,7 @@ void AudioService::EnableWakeWordDetection(bool enable) { void AudioService::EnableVoiceProcessing(bool enable) { ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling"); if (enable) { + bool was_running = IsAudioProcessorRunning(); if (!audio_processor_initialized_) { audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_); audio_processor_initialized_ = true; @@ -586,7 +587,7 @@ void AudioService::EnableVoiceProcessing(bool enable) { /* We should make sure no audio is playing */ ResetDecoder(); - audio_input_need_warmup_ = true; + audio_input_need_warmup_ = !was_running; // Reset input resampler to clear cached data from previous mode (e.g. WakeWord) // This prevents buffer overflow when switching between different feed sizes { diff --git a/main/background_capture_service.cc b/main/background_capture_service.cc new file mode 100644 index 0000000..ea4c064 --- /dev/null +++ b/main/background_capture_service.cc @@ -0,0 +1,177 @@ +#include "background_capture_service.h" + +#include "board.h" +#include "camera.h" + +#include +#include +#include + +#define TAG "BgCapture" + +BackgroundCaptureService::BackgroundCaptureService() = default; + +BackgroundCaptureService::~BackgroundCaptureService() { + Stop(); +} + +void BackgroundCaptureService::Start() { +#if CONFIG_BACKGROUND_CAPTURE_ENABLE + if (running_.exchange(true)) { + return; + } + + auto result = xTaskCreate( + &BackgroundCaptureService::TaskEntry, + "bg_capture", + CONFIG_BACKGROUND_CAPTURE_TASK_STACK_SIZE, + this, + CONFIG_BACKGROUND_CAPTURE_TASK_PRIORITY, + &task_handle_); + if (result != pdPASS) { + running_.store(false); + task_handle_ = nullptr; + ESP_LOGE(TAG, "Failed to create background capture task"); + } +#endif +} + +void BackgroundCaptureService::Stop() { +#if CONFIG_BACKGROUND_CAPTURE_ENABLE + if (!running_.exchange(false)) { + return; + } + + while (task_handle_ != nullptr) { + vTaskDelay(pdMS_TO_TICKS(20)); + } +#endif +} + +void BackgroundCaptureService::TaskEntry(void* arg) { +#if CONFIG_BACKGROUND_CAPTURE_ENABLE + auto* service = static_cast(arg); + service->Run(); + service->task_handle_ = nullptr; +#else + (void)arg; +#endif + vTaskDelete(nullptr); +} + +void BackgroundCaptureService::Run() { +#if CONFIG_BACKGROUND_CAPTURE_ENABLE + ESP_LOGI(TAG, "Background capture task started"); + + while (running_.load()) { + if (!CaptureAndSendFrame()) { + consecutive_failures_++; + auto delay_ms = GetFailureDelayMs(); + ESP_LOGW(TAG, "Background capture retry in %u ms, failures=%u", + delay_ms, consecutive_failures_); + vTaskDelay(pdMS_TO_TICKS(delay_ms)); + continue; + } + + consecutive_failures_ = 0; + vTaskDelay(pdMS_TO_TICKS(CONFIG_BACKGROUND_CAPTURE_FRAME_INTERVAL_MS)); + } + + ESP_LOGI(TAG, "Background capture task stopped"); +#endif +} + +bool BackgroundCaptureService::CaptureAndSendFrame() { +#if CONFIG_BACKGROUND_CAPTURE_ENABLE + const size_t free_internal_heap = heap_caps_get_free_size(MALLOC_CAP_INTERNAL); + if (free_internal_heap < CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP) { + ESP_LOGW(TAG, "Skip background capture, low internal heap: free=%u threshold=%u", + static_cast(free_internal_heap), + static_cast(CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP)); + return false; + } + + auto camera = Board::GetInstance().GetCamera(); + if (camera == nullptr) { + ESP_LOGW(TAG, "No camera available for background capture"); + return false; + } + + std::string jpeg_data; + if (!camera->CaptureToJpeg(jpeg_data, false)) { + ESP_LOGW(TAG, "Failed to capture background frame"); + return false; + } + + if (jpeg_data.empty()) { + ESP_LOGW(TAG, "Captured empty background frame"); + return false; + } + + return UploadJpegFrame(jpeg_data); +#else + return false; +#endif +} + +uint32_t BackgroundCaptureService::GetFailureDelayMs() const { +#if CONFIG_BACKGROUND_CAPTURE_ENABLE + const uint32_t base_delay_ms = CONFIG_BACKGROUND_CAPTURE_RETRY_INTERVAL_MS; + const uint32_t max_delay_ms = CONFIG_BACKGROUND_CAPTURE_MAX_BACKOFF_MS; + const uint32_t shift = std::min(consecutive_failures_ - 1, 4); + return std::min(base_delay_ms << shift, max_delay_ms); +#else + return 0; +#endif +} + +bool BackgroundCaptureService::UploadJpegFrame(const std::string& jpeg_data) { +#if CONFIG_BACKGROUND_CAPTURE_ENABLE + const std::string url = CONFIG_BACKGROUND_CAPTURE_UPLOAD_URL; + if (url.empty()) { + ESP_LOGI(TAG, "Captured background frame: %u bytes", static_cast(jpeg_data.size())); + return true; + } + + auto network = Board::GetInstance().GetNetwork(); + if (network == nullptr) { + ESP_LOGW(TAG, "No network available for background upload"); + return false; + } + + const std::string boundary = "----XIAOZHI_BACKGROUND_CAPTURE_BOUNDARY"; + auto http = network->CreateHttp(3); + http->SetHeader("Content-Type", "multipart/form-data; boundary=" + boundary); + + if (!http->Open("POST", url)) { + ESP_LOGW(TAG, "Failed to open background upload URL: %s", url.c_str()); + return false; + } + + std::string file_header; + file_header += "--" + boundary + "\r\n"; + file_header += "Content-Disposition: form-data; name=\"file\"; filename=\"frame.jpg\"\r\n"; + file_header += "Content-Type: image/jpeg\r\n\r\n"; + http->Write(file_header.c_str(), file_header.size()); + http->Write(jpeg_data.data(), jpeg_data.size()); + + std::string footer; + footer += "\r\n--" + boundary + "--\r\n"; + http->Write(footer.c_str(), footer.size()); + http->Write("", 0); + + const int status_code = http->GetStatusCode(); + http->Close(); + + if (status_code < 200 || status_code >= 300) { + ESP_LOGW(TAG, "Background upload failed, status=%d", status_code); + return false; + } + + ESP_LOGI(TAG, "Uploaded background frame: %u bytes", static_cast(jpeg_data.size())); + return true; +#else + (void)jpeg_data; + return false; +#endif +} diff --git a/main/background_capture_service.h b/main/background_capture_service.h new file mode 100644 index 0000000..a5dcbf5 --- /dev/null +++ b/main/background_capture_service.h @@ -0,0 +1,32 @@ +#ifndef BACKGROUND_CAPTURE_SERVICE_H +#define BACKGROUND_CAPTURE_SERVICE_H + +#include +#include + +#include +#include +#include + +class BackgroundCaptureService { +public: + BackgroundCaptureService(); + ~BackgroundCaptureService(); + + void Start(); + void Stop(); + bool IsRunning() const { return running_.load(); } + +private: + TaskHandle_t task_handle_ = nullptr; + std::atomic running_ = false; + uint32_t consecutive_failures_ = 0; + + static void TaskEntry(void* arg); + void Run(); + bool CaptureAndSendFrame(); + bool UploadJpegFrame(const std::string& jpeg_data); + uint32_t GetFailureDelayMs() const; +}; + +#endif // BACKGROUND_CAPTURE_SERVICE_H diff --git a/main/boards/atoms3r-echo-pyramid/atoms3r_echo_pyramid.cc b/main/boards/atoms3r-echo-pyramid/atoms3r_echo_pyramid.cc index 5fe3fdf..662e7e2 100644 --- a/main/boards/atoms3r-echo-pyramid/atoms3r_echo_pyramid.cc +++ b/main/boards/atoms3r-echo-pyramid/atoms3r_echo_pyramid.cc @@ -214,6 +214,9 @@ public: case kDeviceStateSpeaking: ctrl_->SetStatusColor(64, 0, 0); // red break; + case kDeviceStateThinking: + ctrl_->SetStatusColor(0, 0, 64); // blue + break; default: ctrl_->SetStatusColor(0, 0, 64); // blue break; diff --git a/main/boards/common/wifi_board.cc b/main/boards/common/wifi_board.cc index 377685b..bcab012 100644 --- a/main/boards/common/wifi_board.cc +++ b/main/boards/common/wifi_board.cc @@ -203,7 +203,10 @@ void WifiBoard::EnterWifiConfigMode() { auto& app = Application::GetInstance(); auto state = app.GetDeviceState(); - if (state == kDeviceStateSpeaking || state == kDeviceStateListening || state == kDeviceStateIdle) { + if (state == kDeviceStateSpeaking || + state == kDeviceStateThinking || + state == kDeviceStateListening || + state == kDeviceStateIdle) { // Reset protocol (close audio channel, reset protocol) Application::GetInstance().ResetProtocol(); diff --git a/main/boards/electron-bot/electron_emoji_display.cc b/main/boards/electron-bot/electron_emoji_display.cc index 7020a35..1dceb0c 100644 --- a/main/boards/electron-bot/electron_emoji_display.cc +++ b/main/boards/electron-bot/electron_emoji_display.cc @@ -85,6 +85,13 @@ void ElectronEmojiDisplay::SetStatus(const char* status) { lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); return; + } else if (strcmp(status, Lang::Strings::THINKING) == 0) { + lv_obj_set_style_text_font(status_label_, text_font, 0); + lv_label_set_text(status_label_, status); + lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN); + lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN); + lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); + return; } else if (strcmp(status, Lang::Strings::CONNECTING) == 0) { lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0); lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标 @@ -102,4 +109,4 @@ void ElectronEmojiDisplay::SetStatus(const char* status) { lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_clear_flag(network_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_clear_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); -} \ No newline at end of file +} diff --git a/main/boards/esp-hi/emoji_display.cc b/main/boards/esp-hi/emoji_display.cc index c494337..6c9d590 100644 --- a/main/boards/esp-hi/emoji_display.cc +++ b/main/boards/esp-hi/emoji_display.cc @@ -155,6 +155,8 @@ void EmojiWidget::SetStatus(const char* status) if (player_) { if (strcmp(status, Lang::Strings::LISTENING) == 0) { player_->StartPlayer("asking", true, 15); + } else if (strcmp(status, Lang::Strings::THINKING) == 0) { + player_->StartPlayer("thinking", true, 15); } else if (strcmp(status, Lang::Strings::STANDBY) == 0) { player_->StartPlayer("wake", true, 15); } diff --git a/main/boards/jiuchuan-s3/jiuchuan_dev_board.cc b/main/boards/jiuchuan-s3/jiuchuan_dev_board.cc index 76c522e..a55a9ae 100644 --- a/main/boards/jiuchuan-s3/jiuchuan_dev_board.cc +++ b/main/boards/jiuchuan-s3/jiuchuan_dev_board.cc @@ -231,9 +231,9 @@ private: // 如果当前是聆听状态,切换到待命状态 ESP_LOGI(TAG, "从聆听状态切换到待命状态"); app.ToggleChatState(); // 切换到待命状态 - } else if (current_state == kDeviceStateSpeaking) { - // 如果当前是说话状态,终止说话并切换到待命状态 - ESP_LOGI(TAG, "从说话状态切换到待命状态"); + } else if (current_state == kDeviceStateSpeaking || current_state == kDeviceStateThinking) { + // 如果当前是说话或思考状态,终止并切换到待命状态 + ESP_LOGI(TAG, "从说话/思考状态切换到待命状态"); app.ToggleChatState(); // 终止说话 } else { // 其他状态下只唤醒设备 diff --git a/main/boards/otto-robot/otto_emoji_display.cc b/main/boards/otto-robot/otto_emoji_display.cc index 7702091..46661b9 100644 --- a/main/boards/otto-robot/otto_emoji_display.cc +++ b/main/boards/otto-robot/otto_emoji_display.cc @@ -77,6 +77,13 @@ void OttoEmojiDisplay::SetStatus(const char* status) { lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); return; + } else if (strcmp(status, Lang::Strings::THINKING) == 0) { + lv_obj_set_style_text_font(status_label_, text_font, 0); + lv_label_set_text(status_label_, status); + lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN); + lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN); + lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); + return; } else if (strcmp(status, Lang::Strings::CONNECTING) == 0) { lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0); lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标 @@ -131,4 +138,4 @@ void OttoEmojiDisplay::SetPreviewImage(std::unique_ptr image) { lv_obj_remove_flag(preview_image_, LV_OBJ_FLAG_HIDDEN); esp_timer_stop(preview_timer_); ESP_ERROR_CHECK(esp_timer_start_once(preview_timer_, PREVIEW_IMAGE_DURATION_MS * 1000)); -} \ No newline at end of file +} diff --git a/main/bridge_server.py b/main/bridge_server.py index a0881cc..5746c57 100644 --- a/main/bridge_server.py +++ b/main/bridge_server.py @@ -1,5 +1,6 @@ import asyncio import base64 +import contextlib import json import os import re @@ -44,29 +45,40 @@ CHAT_MODE_AGENT_NAMES = { CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0")) AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0")) WS_PORT = 8080 +WS_MAX_QUEUE = int(os.getenv("BRIDGE_WS_MAX_QUEUE", "128")) +WS_MAX_SIZE = int(os.getenv("BRIDGE_WS_MAX_SIZE", str(8 * 1024 * 1024))) AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower() PROJECT_ROOT = Path(__file__).resolve().parent.parent VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames"))) -INPUT_SAMPLE_RATE = 16000 -OUTPUT_SAMPLE_RATE = 24000 -INPUT_FRAME_DURATION_MS = 20 -INPUT_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * INPUT_FRAME_DURATION_MS // 1000 -INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 60 // 1000 -OUTPUT_FRAME_DURATION_MS = 20 -OUTPUT_SAMPLES_PER_OPUS_FRAME = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000 -TTS_IDLE_TIMEOUT_SECONDS = 0.25 +INPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_INPUT_SAMPLE_RATE", "16000")) +OUTPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_OUTPUT_SAMPLE_RATE", "24000")) +INPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_INPUT_FRAME_DURATION_MS", "60")) +INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 120 // 1000 +OUTPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_OUTPUT_FRAME_DURATION_MS", "60")) +AUDIO_STATS_INTERVAL_SECONDS = float(os.getenv("BRIDGE_AUDIO_STATS_INTERVAL_SECONDS", "5.0")) +DOWNLINK_SEND_GAP_WARN_MS = float(os.getenv("BRIDGE_DOWNLINK_SEND_GAP_WARN_MS", "180.0")) +UPLINK_CAPTURE_TIMEOUT_SECONDS = float(os.getenv("BRIDGE_UPLINK_CAPTURE_TIMEOUT_SECONDS", "0.25")) +TTS_IDLE_TIMEOUT_SECONDS = float(os.getenv("TTS_IDLE_TIMEOUT_SECONDS", "1.2")) +TTS_MIN_ACTIVE_SECONDS = float(os.getenv("TTS_MIN_ACTIVE_SECONDS", "1.0")) TTS_SILENCE_PEAK_THRESHOLD = 96 -TTS_PRE_ROLL_MS = 80 -TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1 +TTS_PRE_ROLL_MS = int(os.getenv("TTS_PRE_ROLL_MS", "480")) +TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = int(os.getenv("TTS_START_CONSECUTIVE_AUDIBLE_FRAMES", "1")) TTS_INTERRUPT_SILENCE_FRAMES = 3 INTERRUPT_TOPIC = "lk.interrupt" VISION_FRAME_TOPIC = "vision.frame" +AGENT_STATE_ATTRIBUTE = "lk.agent.state" TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;;" TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18")) TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18")) TTS_DISPLAY_SCROLL_GAP = " " TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8")) +TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS = float( + os.getenv("TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS", "0.25") +) +TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS = float( + os.getenv("TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS", "8.0") +) EMOTION_TEXT_PATTERN = re.compile( r"^\s*,,;;]+)\s*>?[\s,,;;]*(.*)$", re.DOTALL, @@ -95,6 +107,7 @@ class DeviceSession: agent_ready: asyncio.Event forwarding_tracks: dict[str, asyncio.Task[Any]] = field(default_factory=dict) tts_active: bool = False + tts_thinking: bool = False tts_idle_task: Optional[asyncio.Task] = None tts_display_task: Optional[asyncio.Task] = None tts_stream_id: int = 0 @@ -103,6 +116,11 @@ class DeviceSession: tts_display_final: bool = False tts_emotion: str = "" tts_suppressed_until: float = 0.0 + tts_started_at: float = 0.0 + tts_last_audible_at: float = 0.0 + tts_waiting_for_user_audio_after_interrupt: bool = False + last_interrupt_time: float = 0.0 + last_uplink_audible_time: float = 0.0 agent_dispatch_task: Optional[asyncio.Task] = None closed: bool = False captured_frame_count: int = 0 @@ -138,6 +156,100 @@ class ESP32LiveKitBridge: if formatted_tb.strip(): print(formatted_tb.rstrip()) + def _audio_duration_ms(self, sample_count: int, sample_rate: int) -> float: + if sample_rate <= 0: + return 0.0 + return sample_count * 1000.0 / sample_rate + + def _build_server_hello(self, session: DeviceSession) -> dict[str, Any]: + return { + "type": "hello", + "transport": "websocket", + "session": { + "room": session.room_name, + "identity": session.identity, + }, + "audio_params": { + "format": "opus", + "sample_rate": OUTPUT_SAMPLE_RATE, + "channels": 1, + "frame_duration": OUTPUT_FRAME_DURATION_MS, + }, + } + + def _log_client_hello(self, session: DeviceSession, message: dict[str, Any]) -> None: + audio_params = message.get("audio_params") + if not isinstance(audio_params, dict): + return + + sample_rate = audio_params.get("sample_rate") + frame_duration = audio_params.get("frame_duration") + channels = audio_params.get("channels") + fmt = audio_params.get("format") + print( + "[client-audio] " + f"device={session.device_id} format={fmt} sample_rate={sample_rate} " + f"channels={channels} frame_duration={frame_duration}" + ) + + if sample_rate != INPUT_SAMPLE_RATE or channels != 1: + print( + "[client-audio] warning: bridge uplink decode expects " + f"{INPUT_SAMPLE_RATE}Hz mono, got {sample_rate}Hz channels={channels}" + ) + if frame_duration != INPUT_FRAME_DURATION_MS: + print( + "[client-audio] warning: bridge expects " + f"{INPUT_FRAME_DURATION_MS}ms uplink frames, got {frame_duration}ms" + ) + + def _track_room_connect_task( + self, + session: DeviceSession, + task: asyncio.Task[Any], + ) -> None: + if task.cancelled(): + return + try: + task.result() + except Exception as exc: + self._log_exception( + f"LiveKit 房间连接后台任务失败: room={session.room_name}", + exc, + ) + websocket = session.websocket + if websocket is not None: + asyncio.create_task(websocket.close(code=1011, reason="livekit connect failed")) + + async def _capture_mic_frame( + self, + session: DeviceSession, + pcm_bytes: bytes, + num_samples: int, + ) -> bool: + try: + frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples) + except TypeError: + frame = AudioFrame.create( + sample_rate=INPUT_SAMPLE_RATE, + num_channels=1, + samples_per_channel=num_samples, + ) + memoryview(frame.data).cast("B")[:] = pcm_bytes + + try: + await asyncio.wait_for( + session.mic_source.capture_frame(frame), + timeout=UPLINK_CAPTURE_TIMEOUT_SECONDS, + ) + return True + except asyncio.TimeoutError: + print( + "[uplink] warning: capture_frame timeout, dropping frame " + f"device={session.device_id} samples={num_samples}" + ) + return False + def _build_device_id(self, websocket: Any) -> str: headers = websocket.request.headers requested_id = headers.get("X-Device-Id") or headers.get("Device-Id") @@ -445,7 +557,7 @@ class ESP32LiveKitBridge: print("收到 vision frame,但 image 字段为空") return - saved_path = self._save_vision_frame(session, image) + saved_path = await asyncio.to_thread(self._save_vision_frame, session, image) if saved_path is None: return print(f"已保存 vision frame: {saved_path}") @@ -599,6 +711,10 @@ class ESP32LiveKitBridge: if session.tts_active: print("跳过 tts start,当前已处于激活状态") return + block_reason = self._tts_resume_block_reason(session, include_user_quiet=False) + if block_reason is not None: + print(f"跳过 tts start,打断后仍在等待稳定聆听: {block_reason}") + return if time.monotonic() < session.tts_suppressed_until: print("跳过 tts start,中断后的残留音频仍在抑制窗口内") return @@ -607,16 +723,83 @@ class ESP32LiveKitBridge: session.tts_display_final = False session.tts_emotion = "" self._cancel_tts_display_task(session) + now = time.monotonic() + session.tts_started_at = now + session.tts_last_audible_at = now await self._send_tts_state(session, "start") session.tts_active = True + session.tts_thinking = False + + async def _start_thinking(self, session: DeviceSession) -> None: + if session.tts_active: + print("跳过 tts thinking,当前已处于 TTS 播放状态") + return + if session.tts_thinking: + print("跳过 tts thinking,当前已处于思考状态") + return + block_reason = self._tts_resume_block_reason(session, include_user_quiet=False) + if block_reason is not None: + print(f"跳过 tts thinking,打断后仍在等待稳定聆听: {block_reason}") + return + if time.monotonic() < session.tts_suppressed_until: + print("跳过 tts thinking,中断后的残留音频仍在抑制窗口内") + return + await self._send_tts_state(session, "thinking") + session.tts_thinking = True + + def _tts_resume_block_reason( + self, + session: DeviceSession, + now: Optional[float] = None, + *, + include_user_quiet: bool = True, + ) -> Optional[str]: + if now is None: + now = time.monotonic() + + if session.tts_waiting_for_user_audio_after_interrupt: + return "waiting_for_user_audio_after_interrupt" + + if session.last_interrupt_time <= 0.0: + return None + + since_interrupt = now - session.last_interrupt_time + if since_interrupt > TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS: + return None + + if session.last_uplink_audible_time < session.last_interrupt_time: + return None + + if not include_user_quiet: + return None + + quiet_for = now - session.last_uplink_audible_time + if quiet_for < TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS: + return f"user_audio_quiet_for={quiet_for:.2f}s" + + return None + + def _handle_agent_state(self, session: DeviceSession, participant: rtc.Participant) -> None: + state = participant.attributes.get(AGENT_STATE_ATTRIBUTE) + if not isinstance(state, str) or not state: + return + + print( + f"[agent-state] room={session.room_name} identity={participant.identity} state={state}" + ) + if state == "thinking": + asyncio.create_task(self._start_thinking(session)) async def _stop_tts(self, session: DeviceSession) -> None: - if not session.tts_active: + if not session.tts_active and not session.tts_thinking: print("跳过 tts stop,当前未激活") return self._cancel_tts_display_task(session) await self._send_tts_state(session, "stop") session.tts_active = False + session.tts_thinking = False + session.tts_started_at = 0.0 + session.tts_last_audible_at = 0.0 session.tts_transcript_text = "" session.tts_display_text = "" session.tts_display_final = False @@ -628,6 +811,9 @@ class ESP32LiveKitBridge: session.tts_idle_task.cancel() session.tts_idle_task = None session.tts_active = False + session.tts_thinking = False + session.tts_started_at = 0.0 + session.tts_last_audible_at = 0.0 session.tts_transcript_text = "" session.tts_display_text = "" session.tts_display_final = False @@ -637,12 +823,16 @@ class ESP32LiveKitBridge: async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None: print(f"收到打断请求,停止当前 TTS: device={session.device_id} reason={reason}") + now = time.monotonic() session.tts_stream_id += 1 - session.tts_suppressed_until = time.monotonic() + TTS_INTERRUPT_SUPPRESS_SECONDS + session.last_interrupt_time = now + session.tts_suppressed_until = now + TTS_INTERRUPT_SUPPRESS_SECONDS + session.tts_waiting_for_user_audio_after_interrupt = True await self._force_stop_tts(session, reason) asyncio.create_task(self._send_agent_interrupt(session, reason)) def _reset_tts_idle_timer(self, session: DeviceSession) -> None: + session.tts_last_audible_at = time.monotonic() if session.tts_idle_task is not None: session.tts_idle_task.cancel() session.tts_idle_task = asyncio.create_task( @@ -651,11 +841,28 @@ class ESP32LiveKitBridge: async def _tts_idle_watchdog(self, session: DeviceSession, stream_id: int) -> None: try: - await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS) - if stream_id != session.tts_stream_id: + while True: + await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS) + if stream_id != session.tts_stream_id or not session.tts_active: + return + + now = time.monotonic() + idle_for = now - session.tts_last_audible_at + active_for = now - session.tts_started_at + remaining = max( + TTS_IDLE_TIMEOUT_SECONDS - idle_for, + TTS_MIN_ACTIVE_SECONDS - active_for, + ) + if remaining > 0: + await asyncio.sleep(remaining) + continue + + print( + "TTS 静音达到阈值,切回聆听状态: " + f"idle={idle_for:.2f}s active={active_for:.2f}s" + ) + await self._stop_tts(session) return - print(f"TTS 空闲超过 {TTS_IDLE_TIMEOUT_SECONDS}s,切回聆听状态") - await self._stop_tts(session) except asyncio.CancelledError: pass @@ -799,6 +1006,7 @@ class ESP32LiveKitBridge: if self._is_agent_participant(participant, session.agent_name): session.agent_ready.set() self._scan_participant_audio_tracks(session, participant, "connected_scan") + self._handle_agent_state(session, participant) @session.room.on("participant_connected") def on_participant_connected(participant: rtc.RemoteParticipant) -> None: @@ -810,6 +1018,7 @@ class ESP32LiveKitBridge: self._scan_participant_audio_tracks( session, participant, "participant_connected_scan" ) + self._handle_agent_state(session, participant) @session.room.on("participant_disconnected") def on_participant_disconnected(participant: rtc.RemoteParticipant) -> None: @@ -820,6 +1029,16 @@ class ESP32LiveKitBridge: if not track_sid.endswith(f":{participant.identity}") } + @session.room.on("participant_attributes_changed") + def on_participant_attributes_changed(changed: list[str], participant: rtc.Participant) -> None: + if AGENT_STATE_ATTRIBUTE not in changed: + return + if not isinstance(participant, rtc.RemoteParticipant): + return + if not self._is_agent_participant(participant, session.agent_name): + return + self._handle_agent_state(session, participant) + @session.room.on("data_received") def on_data_received(data_packet: rtc.DataPacket) -> None: identity = data_packet.participant.identity if data_packet.participant else "未知" @@ -865,6 +1084,7 @@ class ESP32LiveKitBridge: if not segment.final: continue display_text = segment.text + asyncio.create_task(self._start_thinking(session)) if session.websocket is not None: ws = session.websocket @@ -967,9 +1187,21 @@ class ESP32LiveKitBridge: async def start(self) -> None: print(f"[config] websocket_port={WS_PORT}") + print(f"[config] websocket_max_queue={WS_MAX_QUEUE} websocket_max_size={WS_MAX_SIZE}") print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}") print(f"[config] token_url={TOKEN_URL}") print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}") + print( + "[config] audio=" + f"uplink_decode:{INPUT_SAMPLE_RATE}Hz/{INPUT_FRAME_DURATION_MS}ms " + f"downlink_encode:{OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms " + f"stats_interval:{AUDIO_STATS_INTERVAL_SECONDS}s " + f"capture_timeout:{UPLINK_CAPTURE_TIMEOUT_SECONDS}s " + f"tts_idle:{TTS_IDLE_TIMEOUT_SECONDS}s " + f"tts_min_active:{TTS_MIN_ACTIVE_SECONDS}s " + f"tts_start_frames:{TTS_START_CONSECUTIVE_AUDIBLE_FRAMES} " + f"tts_pre_roll:{TTS_PRE_ROLL_MS}ms" + ) print( "[config] agents=" f"normal:{CHAT_MODE_AGENT_NAMES['normal']} " @@ -996,6 +1228,7 @@ class ESP32LiveKitBridge: session.websocket = None session.agent_ready.set() session.tts_active = False + session.tts_thinking = False session.tts_stream_id += 1 if session.tts_idle_task is not None: session.tts_idle_task.cancel() @@ -1016,13 +1249,20 @@ class ESP32LiveKitBridge: pending_pcm = bytearray() pre_roll_pcm = bytearray() pre_roll_max_bytes = OUTPUT_SAMPLE_RATE * TTS_PRE_ROLL_MS // 1000 * 2 + output_samples_per_opus_frame = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000 + output_frame_bytes = output_samples_per_opus_frame * 2 audible_frame_streak = 0 silence_frame_streak = 0 waiting_for_post_interrupt_silence = False + downlink_packets = 0 + downlink_audio_ms = 0.0 + last_downlink_stats_time = time.monotonic() + last_send_time: Optional[float] = None stream_id = session.tts_stream_id print( f"启动 TTS 转发: device={session.device_id} room={session.room_name} " - f"track_sid={track_sid} stream_id={stream_id}" + f"track_sid={track_sid} stream_id={stream_id} " + f"opus={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms" ) try: @@ -1043,7 +1283,8 @@ class ESP32LiveKitBridge: pcm_data = frame.data.tobytes() has_audible_audio = self._has_audible_audio(pcm_data) - if time.monotonic() < session.tts_suppressed_until: + now = time.monotonic() + if now < session.tts_suppressed_until: pending_pcm.clear() pre_roll_pcm.clear() audible_frame_streak = 0 @@ -1051,6 +1292,31 @@ class ESP32LiveKitBridge: waiting_for_post_interrupt_silence = True continue + block_reason = self._tts_resume_block_reason( + session, + now, + include_user_quiet=False, + ) + if block_reason is not None: + pending_pcm.clear() + pre_roll_pcm.clear() + audible_frame_streak = 0 + silence_frame_streak = 0 + if block_reason == "waiting_for_user_audio_after_interrupt": + waiting_for_post_interrupt_silence = True + continue + + if ( + waiting_for_post_interrupt_silence + and session.last_interrupt_time > 0.0 + and session.last_uplink_audible_time >= session.last_interrupt_time + and now - session.last_uplink_audible_time + >= TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS + ): + print("检测到用户打断后语音已结束,允许新 TTS 直接起播") + waiting_for_post_interrupt_silence = False + silence_frame_streak = 0 + if waiting_for_post_interrupt_silence: if has_audible_audio: silence_frame_streak = 0 @@ -1098,20 +1364,42 @@ class ESP32LiveKitBridge: if not current_frame_buffered: pending_pcm.extend(pcm_data) - frame_bytes = OUTPUT_SAMPLES_PER_OPUS_FRAME * 2 while ( - len(pending_pcm) >= frame_bytes + len(pending_pcm) >= output_frame_bytes and stream_id == session.tts_stream_id and session.websocket is not None ): try: + now = time.monotonic() + if last_send_time is not None: + send_gap_ms = (now - last_send_time) * 1000.0 + if send_gap_ms > DOWNLINK_SEND_GAP_WARN_MS: + print( + "[downlink] warning: send gap " + f"{send_gap_ms:.1f}ms device={session.device_id} " + f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}" + ) + last_send_time = now + opus_packet = encoder.encode( - bytes(pending_pcm[:frame_bytes]), - OUTPUT_SAMPLES_PER_OPUS_FRAME, + bytes(pending_pcm[:output_frame_bytes]), + output_samples_per_opus_frame, ) - del pending_pcm[:frame_bytes] + del pending_pcm[:output_frame_bytes] await session.websocket.send(self._wrap_opus_payload(session, opus_packet)) + downlink_packets += 1 + downlink_audio_ms += OUTPUT_FRAME_DURATION_MS + if now - last_downlink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS: + print( + "[downlink] " + f"device={session.device_id} packets={downlink_packets} " + f"audio_ms={downlink_audio_ms:.0f} " + f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}" + ) + downlink_packets = 0 + downlink_audio_ms = 0.0 + last_downlink_stats_time = now except Exception as exc: print(f"发送回 ESP32 失败: {exc}") break @@ -1171,27 +1459,26 @@ class ESP32LiveKitBridge: ) session.tts_stream_id += 1 opus_decoder = None + uplink_packets = 0 + uplink_audio_ms = 0.0 + uplink_decode_errors = 0 + uplink_dropped_frames = 0 + last_uplink_stats_time = time.monotonic() + room_connect_task: Optional[asyncio.Task[Any]] = None try: - hello_msg = { - "type": "hello", - "transport": "websocket", - "session": { - "room": session.room_name, - "identity": session.identity, - }, - "audio_params": { - "format": "opus", - "sample_rate": OUTPUT_SAMPLE_RATE, - "channels": 1, - "frame_duration": OUTPUT_FRAME_DURATION_MS, - }, - } + hello_msg = self._build_server_hello(session) await websocket.send(json.dumps(hello_msg)) - print(f"已发送 server hello: device={device_id} room={session.room_name}") + print( + f"已发送 server hello: device={device_id} room={session.room_name} " + f"audio={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms" + ) asyncio.create_task(self._run_emotion_test_sequence(session)) - await self._connect_session_room(session) + room_connect_task = asyncio.create_task(self._connect_session_room(session)) + room_connect_task.add_done_callback( + lambda task: self._track_room_connect_task(session, task) + ) async for message in websocket: if isinstance(message, bytes): @@ -1214,6 +1501,16 @@ class ESP32LiveKitBridge: if num_samples > 0: session.captured_frame_count += 1 now = time.monotonic() + uplink_packets += 1 + uplink_audio_ms += self._audio_duration_ms(num_samples, INPUT_SAMPLE_RATE) + if self._has_audible_audio(pcm_bytes): + session.last_uplink_audible_time = now + if session.tts_waiting_for_user_audio_after_interrupt: + session.tts_waiting_for_user_audio_after_interrupt = False + print( + f"[uplink] detected user audio after interrupt: " + f"device={session.device_id}" + ) if ( session.captured_frame_count <= 5 or now - session.first_capture_log_time >= 5.0 @@ -1224,25 +1521,32 @@ class ESP32LiveKitBridge: # f"bytes={len(pcm_bytes)} samples={num_samples} " # f"room={session.room_name}" # ) - try: - frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples) - await session.mic_source.capture_frame(frame) - except TypeError: - frame = AudioFrame.create( - sample_rate=INPUT_SAMPLE_RATE, - num_channels=1, - samples_per_channel=num_samples, + if now - last_uplink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS: + print( + "[uplink] " + f"device={session.device_id} packets={uplink_packets} " + f"audio_ms={uplink_audio_ms:.0f} " + f"decode_errors={uplink_decode_errors} " + f"dropped_frames={uplink_dropped_frames}" ) - memoryview(frame.data).cast("B")[:] = pcm_bytes - await session.mic_source.capture_frame(frame) + uplink_packets = 0 + uplink_audio_ms = 0.0 + uplink_decode_errors = 0 + uplink_dropped_frames = 0 + last_uplink_stats_time = now + if not await self._capture_mic_frame(session, pcm_bytes, num_samples): + uplink_dropped_frames += 1 except Exception as exc: + uplink_decode_errors += 1 print(f"Opus audio decode error ({len(message)} bytes): {exc}") elif isinstance(message, str): try: data = json.loads(message) # print(f"收到 ESP32 JSON 消息: {data}") msg_type = data.get("type") - if msg_type == "abort": + if msg_type == "hello": + self._log_client_hello(session, data) + elif msg_type == "abort": reason = data.get("reason") abort_reason = reason if isinstance(reason, str) and reason else "button_abort" print(f"处理 ESP32 打断请求: reason={abort_reason}") @@ -1257,6 +1561,10 @@ class ESP32LiveKitBridge: self._log_exception("WebSocket 其他错误", exc) finally: print(f"ESP32 断开连接: device={device_id} room={session.room_name}") + if room_connect_task is not None and not room_connect_task.done(): + room_connect_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await room_connect_task await self._close_session(session) self.device_sessions.pop(device_id, None) @@ -1265,7 +1573,13 @@ async def main() -> None: bridge = ESP32LiveKitBridge() try: await bridge.start() - async with websockets.serve(bridge.handle_websocket, "0.0.0.0", WS_PORT): + async with websockets.serve( + bridge.handle_websocket, + "0.0.0.0", + WS_PORT, + max_queue=WS_MAX_QUEUE, + max_size=WS_MAX_SIZE, + ): print(f"WebSocket 服务器运行在端口 {WS_PORT},等待 ESP32 连接...") await asyncio.Future() finally: diff --git a/main/device_state.h b/main/device_state.h index 4ffafae..8170d35 100644 --- a/main/device_state.h +++ b/main/device_state.h @@ -8,6 +8,7 @@ enum DeviceState { kDeviceStateIdle, kDeviceStateConnecting, kDeviceStateListening, + kDeviceStateThinking, kDeviceStateSpeaking, kDeviceStateUpgrading, kDeviceStateActivating, @@ -15,4 +16,4 @@ enum DeviceState { kDeviceStateFatalError }; -#endif // _DEVICE_STATE_H_ \ No newline at end of file +#endif // _DEVICE_STATE_H_ diff --git a/main/device_state_machine.cc b/main/device_state_machine.cc index 30581de..b251667 100644 --- a/main/device_state_machine.cc +++ b/main/device_state_machine.cc @@ -13,6 +13,7 @@ static const char* const STATE_STRINGS[] = { "idle", "connecting", "listening", + "thinking", "speaking", "upgrading", "activating", @@ -69,9 +70,10 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con to == kDeviceStateActivating; case kDeviceStateIdle: - // Can go to connecting, listening (manual mode), speaking, activating, upgrading, or wifi configuring + // Can go to connecting, listening (manual mode), thinking, speaking, activating, upgrading, or wifi configuring return to == kDeviceStateConnecting || to == kDeviceStateListening || + to == kDeviceStateThinking || to == kDeviceStateSpeaking || to == kDeviceStateActivating || to == kDeviceStateUpgrading || @@ -83,8 +85,15 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con to == kDeviceStateListening; case kDeviceStateListening: - // Can go to speaking or idle + // Can go to thinking, speaking, or idle + return to == kDeviceStateThinking || + to == kDeviceStateSpeaking || + to == kDeviceStateIdle; + + case kDeviceStateThinking: + // Can go to speaking, listening, or idle return to == kDeviceStateSpeaking || + to == kDeviceStateListening || to == kDeviceStateIdle; case kDeviceStateSpeaking: diff --git a/main/display/emote_display.cc b/main/display/emote_display.cc index 7ed920a..8f4f7b3 100644 --- a/main/display/emote_display.cc +++ b/main/display/emote_display.cc @@ -167,6 +167,8 @@ void EmoteDisplay::SetStatus(const char* const status) emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL); } else if (std::strcmp(status, Lang::Strings::STANDBY) == 0) { emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_IDLE, NULL); + } else if (std::strcmp(status, Lang::Strings::THINKING) == 0) { + emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL); } else if (std::strcmp(status, Lang::Strings::SPEAKING) == 0) { emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_SPEAK, NULL); } else if (std::strcmp(status, Lang::Strings::ERROR) == 0) { @@ -247,4 +249,4 @@ void EmoteDisplay::RefreshAll() } } -} // namespace emote \ No newline at end of file +} // namespace emote diff --git a/main/display/lvgl_display/lvgl_display.cc b/main/display/lvgl_display/lvgl_display.cc index 46668dd..b56aa64 100644 --- a/main/display/lvgl_display/lvgl_display.cc +++ b/main/display/lvgl_display/lvgl_display.cc @@ -203,6 +203,7 @@ void LvglDisplay::UpdateStatusBar(bool update_all) { kDeviceStateStarting, kDeviceStateWifiConfiguring, kDeviceStateListening, + kDeviceStateThinking, kDeviceStateActivating, }; if (std::find(allowed_states.begin(), allowed_states.end(), device_state) != allowed_states.end()) { diff --git a/main/led/circular_strip.cc b/main/led/circular_strip.cc index b4111ba..6129f81 100644 --- a/main/led/circular_strip.cc +++ b/main/led/circular_strip.cc @@ -228,6 +228,11 @@ void CircularStrip::OnStateChanged() { SetAllColor(color); break; } + case kDeviceStateThinking: { + StripColor color = { low_brightness_, low_brightness_, default_brightness_ }; + Blink(color, 300); + break; + } case kDeviceStateUpgrading: { StripColor color = { low_brightness_, default_brightness_, low_brightness_ }; Blink(color, 100); diff --git a/main/led/gpio_led.cc b/main/led/gpio_led.cc index 30eeeb2..f1e6ab6 100644 --- a/main/led/gpio_led.cc +++ b/main/led/gpio_led.cc @@ -235,6 +235,10 @@ void GpioLed::OnStateChanged() { // TurnOn(); StartFadeTask(); break; + case kDeviceStateThinking: + SetBrightness(DEFAULT_BRIGHTNESS); + StartContinuousBlink(300); + break; case kDeviceStateSpeaking: SetBrightness(SPEAKING_BRIGHTNESS); TurnOn(); @@ -260,4 +264,4 @@ void GpioLed::EventTask(void* arg) { ulTaskNotifyTake(pdTRUE, portMAX_DELAY); led->OnFadeEnd(); } -} \ No newline at end of file +} diff --git a/main/led/single_led.cc b/main/led/single_led.cc index d107619..c061e2c 100644 --- a/main/led/single_led.cc +++ b/main/led/single_led.cc @@ -152,6 +152,10 @@ void SingleLed::OnStateChanged() { SetColor(0, DEFAULT_BRIGHTNESS, 0); TurnOn(); break; + case kDeviceStateThinking: + SetColor(0, 0, DEFAULT_BRIGHTNESS); + StartContinuousBlink(300); + break; case kDeviceStateUpgrading: SetColor(0, DEFAULT_BRIGHTNESS, 0); StartContinuousBlink(100);