From 2c4329fd84a7c12df34346380bc2253a8fc39843 Mon Sep 17 00:00:00 2001
From: 0Xiao0 <511201264@qq.com>
Date: Fri, 12 Jun 2026 11:38:47 +0800
Subject: [PATCH] fix: voice interupt

---
 .gitignore                                    |   1 +
 main/application.cc                           | 348 +++++++--------
 main/application.h                            |   1 +
 main/assets/locales/en-US/language.json       |   3 +-
 main/assets/locales/zh-CN/language.json       |   3 +-
 main/audio/audio_service.cc                   |   3 +-
 main/background_capture_service.cc            | 177 ++++++++
 main/background_capture_service.h             |  32 ++
 .../atoms3r_echo_pyramid.cc                   |   3 +
 main/boards/common/wifi_board.cc              |   5 +-
 .../electron-bot/electron_emoji_display.cc    |   9 +-
 main/boards/esp-hi/emoji_display.cc           |   2 +
 main/boards/jiuchuan-s3/jiuchuan_dev_board.cc |   6 +-
 main/boards/otto-robot/otto_emoji_display.cc  |   9 +-
 main/bridge_server.py                         | 418 +++++++++++++++---
 main/device_state.h                           |   3 +-
 main/device_state_machine.cc                  |  13 +-
 main/display/emote_display.cc                 |   4 +-
 main/display/lvgl_display/lvgl_display.cc     |   1 +
 main/led/circular_strip.cc                    |   5 +
 main/led/gpio_led.cc                          |   6 +-
 main/led/single_led.cc                        |   4 +
 22 files changed, 816 insertions(+), 240 deletions(-)
 create mode 100644 main/background_capture_service.cc
 create mode 100644 main/background_capture_service.h
diff --git a/.gitignore b/.gitignore
index 0efd9de..eec5b95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,4 @@ main/mmap_generate_emoji.h
 *.bin
 mmap_generate_*.h
 .clangd
+background_frames/
diff --git a/main/application.cc b/main/application.cc
index b679152..8d98579 100644
--- a/main/application.cc
+++ b/main/application.cc
@@ -1,25 +1,24 @@
 #include "application.h"
+#include "assets.h"
+#include "assets/lang_config.h"
+#include "audio_codec.h"
 #include "board.h"
 #include "display.h"
-#include "system_info.h"
-#include "audio_codec.h"
-#include "mqtt_protocol.h"
-#include "websocket_protocol.h"
-#include "assets/lang_config.h"
 #include "mcp_server.h"
-#include "assets.h"
+#include "mqtt_protocol.h"
 #include "settings.h"
+#include "system_info.h"
+#include "websocket_protocol.h"
 
-#include <cstring>
-#include <esp_log.h>
-#include <cJSON.h>
 #include <driver/gpio.h>
+#include <esp_log.h>
 #include <arpa/inet.h>
+#include <cJSON.h>
 #include <font_awesome.h>
+#include <cstring>
 
 #define TAG "Application"
 
-
 Application::Application() {
     event_group_ = xEventGroupCreate();
 
@@ -33,16 +32,16 @@ Application::Application() {
     aec_mode_ = kAecOff;
 #endif
 
-    esp_timer_create_args_t clock_timer_args = {
-        .callback = [](void* arg) {
-            Application* app = (Application*)arg;
-            xEventGroupSetBits(app->event_group_, MAIN_EVENT_CLOCK_TICK);
-        },
-        .arg = this,
-        .dispatch_method = ESP_TIMER_TASK,
-        .name = "clock_timer",
-        .skip_unhandled_events = true
-    };
+    esp_timer_create_args_t clock_timer_args = {.callback =
+                                                    [](void* arg) {
+                                                        Application* app = (Application*)arg;
+                                                        xEventGroupSetBits(app->event_group_,
+                                                                           MAIN_EVENT_CLOCK_TICK);
+                                                    },
+                                                .arg = this,
+                                                .dispatch_method = ESP_TIMER_TASK,
+                                                .name = "clock_timer",
+                                                .skip_unhandled_events = true};
     esp_timer_create(&clock_timer_args, &clock_timer_handle_);
 }
 
@@ -54,9 +53,7 @@ Application::~Application() {
     vEventGroupDelete(event_group_);
 }
 
-bool Application::SetDeviceState(DeviceState state) {
-    return state_machine_.TransitionTo(state);
-}
+bool Application::SetDeviceState(DeviceState state) { return state_machine_.TransitionTo(state); }
 
 void Application::Initialize() {
     auto& board = Board::GetInstance();
@@ -102,7 +99,7 @@ void Application::Initialize() {
     // Set network event callback for UI updates and network state handling
     board.SetNetworkEventCallback([this](NetworkEvent event, const std::string& data) {
         auto display = Board::GetInstance().GetDisplay();
-        
+
         switch (event) {
             case NetworkEvent::Scanning:
                 display->ShowNotification(Lang::Strings::SCANNING_WIFI, 30000);
@@ -142,13 +139,16 @@ void Application::Initialize() {
                 display->SetStatus(Lang::Strings::DETECTING_MODULE);
                 break;
             case NetworkEvent::ModemErrorNoSim:
-                Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_PIN);
+                Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation",
+                      Lang::Sounds::OGG_ERR_PIN);
                 break;
             case NetworkEvent::ModemErrorRegDenied:
-                Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_REG);
+                Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation",
+                      Lang::Sounds::OGG_ERR_REG);
                 break;
             case NetworkEvent::ModemErrorInitFailed:
-                Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation", Lang::Sounds::OGG_EXCLAMATION);
+                Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation",
+                      Lang::Sounds::OGG_EXCLAMATION);
                 break;
             case NetworkEvent::ModemErrorTimeout:
                 display->SetStatus(Lang::Strings::REGISTERING_NETWORK);
@@ -167,19 +167,11 @@ void Application::Run() {
     // Set the priority of the main task to 10
     vTaskPrioritySet(nullptr, 10);
 
-    const EventBits_t ALL_EVENTS = 
-        MAIN_EVENT_SCHEDULE |
-        MAIN_EVENT_SEND_AUDIO |
-        MAIN_EVENT_WAKE_WORD_DETECTED |
-        MAIN_EVENT_VAD_CHANGE |
-        MAIN_EVENT_CLOCK_TICK |
-        MAIN_EVENT_ERROR |
-        MAIN_EVENT_NETWORK_CONNECTED |
-        MAIN_EVENT_NETWORK_DISCONNECTED |
-        MAIN_EVENT_TOGGLE_CHAT |
-        MAIN_EVENT_START_LISTENING |
-        MAIN_EVENT_STOP_LISTENING |
-        MAIN_EVENT_ACTIVATION_DONE |
+    const EventBits_t ALL_EVENTS =
+        MAIN_EVENT_SCHEDULE | MAIN_EVENT_SEND_AUDIO | MAIN_EVENT_WAKE_WORD_DETECTED |
+        MAIN_EVENT_VAD_CHANGE | MAIN_EVENT_CLOCK_TICK | MAIN_EVENT_ERROR |
+        MAIN_EVENT_NETWORK_CONNECTED | MAIN_EVENT_NETWORK_DISCONNECTED | MAIN_EVENT_TOGGLE_CHAT |
+        MAIN_EVENT_START_LISTENING | MAIN_EVENT_STOP_LISTENING | MAIN_EVENT_ACTIVATION_DONE |
         MAIN_EVENT_STATE_CHANGED;
 
     while (true) {
@@ -187,7 +179,8 @@ void Application::Run() {
 
         if (bits & MAIN_EVENT_ERROR) {
             SetDeviceState(kDeviceStateIdle);
-            Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark", Lang::Sounds::OGG_EXCLAMATION);
+            Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark",
+                  Lang::Sounds::OGG_EXCLAMATION);
         }
 
         if (bits & MAIN_EVENT_NETWORK_CONNECTED) {
@@ -257,7 +250,7 @@ void Application::Run() {
             clock_ticks_++;
             auto display = Board::GetInstance().GetDisplay();
             display->UpdateStatusBar();
-        
+
             // Print debug info every 10 seconds
             if (clock_ticks_ % 10 == 0) {
                 SystemInfo::PrintHeapStats();
@@ -278,12 +271,14 @@ void Application::HandleNetworkConnectedEvent() {
             return;
         }
 
-        xTaskCreate([](void* arg) {
-            Application* app = static_cast<Application*>(arg);
-            app->ActivationTask();
-            app->activation_task_handle_ = nullptr;
-            vTaskDelete(NULL);
-        }, "activation", 4096 * 2, this, 2, &activation_task_handle_);
+        xTaskCreate(
+            [](void* arg) {
+                Application* app = static_cast<Application*>(arg);
+                app->ActivationTask();
+                app->activation_task_handle_ = nullptr;
+                vTaskDelete(NULL);
+            },
+            "activation", 4096 * 2, this, 2, &activation_task_handle_);
     }
 
     // Update the status bar immediately to show the network state
@@ -294,7 +289,8 @@ void Application::HandleNetworkConnectedEvent() {
 void Application::HandleNetworkDisconnectedEvent() {
     // Close current conversation when network disconnected
     auto state = GetDeviceState();
-    if (state == kDeviceStateConnecting || state == kDeviceStateListening || state == kDeviceStateSpeaking) {
+    if (state == kDeviceStateConnecting || state == kDeviceStateListening ||
+        state == kDeviceStateThinking || state == kDeviceStateSpeaking) {
         ESP_LOGI(TAG, "Closing audio channel due to network disconnection");
         protocol_->CloseAudioChannel();
     }
@@ -369,7 +365,7 @@ void Application::CheckAssetsVersion() {
         ESP_LOGW(TAG, "Assets partition is disabled for board %s", BOARD_NAME);
         return;
     }
-    
+
     Settings settings("assets", true);
     // Check if there is a new assets need to be downloaded
     std::string download_url = settings.GetString("download_url");
@@ -379,27 +375,30 @@ void Application::CheckAssetsVersion() {
 
         char message[256];
         snprintf(message, sizeof(message), Lang::Strings::FOUND_NEW_ASSETS, download_url.c_str());
-        Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down", Lang::Sounds::OGG_UPGRADE);
-        
+        Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down",
+              Lang::Sounds::OGG_UPGRADE);
+
         // Wait for the audio service to be idle for 3 seconds
         vTaskDelay(pdMS_TO_TICKS(3000));
         SetDeviceState(kDeviceStateUpgrading);
         board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
         display->SetChatMessage("system", Lang::Strings::PLEASE_WAIT);
 
-        bool success = assets.Download(download_url, [this, display](int progress, size_t speed) -> void {
-            char buffer[32];
-            snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024);
-            Schedule([display, message = std::string(buffer)]() {
-                display->SetChatMessage("system", message.c_str());
+        bool success =
+            assets.Download(download_url, [this, display](int progress, size_t speed) -> void {
+                char buffer[32];
+                snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024);
+                Schedule([display, message = std::string(buffer)]() {
+                    display->SetChatMessage("system", message.c_str());
+                });
             });
-        });
 
         board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
         vTaskDelay(pdMS_TO_TICKS(1000));
 
         if (!success) {
-            Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION);
+            Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark",
+                  Lang::Sounds::OGG_EXCLAMATION);
             vTaskDelay(pdMS_TO_TICKS(2000));
             SetDeviceState(kDeviceStateActivating);
             return;
@@ -415,7 +414,7 @@ void Application::CheckAssetsVersion() {
 void Application::CheckNewVersion() {
     const int MAX_RETRY = 10;
     int retry_count = 0;
-    int retry_delay = 10; // Initial retry delay in seconds
+    int retry_delay = 10;  // Initial retry delay in seconds
 
     auto& board = Board::GetInstance();
     while (true) {
@@ -431,27 +430,30 @@ void Application::CheckNewVersion() {
             }
 
             char error_message[128];
-            snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err, ota_->GetCheckVersionUrl().c_str());
+            snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err,
+                     ota_->GetCheckVersionUrl().c_str());
             char buffer[256];
-            snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay, error_message);
+            snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay,
+                     error_message);
             Alert(Lang::Strings::ERROR, buffer, "cloud_slash", Lang::Sounds::OGG_EXCLAMATION);
 
-            ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay, retry_count, MAX_RETRY);
+            ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay,
+                     retry_count, MAX_RETRY);
             for (int i = 0; i < retry_delay; i++) {
                 vTaskDelay(pdMS_TO_TICKS(1000));
                 if (GetDeviceState() == kDeviceStateIdle) {
                     break;
                 }
             }
-            retry_delay *= 2; // Double the retry delay
+            retry_delay *= 2;  // Double the retry delay
             continue;
         }
         retry_count = 0;
-        retry_delay = 10; // Reset retry delay
+        retry_delay = 10;  // Reset retry delay
 
         if (ota_->HasNewVersion()) {
             if (UpgradeFirmware(ota_->GetFirmwareUrl(), ota_->GetFirmwareVersion())) {
-                return; // This line will never be reached after reboot
+                return;  // This line will never be reached after reboot
             }
             // If upgrade failed, continue to normal operation
         }
@@ -507,31 +509,32 @@ void Application::InitializeProtocol() {
     }
 #endif
 
-    protocol_->OnConnected([this]() {
-        DismissAlert();
-    });
+    protocol_->OnConnected([this]() { DismissAlert(); });
 
     protocol_->OnNetworkError([this](const std::string& message) {
         last_error_message_ = message;
         xEventGroupSetBits(event_group_, MAIN_EVENT_ERROR);
     });
-    
+
     protocol_->OnIncomingAudio([this](std::unique_ptr<AudioStreamPacket> packet) {
-        if (GetDeviceState() == kDeviceStateSpeaking) {
+        if (accepting_tts_audio_.load() || GetDeviceState() == kDeviceStateSpeaking) {
             audio_service_.PushPacketToDecodeQueue(std::move(packet));
         }
     });
-    
+
     protocol_->OnAudioChannelOpened([this, codec, &board]() {
         board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
         if (protocol_->server_sample_rate() != codec->output_sample_rate()) {
-            ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion",
-                protocol_->server_sample_rate(), codec->output_sample_rate());
+            ESP_LOGW(TAG,
+                     "Server sample rate %d does not match device output sample rate %d, "
+                     "resampling may cause distortion",
+                     protocol_->server_sample_rate(), codec->output_sample_rate());
         }
     });
-    
+
     protocol_->OnAudioChannelClosed([this, &board]() {
         board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
+        accepting_tts_audio_.store(false);
         Schedule([this]() {
             if (GetDeviceState() == kDeviceStateConnecting) {
                 return;
@@ -541,20 +544,26 @@ void Application::InitializeProtocol() {
             SetDeviceState(kDeviceStateIdle);
         });
     });
-    
+
     protocol_->OnIncomingJson([this, display](const cJSON* root) {
         // Parse JSON data
         auto type = cJSON_GetObjectItem(root, "type");
         if (strcmp(type->valuestring, "tts") == 0) {
             auto state = cJSON_GetObjectItem(root, "state");
-            if (strcmp(state->valuestring, "start") == 0) {
+            if (strcmp(state->valuestring, "thinking") == 0) {
+                Schedule([this]() { SetDeviceState(kDeviceStateThinking); });
+            } else if (strcmp(state->valuestring, "start") == 0) {
+                audio_service_.ResetDecoder();
+                accepting_tts_audio_.store(true);
                 Schedule([this]() {
                     aborted_ = false;
                     SetDeviceState(kDeviceStateSpeaking);
                 });
             } else if (strcmp(state->valuestring, "stop") == 0) {
+                accepting_tts_audio_.store(false);
                 Schedule([this]() {
-                    if (GetDeviceState() == kDeviceStateSpeaking) {
+                    auto state = GetDeviceState();
+                    if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
                         if (listening_mode_ == kListeningModeManualStop) {
                             SetDeviceState(kDeviceStateIdle);
                         } else {
@@ -597,9 +606,7 @@ void Application::InitializeProtocol() {
                 ESP_LOGI(TAG, "System command: %s", command->valuestring);
                 if (strcmp(command->valuestring, "reboot") == 0) {
                     // Do a reboot if user requests a OTA update
-                    Schedule([this]() {
-                        Reboot();
-                    });
+                    Schedule([this]() { Reboot(); });
                 } else {
                     ESP_LOGW(TAG, "Unknown system command: %s", command->valuestring);
                 }
@@ -609,7 +616,8 @@ void Application::InitializeProtocol() {
             auto message = cJSON_GetObjectItem(root, "message");
             auto emotion = cJSON_GetObjectItem(root, "emotion");
             if (cJSON_IsString(status) && cJSON_IsString(message) && cJSON_IsString(emotion)) {
-                Alert(status->valuestring, message->valuestring, emotion->valuestring, Lang::Sounds::OGG_VIBRATION);
+                Alert(status->valuestring, message->valuestring, emotion->valuestring,
+                      Lang::Sounds::OGG_VIBRATION);
             } else {
                 ESP_LOGW(TAG, "Alert command requires status, message and emotion");
             }
@@ -618,9 +626,10 @@ void Application::InitializeProtocol() {
             auto payload = cJSON_GetObjectItem(root, "payload");
             ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root));
             if (cJSON_IsObject(payload)) {
-                Schedule([this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() {
-                    display->SetChatMessage("system", payload_str.c_str());
-                });
+                Schedule(
+                    [this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() {
+                        display->SetChatMessage("system", payload_str.c_str());
+                    });
             } else {
                 ESP_LOGW(TAG, "Invalid custom message format: missing payload");
             }
@@ -629,7 +638,7 @@ void Application::InitializeProtocol() {
             ESP_LOGW(TAG, "Unknown message type: %s", type->valuestring);
         }
     });
-    
+
     protocol_->Start();
 }
 
@@ -638,32 +647,27 @@ void Application::ShowActivationCode(const std::string& code, const std::string&
         char digit;
         const std::string_view& sound;
     };
-    static const std::array<digit_sound, 10> digit_sounds{{
-        digit_sound{'0', Lang::Sounds::OGG_0},
-        digit_sound{'1', Lang::Sounds::OGG_1}, 
-        digit_sound{'2', Lang::Sounds::OGG_2},
-        digit_sound{'3', Lang::Sounds::OGG_3},
-        digit_sound{'4', Lang::Sounds::OGG_4},
-        digit_sound{'5', Lang::Sounds::OGG_5},
-        digit_sound{'6', Lang::Sounds::OGG_6},
-        digit_sound{'7', Lang::Sounds::OGG_7},
-        digit_sound{'8', Lang::Sounds::OGG_8},
-        digit_sound{'9', Lang::Sounds::OGG_9}
-    }};
+    static const std::array<digit_sound, 10> digit_sounds{
+        {digit_sound{'0', Lang::Sounds::OGG_0}, digit_sound{'1', Lang::Sounds::OGG_1},
+         digit_sound{'2', Lang::Sounds::OGG_2}, digit_sound{'3', Lang::Sounds::OGG_3},
+         digit_sound{'4', Lang::Sounds::OGG_4}, digit_sound{'5', Lang::Sounds::OGG_5},
+         digit_sound{'6', Lang::Sounds::OGG_6}, digit_sound{'7', Lang::Sounds::OGG_7},
+         digit_sound{'8', Lang::Sounds::OGG_8}, digit_sound{'9', Lang::Sounds::OGG_9}}};
 
     // This sentence uses 9KB of SRAM, so we need to wait for it to finish
     Alert(Lang::Strings::ACTIVATION, message.c_str(), "link", Lang::Sounds::OGG_ACTIVATION);
 
     for (const auto& digit : code) {
         auto it = std::find_if(digit_sounds.begin(), digit_sounds.end(),
-            [digit](const digit_sound& ds) { return ds.digit == digit; });
+                               [digit](const digit_sound& ds) { return ds.digit == digit; });
         if (it != digit_sounds.end()) {
             audio_service_.PlaySound(it->sound);
         }
     }
 }
 
-void Application::Alert(const char* status, const char* message, const char* emotion, const std::string_view& sound) {
+void Application::Alert(const char* status, const char* message, const char* emotion,
+                        const std::string_view& sound) {
     ESP_LOGW(TAG, "Alert [%s] %s: %s", emotion, status, message);
     auto display = Board::GetInstance().GetDisplay();
     display->SetStatus(status);
@@ -683,9 +687,7 @@ void Application::DismissAlert() {
     }
 }
 
-void Application::ToggleChatState() {
-    ToggleChatStateForMode(kChatAgentModeNormal, false);
-}
+void Application::ToggleChatState() { ToggleChatStateForMode(kChatAgentModeNormal, false); }
 
 void Application::ToggleChatStateWithVision() {
     ToggleChatStateForMode(kChatAgentModeNormal, true);
@@ -698,9 +700,7 @@ void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_e
     xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
 }
 
-bool Application::IsVisionTextModeEnabled() const {
-    return vision_text_mode_enabled_.load();
-}
+bool Application::IsVisionTextModeEnabled() const { return vision_text_mode_enabled_.load(); }
 
 const char* Application::GetChatAgentModeName() const {
     return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
@@ -720,9 +720,7 @@ void Application::StartListening() {
     xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
 }
 
-void Application::StopListening() {
-    xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING);
-}
+void Application::StopListening() { xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING); }
 
 void Application::HandleToggleChatEvent() {
     auto state = GetDeviceState();
@@ -748,21 +746,21 @@ void Application::HandleToggleChatEvent() {
     if (state == kDeviceStateIdle) {
         ListeningMode mode = GetDefaultListeningMode();
         bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
-        bool vision_mode_changed = vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
+        bool vision_mode_changed =
+            vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
         if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
             if (protocol_->IsAudioChannelOpened()) {
                 protocol_->CloseAudioChannel();
             }
             SetDeviceState(kDeviceStateConnecting);
             // Schedule to let the state change be processed first (UI update)
-            Schedule([this, mode]() {
-                ContinueOpenAudioChannel(mode);
-            });
+            Schedule([this, mode]() { ContinueOpenAudioChannel(mode); });
             return;
         }
         SetListeningMode(mode);
-    } else if (state == kDeviceStateSpeaking) {
+    } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
         AbortSpeaking(kAbortReasonNone);
+        SetListeningMode(GetDefaultListeningMode());
     } else if (state == kDeviceStateListening) {
         protocol_->CloseAudioChannel();
     }
@@ -791,7 +789,7 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
 
 void Application::HandleStartListeningEvent() {
     auto state = GetDeviceState();
-    
+
     if (state == kDeviceStateActivating) {
         SetDeviceState(kDeviceStateIdle);
         return;
@@ -805,18 +803,16 @@ void Application::HandleStartListeningEvent() {
         ESP_LOGE(TAG, "Protocol not initialized");
         return;
     }
-    
+
     if (state == kDeviceStateIdle) {
         if (!protocol_->IsAudioChannelOpened()) {
             SetDeviceState(kDeviceStateConnecting);
             // Schedule to let the state change be processed first (UI update)
-            Schedule([this]() {
-                ContinueOpenAudioChannel(kListeningModeManualStop);
-            });
+            Schedule([this]() { ContinueOpenAudioChannel(kListeningModeManualStop); });
             return;
         }
         SetListeningMode(kListeningModeManualStop);
-    } else if (state == kDeviceStateSpeaking) {
+    } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
         AbortSpeaking(kAbortReasonNone);
         SetListeningMode(kListeningModeManualStop);
     }
@@ -824,7 +820,7 @@ void Application::HandleStartListeningEvent() {
 
 void Application::HandleStopListeningEvent() {
     auto state = GetDeviceState();
-    
+
     if (state == kDeviceStateAudioTesting) {
         audio_service_.EnableAudioTesting(false);
         SetDeviceState(kDeviceStateWifiConfiguring);
@@ -854,17 +850,14 @@ void Application::HandleWakeWordDetectedEvent() {
             SetDeviceState(kDeviceStateConnecting);
             // Schedule to let the state change be processed first (UI update),
             // then continue with OpenAudioChannel which may block for ~1 second
-            Schedule([this, wake_word]() {
-                ContinueWakeWordInvoke(wake_word);
-            });
+            Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); });
             return;
         }
         // Channel already opened, continue directly
         ContinueWakeWordInvoke(wake_word);
-    } else if (state == kDeviceStateSpeaking || state == kDeviceStateListening) {
+    } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking ||
+               state == kDeviceStateListening) {
         AbortSpeaking(kAbortReasonWakeWordDetected);
-        // Clear send queue to avoid sending residues to server
-        while (audio_service_.PopPacketFromSendQueue());
 
         if (state == kDeviceStateListening) {
             protocol_->SendStartListening(GetDefaultListeningMode());
@@ -925,14 +918,14 @@ void Application::HandleStateChangedEvent() {
     auto display = board.GetDisplay();
     auto led = board.GetLed();
     led->OnStateChanged();
-    
+
     switch (new_state) {
         case kDeviceStateUnknown:
         case kDeviceStateIdle:
             vision_frame_sent_for_current_listen_.store(false);
             display->SetStatus(Lang::Strings::STANDBY);
-            display->ClearChatMessages();  // Clear messages first
-            display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
+            display->ClearChatMessages();    // Clear messages first
+            display->SetEmotion("neutral");  // Then set emotion (wechat mode checks child count)
             audio_service_.EnableVoiceProcessing(false);
             audio_service_.EnableWakeWordDetection(true);
             break;
@@ -947,18 +940,14 @@ void Application::HandleStateChangedEvent() {
             display->SetStatus(Lang::Strings::LISTENING);
             display->SetEmotion("neutral");
 
-            // Make sure the audio processor is running
-            if (play_popup_on_listening_ || !audio_service_.IsAudioProcessorRunning()) {
-                // For auto mode, wait for playback queue to be empty before enabling voice processing
-                // This prevents audio truncation when STOP arrives late due to network jitter
-                if (listening_mode_ == kListeningModeAutoStop) {
-                    audio_service_.WaitForPlaybackQueueEmpty();
-                }
-                
-                // Send the start listening command
-                protocol_->SendStartListening(listening_mode_);
-                audio_service_.EnableVoiceProcessing(true);
+            // Re-entering listening after an interrupt must restart the capture path even if the
+            // processor task is still marked running, otherwise realtime mode can show Listening
+            // while no fresh mic frames are sent.
+            if (listening_mode_ == kListeningModeAutoStop) {
+                audio_service_.WaitForPlaybackQueueEmpty();
             }
+            protocol_->SendStartListening(listening_mode_);
+            audio_service_.EnableVoiceProcessing(true);
 
 #ifdef CONFIG_WAKE_WORD_DETECTION_IN_LISTENING
             // Enable wake word detection in listening mode (configured via Kconfig)
@@ -967,13 +956,23 @@ void Application::HandleStateChangedEvent() {
             // Disable wake word detection in listening mode
             audio_service_.EnableWakeWordDetection(false);
 #endif
-            
+
             // Play popup sound after ResetDecoder (in EnableVoiceProcessing) has been called
             if (play_popup_on_listening_) {
                 play_popup_on_listening_ = false;
                 audio_service_.PlaySound(Lang::Sounds::OGG_POPUP);
             }
             break;
+        case kDeviceStateThinking:
+            vad_speaking_.store(false);
+            display->SetStatus(Lang::Strings::THINKING);
+            display->SetEmotion("thinking");
+
+            if (listening_mode_ != kListeningModeRealtime) {
+                audio_service_.EnableVoiceProcessing(false);
+                audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
+            }
+            break;
         case kDeviceStateSpeaking:
             display->SetStatus(Lang::Strings::SPEAKING);
 
@@ -982,7 +981,9 @@ void Application::HandleStateChangedEvent() {
                 // Only AFE wake word can be detected in speaking mode
                 audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
             }
-            audio_service_.ResetDecoder();
+            if (!accepting_tts_audio_.load()) {
+                audio_service_.ResetDecoder();
+            }
             break;
         case kDeviceStateWifiConfiguring:
             audio_service_.EnableVoiceProcessing(false);
@@ -1026,6 +1027,8 @@ void Application::Schedule(std::function<void()>&& callback) {
 void Application::AbortSpeaking(AbortReason reason) {
     ESP_LOGI(TAG, "Abort speaking");
     aborted_ = true;
+    accepting_tts_audio_.store(false);
+    audio_service_.ResetDecoder();
     if (protocol_) {
         protocol_->SendAbortSpeaking(reason);
     }
@@ -1069,7 +1072,8 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver
     }
     ESP_LOGI(TAG, "Starting firmware upgrade from URL: %s", upgrade_url.c_str());
 
-    Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download", Lang::Sounds::OGG_UPGRADE);
+    Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download",
+          Lang::Sounds::OGG_UPGRADE);
     vTaskDelay(pdMS_TO_TICKS(3000));
 
     SetDeviceState(kDeviceStateUpgrading);
@@ -1091,17 +1095,19 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver
 
     if (!upgrade_success) {
         // Upgrade failed, restart audio service and continue running
-        ESP_LOGE(TAG, "Firmware upgrade failed, restarting audio service and continuing operation...");
-        audio_service_.Start(); // Restart audio service
-        board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level
-        Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION);
+        ESP_LOGE(TAG,
+                 "Firmware upgrade failed, restarting audio service and continuing operation...");
+        audio_service_.Start();                              // Restart audio service
+        board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);  // Restore power save level
+        Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark",
+              Lang::Sounds::OGG_EXCLAMATION);
         vTaskDelay(pdMS_TO_TICKS(3000));
         return false;
     } else {
         // Upgrade success, reboot immediately
         ESP_LOGI(TAG, "Firmware upgrade successful, rebooting...");
         display->SetChatMessage("system", "Upgrade successful, rebooting...");
-        vTaskDelay(pdMS_TO_TICKS(1000)); // Brief pause to show message
+        vTaskDelay(pdMS_TO_TICKS(1000));  // Brief pause to show message
         Reboot();
         return true;
     }
@@ -1113,25 +1119,21 @@ void Application::WakeWordInvoke(const std::string& wake_word) {
     }
 
     auto state = GetDeviceState();
-    
+
     if (state == kDeviceStateIdle) {
         audio_service_.EncodeWakeWord();
 
         if (!protocol_->IsAudioChannelOpened()) {
             SetDeviceState(kDeviceStateConnecting);
             // Schedule to let the state change be processed first (UI update)
-            Schedule([this, wake_word]() {
-                ContinueWakeWordInvoke(wake_word);
-            });
+            Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); });
             return;
         }
         // Channel already opened, continue directly
         ContinueWakeWordInvoke(wake_word);
-    } else if (state == kDeviceStateSpeaking) {
-        Schedule([this]() {
-            AbortSpeaking(kAbortReasonNone);
-        });
-    } else if (state == kDeviceStateListening) {   
+    } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
+        Schedule([this]() { AbortSpeaking(kAbortReasonNone); });
+    } else if (state == kDeviceStateListening) {
         Schedule([this]() {
             if (protocol_) {
                 protocol_->CloseAudioChannel();
@@ -1163,7 +1165,7 @@ void Application::RegisterMcpBroadcastCallback(std::function<void(const std::str
 
 void Application::SendMcpMessage(const std::string& payload) {
     // Always schedule to run in main task for thread safety
-    Schedule([this, payload](){ 
+    Schedule([this, payload]() {
         if (protocol_) {
             protocol_->SendMcpMessage(payload);
         }
@@ -1179,18 +1181,18 @@ void Application::SetAecMode(AecMode mode) {
         auto& board = Board::GetInstance();
         auto display = board.GetDisplay();
         switch (aec_mode_) {
-        case kAecOff:
-            audio_service_.EnableDeviceAec(false);
-            display->ShowNotification(Lang::Strings::RTC_MODE_OFF);
-            break;
-        case kAecOnServerSide:
-            audio_service_.EnableDeviceAec(false);
-            display->ShowNotification(Lang::Strings::RTC_MODE_ON);
-            break;
-        case kAecOnDeviceSide:
-            audio_service_.EnableDeviceAec(true);
-            display->ShowNotification(Lang::Strings::RTC_MODE_ON);
-            break;
+            case kAecOff:
+                audio_service_.EnableDeviceAec(false);
+                display->ShowNotification(Lang::Strings::RTC_MODE_OFF);
+                break;
+            case kAecOnServerSide:
+                audio_service_.EnableDeviceAec(false);
+                display->ShowNotification(Lang::Strings::RTC_MODE_ON);
+                break;
+            case kAecOnDeviceSide:
+                audio_service_.EnableDeviceAec(true);
+                display->ShowNotification(Lang::Strings::RTC_MODE_ON);
+                break;
         }
 
         // If the AEC mode is changed, close the audio channel
@@ -1200,9 +1202,7 @@ void Application::SetAecMode(AecMode mode) {
     });
 }
 
-void Application::PlaySound(const std::string_view& sound) {
-    audio_service_.PlaySound(sound);
-}
+void Application::PlaySound(const std::string_view& sound) { audio_service_.PlaySound(sound); }
 
 void Application::ResetProtocol() {
     Schedule([this]() {
diff --git a/main/application.h b/main/application.h
index 03f7edf..4ebba78 100644
--- a/main/application.h
+++ b/main/application.h
@@ -162,6 +162,7 @@ private:
     std::atomic<bool> active_vision_text_mode_enabled_ = false;
     std::atomic<bool> vad_speaking_ = false;
     std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
+    std::atomic<bool> accepting_tts_audio_ = false;
     int clock_ticks_ = 0;
     TaskHandle_t activation_task_handle_ = nullptr;
 
diff --git a/main/assets/locales/en-US/language.json b/main/assets/locales/en-US/language.json
index 8bb764a..38029ca 100644
--- a/main/assets/locales/en-US/language.json
+++ b/main/assets/locales/en-US/language.json
@@ -26,6 +26,7 @@
         "CONNECTION_SUCCESSFUL": "Connection Successful",
         "CONNECTED_TO": "Connected to ",
         "LISTENING": "Listening...",
+        "THINKING": "Thinking...",
         "SPEAKING": "Speaking...",
         "SERVER_NOT_FOUND": "Looking for available service",
         "SERVER_NOT_CONNECTED": "Unable to connect to service, please try again later",
@@ -56,4 +57,4 @@
         "LOADING_ASSETS": "Loading assets...",
         "HELLO_MY_FRIEND": "Hello, my friend!"
     }
-}
\ No newline at end of file
+}
diff --git a/main/assets/locales/zh-CN/language.json b/main/assets/locales/zh-CN/language.json
index a21f51a..17b287f 100644
--- a/main/assets/locales/zh-CN/language.json
+++ b/main/assets/locales/zh-CN/language.json
@@ -23,6 +23,7 @@
         "CONNECTING": "连接中...",
         "CONNECTED_TO": "已连接 ",
         "LISTENING": "聆听中...",
+        "THINKING": "思考中...",
         "SPEAKING": "说话中...",
         "SERVER_NOT_FOUND": "正在寻找可用服务",
         "SERVER_NOT_CONNECTED": "无法连接服务，请稍后再试",
@@ -56,4 +57,4 @@
         "FLIGHT_MODE_OFF": "飞行模式已关闭",
         "FLIGHT_MODE_ON": "飞行模式已开启"
     }
-}
\ No newline at end of file
+}
diff --git a/main/audio/audio_service.cc b/main/audio/audio_service.cc
index 350ac58..7698c6f 100644
--- a/main/audio/audio_service.cc
+++ b/main/audio/audio_service.cc
@@ -579,6 +579,7 @@ void AudioService::EnableWakeWordDetection(bool enable) {
 void AudioService::EnableVoiceProcessing(bool enable) {
     ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
     if (enable) {
+        bool was_running = IsAudioProcessorRunning();
         if (!audio_processor_initialized_) {
             audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_);
             audio_processor_initialized_ = true;
@@ -586,7 +587,7 @@ void AudioService::EnableVoiceProcessing(bool enable) {
 
         /* We should make sure no audio is playing */
         ResetDecoder();
-        audio_input_need_warmup_ = true;
+        audio_input_need_warmup_ = !was_running;
         // Reset input resampler to clear cached data from previous mode (e.g. WakeWord)
         // This prevents buffer overflow when switching between different feed sizes
         {
diff --git a/main/background_capture_service.cc b/main/background_capture_service.cc
new file mode 100644
index 0000000..ea4c064
--- /dev/null
+++ b/main/background_capture_service.cc
@@ -0,0 +1,177 @@
+#include "background_capture_service.h"
+
+#include "board.h"
+#include "camera.h"
+
+#include <algorithm>
+#include <esp_heap_caps.h>
+#include <esp_log.h>
+
+#define TAG "BgCapture"
+
+BackgroundCaptureService::BackgroundCaptureService() = default;
+
+BackgroundCaptureService::~BackgroundCaptureService() {
+    Stop();
+}
+
+void BackgroundCaptureService::Start() {
+#if CONFIG_BACKGROUND_CAPTURE_ENABLE
+    if (running_.exchange(true)) {
+        return;
+    }
+
+    auto result = xTaskCreate(
+        &BackgroundCaptureService::TaskEntry,
+        "bg_capture",
+        CONFIG_BACKGROUND_CAPTURE_TASK_STACK_SIZE,
+        this,
+        CONFIG_BACKGROUND_CAPTURE_TASK_PRIORITY,
+        &task_handle_);
+    if (result != pdPASS) {
+        running_.store(false);
+        task_handle_ = nullptr;
+        ESP_LOGE(TAG, "Failed to create background capture task");
+    }
+#endif
+}
+
+void BackgroundCaptureService::Stop() {
+#if CONFIG_BACKGROUND_CAPTURE_ENABLE
+    if (!running_.exchange(false)) {
+        return;
+    }
+
+    while (task_handle_ != nullptr) {
+        vTaskDelay(pdMS_TO_TICKS(20));
+    }
+#endif
+}
+
+void BackgroundCaptureService::TaskEntry(void* arg) {
+#if CONFIG_BACKGROUND_CAPTURE_ENABLE
+    auto* service = static_cast<BackgroundCaptureService*>(arg);
+    service->Run();
+    service->task_handle_ = nullptr;
+#else
+    (void)arg;
+#endif
+    vTaskDelete(nullptr);
+}
+
+void BackgroundCaptureService::Run() {
+#if CONFIG_BACKGROUND_CAPTURE_ENABLE
+    ESP_LOGI(TAG, "Background capture task started");
+
+    while (running_.load()) {
+        if (!CaptureAndSendFrame()) {
+            consecutive_failures_++;
+            auto delay_ms = GetFailureDelayMs();
+            ESP_LOGW(TAG, "Background capture retry in %u ms, failures=%u",
+                     delay_ms, consecutive_failures_);
+            vTaskDelay(pdMS_TO_TICKS(delay_ms));
+            continue;
+        }
+
+        consecutive_failures_ = 0;
+        vTaskDelay(pdMS_TO_TICKS(CONFIG_BACKGROUND_CAPTURE_FRAME_INTERVAL_MS));
+    }
+
+    ESP_LOGI(TAG, "Background capture task stopped");
+#endif
+}
+
+bool BackgroundCaptureService::CaptureAndSendFrame() {
+#if CONFIG_BACKGROUND_CAPTURE_ENABLE
+    const size_t free_internal_heap = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
+    if (free_internal_heap < CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP) {
+        ESP_LOGW(TAG, "Skip background capture, low internal heap: free=%u threshold=%u",
+                 static_cast<unsigned>(free_internal_heap),
+                 static_cast<unsigned>(CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP));
+        return false;
+    }
+
+    auto camera = Board::GetInstance().GetCamera();
+    if (camera == nullptr) {
+        ESP_LOGW(TAG, "No camera available for background capture");
+        return false;
+    }
+
+    std::string jpeg_data;
+    if (!camera->CaptureToJpeg(jpeg_data, false)) {
+        ESP_LOGW(TAG, "Failed to capture background frame");
+        return false;
+    }
+
+    if (jpeg_data.empty()) {
+        ESP_LOGW(TAG, "Captured empty background frame");
+        return false;
+    }
+
+    return UploadJpegFrame(jpeg_data);
+#else
+    return false;
+#endif
+}
+
+uint32_t BackgroundCaptureService::GetFailureDelayMs() const {
+#if CONFIG_BACKGROUND_CAPTURE_ENABLE
+    const uint32_t base_delay_ms = CONFIG_BACKGROUND_CAPTURE_RETRY_INTERVAL_MS;
+    const uint32_t max_delay_ms = CONFIG_BACKGROUND_CAPTURE_MAX_BACKOFF_MS;
+    const uint32_t shift = std::min<uint32_t>(consecutive_failures_ - 1, 4);
+    return std::min<uint32_t>(base_delay_ms << shift, max_delay_ms);
+#else
+    return 0;
+#endif
+}
+
+bool BackgroundCaptureService::UploadJpegFrame(const std::string& jpeg_data) {
+#if CONFIG_BACKGROUND_CAPTURE_ENABLE
+    const std::string url = CONFIG_BACKGROUND_CAPTURE_UPLOAD_URL;
+    if (url.empty()) {
+        ESP_LOGI(TAG, "Captured background frame: %u bytes", static_cast<unsigned>(jpeg_data.size()));
+        return true;
+    }
+
+    auto network = Board::GetInstance().GetNetwork();
+    if (network == nullptr) {
+        ESP_LOGW(TAG, "No network available for background upload");
+        return false;
+    }
+
+    const std::string boundary = "----XIAOZHI_BACKGROUND_CAPTURE_BOUNDARY";
+    auto http = network->CreateHttp(3);
+    http->SetHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
+
+    if (!http->Open("POST", url)) {
+        ESP_LOGW(TAG, "Failed to open background upload URL: %s", url.c_str());
+        return false;
+    }
+
+    std::string file_header;
+    file_header += "--" + boundary + "\r\n";
+    file_header += "Content-Disposition: form-data; name=\"file\"; filename=\"frame.jpg\"\r\n";
+    file_header += "Content-Type: image/jpeg\r\n\r\n";
+    http->Write(file_header.c_str(), file_header.size());
+    http->Write(jpeg_data.data(), jpeg_data.size());
+
+    std::string footer;
+    footer += "\r\n--" + boundary + "--\r\n";
+    http->Write(footer.c_str(), footer.size());
+    http->Write("", 0);
+
+    const int status_code = http->GetStatusCode();
+    http->Close();
+
+    if (status_code < 200 || status_code >= 300) {
+        ESP_LOGW(TAG, "Background upload failed, status=%d", status_code);
+        return false;
+    }
+
+    ESP_LOGI(TAG, "Uploaded background frame: %u bytes", static_cast<unsigned>(jpeg_data.size()));
+    return true;
+#else
+    (void)jpeg_data;
+    return false;
+#endif
+}
diff --git a/main/background_capture_service.h b/main/background_capture_service.h
new file mode 100644
index 0000000..a5dcbf5
--- /dev/null
+++ b/main/background_capture_service.h
@@ -0,0 +1,32 @@
+#ifndef BACKGROUND_CAPTURE_SERVICE_H
+#define BACKGROUND_CAPTURE_SERVICE_H
+
+#include <freertos/FreeRTOS.h>
+#include <freertos/task.h>
+
+#include <atomic>
+#include <cstdint>
+#include <string>
+
+class BackgroundCaptureService {
+public:
+    BackgroundCaptureService();
+    ~BackgroundCaptureService();
+
+    void Start();
+    void Stop();
+    bool IsRunning() const { return running_.load(); }
+
+private:
+    TaskHandle_t task_handle_ = nullptr;
+    std::atomic<bool> running_ = false;
+    uint32_t consecutive_failures_ = 0;
+
+    static void TaskEntry(void* arg);
+    void Run();
+    bool CaptureAndSendFrame();
+    bool UploadJpegFrame(const std::string& jpeg_data);
+    uint32_t GetFailureDelayMs() const;
+};
+
+#endif // BACKGROUND_CAPTURE_SERVICE_H
diff --git a/main/boards/atoms3r-echo-pyramid/atoms3r_echo_pyramid.cc b/main/boards/atoms3r-echo-pyramid/atoms3r_echo_pyramid.cc
index 5fe3fdf..662e7e2 100644
--- a/main/boards/atoms3r-echo-pyramid/atoms3r_echo_pyramid.cc
+++ b/main/boards/atoms3r-echo-pyramid/atoms3r_echo_pyramid.cc
@@ -214,6 +214,9 @@ public:
             case kDeviceStateSpeaking:
                 ctrl_->SetStatusColor(64, 0, 0);   // red
                 break;
+            case kDeviceStateThinking:
+                ctrl_->SetStatusColor(0, 0, 64);   // blue
+                break;
             default:
                 ctrl_->SetStatusColor(0, 0, 64);   // blue
                 break;
diff --git a/main/boards/common/wifi_board.cc b/main/boards/common/wifi_board.cc
index 377685b..bcab012 100644
--- a/main/boards/common/wifi_board.cc
+++ b/main/boards/common/wifi_board.cc
@@ -203,7 +203,10 @@ void WifiBoard::EnterWifiConfigMode() {
     auto& app = Application::GetInstance();
     auto state = app.GetDeviceState();
 
-    if (state == kDeviceStateSpeaking || state == kDeviceStateListening || state == kDeviceStateIdle) {
+    if (state == kDeviceStateSpeaking ||
+            state == kDeviceStateThinking ||
+            state == kDeviceStateListening ||
+            state == kDeviceStateIdle) {
         // Reset protocol (close audio channel, reset protocol)
         Application::GetInstance().ResetProtocol();
 
diff --git a/main/boards/electron-bot/electron_emoji_display.cc b/main/boards/electron-bot/electron_emoji_display.cc
index 7020a35..1dceb0c 100644
--- a/main/boards/electron-bot/electron_emoji_display.cc
+++ b/main/boards/electron-bot/electron_emoji_display.cc
@@ -85,6 +85,13 @@ void ElectronEmojiDisplay::SetStatus(const char* status) {
         lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
         lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
         return;
+    } else if (strcmp(status, Lang::Strings::THINKING) == 0) {
+        lv_obj_set_style_text_font(status_label_, text_font, 0);
+        lv_label_set_text(status_label_, status);
+        lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
+        lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
+        lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
+        return;
     } else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
         lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
         lv_label_set_text(status_label_, "\xEF\x83\x81");  // U+F0c1 连接图标
@@ -102,4 +109,4 @@ void ElectronEmojiDisplay::SetStatus(const char* status) {
     lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
     lv_obj_clear_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
     lv_obj_clear_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
-}
\ No newline at end of file
+}
diff --git a/main/boards/esp-hi/emoji_display.cc b/main/boards/esp-hi/emoji_display.cc
index c494337..6c9d590 100644
--- a/main/boards/esp-hi/emoji_display.cc
+++ b/main/boards/esp-hi/emoji_display.cc
@@ -155,6 +155,8 @@ void EmojiWidget::SetStatus(const char* status)
     if (player_) {
         if (strcmp(status, Lang::Strings::LISTENING) == 0) {
             player_->StartPlayer("asking", true, 15);
+        } else if (strcmp(status, Lang::Strings::THINKING) == 0) {
+            player_->StartPlayer("thinking", true, 15);
         } else if (strcmp(status, Lang::Strings::STANDBY) == 0) {
             player_->StartPlayer("wake", true, 15);
         }
diff --git a/main/boards/jiuchuan-s3/jiuchuan_dev_board.cc b/main/boards/jiuchuan-s3/jiuchuan_dev_board.cc
index 76c522e..a55a9ae 100644
--- a/main/boards/jiuchuan-s3/jiuchuan_dev_board.cc
+++ b/main/boards/jiuchuan-s3/jiuchuan_dev_board.cc
@@ -231,9 +231,9 @@ private:
                 // 如果当前是聆听状态，切换到待命状态
                 ESP_LOGI(TAG, "从聆听状态切换到待命状态");
                 app.ToggleChatState(); // 切换到待命状态
-            } else if (current_state == kDeviceStateSpeaking) {
-                // 如果当前是说话状态，终止说话并切换到待命状态
-                ESP_LOGI(TAG, "从说话状态切换到待命状态");
+            } else if (current_state == kDeviceStateSpeaking || current_state == kDeviceStateThinking) {
+                // 如果当前是说话或思考状态，终止并切换到待命状态
+                ESP_LOGI(TAG, "从说话/思考状态切换到待命状态");
                 app.ToggleChatState(); // 终止说话
             } else {
                 // 其他状态下只唤醒设备
diff --git a/main/boards/otto-robot/otto_emoji_display.cc b/main/boards/otto-robot/otto_emoji_display.cc
index 7702091..46661b9 100644
--- a/main/boards/otto-robot/otto_emoji_display.cc
+++ b/main/boards/otto-robot/otto_emoji_display.cc
@@ -77,6 +77,13 @@ void OttoEmojiDisplay::SetStatus(const char* status) {
         lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
         lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
         return;
+    } else if (strcmp(status, Lang::Strings::THINKING) == 0) {
+        lv_obj_set_style_text_font(status_label_, text_font, 0);
+        lv_label_set_text(status_label_, status);
+        lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
+        lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
+        lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
+        return;
     } else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
         lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
         lv_label_set_text(status_label_, "\xEF\x83\x81");  // U+F0c1 连接图标
@@ -131,4 +138,4 @@ void OttoEmojiDisplay::SetPreviewImage(std::unique_ptr<LvglImage> image) {
     lv_obj_remove_flag(preview_image_, LV_OBJ_FLAG_HIDDEN);
     esp_timer_stop(preview_timer_);
     ESP_ERROR_CHECK(esp_timer_start_once(preview_timer_, PREVIEW_IMAGE_DURATION_MS * 1000));
-}
\ No newline at end of file
+}
diff --git a/main/bridge_server.py b/main/bridge_server.py
index a0881cc..5746c57 100644
--- a/main/bridge_server.py
+++ b/main/bridge_server.py
@@ -1,5 +1,6 @@
 import asyncio
 import base64
+import contextlib
 import json
 import os
 import re
@@ -44,29 +45,40 @@ CHAT_MODE_AGENT_NAMES = {
 CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
 AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
 WS_PORT = 8080
+WS_MAX_QUEUE = int(os.getenv("BRIDGE_WS_MAX_QUEUE", "128"))
+WS_MAX_SIZE = int(os.getenv("BRIDGE_WS_MAX_SIZE", str(8 * 1024 * 1024)))
 AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
 
-INPUT_SAMPLE_RATE = 16000
-OUTPUT_SAMPLE_RATE = 24000
-INPUT_FRAME_DURATION_MS = 20
-INPUT_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * INPUT_FRAME_DURATION_MS // 1000
-INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 60 // 1000
-OUTPUT_FRAME_DURATION_MS = 20
-OUTPUT_SAMPLES_PER_OPUS_FRAME = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000
-TTS_IDLE_TIMEOUT_SECONDS = 0.25
+INPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_INPUT_SAMPLE_RATE", "16000"))
+OUTPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_OUTPUT_SAMPLE_RATE", "24000"))
+INPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_INPUT_FRAME_DURATION_MS", "60"))
+INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 120 // 1000
+OUTPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_OUTPUT_FRAME_DURATION_MS", "60"))
+AUDIO_STATS_INTERVAL_SECONDS = float(os.getenv("BRIDGE_AUDIO_STATS_INTERVAL_SECONDS", "5.0"))
+DOWNLINK_SEND_GAP_WARN_MS = float(os.getenv("BRIDGE_DOWNLINK_SEND_GAP_WARN_MS", "180.0"))
+UPLINK_CAPTURE_TIMEOUT_SECONDS = float(os.getenv("BRIDGE_UPLINK_CAPTURE_TIMEOUT_SECONDS", "0.25"))
+TTS_IDLE_TIMEOUT_SECONDS = float(os.getenv("TTS_IDLE_TIMEOUT_SECONDS", "1.2"))
+TTS_MIN_ACTIVE_SECONDS = float(os.getenv("TTS_MIN_ACTIVE_SECONDS", "1.0"))
 TTS_SILENCE_PEAK_THRESHOLD = 96
-TTS_PRE_ROLL_MS = 80
-TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
+TTS_PRE_ROLL_MS = int(os.getenv("TTS_PRE_ROLL_MS", "480"))
+TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = int(os.getenv("TTS_START_CONSECUTIVE_AUDIBLE_FRAMES", "1"))
 TTS_INTERRUPT_SILENCE_FRAMES = 3
 INTERRUPT_TOPIC = "lk.interrupt"
 VISION_FRAME_TOPIC = "vision.frame"
+AGENT_STATE_ATTRIBUTE = "lk.agent.state"
 TTS_DISPLAY_SENTENCE_BREAKS = "。！？!?；;"
 TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
 TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
 TTS_DISPLAY_SCROLL_GAP = "   "
 TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
+TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS = float(
+    os.getenv("TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS", "0.25")
+)
+TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS = float(
+    os.getenv("TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS", "8.0")
+)
 EMOTION_TEXT_PATTERN = re.compile(
     r"^\s*<?\s*emotion\s*=\s*([^\s>,，;；]+)\s*>?[\s,，;；]*(.*)$",
     re.DOTALL,
@@ -95,6 +107,7 @@ class DeviceSession:
     agent_ready: asyncio.Event
     forwarding_tracks: dict[str, asyncio.Task[Any]] = field(default_factory=dict)
     tts_active: bool = False
+    tts_thinking: bool = False
     tts_idle_task: Optional[asyncio.Task] = None
     tts_display_task: Optional[asyncio.Task] = None
     tts_stream_id: int = 0
@@ -103,6 +116,11 @@ class DeviceSession:
     tts_display_final: bool = False
     tts_emotion: str = ""
     tts_suppressed_until: float = 0.0
+    tts_started_at: float = 0.0
+    tts_last_audible_at: float = 0.0
+    tts_waiting_for_user_audio_after_interrupt: bool = False
+    last_interrupt_time: float = 0.0
+    last_uplink_audible_time: float = 0.0
     agent_dispatch_task: Optional[asyncio.Task] = None
     closed: bool = False
     captured_frame_count: int = 0
@@ -138,6 +156,100 @@ class ESP32LiveKitBridge:
         if formatted_tb.strip():
             print(formatted_tb.rstrip())
 
+    def _audio_duration_ms(self, sample_count: int, sample_rate: int) -> float:
+        if sample_rate <= 0:
+            return 0.0
+        return sample_count * 1000.0 / sample_rate
+
+    def _build_server_hello(self, session: DeviceSession) -> dict[str, Any]:
+        return {
+            "type": "hello",
+            "transport": "websocket",
+            "session": {
+                "room": session.room_name,
+                "identity": session.identity,
+            },
+            "audio_params": {
+                "format": "opus",
+                "sample_rate": OUTPUT_SAMPLE_RATE,
+                "channels": 1,
+                "frame_duration": OUTPUT_FRAME_DURATION_MS,
+            },
+        }
+
+    def _log_client_hello(self, session: DeviceSession, message: dict[str, Any]) -> None:
+        audio_params = message.get("audio_params")
+        if not isinstance(audio_params, dict):
+            return
+
+        sample_rate = audio_params.get("sample_rate")
+        frame_duration = audio_params.get("frame_duration")
+        channels = audio_params.get("channels")
+        fmt = audio_params.get("format")
+        print(
+            "[client-audio] "
+            f"device={session.device_id} format={fmt} sample_rate={sample_rate} "
+            f"channels={channels} frame_duration={frame_duration}"
+        )
+
+        if sample_rate != INPUT_SAMPLE_RATE or channels != 1:
+            print(
+                "[client-audio] warning: bridge uplink decode expects "
+                f"{INPUT_SAMPLE_RATE}Hz mono, got {sample_rate}Hz channels={channels}"
+            )
+        if frame_duration != INPUT_FRAME_DURATION_MS:
+            print(
+                "[client-audio] warning: bridge expects "
+                f"{INPUT_FRAME_DURATION_MS}ms uplink frames, got {frame_duration}ms"
+            )
+
+    def _track_room_connect_task(
+        self,
+        session: DeviceSession,
+        task: asyncio.Task[Any],
+    ) -> None:
+        if task.cancelled():
+            return
+        try:
+            task.result()
+        except Exception as exc:
+            self._log_exception(
+                f"LiveKit 房间连接后台任务失败: room={session.room_name}",
+                exc,
+            )
+            websocket = session.websocket
+            if websocket is not None:
+                asyncio.create_task(websocket.close(code=1011, reason="livekit connect failed"))
+
+    async def _capture_mic_frame(
+        self,
+        session: DeviceSession,
+        pcm_bytes: bytes,
+        num_samples: int,
+    ) -> bool:
+        try:
+            frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples)
+        except TypeError:
+            frame = AudioFrame.create(
+                sample_rate=INPUT_SAMPLE_RATE,
+                num_channels=1,
+                samples_per_channel=num_samples,
+            )
+            memoryview(frame.data).cast("B")[:] = pcm_bytes
+
+        try:
+            await asyncio.wait_for(
+                session.mic_source.capture_frame(frame),
+                timeout=UPLINK_CAPTURE_TIMEOUT_SECONDS,
+            )
+            return True
+        except asyncio.TimeoutError:
+            print(
+                "[uplink] warning: capture_frame timeout, dropping frame "
+                f"device={session.device_id} samples={num_samples}"
+            )
+            return False
+
     def _build_device_id(self, websocket: Any) -> str:
         headers = websocket.request.headers
         requested_id = headers.get("X-Device-Id") or headers.get("Device-Id")
@@ -445,7 +557,7 @@ class ESP32LiveKitBridge:
             print("收到 vision frame，但 image 字段为空")
             return
 
-        saved_path = self._save_vision_frame(session, image)
+        saved_path = await asyncio.to_thread(self._save_vision_frame, session, image)
         if saved_path is None:
             return
         print(f"已保存 vision frame: {saved_path}")
@@ -599,6 +711,10 @@ class ESP32LiveKitBridge:
         if session.tts_active:
             print("跳过 tts start，当前已处于激活状态")
             return
+        block_reason = self._tts_resume_block_reason(session, include_user_quiet=False)
+        if block_reason is not None:
+            print(f"跳过 tts start，打断后仍在等待稳定聆听: {block_reason}")
+            return
         if time.monotonic() < session.tts_suppressed_until:
             print("跳过 tts start，中断后的残留音频仍在抑制窗口内")
             return
@@ -607,16 +723,83 @@ class ESP32LiveKitBridge:
             session.tts_display_final = False
             session.tts_emotion = ""
             self._cancel_tts_display_task(session)
+        now = time.monotonic()
+        session.tts_started_at = now
+        session.tts_last_audible_at = now
         await self._send_tts_state(session, "start")
         session.tts_active = True
+        session.tts_thinking = False
+
+    async def _start_thinking(self, session: DeviceSession) -> None:
+        if session.tts_active:
+            print("跳过 tts thinking，当前已处于 TTS 播放状态")
+            return
+        if session.tts_thinking:
+            print("跳过 tts thinking，当前已处于思考状态")
+            return
+        block_reason = self._tts_resume_block_reason(session, include_user_quiet=False)
+        if block_reason is not None:
+            print(f"跳过 tts thinking，打断后仍在等待稳定聆听: {block_reason}")
+            return
+        if time.monotonic() < session.tts_suppressed_until:
+            print("跳过 tts thinking，中断后的残留音频仍在抑制窗口内")
+            return
+        await self._send_tts_state(session, "thinking")
+        session.tts_thinking = True
+
+    def _tts_resume_block_reason(
+        self,
+        session: DeviceSession,
+        now: Optional[float] = None,
+        *,
+        include_user_quiet: bool = True,
+    ) -> Optional[str]:
+        if now is None:
+            now = time.monotonic()
+
+        if session.tts_waiting_for_user_audio_after_interrupt:
+            return "waiting_for_user_audio_after_interrupt"
+
+        if session.last_interrupt_time <= 0.0:
+            return None
+
+        since_interrupt = now - session.last_interrupt_time
+        if since_interrupt > TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS:
+            return None
+
+        if session.last_uplink_audible_time < session.last_interrupt_time:
+            return None
+
+        if not include_user_quiet:
+            return None
+
+        quiet_for = now - session.last_uplink_audible_time
+        if quiet_for < TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS:
+            return f"user_audio_quiet_for={quiet_for:.2f}s"
+
+        return None
+
+    def _handle_agent_state(self, session: DeviceSession, participant: rtc.Participant) -> None:
+        state = participant.attributes.get(AGENT_STATE_ATTRIBUTE)
+        if not isinstance(state, str) or not state:
+            return
+
+        print(
+            f"[agent-state] room={session.room_name} identity={participant.identity} state={state}"
+        )
+        if state == "thinking":
+            asyncio.create_task(self._start_thinking(session))
 
     async def _stop_tts(self, session: DeviceSession) -> None:
-        if not session.tts_active:
+        if not session.tts_active and not session.tts_thinking:
             print("跳过 tts stop，当前未激活")
             return
         self._cancel_tts_display_task(session)
         await self._send_tts_state(session, "stop")
         session.tts_active = False
+        session.tts_thinking = False
+        session.tts_started_at = 0.0
+        session.tts_last_audible_at = 0.0
         session.tts_transcript_text = ""
         session.tts_display_text = ""
         session.tts_display_final = False
@@ -628,6 +811,9 @@ class ESP32LiveKitBridge:
             session.tts_idle_task.cancel()
             session.tts_idle_task = None
         session.tts_active = False
+        session.tts_thinking = False
+        session.tts_started_at = 0.0
+        session.tts_last_audible_at = 0.0
         session.tts_transcript_text = ""
         session.tts_display_text = ""
         session.tts_display_final = False
@@ -637,12 +823,16 @@ class ESP32LiveKitBridge:
 
     async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None:
         print(f"收到打断请求，停止当前 TTS: device={session.device_id} reason={reason}")
+        now = time.monotonic()
         session.tts_stream_id += 1
-        session.tts_suppressed_until = time.monotonic() + TTS_INTERRUPT_SUPPRESS_SECONDS
+        session.last_interrupt_time = now
+        session.tts_suppressed_until = now + TTS_INTERRUPT_SUPPRESS_SECONDS
+        session.tts_waiting_for_user_audio_after_interrupt = True
         await self._force_stop_tts(session, reason)
         asyncio.create_task(self._send_agent_interrupt(session, reason))
 
     def _reset_tts_idle_timer(self, session: DeviceSession) -> None:
+        session.tts_last_audible_at = time.monotonic()
         if session.tts_idle_task is not None:
             session.tts_idle_task.cancel()
         session.tts_idle_task = asyncio.create_task(
@@ -651,11 +841,28 @@ class ESP32LiveKitBridge:
 
     async def _tts_idle_watchdog(self, session: DeviceSession, stream_id: int) -> None:
         try:
-            await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS)
-            if stream_id != session.tts_stream_id:
+            while True:
+                await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS)
+                if stream_id != session.tts_stream_id or not session.tts_active:
+                    return
+
+                now = time.monotonic()
+                idle_for = now - session.tts_last_audible_at
+                active_for = now - session.tts_started_at
+                remaining = max(
+                    TTS_IDLE_TIMEOUT_SECONDS - idle_for,
+                    TTS_MIN_ACTIVE_SECONDS - active_for,
+                )
+                if remaining > 0:
+                    await asyncio.sleep(remaining)
+                    continue
+
+                print(
+                    "TTS 静音达到阈值，切回聆听状态: "
+                    f"idle={idle_for:.2f}s active={active_for:.2f}s"
+                )
+                await self._stop_tts(session)
                 return
-            print(f"TTS 空闲超过 {TTS_IDLE_TIMEOUT_SECONDS}s，切回聆听状态")
-            await self._stop_tts(session)
         except asyncio.CancelledError:
             pass
 
@@ -799,6 +1006,7 @@ class ESP32LiveKitBridge:
                 if self._is_agent_participant(participant, session.agent_name):
                     session.agent_ready.set()
                     self._scan_participant_audio_tracks(session, participant, "connected_scan")
+                    self._handle_agent_state(session, participant)
 
         @session.room.on("participant_connected")
         def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
@@ -810,6 +1018,7 @@ class ESP32LiveKitBridge:
                 self._scan_participant_audio_tracks(
                     session, participant, "participant_connected_scan"
                 )
+                self._handle_agent_state(session, participant)
 
         @session.room.on("participant_disconnected")
         def on_participant_disconnected(participant: rtc.RemoteParticipant) -> None:
@@ -820,6 +1029,16 @@ class ESP32LiveKitBridge:
                 if not track_sid.endswith(f":{participant.identity}")
             }
 
+        @session.room.on("participant_attributes_changed")
+        def on_participant_attributes_changed(changed: list[str], participant: rtc.Participant) -> None:
+            if AGENT_STATE_ATTRIBUTE not in changed:
+                return
+            if not isinstance(participant, rtc.RemoteParticipant):
+                return
+            if not self._is_agent_participant(participant, session.agent_name):
+                return
+            self._handle_agent_state(session, participant)
+
         @session.room.on("data_received")
         def on_data_received(data_packet: rtc.DataPacket) -> None:
             identity = data_packet.participant.identity if data_packet.participant else "未知"
@@ -865,6 +1084,7 @@ class ESP32LiveKitBridge:
                     if not segment.final:
                         continue
                     display_text = segment.text
+                    asyncio.create_task(self._start_thinking(session))
 
                 if session.websocket is not None:
                     ws = session.websocket
@@ -967,9 +1187,21 @@ class ESP32LiveKitBridge:
 
     async def start(self) -> None:
         print(f"[config] websocket_port={WS_PORT}")
+        print(f"[config] websocket_max_queue={WS_MAX_QUEUE} websocket_max_size={WS_MAX_SIZE}")
         print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
         print(f"[config] token_url={TOKEN_URL}")
         print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
+        print(
+            "[config] audio="
+            f"uplink_decode:{INPUT_SAMPLE_RATE}Hz/{INPUT_FRAME_DURATION_MS}ms "
+            f"downlink_encode:{OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms "
+            f"stats_interval:{AUDIO_STATS_INTERVAL_SECONDS}s "
+            f"capture_timeout:{UPLINK_CAPTURE_TIMEOUT_SECONDS}s "
+            f"tts_idle:{TTS_IDLE_TIMEOUT_SECONDS}s "
+            f"tts_min_active:{TTS_MIN_ACTIVE_SECONDS}s "
+            f"tts_start_frames:{TTS_START_CONSECUTIVE_AUDIBLE_FRAMES} "
+            f"tts_pre_roll:{TTS_PRE_ROLL_MS}ms"
+        )
         print(
             "[config] agents="
             f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
@@ -996,6 +1228,7 @@ class ESP32LiveKitBridge:
         session.websocket = None
         session.agent_ready.set()
         session.tts_active = False
+        session.tts_thinking = False
         session.tts_stream_id += 1
         if session.tts_idle_task is not None:
             session.tts_idle_task.cancel()
@@ -1016,13 +1249,20 @@ class ESP32LiveKitBridge:
         pending_pcm = bytearray()
         pre_roll_pcm = bytearray()
         pre_roll_max_bytes = OUTPUT_SAMPLE_RATE * TTS_PRE_ROLL_MS // 1000 * 2
+        output_samples_per_opus_frame = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000
+        output_frame_bytes = output_samples_per_opus_frame * 2
         audible_frame_streak = 0
         silence_frame_streak = 0
         waiting_for_post_interrupt_silence = False
+        downlink_packets = 0
+        downlink_audio_ms = 0.0
+        last_downlink_stats_time = time.monotonic()
+        last_send_time: Optional[float] = None
         stream_id = session.tts_stream_id
         print(
             f"启动 TTS 转发: device={session.device_id} room={session.room_name} "
-            f"track_sid={track_sid} stream_id={stream_id}"
+            f"track_sid={track_sid} stream_id={stream_id} "
+            f"opus={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
         )
 
         try:
@@ -1043,7 +1283,8 @@ class ESP32LiveKitBridge:
                 pcm_data = frame.data.tobytes()
                 has_audible_audio = self._has_audible_audio(pcm_data)
 
-                if time.monotonic() < session.tts_suppressed_until:
+                now = time.monotonic()
+                if now < session.tts_suppressed_until:
                     pending_pcm.clear()
                     pre_roll_pcm.clear()
                     audible_frame_streak = 0
@@ -1051,6 +1292,31 @@ class ESP32LiveKitBridge:
                     waiting_for_post_interrupt_silence = True
                     continue
 
+                block_reason = self._tts_resume_block_reason(
+                    session,
+                    now,
+                    include_user_quiet=False,
+                )
+                if block_reason is not None:
+                    pending_pcm.clear()
+                    pre_roll_pcm.clear()
+                    audible_frame_streak = 0
+                    silence_frame_streak = 0
+                    if block_reason == "waiting_for_user_audio_after_interrupt":
+                        waiting_for_post_interrupt_silence = True
+                    continue
+
+                if (
+                    waiting_for_post_interrupt_silence
+                    and session.last_interrupt_time > 0.0
+                    and session.last_uplink_audible_time >= session.last_interrupt_time
+                    and now - session.last_uplink_audible_time
+                    >= TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS
+                ):
+                    print("检测到用户打断后语音已结束，允许新 TTS 直接起播")
+                    waiting_for_post_interrupt_silence = False
+                    silence_frame_streak = 0
+
                 if waiting_for_post_interrupt_silence:
                     if has_audible_audio:
                         silence_frame_streak = 0
@@ -1098,20 +1364,42 @@ class ESP32LiveKitBridge:
 
                 if not current_frame_buffered:
                     pending_pcm.extend(pcm_data)
-                frame_bytes = OUTPUT_SAMPLES_PER_OPUS_FRAME * 2
 
                 while (
-                    len(pending_pcm) >= frame_bytes
+                    len(pending_pcm) >= output_frame_bytes
                     and stream_id == session.tts_stream_id
                     and session.websocket is not None
                 ):
                     try:
+                        now = time.monotonic()
+                        if last_send_time is not None:
+                            send_gap_ms = (now - last_send_time) * 1000.0
+                            if send_gap_ms > DOWNLINK_SEND_GAP_WARN_MS:
+                                print(
+                                    "[downlink] warning: send gap "
+                                    f"{send_gap_ms:.1f}ms device={session.device_id} "
+                                    f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}"
+                                )
+                        last_send_time = now
+
                         opus_packet = encoder.encode(
-                            bytes(pending_pcm[:frame_bytes]),
-                            OUTPUT_SAMPLES_PER_OPUS_FRAME,
+                            bytes(pending_pcm[:output_frame_bytes]),
+                            output_samples_per_opus_frame,
                         )
-                        del pending_pcm[:frame_bytes]
+                        del pending_pcm[:output_frame_bytes]
                         await session.websocket.send(self._wrap_opus_payload(session, opus_packet))
+                        downlink_packets += 1
+                        downlink_audio_ms += OUTPUT_FRAME_DURATION_MS
+                        if now - last_downlink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS:
+                            print(
+                                "[downlink] "
+                                f"device={session.device_id} packets={downlink_packets} "
+                                f"audio_ms={downlink_audio_ms:.0f} "
+                                f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}"
+                            )
+                            downlink_packets = 0
+                            downlink_audio_ms = 0.0
+                            last_downlink_stats_time = now
                     except Exception as exc:
                         print(f"发送回 ESP32 失败: {exc}")
                         break
@@ -1171,27 +1459,26 @@ class ESP32LiveKitBridge:
         )
         session.tts_stream_id += 1
         opus_decoder = None
+        uplink_packets = 0
+        uplink_audio_ms = 0.0
+        uplink_decode_errors = 0
+        uplink_dropped_frames = 0
+        last_uplink_stats_time = time.monotonic()
+        room_connect_task: Optional[asyncio.Task[Any]] = None
 
         try:
-            hello_msg = {
-                "type": "hello",
-                "transport": "websocket",
-                "session": {
-                    "room": session.room_name,
-                    "identity": session.identity,
-                },
-                "audio_params": {
-                    "format": "opus",
-                    "sample_rate": OUTPUT_SAMPLE_RATE,
-                    "channels": 1,
-                    "frame_duration": OUTPUT_FRAME_DURATION_MS,
-                },
-            }
+            hello_msg = self._build_server_hello(session)
             await websocket.send(json.dumps(hello_msg))
-            print(f"已发送 server hello: device={device_id} room={session.room_name}")
+            print(
+                f"已发送 server hello: device={device_id} room={session.room_name} "
+                f"audio={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
+            )
             asyncio.create_task(self._run_emotion_test_sequence(session))
 
-            await self._connect_session_room(session)
+            room_connect_task = asyncio.create_task(self._connect_session_room(session))
+            room_connect_task.add_done_callback(
+                lambda task: self._track_room_connect_task(session, task)
+            )
 
             async for message in websocket:
                 if isinstance(message, bytes):
@@ -1214,6 +1501,16 @@ class ESP32LiveKitBridge:
                         if num_samples > 0:
                             session.captured_frame_count += 1
                             now = time.monotonic()
+                            uplink_packets += 1
+                            uplink_audio_ms += self._audio_duration_ms(num_samples, INPUT_SAMPLE_RATE)
+                            if self._has_audible_audio(pcm_bytes):
+                                session.last_uplink_audible_time = now
+                                if session.tts_waiting_for_user_audio_after_interrupt:
+                                    session.tts_waiting_for_user_audio_after_interrupt = False
+                                    print(
+                                        f"[uplink] detected user audio after interrupt: "
+                                        f"device={session.device_id}"
+                                    )
                             if (
                                 session.captured_frame_count <= 5
                                 or now - session.first_capture_log_time >= 5.0
@@ -1224,25 +1521,32 @@ class ESP32LiveKitBridge:
                                 #     f"bytes={len(pcm_bytes)} samples={num_samples} "
                                 #     f"room={session.room_name}"
                                 # )
-                            try:
-                                frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples)
-                                await session.mic_source.capture_frame(frame)
-                            except TypeError:
-                                frame = AudioFrame.create(
-                                    sample_rate=INPUT_SAMPLE_RATE,
-                                    num_channels=1,
-                                    samples_per_channel=num_samples,
+                            if now - last_uplink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS:
+                                print(
+                                    "[uplink] "
+                                    f"device={session.device_id} packets={uplink_packets} "
+                                    f"audio_ms={uplink_audio_ms:.0f} "
+                                    f"decode_errors={uplink_decode_errors} "
+                                    f"dropped_frames={uplink_dropped_frames}"
                                 )
-                                memoryview(frame.data).cast("B")[:] = pcm_bytes
-                                await session.mic_source.capture_frame(frame)
+                                uplink_packets = 0
+                                uplink_audio_ms = 0.0
+                                uplink_decode_errors = 0
+                                uplink_dropped_frames = 0
+                                last_uplink_stats_time = now
+                            if not await self._capture_mic_frame(session, pcm_bytes, num_samples):
+                                uplink_dropped_frames += 1
                     except Exception as exc:
+                        uplink_decode_errors += 1
                         print(f"Opus audio decode error ({len(message)} bytes): {exc}")
                 elif isinstance(message, str):
                     try:
                         data = json.loads(message)
                         # print(f"收到 ESP32 JSON 消息: {data}")
                         msg_type = data.get("type")
-                        if msg_type == "abort":
+                        if msg_type == "hello":
+                            self._log_client_hello(session, data)
+                        elif msg_type == "abort":
                             reason = data.get("reason")
                             abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
                             print(f"处理 ESP32 打断请求: reason={abort_reason}")
@@ -1257,6 +1561,10 @@ class ESP32LiveKitBridge:
             self._log_exception("WebSocket 其他错误", exc)
         finally:
             print(f"ESP32 断开连接: device={device_id} room={session.room_name}")
+            if room_connect_task is not None and not room_connect_task.done():
+                room_connect_task.cancel()
+                with contextlib.suppress(asyncio.CancelledError):
+                    await room_connect_task
             await self._close_session(session)
             self.device_sessions.pop(device_id, None)
 
@@ -1265,7 +1573,13 @@ async def main() -> None:
     bridge = ESP32LiveKitBridge()
     try:
         await bridge.start()
-        async with websockets.serve(bridge.handle_websocket, "0.0.0.0", WS_PORT):
+        async with websockets.serve(
+            bridge.handle_websocket,
+            "0.0.0.0",
+            WS_PORT,
+            max_queue=WS_MAX_QUEUE,
+            max_size=WS_MAX_SIZE,
+        ):
             print(f"WebSocket 服务器运行在端口 {WS_PORT}，等待 ESP32 连接...")
             await asyncio.Future()
     finally:
diff --git a/main/device_state.h b/main/device_state.h
index 4ffafae..8170d35 100644
--- a/main/device_state.h
+++ b/main/device_state.h
@@ -8,6 +8,7 @@ enum DeviceState {
     kDeviceStateIdle,
     kDeviceStateConnecting,
     kDeviceStateListening,
+    kDeviceStateThinking,
     kDeviceStateSpeaking,
     kDeviceStateUpgrading,
     kDeviceStateActivating,
@@ -15,4 +16,4 @@ enum DeviceState {
     kDeviceStateFatalError
 };
 
-#endif // _DEVICE_STATE_H_ 
\ No newline at end of file
+#endif // _DEVICE_STATE_H_
diff --git a/main/device_state_machine.cc b/main/device_state_machine.cc
index 30581de..b251667 100644
--- a/main/device_state_machine.cc
+++ b/main/device_state_machine.cc
@@ -13,6 +13,7 @@ static const char* const STATE_STRINGS[] = {
     "idle",
     "connecting",
     "listening",
+    "thinking",
     "speaking",
     "upgrading",
     "activating",
@@ -69,9 +70,10 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con
                    to == kDeviceStateActivating;
 
         case kDeviceStateIdle:
-            // Can go to connecting, listening (manual mode), speaking, activating, upgrading, or wifi configuring
+            // Can go to connecting, listening (manual mode), thinking, speaking, activating, upgrading, or wifi configuring
             return to == kDeviceStateConnecting ||
                    to == kDeviceStateListening ||
+                   to == kDeviceStateThinking ||
                    to == kDeviceStateSpeaking ||
                    to == kDeviceStateActivating ||
                    to == kDeviceStateUpgrading ||
@@ -83,8 +85,15 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con
                    to == kDeviceStateListening;
 
         case kDeviceStateListening:
-            // Can go to speaking or idle
+            // Can go to thinking, speaking, or idle
+            return to == kDeviceStateThinking ||
+                   to == kDeviceStateSpeaking ||
+                   to == kDeviceStateIdle;
+
+        case kDeviceStateThinking:
+            // Can go to speaking, listening, or idle
             return to == kDeviceStateSpeaking ||
+                   to == kDeviceStateListening ||
                    to == kDeviceStateIdle;
 
         case kDeviceStateSpeaking:
diff --git a/main/display/emote_display.cc b/main/display/emote_display.cc
index 7ed920a..8f4f7b3 100644
--- a/main/display/emote_display.cc
+++ b/main/display/emote_display.cc
@@ -167,6 +167,8 @@ void EmoteDisplay::SetStatus(const char* const status)
             emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
         } else if (std::strcmp(status, Lang::Strings::STANDBY) == 0) {
             emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_IDLE, NULL);
+        } else if (std::strcmp(status, Lang::Strings::THINKING) == 0) {
+            emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
         } else if (std::strcmp(status, Lang::Strings::SPEAKING) == 0) {
             emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_SPEAK, NULL);
         } else if (std::strcmp(status, Lang::Strings::ERROR) == 0) {
@@ -247,4 +249,4 @@ void EmoteDisplay::RefreshAll()
     }
 }
 
-} // namespace emote
\ No newline at end of file
+} // namespace emote
diff --git a/main/display/lvgl_display/lvgl_display.cc b/main/display/lvgl_display/lvgl_display.cc
index 46668dd..b56aa64 100644
--- a/main/display/lvgl_display/lvgl_display.cc
+++ b/main/display/lvgl_display/lvgl_display.cc
@@ -203,6 +203,7 @@ void LvglDisplay::UpdateStatusBar(bool update_all) {
             kDeviceStateStarting,
             kDeviceStateWifiConfiguring,
             kDeviceStateListening,
+            kDeviceStateThinking,
             kDeviceStateActivating,
         };
         if (std::find(allowed_states.begin(), allowed_states.end(), device_state) != allowed_states.end()) {
diff --git a/main/led/circular_strip.cc b/main/led/circular_strip.cc
index b4111ba..6129f81 100644
--- a/main/led/circular_strip.cc
+++ b/main/led/circular_strip.cc
@@ -228,6 +228,11 @@ void CircularStrip::OnStateChanged() {
             SetAllColor(color);
             break;
         }
+        case kDeviceStateThinking: {
+            StripColor color = { low_brightness_, low_brightness_, default_brightness_ };
+            Blink(color, 300);
+            break;
+        }
         case kDeviceStateUpgrading: {
             StripColor color = { low_brightness_, default_brightness_, low_brightness_ };
             Blink(color, 100);
diff --git a/main/led/gpio_led.cc b/main/led/gpio_led.cc
index 30eeeb2..f1e6ab6 100644
--- a/main/led/gpio_led.cc
+++ b/main/led/gpio_led.cc
@@ -235,6 +235,10 @@ void GpioLed::OnStateChanged() {
             // TurnOn();
             StartFadeTask();
             break;
+        case kDeviceStateThinking:
+            SetBrightness(DEFAULT_BRIGHTNESS);
+            StartContinuousBlink(300);
+            break;
         case kDeviceStateSpeaking:
             SetBrightness(SPEAKING_BRIGHTNESS);
             TurnOn();
@@ -260,4 +264,4 @@ void GpioLed::EventTask(void* arg) {
         ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
         led->OnFadeEnd();
     }
-}
\ No newline at end of file
+}
diff --git a/main/led/single_led.cc b/main/led/single_led.cc
index d107619..c061e2c 100644
--- a/main/led/single_led.cc
+++ b/main/led/single_led.cc
@@ -152,6 +152,10 @@ void SingleLed::OnStateChanged() {
             SetColor(0, DEFAULT_BRIGHTNESS, 0);
             TurnOn();
             break;
+        case kDeviceStateThinking:
+            SetColor(0, 0, DEFAULT_BRIGHTNESS);
+            StartContinuousBlink(300);
+            break;
         case kDeviceStateUpgrading:
             SetColor(0, DEFAULT_BRIGHTNESS, 0);
             StartContinuousBlink(100);