reconstruct application

2024-10-03 06:39:22 +08:00
parent e59be04394
commit 879f1cc21e
26 changed files with 927 additions and 639 deletions
--- a/main/Application.cc
+++ b/main/Application.cc
@ -1,27 +1,28 @@
-#include "Application.h"
-#include "BuiltinLed.h"
-#include "TlsTransport.h"
-#include "Ml307SslTransport.h"
-#include "WifiConfigurationAp.h"
-#include "WifiStation.h"
+#include <BuiltinLed.h>
+#include <TlsTransport.h>
+#include <Ml307SslTransport.h>
+#include <WifiConfigurationAp.h>
+#include <WifiStation.h>
+#include <SystemInfo.h>

 #include <cstring>
-#include "esp_log.h"
-#include "model_path.h"
-#include "SystemInfo.h"
-#include "cJSON.h"
-#include "driver/gpio.h"
+#include <esp_log.h>
+#include <cJSON.h>
+#include <driver/gpio.h>
+
+#include "Application.h"

 #define TAG "Application"


 Application::Application()
+    : button_((gpio_num_t)CONFIG_BOOT_BUTTON_GPIO)
 #ifdef CONFIG_USE_ML307
-    : ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096),
+    , ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096),
      http_(ml307_at_modem_),
      firmware_upgrade_(http_)
 #else
-    : http_(),
+    , http_(),
    firmware_upgrade_(http_)
 #endif
 #ifdef CONFIG_USE_DISPLAY
@ -29,16 +30,6 @@ Application::Application()
 #endif
 {
    event_group_ = xEventGroupCreate();
-    audio_encode_queue_ = xQueueCreate(100, sizeof(iovec));
-    audio_decode_queue_ = xQueueCreate(100, sizeof(AudioPacket*));
-
-    srmodel_list_t *models = esp_srmodel_init("model");
-    for (int i = 0; i < models->num; i++) {
-        ESP_LOGI(TAG, "Model %d: %s", i, models->model_name[i]);
-        if (strstr(models->model_name[i], ESP_WN_PREFIX) != NULL) {
-            wakenet_model_ = models->model_name[i];
-        }
-    }
    
    opus_encoder_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1);
    opus_decoder_ = opus_decoder_create(opus_decode_sample_rate_, 1, NULL);
@ -52,32 +43,12 @@ Application::Application()
 }

 Application::~Application() {
-    if (afe_detection_data_ != nullptr) {
-        esp_afe_sr_v1.destroy(afe_detection_data_);
-    }
-
-    if (afe_communication_data_ != nullptr) {
-        esp_afe_vc_v1.destroy(afe_communication_data_);
-    }
-
-    if (wake_word_encode_task_stack_ != nullptr) {
-        free(wake_word_encode_task_stack_);
-    }
-    for (auto& pcm : wake_word_pcm_) {
-        free(pcm.iov_base);
-    }
-    
    if (opus_decoder_ != nullptr) {
        opus_decoder_destroy(opus_decoder_);
    }
    if (audio_encode_task_stack_ != nullptr) {
        free(audio_encode_task_stack_);
    }
-    if (audio_decode_task_stack_ != nullptr) {
-        free(audio_decode_task_stack_);
-    }
-    vQueueDelete(audio_decode_queue_);
-    vQueueDelete(audio_encode_queue_);

    vEventGroupDelete(event_group_);
 }
@ -204,43 +175,139 @@ void Application::Start() {
    }
 #endif

+    audio_device_.OnInputData([this](const int16_t* data, int size) {
+#ifdef CONFIG_USE_AFE_SR
+        if (audio_processor_.IsRunning()) {
+            audio_processor_.Input(data, size);
+        }
+        if (wake_word_detect_.IsDetectionRunning()) {
+            wake_word_detect_.Feed(data, size);
+        }
+#else
+        std::vector<int16_t> pcm(data, data + size);
+        Schedule([this, pcm = std::move(pcm)]() {
+            if (chat_state_ == kChatStateListening) {
+                std::lock_guard<std::mutex> lock(mutex_);
+                audio_encode_queue_.emplace_back(std::move(pcm));
+                cv_.notify_all();
+            }
+        });
+#endif
+    });
+
    // Initialize the audio device
    audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
-    audio_device_.OnStateChanged([this]() {
-        if (audio_device_.playing()) {
-            SetChatState(kChatStateSpeaking);
-        } else {
-            // Check if communication is still running
-            if (xEventGroupGetBits(event_group_) & COMMUNICATION_RUNNING) {
-                SetChatState(kChatStateListening);
-            } else {
-                SetChatState(kChatStateIdle);
-            }
-        }
-    });

    // OPUS encoder / decoder use a lot of stack memory
    const size_t opus_stack_size = 4096 * 8;
    audio_encode_task_stack_ = (StackType_t*)malloc(opus_stack_size);
-    xTaskCreateStatic([](void* arg) {
+    audio_encode_task_ = xTaskCreateStatic([](void* arg) {
        Application* app = (Application*)arg;
        app->AudioEncodeTask();
        vTaskDelete(NULL);
    }, "opus_encode", opus_stack_size, this, 1, audio_encode_task_stack_, &audio_encode_task_buffer_);
-    audio_decode_task_stack_ = (StackType_t*)malloc(opus_stack_size);
-    xTaskCreateStatic([](void* arg) {
-        Application* app = (Application*)arg;
-        app->AudioDecodeTask();
-        vTaskDelete(NULL);
-    }, "opus_decode", opus_stack_size, this, 1, audio_decode_task_stack_, &audio_decode_task_buffer_);

-    StartCommunication();
-    StartDetection();
+    xTaskCreate([](void* arg) {
+        Application* app = (Application*)arg;
+        app->AudioPlayTask();
+        vTaskDelete(NULL);
+    }, "play_audio", 4096 * 2, this, 5, NULL);
+
+#ifdef CONFIG_USE_AFE_SR
+    wake_word_detect_.OnVadStateChange([this](bool speaking) {
+        Schedule([this, speaking]() {
+            auto& builtin_led = BuiltinLed::GetInstance();
+            if (chat_state_ == kChatStateListening) {
+                if (speaking) {
+                    builtin_led.SetRed(32);
+                } else {
+                    builtin_led.SetRed(8);
+                }
+                builtin_led.TurnOn();
+            }
+        });
+    });
+
+    wake_word_detect_.OnWakeWordDetected([this]() {
+        Schedule([this]() {
+            if (chat_state_ == kChatStateIdle) {
+                // Encode the wake word data and start websocket client at the same time
+                // They both consume a lot of time (700ms), so we can do them in parallel
+                wake_word_detect_.EncodeWakeWordData();
+
+                SetChatState(kChatStateConnecting);
+                if (ws_client_ == nullptr) {
+                    StartWebSocketClient();
+                }
+                if (ws_client_ && ws_client_->IsConnected()) {
+                    auto encoded = wake_word_detect_.GetWakeWordStream();
+                    // Send the wake word data to the server
+                    ws_client_->Send(encoded.data(), encoded.size(), true);
+                    opus_encoder_.ResetState();
+                    // Send a ready message to indicate the server that the wake word data is sent
+                    SetChatState(kChatStateWakeWordDetected);
+                    // If connected, the hello message is already sent, so we can start communication
+                    audio_processor_.Start();
+                    ESP_LOGI(TAG, "Audio processor started");
+                } else {
+                    SetChatState(kChatStateIdle);
+                }
+            } else if (chat_state_ == kChatStateSpeaking) {
+                break_speaking_ = true;
+            }
+
+            // Resume detection
+            wake_word_detect_.StartDetection();
+        });
+    });
+    wake_word_detect_.StartDetection();
+
+    audio_processor_.OnOutput([this](std::vector<int16_t>&& data) {
+        Schedule([this, data = std::move(data)]() {
+            if (chat_state_ == kChatStateListening) {
+                std::lock_guard<std::mutex> lock(mutex_);
+                audio_encode_queue_.emplace_back(std::move(data));
+                cv_.notify_all();
+            }
+        });
+    });
+#endif

    // Blink the LED to indicate the device is running
    builtin_led.SetGreen();
    builtin_led.BlinkOnce();
-    xEventGroupSetBits(event_group_, DETECTION_RUNNING);
+
+    button_.OnClick([this]() {
+        Schedule([this]() {
+            if (chat_state_ == kChatStateIdle) {
+                SetChatState(kChatStateConnecting);
+                StartWebSocketClient();
+
+                if (ws_client_ && ws_client_->IsConnected()) {
+                    opus_encoder_.ResetState();
+#ifdef CONFIG_USE_AFE_SR
+                    audio_processor_.Start();
+#endif
+                    SetChatState(kChatStateListening);
+                    ESP_LOGI(TAG, "Communication started");
+                } else {
+                    SetChatState(kChatStateIdle);
+                }
+            } else if (chat_state_ == kChatStateSpeaking) {
+                break_speaking_ = true;
+            } else if (chat_state_ == kChatStateListening) {
+                if (ws_client_ && ws_client_->IsConnected()) {
+                    ws_client_->Close();
+                }
+            }
+        });
+    });
+
+    xTaskCreate([](void* arg) {
+        Application* app = (Application*)arg;
+        app->MainLoop();
+        vTaskDelete(NULL);
+    }, "main_loop", 4096 * 2, this, 5, NULL);

    // Launch a task to check for new firmware version
    xTaskCreate([](void* arg) {
@ -259,6 +326,28 @@ void Application::Start() {
 #endif
 }

+void Application::Schedule(std::function<void()> callback) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    main_tasks_.push_back(callback);
+    cv_.notify_all();
+}
+
+// The Main Loop controls the chat state and websocket connection
+// If other tasks need to access the websocket or chat state,
+// they should use Schedule to call this function
+void Application::MainLoop() {
+    while (true) {
+        std::unique_lock<std::mutex> lock(mutex_);
+        cv_.wait(lock, [this]() {
+            return !main_tasks_.empty();
+        });
+        auto task = std::move(main_tasks_.front());
+        main_tasks_.pop_front();
+        lock.unlock();
+        task();
+    }
+}
+
 void Application::SetChatState(ChatState state) {
    const char* state_str[] = {
        "idle",
@ -294,199 +383,27 @@ void Application::SetChatState(ChatState state) {
            builtin_led.SetBlue();
            builtin_led.TurnOn();
            break;
-        case kChatStateTesting:
-            builtin_led.SetRed();
-            builtin_led.TurnOn();
-            break;
        case kChatStateUpgrading:
            builtin_led.SetGreen();
            builtin_led.StartContinuousBlink(100);
            break;
    }

-    std::lock_guard<std::recursive_mutex> lock(mutex_);
    if (ws_client_ && ws_client_->IsConnected()) {
        cJSON* root = cJSON_CreateObject();
        cJSON_AddStringToObject(root, "type", "state");
        cJSON_AddStringToObject(root, "state", state_str[chat_state_]);
        char* json = cJSON_PrintUnformatted(root);
+
+        std::lock_guard<std::mutex> lock(mutex_);
        ws_client_->Send(json);
        cJSON_Delete(root);
        free(json);
    }
 }

-void Application::StartCommunication() {
-    afe_config_t afe_config = {
-        .aec_init = false,
-        .se_init = true,
-        .vad_init = true,
-        .wakenet_init = false,
-        .voice_communication_init = true,
-        .voice_communication_agc_init = true,
-        .voice_communication_agc_gain = 10,
-        .vad_mode = VAD_MODE_3,
-        .wakenet_model_name = NULL,
-        .wakenet_model_name_2 = NULL,
-        .wakenet_mode = DET_MODE_90,
-        .afe_mode = SR_MODE_HIGH_PERF,
-        .afe_perferred_core = 0,
-        .afe_perferred_priority = 5,
-        .afe_ringbuf_size = 50,
-        .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
-        .afe_linear_gain = 1.0,
-        .agc_mode = AFE_MN_PEAK_AGC_MODE_2,
-        .pcm_config = {
-            .total_ch_num = 1,
-            .mic_num = 1,
-            .ref_num = 0,
-            .sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE,
-        },
-        .debug_init = false,
-        .debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }},
-        .afe_ns_mode = NS_MODE_SSP,
-        .afe_ns_model_name = NULL,
-        .fixed_first_channel = true,
-    };
-
-    afe_communication_data_ = esp_afe_vc_v1.create_from_config(&afe_config);
-    
-    xTaskCreate([](void* arg) {
-        Application* app = (Application*)arg;
-        app->AudioCommunicationTask();
-        vTaskDelete(NULL);
-    }, "audio_communication", 4096 * 2, this, 5, NULL);
-}
-
-void Application::StartDetection() {
-    afe_config_t afe_config = {
-        .aec_init = false,
-        .se_init = true,
-        .vad_init = true,
-        .wakenet_init = true,
-        .voice_communication_init = false,
-        .voice_communication_agc_init = false,
-        .voice_communication_agc_gain = 10,
-        .vad_mode = VAD_MODE_3,
-        .wakenet_model_name = wakenet_model_,
-        .wakenet_model_name_2 = NULL,
-        .wakenet_mode = DET_MODE_90,
-        .afe_mode = SR_MODE_HIGH_PERF,
-        .afe_perferred_core = 0,
-        .afe_perferred_priority = 5,
-        .afe_ringbuf_size = 50,
-        .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
-        .afe_linear_gain = 1.0,
-        .agc_mode = AFE_MN_PEAK_AGC_MODE_2,
-        .pcm_config = {
-            .total_ch_num = 1,
-            .mic_num = 1,
-            .ref_num = 0,
-            .sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE
-        },
-        .debug_init = false,
-        .debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }},
-        .afe_ns_mode = NS_MODE_SSP,
-        .afe_ns_model_name = NULL,
-        .fixed_first_channel = true,
-    };
-
-    afe_detection_data_ = esp_afe_sr_v1.create_from_config(&afe_config);
-    xTaskCreate([](void* arg) {
-        Application* app = (Application*)arg;
-        app->AudioFeedTask();
-        vTaskDelete(NULL);
-    }, "audio_feed", 4096 * 2, this, 5, NULL);
-
-    xTaskCreate([](void* arg) {
-        Application* app = (Application*)arg;
-        app->AudioDetectionTask();
-        vTaskDelete(NULL);
-    }, "audio_detection", 4096 * 2, this, 5, NULL);
-}
-
-void Application::AudioFeedTask() {
-    int chunk_size = esp_afe_vc_v1.get_feed_chunksize(afe_detection_data_);
-    int16_t buffer[chunk_size];
-    ESP_LOGI(TAG, "Audio feed task started, chunk size: %d", chunk_size);
-
-    while (true) {
-        audio_device_.Read(buffer, chunk_size);
-
-        auto event_bits = xEventGroupGetBits(event_group_);
-        if (event_bits & DETECTION_RUNNING) {
-            esp_afe_sr_v1.feed(afe_detection_data_, buffer);
-        } else if (event_bits & COMMUNICATION_RUNNING) {
-            esp_afe_vc_v1.feed(afe_communication_data_, buffer);
-        }
-    }
-
-    vTaskDelete(NULL);
-}
-
-void Application::StoreWakeWordData(uint8_t* data, size_t size) {
-    // store audio data to wake_word_pcm_
-    auto iov = (iovec){
-        .iov_base = heap_caps_malloc(size, MALLOC_CAP_SPIRAM),
-        .iov_len = size
-    };
-    memcpy(iov.iov_base, data, size);
-    wake_word_pcm_.push_back(iov);
-    // keep about 2 seconds of data, detect duration is 32ms (sample_rate == 16000, chunksize == 512)
-    while (wake_word_pcm_.size() > 2000 / 32) {
-        heap_caps_free(wake_word_pcm_.front().iov_base);
-        wake_word_pcm_.pop_front();
-    }
-}
-
-void Application::EncodeWakeWordData() {
-    if (wake_word_encode_task_stack_ == nullptr) {
-        wake_word_encode_task_stack_ = (StackType_t*)malloc(4096 * 8);
-    }
-    wake_word_encode_task_ = xTaskCreateStatic([](void* arg) {
-        Application* app = (Application*)arg;
-        auto start_time = esp_timer_get_time();
-        // encode detect packets
-        OpusEncoder* encoder = new OpusEncoder();
-        encoder->Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1, 60);
-        encoder->SetComplexity(0);
-        app->wake_word_opus_.resize(4096 * 4);
-        size_t offset = 0;
-
-        for (auto& pcm: app->wake_word_pcm_) {
-            encoder->Encode(pcm, [app, &offset](const iovec opus) {
-                size_t protocol_size = sizeof(BinaryProtocol) + opus.iov_len;
-                if (offset + protocol_size < app->wake_word_opus_.size()) {
-                    auto protocol = (BinaryProtocol*)(&app->wake_word_opus_[offset]);
-                    protocol->version = htons(PROTOCOL_VERSION);
-                    protocol->type = htons(0);
-                    protocol->reserved = 0;
-                    protocol->timestamp = htonl(app->audio_device_.playing() ? app->audio_device_.last_timestamp() : 0);
-                    protocol->payload_size = htonl(opus.iov_len);
-                    memcpy(protocol->payload, opus.iov_base, opus.iov_len);
-                    offset += protocol_size;
-                }
-            });
-            heap_caps_free(pcm.iov_base);
-        }
-        app->wake_word_pcm_.clear();
-        app->wake_word_opus_.resize(offset);
-
-        auto end_time = esp_timer_get_time();
-        ESP_LOGI(TAG, "Encode wake word opus: %zu bytes in %lld ms", app->wake_word_opus_.size(), (end_time - start_time) / 1000);
-        xEventGroupSetBits(app->event_group_, WAKE_WORD_ENCODED);
-        delete encoder;
-        vTaskDelete(NULL);
-    }, "encode_detect_packets", 4096 * 8, this, 1, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_);
-}
-
-void Application::SendWakeWordData() {
-    ws_client_->Send(wake_word_opus_.data(), wake_word_opus_.size(), true);
-    wake_word_opus_.clear();
-}
-
-BinaryProtocol* Application::AllocateBinaryProtocol(void* payload, size_t payload_size) {
-    auto last_timestamp = audio_device_.playing() ? audio_device_.last_timestamp() : 0;
+BinaryProtocol* Application::AllocateBinaryProtocol(const uint8_t* payload, size_t payload_size) {
+    auto last_timestamp = 0;
    auto protocol = (BinaryProtocol*)heap_caps_malloc(sizeof(BinaryProtocol) + payload_size, MALLOC_CAP_SPIRAM);
    protocol->version = htons(PROTOCOL_VERSION);
    protocol->type = htons(0);
@ -498,197 +415,34 @@ BinaryProtocol* Application::AllocateBinaryProtocol(void* payload, size_t payloa
    return protocol;
 }

-void Application::CheckTestButton() {
-    if (gpio_get_level(GPIO_NUM_1) == 0) {
-        if (chat_state_ == kChatStateIdle) {
-            SetChatState(kChatStateTesting);
-            test_resampler_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
-        }
-    } else {
-        if (chat_state_ == kChatStateTesting) {
-            SetChatState(kChatStateIdle);
-            
-            // 创建新线程来处理音频播放
-            xTaskCreate([](void* arg) {
-                Application* app = static_cast<Application*>(arg);
-                app->PlayTestAudio();
-                vTaskDelete(NULL);
-            }, "play_test_audio", 4096, this, 1, NULL);
-        }
-    }
-}
-
-void Application::PlayTestAudio() {
-    // 写入音频数据到扬声器
-    auto packet = new AudioPacket();
-    packet->type = kAudioPacketTypeStart;
-    audio_device_.QueueAudioPacket(packet);
-
-    for (auto& pcm : test_pcm_) {
-        packet = new AudioPacket();
-        packet->type = kAudioPacketTypeData;
-        packet->pcm.resize(test_resampler_.GetOutputSamples(pcm.iov_len / 2));
-        test_resampler_.Process((int16_t*)pcm.iov_base, pcm.iov_len / 2, packet->pcm.data());
-        audio_device_.QueueAudioPacket(packet);
-        heap_caps_free(pcm.iov_base);
-    }
-    // 清除测试PCM数据
-    test_pcm_.clear();
-
-    // 停止音频设备
-    packet = new AudioPacket();
-    packet->type = kAudioPacketTypeStop;
-    audio_device_.QueueAudioPacket(packet);
-}
-
-void Application::AudioDetectionTask() {
-    auto chunk_size = esp_afe_sr_v1.get_fetch_chunksize(afe_detection_data_);
-    ESP_LOGI(TAG, "Audio detection task started, chunk size: %d", chunk_size);
-
-    while (true) {
-        xEventGroupWaitBits(event_group_, DETECTION_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY);
-
-        auto res = esp_afe_sr_v1.fetch(afe_detection_data_);
-        if (res == nullptr || res->ret_value == ESP_FAIL) {
-            ESP_LOGE(TAG, "Error in AudioDetectionTask");
-            if (res != nullptr) {
-                ESP_LOGI(TAG, "Error code: %d", res->ret_value);
-            }
-            continue;;
-        }
-
-        // Store the wake word data for voice recognition, like who is speaking
-        StoreWakeWordData((uint8_t*)res->data, res->data_size);
-
-        CheckTestButton();
-        if (chat_state_ == kChatStateTesting) {
-            auto& builtin_led = BuiltinLed::GetInstance();
-            if (res->vad_state == AFE_VAD_SPEECH) {
-                iovec iov = {
-                    .iov_base = heap_caps_malloc(res->data_size, MALLOC_CAP_SPIRAM),
-                    .iov_len = (size_t)res->data_size
-                };
-                memcpy(iov.iov_base, res->data, res->data_size);
-                test_pcm_.push_back(iov);
-                builtin_led.SetRed(128);
-            } else {
-                builtin_led.SetRed(32);
-            }
-            builtin_led.TurnOn();
-            continue;
-        }
-
-        if (chat_state_ == kChatStateIdle && res->wakeup_state == WAKENET_DETECTED) {
-            xEventGroupClearBits(event_group_, DETECTION_RUNNING);
-            SetChatState(kChatStateConnecting);
-
-            // Encode the wake word data and start websocket client at the same time
-            // They both consume a lot of time (700ms), so we can do them in parallel
-            EncodeWakeWordData();
-            StartWebSocketClient();
-
-            // Here the websocket is done, and we also wait for the wake word data to be encoded
-            xEventGroupWaitBits(event_group_, WAKE_WORD_ENCODED, pdTRUE, pdTRUE, portMAX_DELAY);
-
-            std::lock_guard<std::recursive_mutex> lock(mutex_);
-            if (ws_client_ && ws_client_->IsConnected()) {
-                // Send the wake word data to the server
-                SendWakeWordData();
-                // Send a ready message to indicate the server that the wake word data is sent
-                SetChatState(kChatStateWakeWordDetected);
-                opus_encoder_.ResetState();
-                // If connected, the hello message is already sent, so we can start communication
-                xEventGroupSetBits(event_group_, COMMUNICATION_RUNNING);
-                ESP_LOGI(TAG, "Communication running");
-            } else {
-                SetChatState(kChatStateIdle);
-                xEventGroupSetBits(event_group_, DETECTION_RUNNING);
-            }
-        }
-    }
-}
-
-void Application::AudioCommunicationTask() {
-    int chunk_size = esp_afe_vc_v1.get_fetch_chunksize(afe_communication_data_);
-    ESP_LOGI(TAG, "Audio communication task started, chunk size: %d", chunk_size);
-
-    while (true) {
-        xEventGroupWaitBits(event_group_, COMMUNICATION_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY);
-
-        auto res = esp_afe_vc_v1.fetch(afe_communication_data_);
-        if (res == nullptr || res->ret_value == ESP_FAIL) {
-            ESP_LOGE(TAG, "Error in AudioCommunicationTask");
-            if (res != nullptr) {
-                ESP_LOGI(TAG, "Error code: %d", res->ret_value);
-            }
-            continue;
-        }
-
-        // Check if the websocket client is disconnected by the server
-        {
-            std::lock_guard<std::recursive_mutex> lock(mutex_);
-            if (ws_client_ == nullptr || !ws_client_->IsConnected()) {
-                xEventGroupClearBits(event_group_, COMMUNICATION_RUNNING);
-                if (audio_device_.playing()) {
-                    audio_device_.Break();
-                }
-                SetChatState(kChatStateIdle);
-                if (ws_client_ != nullptr) {
-                    delete ws_client_;
-                    ws_client_ = nullptr;
-                }
-                xEventGroupSetBits(event_group_, DETECTION_RUNNING);
-                continue;
-            }
-        }
-
-        if (chat_state_ == kChatStateListening) {
-            // Update the LED state based on the VAD state
-            auto& builtin_led = BuiltinLed::GetInstance();
-            if (res->vad_state == AFE_VAD_SPEECH) {
-                builtin_led.SetRed(128);
-            } else {
-                builtin_led.SetRed(32);
-            }
-            builtin_led.TurnOn();
-
-            // Send audio data to server
-            iovec data = {
-                .iov_base = malloc(res->data_size),
-                .iov_len = (size_t)res->data_size
-            };
-            memcpy(data.iov_base, res->data, res->data_size);
-            xQueueSend(audio_encode_queue_, &data, portMAX_DELAY);
-        }
-    }
-}
-
 void Application::AudioEncodeTask() {
    ESP_LOGI(TAG, "Audio encode task started");
    while (true) {
-        iovec pcm;
-        xQueueReceive(audio_encode_queue_, &pcm, portMAX_DELAY);
-
-        // Encode audio data
-        opus_encoder_.Encode(pcm, [this](const iovec opus) {
-            auto protocol = AllocateBinaryProtocol(opus.iov_base, opus.iov_len);
-            std::lock_guard<std::recursive_mutex> lock(mutex_);
-            if (ws_client_ && ws_client_->IsConnected()) {
-                ws_client_->Send(protocol, sizeof(BinaryProtocol) + opus.iov_len, true);
-            }
-            heap_caps_free(protocol);
+        std::unique_lock<std::mutex> lock(mutex_);
+        cv_.wait(lock, [this]() {
+            return !audio_encode_queue_.empty() || !audio_decode_queue_.empty();
        });

-        free(pcm.iov_base);
-    }
-}
+        if (!audio_encode_queue_.empty()) {
+            auto pcm = std::move(audio_encode_queue_.front());
+            audio_encode_queue_.pop_front();
+            lock.unlock();

-void Application::AudioDecodeTask() {
-    while (true) {
-        AudioPacket* packet;
-        xQueueReceive(audio_decode_queue_, &packet, portMAX_DELAY);
+            // Encode audio data
+            opus_encoder_.Encode(pcm, [this](const uint8_t* opus, size_t opus_size) {
+                auto protocol = AllocateBinaryProtocol(opus, opus_size);
+                Schedule([this, protocol, opus_size]() {
+                    if (ws_client_ && ws_client_->IsConnected()) {
+                        ws_client_->Send(protocol, sizeof(BinaryProtocol) + opus_size, true);
+                    }
+                    heap_caps_free(protocol);
+                });
+            });
+        } else if (!audio_decode_queue_.empty()) {
+            auto packet = std::move(audio_decode_queue_.front());
+            audio_decode_queue_.pop_front();
+            lock.unlock();

-        if (packet->type == kAudioPacketTypeData) {
            int frame_size = opus_decode_sample_rate_ / 1000 * opus_duration_ms_;
            packet->pcm.resize(frame_size);

@ -705,9 +459,74 @@ void Application::AudioDecodeTask() {
                opus_resampler_.Process(packet->pcm.data(), frame_size, resampled.data());
                packet->pcm = std::move(resampled);
            }
+
+            std::lock_guard<std::mutex> lock(mutex_);
+            audio_play_queue_.push_back(packet);
+            cv_.notify_all();
+        }
+    }
+}
+
+void Application::HandleAudioPacket(AudioPacket* packet) {
+    switch (packet->type)
+    {
+    case kAudioPacketTypeData: {
+        if (skip_to_end_) {
+            break;
        }

-        audio_device_.QueueAudioPacket(packet);
+        // This will block until the audio device has finished playing the audio
+        audio_device_.OutputData(packet->pcm);
+
+        if (break_speaking_) {
+            break_speaking_ = false;
+            skip_to_end_ = true;
+            
+            // Play a silence and skip to the end
+            int frame_size = opus_decode_sample_rate_ / 1000 * opus_duration_ms_;
+            std::vector<int16_t> silence(frame_size);
+            bzero(silence.data(), silence.size() * sizeof(int16_t));
+            audio_device_.OutputData(silence);
+        }
+        break;
+    }
+    case kAudioPacketTypeStart:
+        Schedule([this]() {
+            SetChatState(kChatStateSpeaking);
+        });
+        break;
+    case kAudioPacketTypeStop:
+        skip_to_end_ = false;
+        Schedule([this]() {
+            SetChatState(kChatStateListening);
+        });
+        break;
+    case kAudioPacketTypeSentenceStart:
+        ESP_LOGI(TAG, "<< %s", packet->text.c_str());
+        break;
+    case kAudioPacketTypeSentenceEnd:
+        break;
+    default:
+        ESP_LOGI(TAG, "Unknown packet type: %d", packet->type);
+        break;
+    }
+
+    delete packet;
+}
+
+void Application::AudioPlayTask() {
+    ESP_LOGI(TAG, "Audio play task started");
+
+    while (true) {
+        std::unique_lock<std::mutex> lock(mutex_);
+        cv_.wait(lock, [this]() {
+            return !audio_play_queue_.empty();
+        });
+        auto packet = std::move(audio_play_queue_.front());
+        audio_play_queue_.pop_front();
+        lock.unlock();
+
+        HandleAudioPacket(packet);
    }
 }

@ -726,6 +545,7 @@ void Application::SetDecodeSampleRate(int sample_rate) {

 void Application::StartWebSocketClient() {
    if (ws_client_ != nullptr) {
+        ESP_LOGW(TAG, "WebSocket client already exists");
        delete ws_client_;
    }

@ -746,7 +566,6 @@ void Application::StartWebSocketClient() {
        // keys: message type, version, wakeup_model, audio_params (format, sample_rate, channels)
        std::string message = "{";
        message += "\"type\":\"hello\",";
-        message += "\"wakeup_model\":\"" + std::string(wakenet_model_) + "\",";
        message += "\"audio_params\":{";
        message += "\"format\":\"opus\", \"sample_rate\":" + std::to_string(CONFIG_AUDIO_INPUT_SAMPLE_RATE) + ", \"channels\":1";
        message += "}}";
@ -763,7 +582,10 @@ void Application::StartWebSocketClient() {
            auto payload_size = ntohl(protocol->payload_size);
            packet->opus.resize(payload_size);
            memcpy(packet->opus.data(), protocol->payload, payload_size);
-            xQueueSend(audio_decode_queue_, &packet, portMAX_DELAY);
+
+            std::lock_guard<std::mutex> lock(mutex_);
+            audio_decode_queue_.push_back(packet);
+            cv_.notify_all();
        } else {
            // Parse JSON data
            auto root = cJSON_Parse(data);
@ -786,7 +608,10 @@ void Application::StartWebSocketClient() {
                        packet->type = kAudioPacketTypeSentenceStart;
                        packet->text = cJSON_GetObjectItem(root, "text")->valuestring;
                    }
-                    xQueueSend(audio_decode_queue_, &packet, portMAX_DELAY);
+
+                    std::lock_guard<std::mutex> lock(mutex_);
+                    audio_decode_queue_.push_back(packet);
+                    cv_.notify_all();
                } else if (strcmp(type->valuestring, "stt") == 0) {
                    auto text = cJSON_GetObjectItem(root, "text");
                    if (text != NULL) {
@ -804,6 +629,14 @@ void Application::StartWebSocketClient() {

    ws_client_->OnDisconnected([this]() {
        ESP_LOGI(TAG, "Websocket disconnected");
+        Schedule([this]() {
+#ifdef CONFIG_USE_AFE_SR
+            audio_processor_.Stop();
+#endif
+            delete ws_client_;
+            ws_client_ = nullptr;
+            SetChatState(kChatStateIdle);
+        });
    });

    if (!ws_client_->Connect(CONFIG_WEBSOCKET_URL)) {