reconstruct application

This commit is contained in:
Terrence
2024-10-03 06:39:22 +08:00
parent e59be04394
commit 879f1cc21e
26 changed files with 927 additions and 639 deletions

View File

@ -1,27 +1,28 @@
#include "Application.h"
#include "BuiltinLed.h"
#include "TlsTransport.h"
#include "Ml307SslTransport.h"
#include "WifiConfigurationAp.h"
#include "WifiStation.h"
#include <BuiltinLed.h>
#include <TlsTransport.h>
#include <Ml307SslTransport.h>
#include <WifiConfigurationAp.h>
#include <WifiStation.h>
#include <SystemInfo.h>
#include <cstring>
#include "esp_log.h"
#include "model_path.h"
#include "SystemInfo.h"
#include "cJSON.h"
#include "driver/gpio.h"
#include <esp_log.h>
#include <cJSON.h>
#include <driver/gpio.h>
#include "Application.h"
#define TAG "Application"
Application::Application()
: button_((gpio_num_t)CONFIG_BOOT_BUTTON_GPIO)
#ifdef CONFIG_USE_ML307
: ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096),
, ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096),
http_(ml307_at_modem_),
firmware_upgrade_(http_)
#else
: http_(),
, http_(),
firmware_upgrade_(http_)
#endif
#ifdef CONFIG_USE_DISPLAY
@ -29,16 +30,6 @@ Application::Application()
#endif
{
event_group_ = xEventGroupCreate();
audio_encode_queue_ = xQueueCreate(100, sizeof(iovec));
audio_decode_queue_ = xQueueCreate(100, sizeof(AudioPacket*));
srmodel_list_t *models = esp_srmodel_init("model");
for (int i = 0; i < models->num; i++) {
ESP_LOGI(TAG, "Model %d: %s", i, models->model_name[i]);
if (strstr(models->model_name[i], ESP_WN_PREFIX) != NULL) {
wakenet_model_ = models->model_name[i];
}
}
opus_encoder_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1);
opus_decoder_ = opus_decoder_create(opus_decode_sample_rate_, 1, NULL);
@ -52,32 +43,12 @@ Application::Application()
}
Application::~Application() {
if (afe_detection_data_ != nullptr) {
esp_afe_sr_v1.destroy(afe_detection_data_);
}
if (afe_communication_data_ != nullptr) {
esp_afe_vc_v1.destroy(afe_communication_data_);
}
if (wake_word_encode_task_stack_ != nullptr) {
free(wake_word_encode_task_stack_);
}
for (auto& pcm : wake_word_pcm_) {
free(pcm.iov_base);
}
if (opus_decoder_ != nullptr) {
opus_decoder_destroy(opus_decoder_);
}
if (audio_encode_task_stack_ != nullptr) {
free(audio_encode_task_stack_);
}
if (audio_decode_task_stack_ != nullptr) {
free(audio_decode_task_stack_);
}
vQueueDelete(audio_decode_queue_);
vQueueDelete(audio_encode_queue_);
vEventGroupDelete(event_group_);
}
@ -204,43 +175,139 @@ void Application::Start() {
}
#endif
audio_device_.OnInputData([this](const int16_t* data, int size) {
#ifdef CONFIG_USE_AFE_SR
if (audio_processor_.IsRunning()) {
audio_processor_.Input(data, size);
}
if (wake_word_detect_.IsDetectionRunning()) {
wake_word_detect_.Feed(data, size);
}
#else
std::vector<int16_t> pcm(data, data + size);
Schedule([this, pcm = std::move(pcm)]() {
if (chat_state_ == kChatStateListening) {
std::lock_guard<std::mutex> lock(mutex_);
audio_encode_queue_.emplace_back(std::move(pcm));
cv_.notify_all();
}
});
#endif
});
// Initialize the audio device
audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
audio_device_.OnStateChanged([this]() {
if (audio_device_.playing()) {
SetChatState(kChatStateSpeaking);
} else {
// Check if communication is still running
if (xEventGroupGetBits(event_group_) & COMMUNICATION_RUNNING) {
SetChatState(kChatStateListening);
} else {
SetChatState(kChatStateIdle);
}
}
});
// OPUS encoder / decoder use a lot of stack memory
const size_t opus_stack_size = 4096 * 8;
audio_encode_task_stack_ = (StackType_t*)malloc(opus_stack_size);
xTaskCreateStatic([](void* arg) {
audio_encode_task_ = xTaskCreateStatic([](void* arg) {
Application* app = (Application*)arg;
app->AudioEncodeTask();
vTaskDelete(NULL);
}, "opus_encode", opus_stack_size, this, 1, audio_encode_task_stack_, &audio_encode_task_buffer_);
audio_decode_task_stack_ = (StackType_t*)malloc(opus_stack_size);
xTaskCreateStatic([](void* arg) {
Application* app = (Application*)arg;
app->AudioDecodeTask();
vTaskDelete(NULL);
}, "opus_decode", opus_stack_size, this, 1, audio_decode_task_stack_, &audio_decode_task_buffer_);
StartCommunication();
StartDetection();
xTaskCreate([](void* arg) {
Application* app = (Application*)arg;
app->AudioPlayTask();
vTaskDelete(NULL);
}, "play_audio", 4096 * 2, this, 5, NULL);
#ifdef CONFIG_USE_AFE_SR
wake_word_detect_.OnVadStateChange([this](bool speaking) {
Schedule([this, speaking]() {
auto& builtin_led = BuiltinLed::GetInstance();
if (chat_state_ == kChatStateListening) {
if (speaking) {
builtin_led.SetRed(32);
} else {
builtin_led.SetRed(8);
}
builtin_led.TurnOn();
}
});
});
wake_word_detect_.OnWakeWordDetected([this]() {
Schedule([this]() {
if (chat_state_ == kChatStateIdle) {
// Encode the wake word data and start websocket client at the same time
// They both consume a lot of time (700ms), so we can do them in parallel
wake_word_detect_.EncodeWakeWordData();
SetChatState(kChatStateConnecting);
if (ws_client_ == nullptr) {
StartWebSocketClient();
}
if (ws_client_ && ws_client_->IsConnected()) {
auto encoded = wake_word_detect_.GetWakeWordStream();
// Send the wake word data to the server
ws_client_->Send(encoded.data(), encoded.size(), true);
opus_encoder_.ResetState();
// Send a ready message to indicate the server that the wake word data is sent
SetChatState(kChatStateWakeWordDetected);
// If connected, the hello message is already sent, so we can start communication
audio_processor_.Start();
ESP_LOGI(TAG, "Audio processor started");
} else {
SetChatState(kChatStateIdle);
}
} else if (chat_state_ == kChatStateSpeaking) {
break_speaking_ = true;
}
// Resume detection
wake_word_detect_.StartDetection();
});
});
wake_word_detect_.StartDetection();
audio_processor_.OnOutput([this](std::vector<int16_t>&& data) {
Schedule([this, data = std::move(data)]() {
if (chat_state_ == kChatStateListening) {
std::lock_guard<std::mutex> lock(mutex_);
audio_encode_queue_.emplace_back(std::move(data));
cv_.notify_all();
}
});
});
#endif
// Blink the LED to indicate the device is running
builtin_led.SetGreen();
builtin_led.BlinkOnce();
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
button_.OnClick([this]() {
Schedule([this]() {
if (chat_state_ == kChatStateIdle) {
SetChatState(kChatStateConnecting);
StartWebSocketClient();
if (ws_client_ && ws_client_->IsConnected()) {
opus_encoder_.ResetState();
#ifdef CONFIG_USE_AFE_SR
audio_processor_.Start();
#endif
SetChatState(kChatStateListening);
ESP_LOGI(TAG, "Communication started");
} else {
SetChatState(kChatStateIdle);
}
} else if (chat_state_ == kChatStateSpeaking) {
break_speaking_ = true;
} else if (chat_state_ == kChatStateListening) {
if (ws_client_ && ws_client_->IsConnected()) {
ws_client_->Close();
}
}
});
});
xTaskCreate([](void* arg) {
Application* app = (Application*)arg;
app->MainLoop();
vTaskDelete(NULL);
}, "main_loop", 4096 * 2, this, 5, NULL);
// Launch a task to check for new firmware version
xTaskCreate([](void* arg) {
@ -259,6 +326,28 @@ void Application::Start() {
#endif
}
void Application::Schedule(std::function<void()> callback) {
std::lock_guard<std::mutex> lock(mutex_);
main_tasks_.push_back(callback);
cv_.notify_all();
}
// The Main Loop controls the chat state and websocket connection
// If other tasks need to access the websocket or chat state,
// they should use Schedule to call this function
void Application::MainLoop() {
while (true) {
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait(lock, [this]() {
return !main_tasks_.empty();
});
auto task = std::move(main_tasks_.front());
main_tasks_.pop_front();
lock.unlock();
task();
}
}
void Application::SetChatState(ChatState state) {
const char* state_str[] = {
"idle",
@ -294,199 +383,27 @@ void Application::SetChatState(ChatState state) {
builtin_led.SetBlue();
builtin_led.TurnOn();
break;
case kChatStateTesting:
builtin_led.SetRed();
builtin_led.TurnOn();
break;
case kChatStateUpgrading:
builtin_led.SetGreen();
builtin_led.StartContinuousBlink(100);
break;
}
std::lock_guard<std::recursive_mutex> lock(mutex_);
if (ws_client_ && ws_client_->IsConnected()) {
cJSON* root = cJSON_CreateObject();
cJSON_AddStringToObject(root, "type", "state");
cJSON_AddStringToObject(root, "state", state_str[chat_state_]);
char* json = cJSON_PrintUnformatted(root);
std::lock_guard<std::mutex> lock(mutex_);
ws_client_->Send(json);
cJSON_Delete(root);
free(json);
}
}
void Application::StartCommunication() {
afe_config_t afe_config = {
.aec_init = false,
.se_init = true,
.vad_init = true,
.wakenet_init = false,
.voice_communication_init = true,
.voice_communication_agc_init = true,
.voice_communication_agc_gain = 10,
.vad_mode = VAD_MODE_3,
.wakenet_model_name = NULL,
.wakenet_model_name_2 = NULL,
.wakenet_mode = DET_MODE_90,
.afe_mode = SR_MODE_HIGH_PERF,
.afe_perferred_core = 0,
.afe_perferred_priority = 5,
.afe_ringbuf_size = 50,
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
.afe_linear_gain = 1.0,
.agc_mode = AFE_MN_PEAK_AGC_MODE_2,
.pcm_config = {
.total_ch_num = 1,
.mic_num = 1,
.ref_num = 0,
.sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE,
},
.debug_init = false,
.debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }},
.afe_ns_mode = NS_MODE_SSP,
.afe_ns_model_name = NULL,
.fixed_first_channel = true,
};
afe_communication_data_ = esp_afe_vc_v1.create_from_config(&afe_config);
xTaskCreate([](void* arg) {
Application* app = (Application*)arg;
app->AudioCommunicationTask();
vTaskDelete(NULL);
}, "audio_communication", 4096 * 2, this, 5, NULL);
}
void Application::StartDetection() {
afe_config_t afe_config = {
.aec_init = false,
.se_init = true,
.vad_init = true,
.wakenet_init = true,
.voice_communication_init = false,
.voice_communication_agc_init = false,
.voice_communication_agc_gain = 10,
.vad_mode = VAD_MODE_3,
.wakenet_model_name = wakenet_model_,
.wakenet_model_name_2 = NULL,
.wakenet_mode = DET_MODE_90,
.afe_mode = SR_MODE_HIGH_PERF,
.afe_perferred_core = 0,
.afe_perferred_priority = 5,
.afe_ringbuf_size = 50,
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
.afe_linear_gain = 1.0,
.agc_mode = AFE_MN_PEAK_AGC_MODE_2,
.pcm_config = {
.total_ch_num = 1,
.mic_num = 1,
.ref_num = 0,
.sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE
},
.debug_init = false,
.debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }},
.afe_ns_mode = NS_MODE_SSP,
.afe_ns_model_name = NULL,
.fixed_first_channel = true,
};
afe_detection_data_ = esp_afe_sr_v1.create_from_config(&afe_config);
xTaskCreate([](void* arg) {
Application* app = (Application*)arg;
app->AudioFeedTask();
vTaskDelete(NULL);
}, "audio_feed", 4096 * 2, this, 5, NULL);
xTaskCreate([](void* arg) {
Application* app = (Application*)arg;
app->AudioDetectionTask();
vTaskDelete(NULL);
}, "audio_detection", 4096 * 2, this, 5, NULL);
}
void Application::AudioFeedTask() {
int chunk_size = esp_afe_vc_v1.get_feed_chunksize(afe_detection_data_);
int16_t buffer[chunk_size];
ESP_LOGI(TAG, "Audio feed task started, chunk size: %d", chunk_size);
while (true) {
audio_device_.Read(buffer, chunk_size);
auto event_bits = xEventGroupGetBits(event_group_);
if (event_bits & DETECTION_RUNNING) {
esp_afe_sr_v1.feed(afe_detection_data_, buffer);
} else if (event_bits & COMMUNICATION_RUNNING) {
esp_afe_vc_v1.feed(afe_communication_data_, buffer);
}
}
vTaskDelete(NULL);
}
void Application::StoreWakeWordData(uint8_t* data, size_t size) {
// store audio data to wake_word_pcm_
auto iov = (iovec){
.iov_base = heap_caps_malloc(size, MALLOC_CAP_SPIRAM),
.iov_len = size
};
memcpy(iov.iov_base, data, size);
wake_word_pcm_.push_back(iov);
// keep about 2 seconds of data, detect duration is 32ms (sample_rate == 16000, chunksize == 512)
while (wake_word_pcm_.size() > 2000 / 32) {
heap_caps_free(wake_word_pcm_.front().iov_base);
wake_word_pcm_.pop_front();
}
}
void Application::EncodeWakeWordData() {
if (wake_word_encode_task_stack_ == nullptr) {
wake_word_encode_task_stack_ = (StackType_t*)malloc(4096 * 8);
}
wake_word_encode_task_ = xTaskCreateStatic([](void* arg) {
Application* app = (Application*)arg;
auto start_time = esp_timer_get_time();
// encode detect packets
OpusEncoder* encoder = new OpusEncoder();
encoder->Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1, 60);
encoder->SetComplexity(0);
app->wake_word_opus_.resize(4096 * 4);
size_t offset = 0;
for (auto& pcm: app->wake_word_pcm_) {
encoder->Encode(pcm, [app, &offset](const iovec opus) {
size_t protocol_size = sizeof(BinaryProtocol) + opus.iov_len;
if (offset + protocol_size < app->wake_word_opus_.size()) {
auto protocol = (BinaryProtocol*)(&app->wake_word_opus_[offset]);
protocol->version = htons(PROTOCOL_VERSION);
protocol->type = htons(0);
protocol->reserved = 0;
protocol->timestamp = htonl(app->audio_device_.playing() ? app->audio_device_.last_timestamp() : 0);
protocol->payload_size = htonl(opus.iov_len);
memcpy(protocol->payload, opus.iov_base, opus.iov_len);
offset += protocol_size;
}
});
heap_caps_free(pcm.iov_base);
}
app->wake_word_pcm_.clear();
app->wake_word_opus_.resize(offset);
auto end_time = esp_timer_get_time();
ESP_LOGI(TAG, "Encode wake word opus: %zu bytes in %lld ms", app->wake_word_opus_.size(), (end_time - start_time) / 1000);
xEventGroupSetBits(app->event_group_, WAKE_WORD_ENCODED);
delete encoder;
vTaskDelete(NULL);
}, "encode_detect_packets", 4096 * 8, this, 1, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_);
}
void Application::SendWakeWordData() {
ws_client_->Send(wake_word_opus_.data(), wake_word_opus_.size(), true);
wake_word_opus_.clear();
}
BinaryProtocol* Application::AllocateBinaryProtocol(void* payload, size_t payload_size) {
auto last_timestamp = audio_device_.playing() ? audio_device_.last_timestamp() : 0;
BinaryProtocol* Application::AllocateBinaryProtocol(const uint8_t* payload, size_t payload_size) {
auto last_timestamp = 0;
auto protocol = (BinaryProtocol*)heap_caps_malloc(sizeof(BinaryProtocol) + payload_size, MALLOC_CAP_SPIRAM);
protocol->version = htons(PROTOCOL_VERSION);
protocol->type = htons(0);
@ -498,197 +415,34 @@ BinaryProtocol* Application::AllocateBinaryProtocol(void* payload, size_t payloa
return protocol;
}
void Application::CheckTestButton() {
if (gpio_get_level(GPIO_NUM_1) == 0) {
if (chat_state_ == kChatStateIdle) {
SetChatState(kChatStateTesting);
test_resampler_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
}
} else {
if (chat_state_ == kChatStateTesting) {
SetChatState(kChatStateIdle);
// 创建新线程来处理音频播放
xTaskCreate([](void* arg) {
Application* app = static_cast<Application*>(arg);
app->PlayTestAudio();
vTaskDelete(NULL);
}, "play_test_audio", 4096, this, 1, NULL);
}
}
}
void Application::PlayTestAudio() {
// 写入音频数据到扬声器
auto packet = new AudioPacket();
packet->type = kAudioPacketTypeStart;
audio_device_.QueueAudioPacket(packet);
for (auto& pcm : test_pcm_) {
packet = new AudioPacket();
packet->type = kAudioPacketTypeData;
packet->pcm.resize(test_resampler_.GetOutputSamples(pcm.iov_len / 2));
test_resampler_.Process((int16_t*)pcm.iov_base, pcm.iov_len / 2, packet->pcm.data());
audio_device_.QueueAudioPacket(packet);
heap_caps_free(pcm.iov_base);
}
// 清除测试PCM数据
test_pcm_.clear();
// 停止音频设备
packet = new AudioPacket();
packet->type = kAudioPacketTypeStop;
audio_device_.QueueAudioPacket(packet);
}
void Application::AudioDetectionTask() {
auto chunk_size = esp_afe_sr_v1.get_fetch_chunksize(afe_detection_data_);
ESP_LOGI(TAG, "Audio detection task started, chunk size: %d", chunk_size);
while (true) {
xEventGroupWaitBits(event_group_, DETECTION_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY);
auto res = esp_afe_sr_v1.fetch(afe_detection_data_);
if (res == nullptr || res->ret_value == ESP_FAIL) {
ESP_LOGE(TAG, "Error in AudioDetectionTask");
if (res != nullptr) {
ESP_LOGI(TAG, "Error code: %d", res->ret_value);
}
continue;;
}
// Store the wake word data for voice recognition, like who is speaking
StoreWakeWordData((uint8_t*)res->data, res->data_size);
CheckTestButton();
if (chat_state_ == kChatStateTesting) {
auto& builtin_led = BuiltinLed::GetInstance();
if (res->vad_state == AFE_VAD_SPEECH) {
iovec iov = {
.iov_base = heap_caps_malloc(res->data_size, MALLOC_CAP_SPIRAM),
.iov_len = (size_t)res->data_size
};
memcpy(iov.iov_base, res->data, res->data_size);
test_pcm_.push_back(iov);
builtin_led.SetRed(128);
} else {
builtin_led.SetRed(32);
}
builtin_led.TurnOn();
continue;
}
if (chat_state_ == kChatStateIdle && res->wakeup_state == WAKENET_DETECTED) {
xEventGroupClearBits(event_group_, DETECTION_RUNNING);
SetChatState(kChatStateConnecting);
// Encode the wake word data and start websocket client at the same time
// They both consume a lot of time (700ms), so we can do them in parallel
EncodeWakeWordData();
StartWebSocketClient();
// Here the websocket is done, and we also wait for the wake word data to be encoded
xEventGroupWaitBits(event_group_, WAKE_WORD_ENCODED, pdTRUE, pdTRUE, portMAX_DELAY);
std::lock_guard<std::recursive_mutex> lock(mutex_);
if (ws_client_ && ws_client_->IsConnected()) {
// Send the wake word data to the server
SendWakeWordData();
// Send a ready message to indicate the server that the wake word data is sent
SetChatState(kChatStateWakeWordDetected);
opus_encoder_.ResetState();
// If connected, the hello message is already sent, so we can start communication
xEventGroupSetBits(event_group_, COMMUNICATION_RUNNING);
ESP_LOGI(TAG, "Communication running");
} else {
SetChatState(kChatStateIdle);
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
}
}
}
}
void Application::AudioCommunicationTask() {
int chunk_size = esp_afe_vc_v1.get_fetch_chunksize(afe_communication_data_);
ESP_LOGI(TAG, "Audio communication task started, chunk size: %d", chunk_size);
while (true) {
xEventGroupWaitBits(event_group_, COMMUNICATION_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY);
auto res = esp_afe_vc_v1.fetch(afe_communication_data_);
if (res == nullptr || res->ret_value == ESP_FAIL) {
ESP_LOGE(TAG, "Error in AudioCommunicationTask");
if (res != nullptr) {
ESP_LOGI(TAG, "Error code: %d", res->ret_value);
}
continue;
}
// Check if the websocket client is disconnected by the server
{
std::lock_guard<std::recursive_mutex> lock(mutex_);
if (ws_client_ == nullptr || !ws_client_->IsConnected()) {
xEventGroupClearBits(event_group_, COMMUNICATION_RUNNING);
if (audio_device_.playing()) {
audio_device_.Break();
}
SetChatState(kChatStateIdle);
if (ws_client_ != nullptr) {
delete ws_client_;
ws_client_ = nullptr;
}
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
continue;
}
}
if (chat_state_ == kChatStateListening) {
// Update the LED state based on the VAD state
auto& builtin_led = BuiltinLed::GetInstance();
if (res->vad_state == AFE_VAD_SPEECH) {
builtin_led.SetRed(128);
} else {
builtin_led.SetRed(32);
}
builtin_led.TurnOn();
// Send audio data to server
iovec data = {
.iov_base = malloc(res->data_size),
.iov_len = (size_t)res->data_size
};
memcpy(data.iov_base, res->data, res->data_size);
xQueueSend(audio_encode_queue_, &data, portMAX_DELAY);
}
}
}
void Application::AudioEncodeTask() {
ESP_LOGI(TAG, "Audio encode task started");
while (true) {
iovec pcm;
xQueueReceive(audio_encode_queue_, &pcm, portMAX_DELAY);
// Encode audio data
opus_encoder_.Encode(pcm, [this](const iovec opus) {
auto protocol = AllocateBinaryProtocol(opus.iov_base, opus.iov_len);
std::lock_guard<std::recursive_mutex> lock(mutex_);
if (ws_client_ && ws_client_->IsConnected()) {
ws_client_->Send(protocol, sizeof(BinaryProtocol) + opus.iov_len, true);
}
heap_caps_free(protocol);
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait(lock, [this]() {
return !audio_encode_queue_.empty() || !audio_decode_queue_.empty();
});
free(pcm.iov_base);
}
}
if (!audio_encode_queue_.empty()) {
auto pcm = std::move(audio_encode_queue_.front());
audio_encode_queue_.pop_front();
lock.unlock();
void Application::AudioDecodeTask() {
while (true) {
AudioPacket* packet;
xQueueReceive(audio_decode_queue_, &packet, portMAX_DELAY);
// Encode audio data
opus_encoder_.Encode(pcm, [this](const uint8_t* opus, size_t opus_size) {
auto protocol = AllocateBinaryProtocol(opus, opus_size);
Schedule([this, protocol, opus_size]() {
if (ws_client_ && ws_client_->IsConnected()) {
ws_client_->Send(protocol, sizeof(BinaryProtocol) + opus_size, true);
}
heap_caps_free(protocol);
});
});
} else if (!audio_decode_queue_.empty()) {
auto packet = std::move(audio_decode_queue_.front());
audio_decode_queue_.pop_front();
lock.unlock();
if (packet->type == kAudioPacketTypeData) {
int frame_size = opus_decode_sample_rate_ / 1000 * opus_duration_ms_;
packet->pcm.resize(frame_size);
@ -705,9 +459,74 @@ void Application::AudioDecodeTask() {
opus_resampler_.Process(packet->pcm.data(), frame_size, resampled.data());
packet->pcm = std::move(resampled);
}
std::lock_guard<std::mutex> lock(mutex_);
audio_play_queue_.push_back(packet);
cv_.notify_all();
}
}
}
void Application::HandleAudioPacket(AudioPacket* packet) {
switch (packet->type)
{
case kAudioPacketTypeData: {
if (skip_to_end_) {
break;
}
audio_device_.QueueAudioPacket(packet);
// This will block until the audio device has finished playing the audio
audio_device_.OutputData(packet->pcm);
if (break_speaking_) {
break_speaking_ = false;
skip_to_end_ = true;
// Play a silence and skip to the end
int frame_size = opus_decode_sample_rate_ / 1000 * opus_duration_ms_;
std::vector<int16_t> silence(frame_size);
bzero(silence.data(), silence.size() * sizeof(int16_t));
audio_device_.OutputData(silence);
}
break;
}
case kAudioPacketTypeStart:
Schedule([this]() {
SetChatState(kChatStateSpeaking);
});
break;
case kAudioPacketTypeStop:
skip_to_end_ = false;
Schedule([this]() {
SetChatState(kChatStateListening);
});
break;
case kAudioPacketTypeSentenceStart:
ESP_LOGI(TAG, "<< %s", packet->text.c_str());
break;
case kAudioPacketTypeSentenceEnd:
break;
default:
ESP_LOGI(TAG, "Unknown packet type: %d", packet->type);
break;
}
delete packet;
}
void Application::AudioPlayTask() {
ESP_LOGI(TAG, "Audio play task started");
while (true) {
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait(lock, [this]() {
return !audio_play_queue_.empty();
});
auto packet = std::move(audio_play_queue_.front());
audio_play_queue_.pop_front();
lock.unlock();
HandleAudioPacket(packet);
}
}
@ -726,6 +545,7 @@ void Application::SetDecodeSampleRate(int sample_rate) {
void Application::StartWebSocketClient() {
if (ws_client_ != nullptr) {
ESP_LOGW(TAG, "WebSocket client already exists");
delete ws_client_;
}
@ -746,7 +566,6 @@ void Application::StartWebSocketClient() {
// keys: message type, version, wakeup_model, audio_params (format, sample_rate, channels)
std::string message = "{";
message += "\"type\":\"hello\",";
message += "\"wakeup_model\":\"" + std::string(wakenet_model_) + "\",";
message += "\"audio_params\":{";
message += "\"format\":\"opus\", \"sample_rate\":" + std::to_string(CONFIG_AUDIO_INPUT_SAMPLE_RATE) + ", \"channels\":1";
message += "}}";
@ -763,7 +582,10 @@ void Application::StartWebSocketClient() {
auto payload_size = ntohl(protocol->payload_size);
packet->opus.resize(payload_size);
memcpy(packet->opus.data(), protocol->payload, payload_size);
xQueueSend(audio_decode_queue_, &packet, portMAX_DELAY);
std::lock_guard<std::mutex> lock(mutex_);
audio_decode_queue_.push_back(packet);
cv_.notify_all();
} else {
// Parse JSON data
auto root = cJSON_Parse(data);
@ -786,7 +608,10 @@ void Application::StartWebSocketClient() {
packet->type = kAudioPacketTypeSentenceStart;
packet->text = cJSON_GetObjectItem(root, "text")->valuestring;
}
xQueueSend(audio_decode_queue_, &packet, portMAX_DELAY);
std::lock_guard<std::mutex> lock(mutex_);
audio_decode_queue_.push_back(packet);
cv_.notify_all();
} else if (strcmp(type->valuestring, "stt") == 0) {
auto text = cJSON_GetObjectItem(root, "text");
if (text != NULL) {
@ -804,6 +629,14 @@ void Application::StartWebSocketClient() {
ws_client_->OnDisconnected([this]() {
ESP_LOGI(TAG, "Websocket disconnected");
Schedule([this]() {
#ifdef CONFIG_USE_AFE_SR
audio_processor_.Stop();
#endif
delete ws_client_;
ws_client_ = nullptr;
SetChatState(kChatStateIdle);
});
});
if (!ws_client_->Connect(CONFIG_WEBSOCKET_URL)) {