v1.8.0: Audio 代码重构与低功耗优化 (#943)

* Reconstruct Audio Code

* Remove old IoT implementation

* Add MQTT-UDP documentation

* OTA升级失败时,可以继续使用
This commit is contained in:
Xiaoxia
2025-07-19 22:45:22 +08:00
committed by GitHub
parent 0621578f55
commit 3c71558a5f
173 changed files with 2099 additions and 3265 deletions

View File

@ -6,26 +6,8 @@
#include "mqtt_protocol.h"
#include "websocket_protocol.h"
#include "font_awesome_symbols.h"
#include "iot/thing_manager.h"
#include "assets/lang_config.h"
#include "mcp_server.h"
#include "audio_debugger.h"
#if CONFIG_USE_AUDIO_PROCESSOR
#include "afe_audio_processor.h"
#else
#include "no_audio_processor.h"
#endif
#if CONFIG_USE_AFE_WAKE_WORD
#include "afe_wake_word.h"
#elif CONFIG_USE_ESP_WAKE_WORD
#include "esp_wake_word.h"
#elif CONFIG_USE_CUSTOM_WAKE_WORD
#include "custom_wake_word.h"
#else
#include "no_wake_word.h"
#endif
#include <cstring>
#include <esp_log.h>
@ -53,7 +35,6 @@ static const char* const STATE_STRINGS[] = {
Application::Application() {
event_group_ = xEventGroupCreate();
background_task_ = new BackgroundTask(4096 * 7);
#if CONFIG_USE_DEVICE_AEC
aec_mode_ = kAecOnDeviceSide;
@ -63,22 +44,6 @@ Application::Application() {
aec_mode_ = kAecOff;
#endif
#if CONFIG_USE_AUDIO_PROCESSOR
audio_processor_ = std::make_unique<AfeAudioProcessor>();
#else
audio_processor_ = std::make_unique<NoAudioProcessor>();
#endif
#if CONFIG_USE_AFE_WAKE_WORD
wake_word_ = std::make_unique<AfeWakeWord>();
#elif CONFIG_USE_ESP_WAKE_WORD
wake_word_ = std::make_unique<EspWakeWord>();
#elif CONFIG_USE_CUSTOM_WAKE_WORD
wake_word_ = std::make_unique<CustomWakeWord>();
#else
wake_word_ = std::make_unique<NoWakeWord>();
#endif
esp_timer_create_args_t clock_timer_args = {
.callback = [](void* arg) {
Application* app = (Application*)arg;
@ -97,9 +62,6 @@ Application::~Application() {
esp_timer_stop(clock_timer_handle_);
esp_timer_delete(clock_timer_handle_);
}
if (background_task_ != nullptr) {
delete background_task_;
}
vEventGroupDelete(event_group_);
}
@ -108,9 +70,10 @@ void Application::CheckNewVersion(Ota& ota) {
int retry_count = 0;
int retry_delay = 10; // 初始重试延迟为10秒
auto& board = Board::GetInstance();
while (true) {
SetDeviceState(kDeviceStateActivating);
auto display = Board::GetInstance().GetDisplay();
auto display = board.GetDisplay();
display->SetStatus(Lang::Strings::CHECKING_NEW_VERSION);
if (!ota.CheckVersion()) {
@ -148,40 +111,38 @@ void Application::CheckNewVersion(Ota& ota) {
std::string message = std::string(Lang::Strings::NEW_VERSION) + ota.GetFirmwareVersion();
display->SetChatMessage("system", message.c_str());
auto& board = Board::GetInstance();
board.SetPowerSaveMode(false);
wake_word_->StopDetection();
// 预先关闭音频输出,避免升级过程有音频操作
auto codec = board.GetAudioCodec();
codec->EnableInput(false);
codec->EnableOutput(false);
{
std::lock_guard<std::mutex> lock(mutex_);
audio_decode_queue_.clear();
}
background_task_->WaitForCompletion();
delete background_task_;
background_task_ = nullptr;
audio_service_.Stop();
vTaskDelay(pdMS_TO_TICKS(1000));
ota.StartUpgrade([display](int progress, size_t speed) {
bool upgrade_success = ota.StartUpgrade([display](int progress, size_t speed) {
char buffer[64];
snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024);
display->SetChatMessage("system", buffer);
});
// If upgrade success, the device will reboot and never reach here
display->SetStatus(Lang::Strings::UPGRADE_FAILED);
ESP_LOGI(TAG, "Firmware upgrade failed...");
vTaskDelay(pdMS_TO_TICKS(3000));
Reboot();
return;
if (!upgrade_success) {
// Upgrade failed, restart audio service and continue running
ESP_LOGE(TAG, "Firmware upgrade failed, restarting audio service and continuing operation...");
audio_service_.Start(); // Restart audio service
board.SetPowerSaveMode(true); // Restore power save mode
Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "sad", Lang::Sounds::P3_EXCLAMATION);
vTaskDelay(pdMS_TO_TICKS(3000));
// Continue to normal operation (don't break, just fall through)
} else {
// Upgrade success, reboot immediately
ESP_LOGI(TAG, "Firmware upgrade successful, rebooting...");
display->SetChatMessage("system", "Upgrade successful, rebooting...");
vTaskDelay(pdMS_TO_TICKS(1000)); // Brief pause to show message
Reboot();
return; // This line will never be reached after reboot
}
}
// No new version, mark the current version as valid
ota.MarkCurrentVersionValid();
if (!ota.HasActivationCode() && !ota.HasActivationChallenge()) {
xEventGroupSetBits(event_group_, CHECK_NEW_VERSION_DONE_EVENT);
xEventGroupSetBits(event_group_, MAIN_EVENT_CHECK_NEW_VERSION_DONE);
// Exit the loop if done checking new version
break;
}
@ -197,7 +158,7 @@ void Application::CheckNewVersion(Ota& ota) {
ESP_LOGI(TAG, "Activating... %d/%d", i + 1, 10);
esp_err_t err = ota.Activate();
if (err == ESP_OK) {
xEventGroupSetBits(event_group_, CHECK_NEW_VERSION_DONE_EVENT);
xEventGroupSetBits(event_group_, MAIN_EVENT_CHECK_NEW_VERSION_DONE);
break;
} else if (err == ESP_ERR_TIMEOUT) {
vTaskDelay(pdMS_TO_TICKS(3000));
@ -236,7 +197,7 @@ void Application::ShowActivationCode(const std::string& code, const std::string&
auto it = std::find_if(digit_sounds.begin(), digit_sounds.end(),
[digit](const digit_sound& ds) { return ds.digit == digit; });
if (it != digit_sounds.end()) {
PlaySound(it->sound);
audio_service_.PlaySound(it->sound);
}
}
}
@ -248,8 +209,7 @@ void Application::Alert(const char* status, const char* message, const char* emo
display->SetEmotion(emotion);
display->SetChatMessage("system", message);
if (!sound.empty()) {
ResetDecoder();
PlaySound(sound);
audio_service_.PlaySound(sound);
}
}
@ -262,59 +222,17 @@ void Application::DismissAlert() {
}
}
void Application::PlaySound(const std::string_view& sound) {
// Wait for the previous sound to finish
{
std::unique_lock<std::mutex> lock(mutex_);
audio_decode_cv_.wait(lock, [this]() {
return audio_decode_queue_.empty();
});
}
background_task_->WaitForCompletion();
const char* data = sound.data();
size_t size = sound.size();
for (const char* p = data; p < data + size; ) {
auto p3 = (BinaryProtocol3*)p;
p += sizeof(BinaryProtocol3);
auto payload_size = ntohs(p3->payload_size);
AudioStreamPacket packet;
packet.sample_rate = 16000;
packet.frame_duration = 60;
packet.payload.resize(payload_size);
memcpy(packet.payload.data(), p3->payload, payload_size);
p += payload_size;
std::lock_guard<std::mutex> lock(mutex_);
audio_decode_queue_.emplace_back(std::move(packet));
}
}
void Application::EnterAudioTestingMode() {
ESP_LOGI(TAG, "Entering audio testing mode");
ResetDecoder();
SetDeviceState(kDeviceStateAudioTesting);
}
void Application::ExitAudioTestingMode() {
ESP_LOGI(TAG, "Exiting audio testing mode");
SetDeviceState(kDeviceStateWifiConfiguring);
// Copy audio_testing_queue_ to audio_decode_queue_
std::lock_guard<std::mutex> lock(mutex_);
audio_decode_queue_ = std::move(audio_testing_queue_);
audio_decode_cv_.notify_all();
}
void Application::ToggleChatState() {
if (device_state_ == kDeviceStateActivating) {
SetDeviceState(kDeviceStateIdle);
return;
} else if (device_state_ == kDeviceStateWifiConfiguring) {
EnterAudioTestingMode();
audio_service_.EnableAudioTesting(true);
SetDeviceState(kDeviceStateAudioTesting);
return;
} else if (device_state_ == kDeviceStateAudioTesting) {
ExitAudioTestingMode();
audio_service_.EnableAudioTesting(false);
SetDeviceState(kDeviceStateWifiConfiguring);
return;
}
@ -350,7 +268,8 @@ void Application::StartListening() {
SetDeviceState(kDeviceStateIdle);
return;
} else if (device_state_ == kDeviceStateWifiConfiguring) {
EnterAudioTestingMode();
audio_service_.EnableAudioTesting(true);
SetDeviceState(kDeviceStateAudioTesting);
return;
}
@ -380,7 +299,8 @@ void Application::StartListening() {
void Application::StopListening() {
if (device_state_ == kDeviceStateAudioTesting) {
ExitAudioTestingMode();
audio_service_.EnableAudioTesting(false);
SetDeviceState(kDeviceStateWifiConfiguring);
return;
}
@ -409,43 +329,22 @@ void Application::Start() {
/* Setup the display */
auto display = board.GetDisplay();
/* Setup the audio codec */
/* Setup the audio service */
auto codec = board.GetAudioCodec();
opus_decoder_ = std::make_unique<OpusDecoderWrapper>(codec->output_sample_rate(), 1, OPUS_FRAME_DURATION_MS);
opus_encoder_ = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
opus_encoder_->SetComplexity(0);
if (aec_mode_ != kAecOff) {
ESP_LOGI(TAG, "AEC mode: %d, setting opus encoder complexity to 0", aec_mode_);
opus_encoder_->SetComplexity(0);
} else {
#if CONFIG_USE_AUDIO_PROCESSOR
ESP_LOGI(TAG, "Audio processor detected, setting opus encoder complexity to 5");
opus_encoder_->SetComplexity(5);
#else
ESP_LOGI(TAG, "Audio processor not detected, setting opus encoder complexity to 0");
opus_encoder_->SetComplexity(0);
#endif
}
audio_service_.Initialize(codec);
audio_service_.Start();
if (codec->input_sample_rate() != 16000) {
input_resampler_.Configure(codec->input_sample_rate(), 16000);
reference_resampler_.Configure(codec->input_sample_rate(), 16000);
}
codec->Start();
#if CONFIG_USE_AUDIO_PROCESSOR
xTaskCreatePinnedToCore([](void* arg) {
Application* app = (Application*)arg;
app->AudioLoop();
vTaskDelete(NULL);
}, "audio_loop", 4096 * 2, this, 8, &audio_loop_task_handle_, 1);
#else
xTaskCreate([](void* arg) {
Application* app = (Application*)arg;
app->AudioLoop();
vTaskDelete(NULL);
}, "audio_loop", 4096 * 2, this, 8, &audio_loop_task_handle_);
#endif
AudioServiceCallbacks callbacks;
callbacks.on_send_queue_available = [this]() {
xEventGroupSetBits(event_group_, MAIN_EVENT_SEND_AUDIO);
};
callbacks.on_wake_word_detected = [this](const std::string& wake_word) {
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
};
callbacks.on_vad_change = [this](bool speaking) {
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
};
audio_service_.SetCallbacks(callbacks);
/* Start the clock timer to update the status bar */
esp_timer_start_periodic(clock_timer_handle_, 1000000);
@ -464,9 +363,7 @@ void Application::Start() {
display->SetStatus(Lang::Strings::LOADING_PROTOCOL);
// Add MCP common tools before initializing the protocol
#if CONFIG_IOT_PROTOCOL_MCP
McpServer::GetInstance().AddCommonTools();
#endif
if (ota.HasMqttConfig()) {
protocol_ = std::make_unique<MqttProtocol>();
@ -478,13 +375,12 @@ void Application::Start() {
}
protocol_->OnNetworkError([this](const std::string& message) {
SetDeviceState(kDeviceStateIdle);
Alert(Lang::Strings::ERROR, message.c_str(), "sad", Lang::Sounds::P3_EXCLAMATION);
last_error_message_ = message;
xEventGroupSetBits(event_group_, MAIN_EVENT_ERROR);
});
protocol_->OnIncomingAudio([this](AudioStreamPacket&& packet) {
std::lock_guard<std::mutex> lock(mutex_);
if (device_state_ == kDeviceStateSpeaking && audio_decode_queue_.size() < MAX_AUDIO_PACKETS_IN_QUEUE) {
audio_decode_queue_.emplace_back(std::move(packet));
protocol_->OnIncomingAudio([this](std::unique_ptr<AudioStreamPacket> packet) {
if (device_state_ == kDeviceStateSpeaking) {
audio_service_.PushPacketToDecodeQueue(std::move(packet));
}
});
protocol_->OnAudioChannelOpened([this, codec, &board]() {
@ -493,15 +389,6 @@ void Application::Start() {
ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion",
protocol_->server_sample_rate(), codec->output_sample_rate());
}
#if CONFIG_IOT_PROTOCOL_XIAOZHI
auto& thing_manager = iot::ThingManager::GetInstance();
protocol_->SendIotDescriptors(thing_manager.GetDescriptorsJson());
std::string states;
if (thing_manager.GetStatesJson(states, false)) {
protocol_->SendIotStates(states);
}
#endif
});
protocol_->OnAudioChannelClosed([this, &board]() {
board.SetPowerSaveMode(true);
@ -525,7 +412,6 @@ void Application::Start() {
});
} else if (strcmp(state->valuestring, "stop") == 0) {
Schedule([this]() {
background_task_->WaitForCompletion();
if (device_state_ == kDeviceStateSpeaking) {
if (listening_mode_ == kListeningModeManualStop) {
SetDeviceState(kDeviceStateIdle);
@ -558,36 +444,11 @@ void Application::Start() {
display->SetEmotion(emotion_str.c_str());
});
}
#if CONFIG_RECEIVE_CUSTOM_MESSAGE
} else if (strcmp(type->valuestring, "custom") == 0) {
auto payload = cJSON_GetObjectItem(root, "payload");
ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root));
if (cJSON_IsObject(payload)) {
Schedule([this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() {
display->SetChatMessage("system", payload_str.c_str());
});
} else {
ESP_LOGW(TAG, "Invalid custom message format: missing payload");
}
#endif
#if CONFIG_IOT_PROTOCOL_MCP
} else if (strcmp(type->valuestring, "mcp") == 0) {
auto payload = cJSON_GetObjectItem(root, "payload");
if (cJSON_IsObject(payload)) {
McpServer::GetInstance().ParseMessage(payload);
}
#endif
#if CONFIG_IOT_PROTOCOL_XIAOZHI
} else if (strcmp(type->valuestring, "iot") == 0) {
auto commands = cJSON_GetObjectItem(root, "commands");
if (cJSON_IsArray(commands)) {
auto& thing_manager = iot::ThingManager::GetInstance();
for (int i = 0; i < cJSON_GetArraySize(commands); ++i) {
auto command = cJSON_GetArrayItem(commands, i);
thing_manager.Invoke(command);
}
}
#endif
} else if (strcmp(type->valuestring, "system") == 0) {
auto command = cJSON_GetObjectItem(root, "command");
if (cJSON_IsString(command)) {
@ -610,112 +471,24 @@ void Application::Start() {
} else {
ESP_LOGW(TAG, "Alert command requires status, message and emotion");
}
#if CONFIG_RECEIVE_CUSTOM_MESSAGE
} else if (strcmp(type->valuestring, "custom") == 0) {
auto payload = cJSON_GetObjectItem(root, "payload");
ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root));
if (cJSON_IsObject(payload)) {
Schedule([this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() {
display->SetChatMessage("system", payload_str.c_str());
});
} else {
ESP_LOGW(TAG, "Invalid custom message format: missing payload");
}
#endif
} else {
ESP_LOGW(TAG, "Unknown message type: %s", type->valuestring);
}
});
bool protocol_started = protocol_->Start();
audio_debugger_ = std::make_unique<AudioDebugger>();
audio_processor_->Initialize(codec);
audio_processor_->OnOutput([this](std::vector<int16_t>&& data) {
{
std::lock_guard<std::mutex> lock(mutex_);
if (audio_send_queue_.size() >= MAX_AUDIO_PACKETS_IN_QUEUE) {
ESP_LOGW(TAG, "Too many audio packets in queue, drop the newest packet");
return;
}
}
background_task_->Schedule([this, data = std::move(data)]() mutable {
opus_encoder_->Encode(std::move(data), [this](std::vector<uint8_t>&& opus) {
AudioStreamPacket packet;
packet.payload = std::move(opus);
#ifdef CONFIG_USE_SERVER_AEC
{
std::lock_guard<std::mutex> lock(timestamp_mutex_);
if (!timestamp_queue_.empty()) {
packet.timestamp = timestamp_queue_.front();
timestamp_queue_.pop_front();
} else {
packet.timestamp = 0;
}
if (timestamp_queue_.size() > 3) { // 限制队列长度3
timestamp_queue_.pop_front(); // 该包发送前先出队保持队列长度
return;
}
}
#endif
std::lock_guard<std::mutex> lock(mutex_);
if (audio_send_queue_.size() >= MAX_AUDIO_PACKETS_IN_QUEUE) {
ESP_LOGW(TAG, "Too many audio packets in queue, drop the oldest packet");
audio_send_queue_.pop_front();
}
audio_send_queue_.emplace_back(std::move(packet));
xEventGroupSetBits(event_group_, SEND_AUDIO_EVENT);
});
});
});
audio_processor_->OnVadStateChange([this](bool speaking) {
if (device_state_ == kDeviceStateListening) {
Schedule([this, speaking]() {
if (speaking) {
voice_detected_ = true;
} else {
voice_detected_ = false;
}
auto led = Board::GetInstance().GetLed();
led->OnStateChanged();
});
}
});
wake_word_->Initialize(codec);
wake_word_->OnWakeWordDetected([this](const std::string& wake_word) {
Schedule([this, &wake_word]() {
if (!protocol_) {
return;
}
if (device_state_ == kDeviceStateIdle) {
wake_word_->EncodeWakeWordData();
if (!protocol_->IsAudioChannelOpened()) {
SetDeviceState(kDeviceStateConnecting);
if (!protocol_->OpenAudioChannel()) {
wake_word_->StartDetection();
return;
}
}
ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
#if CONFIG_USE_AFE_WAKE_WORD || CONFIG_USE_CUSTOM_WAKE_WORD
AudioStreamPacket packet;
// Encode and send the wake word data to the server
while (wake_word_->GetWakeWordOpus(packet.payload)) {
protocol_->SendAudio(packet);
}
// Set the chat state to wake word detected
protocol_->SendWakeWordDetected(wake_word);
#else
// Play the pop up sound to indicate the wake word is detected
// And wait 60ms to make sure the queue has been processed by audio task
ResetDecoder();
PlaySound(Lang::Sounds::P3_POPUP);
vTaskDelay(pdMS_TO_TICKS(60));
#endif
SetListeningMode(aec_mode_ == kAecOff ? kListeningModeAutoStop : kListeningModeRealtime);
} else if (device_state_ == kDeviceStateSpeaking) {
AbortSpeaking(kAbortReasonWakeWordDetected);
} else if (device_state_ == kDeviceStateActivating) {
SetDeviceState(kDeviceStateIdle);
}
});
});
wake_word_->StartDetection();
// Wait for the new version check to finish
xEventGroupWaitBits(event_group_, CHECK_NEW_VERSION_DONE_EVENT, pdTRUE, pdFALSE, portMAX_DELAY);
SetDeviceState(kDeviceStateIdle);
has_server_time_ = ota.HasServerTime();
@ -724,8 +497,7 @@ void Application::Start() {
display->ShowNotification(message.c_str());
display->SetChatMessage("system", "");
// Play the success sound to indicate the device is ready
ResetDecoder();
PlaySound(Lang::Sounds::P3_SUCCESS);
audio_service_.PlaySound(Lang::Sounds::P3_SUCCESS);
}
// Print heap stats
@ -746,19 +518,6 @@ void Application::OnClockTimer() {
// SystemInfo::PrintTaskCpuUsage(pdMS_TO_TICKS(1000));
// SystemInfo::PrintTaskList();
SystemInfo::PrintHeapStats();
// If we have synchronized server time, set the status to clock "HH:MM" if the device is idle
if (has_server_time_) {
if (device_state_ == kDeviceStateIdle) {
Schedule([this]() {
// Set status to clock "HH:MM"
time_t now = time(NULL);
char time_str[64];
strftime(time_str, sizeof(time_str), "%H:%M ", localtime(&now));
Board::GetInstance().GetDisplay()->SetStatus(time_str);
});
}
}
}
}
@ -768,7 +527,7 @@ void Application::Schedule(std::function<void()> callback) {
std::lock_guard<std::mutex> lock(mutex_);
main_tasks_.push_back(std::move(callback));
}
xEventGroupSetBits(event_group_, SCHEDULE_EVENT);
xEventGroupSetBits(event_group_, MAIN_EVENT_SCHEDULE);
}
// The Main Event Loop controls the chat state and websocket connection
@ -779,20 +538,36 @@ void Application::MainEventLoop() {
vTaskPrioritySet(NULL, 3);
while (true) {
auto bits = xEventGroupWaitBits(event_group_, SCHEDULE_EVENT | SEND_AUDIO_EVENT, pdTRUE, pdFALSE, portMAX_DELAY);
auto bits = xEventGroupWaitBits(event_group_, MAIN_EVENT_SCHEDULE |
MAIN_EVENT_SEND_AUDIO |
MAIN_EVENT_WAKE_WORD_DETECTED |
MAIN_EVENT_VAD_CHANGE |
MAIN_EVENT_ERROR, pdTRUE, pdFALSE, portMAX_DELAY);
if (bits & MAIN_EVENT_ERROR) {
SetDeviceState(kDeviceStateIdle);
Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "sad", Lang::Sounds::P3_EXCLAMATION);
}
if (bits & SEND_AUDIO_EVENT) {
std::unique_lock<std::mutex> lock(mutex_);
auto packets = std::move(audio_send_queue_);
lock.unlock();
for (auto& packet : packets) {
if (!protocol_->SendAudio(packet)) {
if (bits & MAIN_EVENT_SEND_AUDIO) {
while (auto packet = audio_service_.PopPacketFromSendQueue()) {
if (!protocol_->SendAudio(std::move(packet))) {
break;
}
}
}
if (bits & SCHEDULE_EVENT) {
if (bits & MAIN_EVENT_WAKE_WORD_DETECTED) {
OnWakeWordDetected();
}
if (bits & MAIN_EVENT_VAD_CHANGE) {
if (device_state_ == kDeviceStateListening) {
auto led = Board::GetInstance().GetLed();
led->OnStateChanged();
}
}
if (bits & MAIN_EVENT_SCHEDULE) {
std::unique_lock<std::mutex> lock(mutex_);
auto tasks = std::move(main_tasks_);
lock.unlock();
@ -803,170 +578,43 @@ void Application::MainEventLoop() {
}
}
// The Audio Loop is used to input and output audio data
void Application::AudioLoop() {
auto codec = Board::GetInstance().GetAudioCodec();
while (true) {
OnAudioInput();
if (codec->output_enabled()) {
OnAudioOutput();
}
}
}
void Application::OnAudioOutput() {
if (busy_decoding_audio_) {
void Application::OnWakeWordDetected() {
if (!protocol_) {
return;
}
auto now = std::chrono::steady_clock::now();
auto codec = Board::GetInstance().GetAudioCodec();
const int max_silence_seconds = 10;
if (device_state_ == kDeviceStateIdle) {
audio_service_.EncodeWakeWord();
std::unique_lock<std::mutex> lock(mutex_);
if (audio_decode_queue_.empty()) {
// Disable the output if there is no audio data for a long time
if (device_state_ == kDeviceStateIdle) {
auto duration = std::chrono::duration_cast<std::chrono::seconds>(now - last_output_time_).count();
if (duration > max_silence_seconds) {
codec->EnableOutput(false);
if (!protocol_->IsAudioChannelOpened()) {
SetDeviceState(kDeviceStateConnecting);
if (!protocol_->OpenAudioChannel()) {
audio_service_.EnableWakeWordDetection(true);
return;
}
}
return;
}
auto packet = std::move(audio_decode_queue_.front());
audio_decode_queue_.pop_front();
lock.unlock();
audio_decode_cv_.notify_all();
// Synchronize the sample rate and frame duration
SetDecodeSampleRate(packet.sample_rate, packet.frame_duration);
busy_decoding_audio_ = true;
if (!background_task_->Schedule([this, codec, packet = std::move(packet)]() mutable {
busy_decoding_audio_ = false;
if (aborted_) {
return;
auto wake_word = audio_service_.GetLastWakeWord();
ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
#if CONFIG_USE_AFE_WAKE_WORD || CONFIG_USE_CUSTOM_WAKE_WORD
// Encode and send the wake word data to the server
while (auto packet = audio_service_.PopWakeWordPacket()) {
protocol_->SendAudio(std::move(packet));
}
std::vector<int16_t> pcm;
if (!opus_decoder_->Decode(std::move(packet.payload), pcm)) {
return;
}
// Resample if the sample rate is different
if (opus_decoder_->sample_rate() != codec->output_sample_rate()) {
int target_size = output_resampler_.GetOutputSamples(pcm.size());
std::vector<int16_t> resampled(target_size);
output_resampler_.Process(pcm.data(), pcm.size(), resampled.data());
pcm = std::move(resampled);
}
codec->OutputData(pcm);
#ifdef CONFIG_USE_SERVER_AEC
std::lock_guard<std::mutex> lock(timestamp_mutex_);
timestamp_queue_.push_back(packet.timestamp);
// Set the chat state to wake word detected
protocol_->SendWakeWordDetected(wake_word);
#else
// Play the pop up sound to indicate the wake word is detected
audio_service_.PlaySound(Lang::Sounds::P3_POPUP);
#endif
last_output_time_ = std::chrono::steady_clock::now();
})) {
busy_decoding_audio_ = false;
SetListeningMode(aec_mode_ == kAecOff ? kListeningModeAutoStop : kListeningModeRealtime);
} else if (device_state_ == kDeviceStateSpeaking) {
AbortSpeaking(kAbortReasonWakeWordDetected);
} else if (device_state_ == kDeviceStateActivating) {
SetDeviceState(kDeviceStateIdle);
}
}
void Application::OnAudioInput() {
if (device_state_ == kDeviceStateAudioTesting) {
if (audio_testing_queue_.size() >= AUDIO_TESTING_MAX_DURATION_MS / OPUS_FRAME_DURATION_MS) {
ExitAudioTestingMode();
return;
}
std::vector<int16_t> data;
int samples = OPUS_FRAME_DURATION_MS * 16000 / 1000;
if (ReadAudio(data, 16000, samples)) {
background_task_->Schedule([this, data = std::move(data)]() mutable {
opus_encoder_->Encode(std::move(data), [this](std::vector<uint8_t>&& opus) {
AudioStreamPacket packet;
packet.payload = std::move(opus);
packet.frame_duration = OPUS_FRAME_DURATION_MS;
packet.sample_rate = 16000;
std::lock_guard<std::mutex> lock(mutex_);
audio_testing_queue_.push_back(std::move(packet));
});
});
return;
}
}
if (wake_word_->IsDetectionRunning()) {
std::vector<int16_t> data;
int samples = wake_word_->GetFeedSize();
if (samples > 0) {
if (ReadAudio(data, 16000, samples)) {
wake_word_->Feed(data);
return;
}
}
}
if (audio_processor_->IsRunning()) {
std::vector<int16_t> data;
int samples = audio_processor_->GetFeedSize();
if (samples > 0) {
if (ReadAudio(data, 16000, samples)) {
audio_processor_->Feed(data);
return;
}
}
}
vTaskDelay(pdMS_TO_TICKS(OPUS_FRAME_DURATION_MS / 2));
}
bool Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples) {
auto codec = Board::GetInstance().GetAudioCodec();
if (!codec->input_enabled()) {
return false;
}
if (codec->input_sample_rate() != sample_rate) {
data.resize(samples * codec->input_sample_rate() / sample_rate);
if (!codec->InputData(data)) {
return false;
}
if (codec->input_channels() == 2) {
auto mic_channel = std::vector<int16_t>(data.size() / 2);
auto reference_channel = std::vector<int16_t>(data.size() / 2);
for (size_t i = 0, j = 0; i < mic_channel.size(); ++i, j += 2) {
mic_channel[i] = data[j];
reference_channel[i] = data[j + 1];
}
auto resampled_mic = std::vector<int16_t>(input_resampler_.GetOutputSamples(mic_channel.size()));
auto resampled_reference = std::vector<int16_t>(reference_resampler_.GetOutputSamples(reference_channel.size()));
input_resampler_.Process(mic_channel.data(), mic_channel.size(), resampled_mic.data());
reference_resampler_.Process(reference_channel.data(), reference_channel.size(), resampled_reference.data());
data.resize(resampled_mic.size() + resampled_reference.size());
for (size_t i = 0, j = 0; i < resampled_mic.size(); ++i, j += 2) {
data[j] = resampled_mic[i];
data[j + 1] = resampled_reference[i];
}
} else {
auto resampled = std::vector<int16_t>(input_resampler_.GetOutputSamples(data.size()));
input_resampler_.Process(data.data(), data.size(), resampled.data());
data = std::move(resampled);
}
} else {
data.resize(samples);
if (!codec->InputData(data)) {
return false;
}
}
// 音频调试:发送原始音频数据
if (audio_debugger_) {
audio_debugger_->Feed(data);
}
return true;
}
void Application::AbortSpeaking(AbortReason reason) {
ESP_LOGI(TAG, "Abort speaking");
aborted_ = true;
@ -987,8 +635,6 @@ void Application::SetDeviceState(DeviceState state) {
auto previous_state = device_state_;
device_state_ = state;
ESP_LOGI(TAG, "STATE: %s", STATE_STRINGS[device_state_]);
// The state is changed, wait for all background tasks to finish
background_task_->WaitForCompletion();
// Send the state change event
DeviceStateEventManager::GetInstance().PostStateChangeEvent(previous_state, state);
@ -1002,51 +648,39 @@ void Application::SetDeviceState(DeviceState state) {
case kDeviceStateIdle:
display->SetStatus(Lang::Strings::STANDBY);
display->SetEmotion("neutral");
audio_processor_->Stop();
wake_word_->StartDetection();
audio_service_.EnableVoiceProcessing(false);
audio_service_.EnableWakeWordDetection(true);
break;
case kDeviceStateConnecting:
display->SetStatus(Lang::Strings::CONNECTING);
display->SetEmotion("neutral");
display->SetChatMessage("system", "");
timestamp_queue_.clear();
break;
case kDeviceStateListening:
display->SetStatus(Lang::Strings::LISTENING);
display->SetEmotion("neutral");
// Update the IoT states before sending the start listening command
#if CONFIG_IOT_PROTOCOL_XIAOZHI
UpdateIotStates();
#endif
// Make sure the audio processor is running
if (!audio_processor_->IsRunning()) {
if (!audio_service_.IsAudioProcessorRunning()) {
// Send the start listening command
protocol_->SendStartListening(listening_mode_);
if (previous_state == kDeviceStateSpeaking) {
audio_decode_queue_.clear();
audio_decode_cv_.notify_all();
// FIXME: Wait for the speaker to empty the buffer
vTaskDelay(pdMS_TO_TICKS(120));
}
opus_encoder_->ResetState();
audio_processor_->Start();
wake_word_->StopDetection();
audio_service_.EnableVoiceProcessing(true);
audio_service_.EnableWakeWordDetection(false);
}
break;
case kDeviceStateSpeaking:
display->SetStatus(Lang::Strings::SPEAKING);
if (listening_mode_ != kListeningModeRealtime) {
audio_processor_->Stop();
audio_service_.EnableVoiceProcessing(false);
// Only AFE wake word can be detected in speaking mode
#if CONFIG_USE_AFE_WAKE_WORD
wake_word_->StartDetection();
audio_service_.EnableWakeWordDetection(true);
#else
wake_word_->StopDetection();
audio_service_.EnableWakeWordDetection(false);
#endif
}
ResetDecoder();
audio_service_.ResetDecoder();
break;
default:
// Do nothing
@ -1054,41 +688,6 @@ void Application::SetDeviceState(DeviceState state) {
}
}
void Application::ResetDecoder() {
std::lock_guard<std::mutex> lock(mutex_);
opus_decoder_->ResetState();
audio_decode_queue_.clear();
audio_decode_cv_.notify_all();
last_output_time_ = std::chrono::steady_clock::now();
auto codec = Board::GetInstance().GetAudioCodec();
codec->EnableOutput(true);
}
void Application::SetDecodeSampleRate(int sample_rate, int frame_duration) {
if (opus_decoder_->sample_rate() == sample_rate && opus_decoder_->duration_ms() == frame_duration) {
return;
}
opus_decoder_.reset();
opus_decoder_ = std::make_unique<OpusDecoderWrapper>(sample_rate, 1, frame_duration);
auto codec = Board::GetInstance().GetAudioCodec();
if (opus_decoder_->sample_rate() != codec->output_sample_rate()) {
ESP_LOGI(TAG, "Resampling audio from %d to %d", opus_decoder_->sample_rate(), codec->output_sample_rate());
output_resampler_.Configure(opus_decoder_->sample_rate(), codec->output_sample_rate());
}
}
void Application::UpdateIotStates() {
#if CONFIG_IOT_PROTOCOL_XIAOZHI
auto& thing_manager = iot::ThingManager::GetInstance();
std::string states;
if (thing_manager.GetStatesJson(states, true)) {
protocol_->SendIotStates(states);
}
#endif
}
void Application::Reboot() {
ESP_LOGI(TAG, "Rebooting...");
esp_restart();
@ -1124,6 +723,10 @@ bool Application::CanEnterSleepMode() {
return false;
}
if (!audio_service_.IsIdle()) {
return false;
}
// Now it is safe to enter sleep mode
return true;
}
@ -1143,15 +746,15 @@ void Application::SetAecMode(AecMode mode) {
auto display = board.GetDisplay();
switch (aec_mode_) {
case kAecOff:
audio_processor_->EnableDeviceAec(false);
audio_service_.EnableDeviceAec(false);
display->ShowNotification(Lang::Strings::RTC_MODE_OFF);
break;
case kAecOnServerSide:
audio_processor_->EnableDeviceAec(false);
audio_service_.EnableDeviceAec(false);
display->ShowNotification(Lang::Strings::RTC_MODE_ON);
break;
case kAecOnDeviceSide:
audio_processor_->EnableDeviceAec(true);
audio_service_.EnableDeviceAec(true);
display->ShowNotification(Lang::Strings::RTC_MODE_ON);
break;
}
@ -1162,3 +765,7 @@ void Application::SetAecMode(AecMode mode) {
}
});
}
void Application::PlaySound(const std::string_view& sound) {
audio_service_.PlaySound(sound);
}