feat(audio): Use esp_audio_codec and esp_audio_effects to replace 78opus (#1632)

This commit is contained in:
majingjing123
2026-01-07 18:45:34 +08:00
committed by GitHub
parent be88719932
commit 906d819454
7 changed files with 316 additions and 106 deletions

View File

@ -1,6 +1,5 @@
#include "afe_wake_word.h"
#include "audio_service.h"
#include <esp_log.h>
#include <sstream>
@ -157,7 +156,7 @@ void AfeWakeWord::StoreWakeWordData(const int16_t* data, size_t samples) {
}
void AfeWakeWord::EncodeWakeWordData() {
const size_t stack_size = 4096 * 7;
const size_t stack_size = 4096 * 6;
wake_word_opus_.clear();
if (wake_word_encode_task_stack_ == nullptr) {
wake_word_encode_task_stack_ = (StackType_t*)heap_caps_malloc(stack_size, MALLOC_CAP_SPIRAM);
@ -172,20 +171,62 @@ void AfeWakeWord::EncodeWakeWordData() {
auto this_ = (AfeWakeWord*)arg;
{
auto start_time = esp_timer_get_time();
auto encoder = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
encoder->SetComplexity(0); // 0 is the fastest
// Create encoder
esp_opus_enc_config_t opus_enc_cfg = AS_OPUS_ENC_CONFIG();
void* encoder_handle = nullptr;
auto ret = esp_opus_enc_open(&opus_enc_cfg, sizeof(esp_opus_enc_config_t), &encoder_handle);
if (encoder_handle == nullptr) {
ESP_LOGE(TAG, "Failed to create audio encoder, error code: %d", ret);
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
this_->wake_word_opus_.push_back(std::vector<uint8_t>());
this_->wake_word_cv_.notify_all();
return;
}
// Get frame size
int frame_size = 0;
int outbuf_size = 0;
esp_opus_enc_get_frame_size(encoder_handle, &frame_size, &outbuf_size);
frame_size = frame_size / sizeof(int16_t);
// Encode all PCM data
int packets = 0;
std::vector<int16_t> in_buffer;
esp_audio_enc_in_frame_t in = {};
esp_audio_enc_out_frame_t out = {};
for (auto& pcm: this_->wake_word_pcm_) {
encoder->Encode(std::move(pcm), [this_](std::vector<uint8_t>&& opus) {
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
this_->wake_word_opus_.emplace_back(std::move(opus));
this_->wake_word_cv_.notify_all();
});
packets++;
if (in_buffer.empty()) {
in_buffer = std::move(pcm);
} else {
in_buffer.reserve(in_buffer.size() + pcm.size());
in_buffer.insert(in_buffer.end(), pcm.begin(), pcm.end());
}
while (in_buffer.size() >= frame_size) {
std::vector<uint8_t> opus_buf(outbuf_size);
in.buffer = (uint8_t *)(in_buffer.data());
in.len = (uint32_t)(frame_size * sizeof(int16_t));
out.buffer = opus_buf.data();
out.len = outbuf_size;
out.encoded_bytes = 0;
ret = esp_opus_enc_process(encoder_handle, &in, &out);
if (ret == ESP_AUDIO_ERR_OK) {
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
this_->wake_word_opus_.emplace_back(opus_buf.data(), opus_buf.data() + out.encoded_bytes);
this_->wake_word_cv_.notify_all();
packets++;
} else {
ESP_LOGE(TAG, "Failed to encode audio, error code: %d", ret);
}
in_buffer.erase(in_buffer.begin(), in_buffer.begin() + frame_size);
}
}
this_->wake_word_pcm_.clear();
// Close encoder
esp_opus_enc_close(encoder_handle);
auto end_time = esp_timer_get_time();
ESP_LOGI(TAG, "Encode wake word opus %d packets in %ld ms", packets, (long)((end_time - start_time) / 1000));

View File

@ -9,10 +9,8 @@
#include <esp_mn_speech_commands.h>
#include <cJSON.h>
#define TAG "CustomWakeWord"
CustomWakeWord::CustomWakeWord()
: wake_word_pcm_(), wake_word_opus_() {
}
@ -218,20 +216,56 @@ void CustomWakeWord::EncodeWakeWordData() {
auto this_ = (CustomWakeWord*)arg;
{
auto start_time = esp_timer_get_time();
auto encoder = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
encoder->SetComplexity(0); // 0 is the fastest
// Create encoder
esp_opus_enc_config_t opus_enc_cfg = AS_OPUS_ENC_CONFIG();
void* encoder_handle = nullptr;
auto ret = esp_opus_enc_open(&opus_enc_cfg, sizeof(esp_opus_enc_config_t), &encoder_handle);
if (encoder_handle == nullptr) {
ESP_LOGE(TAG, "Failed to create audio encoder, error code: %d", ret);
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
this_->wake_word_opus_.push_back(std::vector<uint8_t>());
this_->wake_word_cv_.notify_all();
return;
}
// Get frame size
int frame_size = 0;
int outbuf_size = 0;
esp_opus_enc_get_frame_size(encoder_handle, &frame_size, &outbuf_size);
frame_size = frame_size / sizeof(int16_t);
// Encode all PCM data
int packets = 0;
std::vector<int16_t> in_buffer;
esp_audio_enc_in_frame_t in = {};
esp_audio_enc_out_frame_t out = {};
for (auto& pcm: this_->wake_word_pcm_) {
encoder->Encode(std::move(pcm), [this_](std::vector<uint8_t>&& opus) {
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
this_->wake_word_opus_.emplace_back(std::move(opus));
this_->wake_word_cv_.notify_all();
});
packets++;
if (in_buffer.empty()) {
in_buffer = std::move(pcm);
} else {
in_buffer.reserve(in_buffer.size() + pcm.size());
in_buffer.insert(in_buffer.end(), pcm.begin(), pcm.end());
}
while (in_buffer.size() >= frame_size) {
std::vector<uint8_t> opus_buf(outbuf_size);
in.buffer = (uint8_t *)(in_buffer.data());
in.len = (uint32_t)(frame_size * sizeof(int16_t));
out.buffer = opus_buf.data();
out.len = outbuf_size;
out.encoded_bytes = 0;
ret = esp_opus_enc_process(encoder_handle, &in, &out);
if (ret == ESP_AUDIO_ERR_OK) {
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
this_->wake_word_opus_.emplace_back(opus_buf.data(), opus_buf.data() + out.encoded_bytes);
this_->wake_word_cv_.notify_all();
packets++;
} else {
ESP_LOGE(TAG, "Failed to encode audio, error code: %d", ret);
}
in_buffer.erase(in_buffer.begin(), in_buffer.begin() + frame_size);
}
}
this_->wake_word_pcm_.clear();
// Close encoder
esp_opus_enc_close(encoder_handle);
auto end_time = esp_timer_get_time();
ESP_LOGI(TAG, "Encode wake word opus %d packets in %ld ms", packets, (long)((end_time - start_time) / 1000));