Compare commits
5 Commits
cam
...
2c4329fd84
| Author | SHA1 | Date | |
|---|---|---|---|
| 2c4329fd84 | |||
| 9637e09aef | |||
| b92e6e1b07 | |||
| 33ee598c21 | |||
| 37343ac0fe |
1
.gitignore
vendored
1
.gitignore
vendored
@ -19,3 +19,4 @@ main/mmap_generate_emoji.h
|
|||||||
*.bin
|
*.bin
|
||||||
mmap_generate_*.h
|
mmap_generate_*.h
|
||||||
.clangd
|
.clangd
|
||||||
|
background_frames/
|
||||||
|
|||||||
@ -1,25 +1,24 @@
|
|||||||
#include "application.h"
|
#include "application.h"
|
||||||
|
#include "assets.h"
|
||||||
|
#include "assets/lang_config.h"
|
||||||
|
#include "audio_codec.h"
|
||||||
#include "board.h"
|
#include "board.h"
|
||||||
#include "display.h"
|
#include "display.h"
|
||||||
#include "system_info.h"
|
|
||||||
#include "audio_codec.h"
|
|
||||||
#include "mqtt_protocol.h"
|
|
||||||
#include "websocket_protocol.h"
|
|
||||||
#include "assets/lang_config.h"
|
|
||||||
#include "mcp_server.h"
|
#include "mcp_server.h"
|
||||||
#include "assets.h"
|
#include "mqtt_protocol.h"
|
||||||
#include "settings.h"
|
#include "settings.h"
|
||||||
|
#include "system_info.h"
|
||||||
|
#include "websocket_protocol.h"
|
||||||
|
|
||||||
#include <cstring>
|
|
||||||
#include <esp_log.h>
|
|
||||||
#include <cJSON.h>
|
|
||||||
#include <driver/gpio.h>
|
#include <driver/gpio.h>
|
||||||
|
#include <esp_log.h>
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
|
#include <cJSON.h>
|
||||||
#include <font_awesome.h>
|
#include <font_awesome.h>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
#define TAG "Application"
|
#define TAG "Application"
|
||||||
|
|
||||||
|
|
||||||
Application::Application() {
|
Application::Application() {
|
||||||
event_group_ = xEventGroupCreate();
|
event_group_ = xEventGroupCreate();
|
||||||
|
|
||||||
@ -33,16 +32,16 @@ Application::Application() {
|
|||||||
aec_mode_ = kAecOff;
|
aec_mode_ = kAecOff;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
esp_timer_create_args_t clock_timer_args = {
|
esp_timer_create_args_t clock_timer_args = {.callback =
|
||||||
.callback = [](void* arg) {
|
[](void* arg) {
|
||||||
Application* app = (Application*)arg;
|
Application* app = (Application*)arg;
|
||||||
xEventGroupSetBits(app->event_group_, MAIN_EVENT_CLOCK_TICK);
|
xEventGroupSetBits(app->event_group_,
|
||||||
|
MAIN_EVENT_CLOCK_TICK);
|
||||||
},
|
},
|
||||||
.arg = this,
|
.arg = this,
|
||||||
.dispatch_method = ESP_TIMER_TASK,
|
.dispatch_method = ESP_TIMER_TASK,
|
||||||
.name = "clock_timer",
|
.name = "clock_timer",
|
||||||
.skip_unhandled_events = true
|
.skip_unhandled_events = true};
|
||||||
};
|
|
||||||
esp_timer_create(&clock_timer_args, &clock_timer_handle_);
|
esp_timer_create(&clock_timer_args, &clock_timer_handle_);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,9 +53,7 @@ Application::~Application() {
|
|||||||
vEventGroupDelete(event_group_);
|
vEventGroupDelete(event_group_);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Application::SetDeviceState(DeviceState state) {
|
bool Application::SetDeviceState(DeviceState state) { return state_machine_.TransitionTo(state); }
|
||||||
return state_machine_.TransitionTo(state);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Application::Initialize() {
|
void Application::Initialize() {
|
||||||
auto& board = Board::GetInstance();
|
auto& board = Board::GetInstance();
|
||||||
@ -81,6 +78,7 @@ void Application::Initialize() {
|
|||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
|
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
|
||||||
};
|
};
|
||||||
callbacks.on_vad_change = [this](bool speaking) {
|
callbacks.on_vad_change = [this](bool speaking) {
|
||||||
|
vad_speaking_.store(speaking);
|
||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
|
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
|
||||||
};
|
};
|
||||||
audio_service_.SetCallbacks(callbacks);
|
audio_service_.SetCallbacks(callbacks);
|
||||||
@ -141,13 +139,16 @@ void Application::Initialize() {
|
|||||||
display->SetStatus(Lang::Strings::DETECTING_MODULE);
|
display->SetStatus(Lang::Strings::DETECTING_MODULE);
|
||||||
break;
|
break;
|
||||||
case NetworkEvent::ModemErrorNoSim:
|
case NetworkEvent::ModemErrorNoSim:
|
||||||
Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_PIN);
|
Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation",
|
||||||
|
Lang::Sounds::OGG_ERR_PIN);
|
||||||
break;
|
break;
|
||||||
case NetworkEvent::ModemErrorRegDenied:
|
case NetworkEvent::ModemErrorRegDenied:
|
||||||
Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_REG);
|
Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation",
|
||||||
|
Lang::Sounds::OGG_ERR_REG);
|
||||||
break;
|
break;
|
||||||
case NetworkEvent::ModemErrorInitFailed:
|
case NetworkEvent::ModemErrorInitFailed:
|
||||||
Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation", Lang::Sounds::OGG_EXCLAMATION);
|
Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation",
|
||||||
|
Lang::Sounds::OGG_EXCLAMATION);
|
||||||
break;
|
break;
|
||||||
case NetworkEvent::ModemErrorTimeout:
|
case NetworkEvent::ModemErrorTimeout:
|
||||||
display->SetStatus(Lang::Strings::REGISTERING_NETWORK);
|
display->SetStatus(Lang::Strings::REGISTERING_NETWORK);
|
||||||
@ -167,18 +168,10 @@ void Application::Run() {
|
|||||||
vTaskPrioritySet(nullptr, 10);
|
vTaskPrioritySet(nullptr, 10);
|
||||||
|
|
||||||
const EventBits_t ALL_EVENTS =
|
const EventBits_t ALL_EVENTS =
|
||||||
MAIN_EVENT_SCHEDULE |
|
MAIN_EVENT_SCHEDULE | MAIN_EVENT_SEND_AUDIO | MAIN_EVENT_WAKE_WORD_DETECTED |
|
||||||
MAIN_EVENT_SEND_AUDIO |
|
MAIN_EVENT_VAD_CHANGE | MAIN_EVENT_CLOCK_TICK | MAIN_EVENT_ERROR |
|
||||||
MAIN_EVENT_WAKE_WORD_DETECTED |
|
MAIN_EVENT_NETWORK_CONNECTED | MAIN_EVENT_NETWORK_DISCONNECTED | MAIN_EVENT_TOGGLE_CHAT |
|
||||||
MAIN_EVENT_VAD_CHANGE |
|
MAIN_EVENT_START_LISTENING | MAIN_EVENT_STOP_LISTENING | MAIN_EVENT_ACTIVATION_DONE |
|
||||||
MAIN_EVENT_CLOCK_TICK |
|
|
||||||
MAIN_EVENT_ERROR |
|
|
||||||
MAIN_EVENT_NETWORK_CONNECTED |
|
|
||||||
MAIN_EVENT_NETWORK_DISCONNECTED |
|
|
||||||
MAIN_EVENT_TOGGLE_CHAT |
|
|
||||||
MAIN_EVENT_START_LISTENING |
|
|
||||||
MAIN_EVENT_STOP_LISTENING |
|
|
||||||
MAIN_EVENT_ACTIVATION_DONE |
|
|
||||||
MAIN_EVENT_STATE_CHANGED;
|
MAIN_EVENT_STATE_CHANGED;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -186,7 +179,8 @@ void Application::Run() {
|
|||||||
|
|
||||||
if (bits & MAIN_EVENT_ERROR) {
|
if (bits & MAIN_EVENT_ERROR) {
|
||||||
SetDeviceState(kDeviceStateIdle);
|
SetDeviceState(kDeviceStateIdle);
|
||||||
Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark", Lang::Sounds::OGG_EXCLAMATION);
|
Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark",
|
||||||
|
Lang::Sounds::OGG_EXCLAMATION);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bits & MAIN_EVENT_NETWORK_CONNECTED) {
|
if (bits & MAIN_EVENT_NETWORK_CONNECTED) {
|
||||||
@ -233,6 +227,13 @@ void Application::Run() {
|
|||||||
if (GetDeviceState() == kDeviceStateListening) {
|
if (GetDeviceState() == kDeviceStateListening) {
|
||||||
auto led = Board::GetInstance().GetLed();
|
auto led = Board::GetInstance().GetLed();
|
||||||
led->OnStateChanged();
|
led->OnStateChanged();
|
||||||
|
|
||||||
|
if (vad_speaking_.load() && vision_text_mode_enabled_.load() &&
|
||||||
|
!vision_frame_sent_for_current_listen_.exchange(true)) {
|
||||||
|
if (!SendCurrentVisionFrame()) {
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,12 +271,14 @@ void Application::HandleNetworkConnectedEvent() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
xTaskCreate([](void* arg) {
|
xTaskCreate(
|
||||||
|
[](void* arg) {
|
||||||
Application* app = static_cast<Application*>(arg);
|
Application* app = static_cast<Application*>(arg);
|
||||||
app->ActivationTask();
|
app->ActivationTask();
|
||||||
app->activation_task_handle_ = nullptr;
|
app->activation_task_handle_ = nullptr;
|
||||||
vTaskDelete(NULL);
|
vTaskDelete(NULL);
|
||||||
}, "activation", 4096 * 2, this, 2, &activation_task_handle_);
|
},
|
||||||
|
"activation", 4096 * 2, this, 2, &activation_task_handle_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the status bar immediately to show the network state
|
// Update the status bar immediately to show the network state
|
||||||
@ -286,7 +289,8 @@ void Application::HandleNetworkConnectedEvent() {
|
|||||||
void Application::HandleNetworkDisconnectedEvent() {
|
void Application::HandleNetworkDisconnectedEvent() {
|
||||||
// Close current conversation when network disconnected
|
// Close current conversation when network disconnected
|
||||||
auto state = GetDeviceState();
|
auto state = GetDeviceState();
|
||||||
if (state == kDeviceStateConnecting || state == kDeviceStateListening || state == kDeviceStateSpeaking) {
|
if (state == kDeviceStateConnecting || state == kDeviceStateListening ||
|
||||||
|
state == kDeviceStateThinking || state == kDeviceStateSpeaking) {
|
||||||
ESP_LOGI(TAG, "Closing audio channel due to network disconnection");
|
ESP_LOGI(TAG, "Closing audio channel due to network disconnection");
|
||||||
protocol_->CloseAudioChannel();
|
protocol_->CloseAudioChannel();
|
||||||
}
|
}
|
||||||
@ -371,7 +375,8 @@ void Application::CheckAssetsVersion() {
|
|||||||
|
|
||||||
char message[256];
|
char message[256];
|
||||||
snprintf(message, sizeof(message), Lang::Strings::FOUND_NEW_ASSETS, download_url.c_str());
|
snprintf(message, sizeof(message), Lang::Strings::FOUND_NEW_ASSETS, download_url.c_str());
|
||||||
Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down", Lang::Sounds::OGG_UPGRADE);
|
Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down",
|
||||||
|
Lang::Sounds::OGG_UPGRADE);
|
||||||
|
|
||||||
// Wait for the audio service to be idle for 3 seconds
|
// Wait for the audio service to be idle for 3 seconds
|
||||||
vTaskDelay(pdMS_TO_TICKS(3000));
|
vTaskDelay(pdMS_TO_TICKS(3000));
|
||||||
@ -379,7 +384,8 @@ void Application::CheckAssetsVersion() {
|
|||||||
board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
|
board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
|
||||||
display->SetChatMessage("system", Lang::Strings::PLEASE_WAIT);
|
display->SetChatMessage("system", Lang::Strings::PLEASE_WAIT);
|
||||||
|
|
||||||
bool success = assets.Download(download_url, [this, display](int progress, size_t speed) -> void {
|
bool success =
|
||||||
|
assets.Download(download_url, [this, display](int progress, size_t speed) -> void {
|
||||||
char buffer[32];
|
char buffer[32];
|
||||||
snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024);
|
snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024);
|
||||||
Schedule([display, message = std::string(buffer)]() {
|
Schedule([display, message = std::string(buffer)]() {
|
||||||
@ -391,7 +397,8 @@ void Application::CheckAssetsVersion() {
|
|||||||
vTaskDelay(pdMS_TO_TICKS(1000));
|
vTaskDelay(pdMS_TO_TICKS(1000));
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION);
|
Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark",
|
||||||
|
Lang::Sounds::OGG_EXCLAMATION);
|
||||||
vTaskDelay(pdMS_TO_TICKS(2000));
|
vTaskDelay(pdMS_TO_TICKS(2000));
|
||||||
SetDeviceState(kDeviceStateActivating);
|
SetDeviceState(kDeviceStateActivating);
|
||||||
return;
|
return;
|
||||||
@ -423,12 +430,15 @@ void Application::CheckNewVersion() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
char error_message[128];
|
char error_message[128];
|
||||||
snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err, ota_->GetCheckVersionUrl().c_str());
|
snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err,
|
||||||
|
ota_->GetCheckVersionUrl().c_str());
|
||||||
char buffer[256];
|
char buffer[256];
|
||||||
snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay, error_message);
|
snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay,
|
||||||
|
error_message);
|
||||||
Alert(Lang::Strings::ERROR, buffer, "cloud_slash", Lang::Sounds::OGG_EXCLAMATION);
|
Alert(Lang::Strings::ERROR, buffer, "cloud_slash", Lang::Sounds::OGG_EXCLAMATION);
|
||||||
|
|
||||||
ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay, retry_count, MAX_RETRY);
|
ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay,
|
||||||
|
retry_count, MAX_RETRY);
|
||||||
for (int i = 0; i < retry_delay; i++) {
|
for (int i = 0; i < retry_delay; i++) {
|
||||||
vTaskDelay(pdMS_TO_TICKS(1000));
|
vTaskDelay(pdMS_TO_TICKS(1000));
|
||||||
if (GetDeviceState() == kDeviceStateIdle) {
|
if (GetDeviceState() == kDeviceStateIdle) {
|
||||||
@ -499,9 +509,7 @@ void Application::InitializeProtocol() {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
protocol_->OnConnected([this]() {
|
protocol_->OnConnected([this]() { DismissAlert(); });
|
||||||
DismissAlert();
|
|
||||||
});
|
|
||||||
|
|
||||||
protocol_->OnNetworkError([this](const std::string& message) {
|
protocol_->OnNetworkError([this](const std::string& message) {
|
||||||
last_error_message_ = message;
|
last_error_message_ = message;
|
||||||
@ -509,7 +517,7 @@ void Application::InitializeProtocol() {
|
|||||||
});
|
});
|
||||||
|
|
||||||
protocol_->OnIncomingAudio([this](std::unique_ptr<AudioStreamPacket> packet) {
|
protocol_->OnIncomingAudio([this](std::unique_ptr<AudioStreamPacket> packet) {
|
||||||
if (GetDeviceState() == kDeviceStateSpeaking) {
|
if (accepting_tts_audio_.load() || GetDeviceState() == kDeviceStateSpeaking) {
|
||||||
audio_service_.PushPacketToDecodeQueue(std::move(packet));
|
audio_service_.PushPacketToDecodeQueue(std::move(packet));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -517,14 +525,20 @@ void Application::InitializeProtocol() {
|
|||||||
protocol_->OnAudioChannelOpened([this, codec, &board]() {
|
protocol_->OnAudioChannelOpened([this, codec, &board]() {
|
||||||
board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
|
board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
|
||||||
if (protocol_->server_sample_rate() != codec->output_sample_rate()) {
|
if (protocol_->server_sample_rate() != codec->output_sample_rate()) {
|
||||||
ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion",
|
ESP_LOGW(TAG,
|
||||||
|
"Server sample rate %d does not match device output sample rate %d, "
|
||||||
|
"resampling may cause distortion",
|
||||||
protocol_->server_sample_rate(), codec->output_sample_rate());
|
protocol_->server_sample_rate(), codec->output_sample_rate());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
protocol_->OnAudioChannelClosed([this, &board]() {
|
protocol_->OnAudioChannelClosed([this, &board]() {
|
||||||
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
|
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
|
||||||
|
accepting_tts_audio_.store(false);
|
||||||
Schedule([this]() {
|
Schedule([this]() {
|
||||||
|
if (GetDeviceState() == kDeviceStateConnecting) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
auto display = Board::GetInstance().GetDisplay();
|
auto display = Board::GetInstance().GetDisplay();
|
||||||
display->SetChatMessage("system", "");
|
display->SetChatMessage("system", "");
|
||||||
SetDeviceState(kDeviceStateIdle);
|
SetDeviceState(kDeviceStateIdle);
|
||||||
@ -536,14 +550,20 @@ void Application::InitializeProtocol() {
|
|||||||
auto type = cJSON_GetObjectItem(root, "type");
|
auto type = cJSON_GetObjectItem(root, "type");
|
||||||
if (strcmp(type->valuestring, "tts") == 0) {
|
if (strcmp(type->valuestring, "tts") == 0) {
|
||||||
auto state = cJSON_GetObjectItem(root, "state");
|
auto state = cJSON_GetObjectItem(root, "state");
|
||||||
if (strcmp(state->valuestring, "start") == 0) {
|
if (strcmp(state->valuestring, "thinking") == 0) {
|
||||||
|
Schedule([this]() { SetDeviceState(kDeviceStateThinking); });
|
||||||
|
} else if (strcmp(state->valuestring, "start") == 0) {
|
||||||
|
audio_service_.ResetDecoder();
|
||||||
|
accepting_tts_audio_.store(true);
|
||||||
Schedule([this]() {
|
Schedule([this]() {
|
||||||
aborted_ = false;
|
aborted_ = false;
|
||||||
SetDeviceState(kDeviceStateSpeaking);
|
SetDeviceState(kDeviceStateSpeaking);
|
||||||
});
|
});
|
||||||
} else if (strcmp(state->valuestring, "stop") == 0) {
|
} else if (strcmp(state->valuestring, "stop") == 0) {
|
||||||
|
accepting_tts_audio_.store(false);
|
||||||
Schedule([this]() {
|
Schedule([this]() {
|
||||||
if (GetDeviceState() == kDeviceStateSpeaking) {
|
auto state = GetDeviceState();
|
||||||
|
if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
|
||||||
if (listening_mode_ == kListeningModeManualStop) {
|
if (listening_mode_ == kListeningModeManualStop) {
|
||||||
SetDeviceState(kDeviceStateIdle);
|
SetDeviceState(kDeviceStateIdle);
|
||||||
} else {
|
} else {
|
||||||
@ -586,9 +606,7 @@ void Application::InitializeProtocol() {
|
|||||||
ESP_LOGI(TAG, "System command: %s", command->valuestring);
|
ESP_LOGI(TAG, "System command: %s", command->valuestring);
|
||||||
if (strcmp(command->valuestring, "reboot") == 0) {
|
if (strcmp(command->valuestring, "reboot") == 0) {
|
||||||
// Do a reboot if user requests a OTA update
|
// Do a reboot if user requests a OTA update
|
||||||
Schedule([this]() {
|
Schedule([this]() { Reboot(); });
|
||||||
Reboot();
|
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
ESP_LOGW(TAG, "Unknown system command: %s", command->valuestring);
|
ESP_LOGW(TAG, "Unknown system command: %s", command->valuestring);
|
||||||
}
|
}
|
||||||
@ -598,7 +616,8 @@ void Application::InitializeProtocol() {
|
|||||||
auto message = cJSON_GetObjectItem(root, "message");
|
auto message = cJSON_GetObjectItem(root, "message");
|
||||||
auto emotion = cJSON_GetObjectItem(root, "emotion");
|
auto emotion = cJSON_GetObjectItem(root, "emotion");
|
||||||
if (cJSON_IsString(status) && cJSON_IsString(message) && cJSON_IsString(emotion)) {
|
if (cJSON_IsString(status) && cJSON_IsString(message) && cJSON_IsString(emotion)) {
|
||||||
Alert(status->valuestring, message->valuestring, emotion->valuestring, Lang::Sounds::OGG_VIBRATION);
|
Alert(status->valuestring, message->valuestring, emotion->valuestring,
|
||||||
|
Lang::Sounds::OGG_VIBRATION);
|
||||||
} else {
|
} else {
|
||||||
ESP_LOGW(TAG, "Alert command requires status, message and emotion");
|
ESP_LOGW(TAG, "Alert command requires status, message and emotion");
|
||||||
}
|
}
|
||||||
@ -607,7 +626,8 @@ void Application::InitializeProtocol() {
|
|||||||
auto payload = cJSON_GetObjectItem(root, "payload");
|
auto payload = cJSON_GetObjectItem(root, "payload");
|
||||||
ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root));
|
ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root));
|
||||||
if (cJSON_IsObject(payload)) {
|
if (cJSON_IsObject(payload)) {
|
||||||
Schedule([this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() {
|
Schedule(
|
||||||
|
[this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() {
|
||||||
display->SetChatMessage("system", payload_str.c_str());
|
display->SetChatMessage("system", payload_str.c_str());
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
@ -627,18 +647,12 @@ void Application::ShowActivationCode(const std::string& code, const std::string&
|
|||||||
char digit;
|
char digit;
|
||||||
const std::string_view& sound;
|
const std::string_view& sound;
|
||||||
};
|
};
|
||||||
static const std::array<digit_sound, 10> digit_sounds{{
|
static const std::array<digit_sound, 10> digit_sounds{
|
||||||
digit_sound{'0', Lang::Sounds::OGG_0},
|
{digit_sound{'0', Lang::Sounds::OGG_0}, digit_sound{'1', Lang::Sounds::OGG_1},
|
||||||
digit_sound{'1', Lang::Sounds::OGG_1},
|
digit_sound{'2', Lang::Sounds::OGG_2}, digit_sound{'3', Lang::Sounds::OGG_3},
|
||||||
digit_sound{'2', Lang::Sounds::OGG_2},
|
digit_sound{'4', Lang::Sounds::OGG_4}, digit_sound{'5', Lang::Sounds::OGG_5},
|
||||||
digit_sound{'3', Lang::Sounds::OGG_3},
|
digit_sound{'6', Lang::Sounds::OGG_6}, digit_sound{'7', Lang::Sounds::OGG_7},
|
||||||
digit_sound{'4', Lang::Sounds::OGG_4},
|
digit_sound{'8', Lang::Sounds::OGG_8}, digit_sound{'9', Lang::Sounds::OGG_9}}};
|
||||||
digit_sound{'5', Lang::Sounds::OGG_5},
|
|
||||||
digit_sound{'6', Lang::Sounds::OGG_6},
|
|
||||||
digit_sound{'7', Lang::Sounds::OGG_7},
|
|
||||||
digit_sound{'8', Lang::Sounds::OGG_8},
|
|
||||||
digit_sound{'9', Lang::Sounds::OGG_9}
|
|
||||||
}};
|
|
||||||
|
|
||||||
// This sentence uses 9KB of SRAM, so we need to wait for it to finish
|
// This sentence uses 9KB of SRAM, so we need to wait for it to finish
|
||||||
Alert(Lang::Strings::ACTIVATION, message.c_str(), "link", Lang::Sounds::OGG_ACTIVATION);
|
Alert(Lang::Strings::ACTIVATION, message.c_str(), "link", Lang::Sounds::OGG_ACTIVATION);
|
||||||
@ -652,7 +666,8 @@ void Application::ShowActivationCode(const std::string& code, const std::string&
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::Alert(const char* status, const char* message, const char* emotion, const std::string_view& sound) {
|
void Application::Alert(const char* status, const char* message, const char* emotion,
|
||||||
|
const std::string_view& sound) {
|
||||||
ESP_LOGW(TAG, "Alert [%s] %s: %s", emotion, status, message);
|
ESP_LOGW(TAG, "Alert [%s] %s: %s", emotion, status, message);
|
||||||
auto display = Board::GetInstance().GetDisplay();
|
auto display = Board::GetInstance().GetDisplay();
|
||||||
display->SetStatus(status);
|
display->SetStatus(status);
|
||||||
@ -672,30 +687,43 @@ void Application::DismissAlert() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::ToggleChatState() {
|
void Application::ToggleChatState() { ToggleChatStateForMode(kChatAgentModeNormal, false); }
|
||||||
vision_text_mode_enabled_.store(false);
|
|
||||||
|
void Application::ToggleChatStateWithVision() {
|
||||||
|
ToggleChatStateForMode(kChatAgentModeNormal, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) {
|
||||||
|
chat_agent_mode_.store(agent_mode);
|
||||||
|
vision_text_mode_enabled_.store(vision_enabled);
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
|
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::ToggleChatStateWithVision() {
|
bool Application::IsVisionTextModeEnabled() const { return vision_text_mode_enabled_.load(); }
|
||||||
vision_text_mode_enabled_.store(true);
|
|
||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
|
const char* Application::GetChatAgentModeName() const {
|
||||||
|
return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* Application::GetChatModeName() const {
|
||||||
|
bool vision_enabled = vision_text_mode_enabled_.load();
|
||||||
|
if (chat_agent_mode_.load() == kChatAgentModeBeaver) {
|
||||||
|
return vision_enabled ? "vision-beaver" : "beaver";
|
||||||
|
}
|
||||||
|
return vision_enabled ? "vision-normal" : "normal";
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::StartListening() {
|
void Application::StartListening() {
|
||||||
vision_text_mode_enabled_.store(false);
|
vision_text_mode_enabled_.store(false);
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
|
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::StopListening() {
|
void Application::StopListening() { xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING); }
|
||||||
xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Application::HandleToggleChatEvent() {
|
void Application::HandleToggleChatEvent() {
|
||||||
auto state = GetDeviceState();
|
auto state = GetDeviceState();
|
||||||
if (state != kDeviceStateIdle) {
|
|
||||||
vision_text_mode_enabled_.store(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (state == kDeviceStateActivating) {
|
if (state == kDeviceStateActivating) {
|
||||||
SetDeviceState(kDeviceStateIdle);
|
SetDeviceState(kDeviceStateIdle);
|
||||||
@ -717,17 +745,22 @@ void Application::HandleToggleChatEvent() {
|
|||||||
|
|
||||||
if (state == kDeviceStateIdle) {
|
if (state == kDeviceStateIdle) {
|
||||||
ListeningMode mode = GetDefaultListeningMode();
|
ListeningMode mode = GetDefaultListeningMode();
|
||||||
if (!protocol_->IsAudioChannelOpened()) {
|
bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
|
||||||
|
bool vision_mode_changed =
|
||||||
|
vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
|
||||||
|
if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
|
||||||
|
if (protocol_->IsAudioChannelOpened()) {
|
||||||
|
protocol_->CloseAudioChannel();
|
||||||
|
}
|
||||||
SetDeviceState(kDeviceStateConnecting);
|
SetDeviceState(kDeviceStateConnecting);
|
||||||
// Schedule to let the state change be processed first (UI update)
|
// Schedule to let the state change be processed first (UI update)
|
||||||
Schedule([this, mode]() {
|
Schedule([this, mode]() { ContinueOpenAudioChannel(mode); });
|
||||||
ContinueOpenAudioChannel(mode);
|
|
||||||
});
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
SetListeningMode(mode);
|
SetListeningMode(mode);
|
||||||
} else if (state == kDeviceStateSpeaking) {
|
} else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
|
||||||
AbortSpeaking(kAbortReasonNone);
|
AbortSpeaking(kAbortReasonNone);
|
||||||
|
SetListeningMode(GetDefaultListeningMode());
|
||||||
} else if (state == kDeviceStateListening) {
|
} else if (state == kDeviceStateListening) {
|
||||||
protocol_->CloseAudioChannel();
|
protocol_->CloseAudioChannel();
|
||||||
}
|
}
|
||||||
@ -749,6 +782,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
active_chat_agent_mode_.store(chat_agent_mode_.load());
|
||||||
|
active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load());
|
||||||
SetListeningMode(mode);
|
SetListeningMode(mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -773,13 +808,11 @@ void Application::HandleStartListeningEvent() {
|
|||||||
if (!protocol_->IsAudioChannelOpened()) {
|
if (!protocol_->IsAudioChannelOpened()) {
|
||||||
SetDeviceState(kDeviceStateConnecting);
|
SetDeviceState(kDeviceStateConnecting);
|
||||||
// Schedule to let the state change be processed first (UI update)
|
// Schedule to let the state change be processed first (UI update)
|
||||||
Schedule([this]() {
|
Schedule([this]() { ContinueOpenAudioChannel(kListeningModeManualStop); });
|
||||||
ContinueOpenAudioChannel(kListeningModeManualStop);
|
|
||||||
});
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
SetListeningMode(kListeningModeManualStop);
|
SetListeningMode(kListeningModeManualStop);
|
||||||
} else if (state == kDeviceStateSpeaking) {
|
} else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
|
||||||
AbortSpeaking(kAbortReasonNone);
|
AbortSpeaking(kAbortReasonNone);
|
||||||
SetListeningMode(kListeningModeManualStop);
|
SetListeningMode(kListeningModeManualStop);
|
||||||
}
|
}
|
||||||
@ -817,17 +850,14 @@ void Application::HandleWakeWordDetectedEvent() {
|
|||||||
SetDeviceState(kDeviceStateConnecting);
|
SetDeviceState(kDeviceStateConnecting);
|
||||||
// Schedule to let the state change be processed first (UI update),
|
// Schedule to let the state change be processed first (UI update),
|
||||||
// then continue with OpenAudioChannel which may block for ~1 second
|
// then continue with OpenAudioChannel which may block for ~1 second
|
||||||
Schedule([this, wake_word]() {
|
Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); });
|
||||||
ContinueWakeWordInvoke(wake_word);
|
|
||||||
});
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Channel already opened, continue directly
|
// Channel already opened, continue directly
|
||||||
ContinueWakeWordInvoke(wake_word);
|
ContinueWakeWordInvoke(wake_word);
|
||||||
} else if (state == kDeviceStateSpeaking || state == kDeviceStateListening) {
|
} else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking ||
|
||||||
|
state == kDeviceStateListening) {
|
||||||
AbortSpeaking(kAbortReasonWakeWordDetected);
|
AbortSpeaking(kAbortReasonWakeWordDetected);
|
||||||
// Clear send queue to avoid sending residues to server
|
|
||||||
while (audio_service_.PopPacketFromSendQueue());
|
|
||||||
|
|
||||||
if (state == kDeviceStateListening) {
|
if (state == kDeviceStateListening) {
|
||||||
protocol_->SendStartListening(GetDefaultListeningMode());
|
protocol_->SendStartListening(GetDefaultListeningMode());
|
||||||
@ -892,6 +922,7 @@ void Application::HandleStateChangedEvent() {
|
|||||||
switch (new_state) {
|
switch (new_state) {
|
||||||
case kDeviceStateUnknown:
|
case kDeviceStateUnknown:
|
||||||
case kDeviceStateIdle:
|
case kDeviceStateIdle:
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
display->SetStatus(Lang::Strings::STANDBY);
|
display->SetStatus(Lang::Strings::STANDBY);
|
||||||
display->ClearChatMessages(); // Clear messages first
|
display->ClearChatMessages(); // Clear messages first
|
||||||
display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
|
display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
|
||||||
@ -904,24 +935,19 @@ void Application::HandleStateChangedEvent() {
|
|||||||
display->SetChatMessage("system", "");
|
display->SetChatMessage("system", "");
|
||||||
break;
|
break;
|
||||||
case kDeviceStateListening:
|
case kDeviceStateListening:
|
||||||
|
vad_speaking_.store(false);
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
display->SetStatus(Lang::Strings::LISTENING);
|
display->SetStatus(Lang::Strings::LISTENING);
|
||||||
display->SetEmotion("neutral");
|
display->SetEmotion("neutral");
|
||||||
|
|
||||||
// Make sure the audio processor is running
|
// Re-entering listening after an interrupt must restart the capture path even if the
|
||||||
if (play_popup_on_listening_ || !audio_service_.IsAudioProcessorRunning()) {
|
// processor task is still marked running, otherwise realtime mode can show Listening
|
||||||
// For auto mode, wait for playback queue to be empty before enabling voice processing
|
// while no fresh mic frames are sent.
|
||||||
// This prevents audio truncation when STOP arrives late due to network jitter
|
|
||||||
if (listening_mode_ == kListeningModeAutoStop) {
|
if (listening_mode_ == kListeningModeAutoStop) {
|
||||||
audio_service_.WaitForPlaybackQueueEmpty();
|
audio_service_.WaitForPlaybackQueueEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vision_text_mode_enabled_.load()) {
|
|
||||||
SendCurrentVisionFrame();
|
|
||||||
}
|
|
||||||
// Send the start listening command
|
|
||||||
protocol_->SendStartListening(listening_mode_);
|
protocol_->SendStartListening(listening_mode_);
|
||||||
audio_service_.EnableVoiceProcessing(true);
|
audio_service_.EnableVoiceProcessing(true);
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CONFIG_WAKE_WORD_DETECTION_IN_LISTENING
|
#ifdef CONFIG_WAKE_WORD_DETECTION_IN_LISTENING
|
||||||
// Enable wake word detection in listening mode (configured via Kconfig)
|
// Enable wake word detection in listening mode (configured via Kconfig)
|
||||||
@ -937,6 +963,16 @@ void Application::HandleStateChangedEvent() {
|
|||||||
audio_service_.PlaySound(Lang::Sounds::OGG_POPUP);
|
audio_service_.PlaySound(Lang::Sounds::OGG_POPUP);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case kDeviceStateThinking:
|
||||||
|
vad_speaking_.store(false);
|
||||||
|
display->SetStatus(Lang::Strings::THINKING);
|
||||||
|
display->SetEmotion("thinking");
|
||||||
|
|
||||||
|
if (listening_mode_ != kListeningModeRealtime) {
|
||||||
|
audio_service_.EnableVoiceProcessing(false);
|
||||||
|
audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
|
||||||
|
}
|
||||||
|
break;
|
||||||
case kDeviceStateSpeaking:
|
case kDeviceStateSpeaking:
|
||||||
display->SetStatus(Lang::Strings::SPEAKING);
|
display->SetStatus(Lang::Strings::SPEAKING);
|
||||||
|
|
||||||
@ -945,7 +981,9 @@ void Application::HandleStateChangedEvent() {
|
|||||||
// Only AFE wake word can be detected in speaking mode
|
// Only AFE wake word can be detected in speaking mode
|
||||||
audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
|
audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
|
||||||
}
|
}
|
||||||
|
if (!accepting_tts_audio_.load()) {
|
||||||
audio_service_.ResetDecoder();
|
audio_service_.ResetDecoder();
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case kDeviceStateWifiConfiguring:
|
case kDeviceStateWifiConfiguring:
|
||||||
audio_service_.EnableVoiceProcessing(false);
|
audio_service_.EnableVoiceProcessing(false);
|
||||||
@ -957,24 +995,25 @@ void Application::HandleStateChangedEvent() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::SendCurrentVisionFrame() {
|
bool Application::SendCurrentVisionFrame() {
|
||||||
if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
|
if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto camera = Board::GetInstance().GetCamera();
|
auto camera = Board::GetInstance().GetCamera();
|
||||||
if (camera == nullptr) {
|
if (camera == nullptr) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string jpeg_data;
|
std::string jpeg_data;
|
||||||
if (!camera->CaptureToJpeg(jpeg_data, false)) {
|
if (!camera->CaptureToJpeg(jpeg_data, true)) {
|
||||||
ESP_LOGW(TAG, "Failed to capture vision frame");
|
ESP_LOGW(TAG, "Failed to capture vision frame");
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
protocol_->SendVisionFrame(jpeg_data);
|
protocol_->SendVisionFrame(jpeg_data);
|
||||||
ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
|
ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::Schedule(std::function<void()>&& callback) {
|
void Application::Schedule(std::function<void()>&& callback) {
|
||||||
@ -988,6 +1027,8 @@ void Application::Schedule(std::function<void()>&& callback) {
|
|||||||
void Application::AbortSpeaking(AbortReason reason) {
|
void Application::AbortSpeaking(AbortReason reason) {
|
||||||
ESP_LOGI(TAG, "Abort speaking");
|
ESP_LOGI(TAG, "Abort speaking");
|
||||||
aborted_ = true;
|
aborted_ = true;
|
||||||
|
accepting_tts_audio_.store(false);
|
||||||
|
audio_service_.ResetDecoder();
|
||||||
if (protocol_) {
|
if (protocol_) {
|
||||||
protocol_->SendAbortSpeaking(reason);
|
protocol_->SendAbortSpeaking(reason);
|
||||||
}
|
}
|
||||||
@ -995,6 +1036,8 @@ void Application::AbortSpeaking(AbortReason reason) {
|
|||||||
|
|
||||||
void Application::SetListeningMode(ListeningMode mode) {
|
void Application::SetListeningMode(ListeningMode mode) {
|
||||||
listening_mode_ = mode;
|
listening_mode_ = mode;
|
||||||
|
vad_speaking_.store(false);
|
||||||
|
vision_frame_sent_for_current_listen_.store(false);
|
||||||
SetDeviceState(kDeviceStateListening);
|
SetDeviceState(kDeviceStateListening);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1029,7 +1072,8 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver
|
|||||||
}
|
}
|
||||||
ESP_LOGI(TAG, "Starting firmware upgrade from URL: %s", upgrade_url.c_str());
|
ESP_LOGI(TAG, "Starting firmware upgrade from URL: %s", upgrade_url.c_str());
|
||||||
|
|
||||||
Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download", Lang::Sounds::OGG_UPGRADE);
|
Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download",
|
||||||
|
Lang::Sounds::OGG_UPGRADE);
|
||||||
vTaskDelay(pdMS_TO_TICKS(3000));
|
vTaskDelay(pdMS_TO_TICKS(3000));
|
||||||
|
|
||||||
SetDeviceState(kDeviceStateUpgrading);
|
SetDeviceState(kDeviceStateUpgrading);
|
||||||
@ -1051,10 +1095,12 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver
|
|||||||
|
|
||||||
if (!upgrade_success) {
|
if (!upgrade_success) {
|
||||||
// Upgrade failed, restart audio service and continue running
|
// Upgrade failed, restart audio service and continue running
|
||||||
ESP_LOGE(TAG, "Firmware upgrade failed, restarting audio service and continuing operation...");
|
ESP_LOGE(TAG,
|
||||||
|
"Firmware upgrade failed, restarting audio service and continuing operation...");
|
||||||
audio_service_.Start(); // Restart audio service
|
audio_service_.Start(); // Restart audio service
|
||||||
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level
|
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level
|
||||||
Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION);
|
Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark",
|
||||||
|
Lang::Sounds::OGG_EXCLAMATION);
|
||||||
vTaskDelay(pdMS_TO_TICKS(3000));
|
vTaskDelay(pdMS_TO_TICKS(3000));
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
@ -1080,17 +1126,13 @@ void Application::WakeWordInvoke(const std::string& wake_word) {
|
|||||||
if (!protocol_->IsAudioChannelOpened()) {
|
if (!protocol_->IsAudioChannelOpened()) {
|
||||||
SetDeviceState(kDeviceStateConnecting);
|
SetDeviceState(kDeviceStateConnecting);
|
||||||
// Schedule to let the state change be processed first (UI update)
|
// Schedule to let the state change be processed first (UI update)
|
||||||
Schedule([this, wake_word]() {
|
Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); });
|
||||||
ContinueWakeWordInvoke(wake_word);
|
|
||||||
});
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Channel already opened, continue directly
|
// Channel already opened, continue directly
|
||||||
ContinueWakeWordInvoke(wake_word);
|
ContinueWakeWordInvoke(wake_word);
|
||||||
} else if (state == kDeviceStateSpeaking) {
|
} else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
|
||||||
Schedule([this]() {
|
Schedule([this]() { AbortSpeaking(kAbortReasonNone); });
|
||||||
AbortSpeaking(kAbortReasonNone);
|
|
||||||
});
|
|
||||||
} else if (state == kDeviceStateListening) {
|
} else if (state == kDeviceStateListening) {
|
||||||
Schedule([this]() {
|
Schedule([this]() {
|
||||||
if (protocol_) {
|
if (protocol_) {
|
||||||
@ -1123,7 +1165,7 @@ void Application::RegisterMcpBroadcastCallback(std::function<void(const std::str
|
|||||||
|
|
||||||
void Application::SendMcpMessage(const std::string& payload) {
|
void Application::SendMcpMessage(const std::string& payload) {
|
||||||
// Always schedule to run in main task for thread safety
|
// Always schedule to run in main task for thread safety
|
||||||
Schedule([this, payload](){
|
Schedule([this, payload]() {
|
||||||
if (protocol_) {
|
if (protocol_) {
|
||||||
protocol_->SendMcpMessage(payload);
|
protocol_->SendMcpMessage(payload);
|
||||||
}
|
}
|
||||||
@ -1160,9 +1202,7 @@ void Application::SetAecMode(AecMode mode) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::PlaySound(const std::string_view& sound) {
|
void Application::PlaySound(const std::string_view& sound) { audio_service_.PlaySound(sound); }
|
||||||
audio_service_.PlaySound(sound);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Application::ResetProtocol() {
|
void Application::ResetProtocol() {
|
||||||
Schedule([this]() {
|
Schedule([this]() {
|
||||||
|
|||||||
@ -41,6 +41,11 @@ enum AecMode {
|
|||||||
kAecOnServerSide,
|
kAecOnServerSide,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum ChatAgentMode {
|
||||||
|
kChatAgentModeNormal,
|
||||||
|
kChatAgentModeBeaver,
|
||||||
|
};
|
||||||
|
|
||||||
class Application {
|
class Application {
|
||||||
public:
|
public:
|
||||||
static Application& GetInstance() {
|
static Application& GetInstance() {
|
||||||
@ -93,6 +98,11 @@ public:
|
|||||||
*/
|
*/
|
||||||
void ToggleChatState();
|
void ToggleChatState();
|
||||||
void ToggleChatStateWithVision();
|
void ToggleChatStateWithVision();
|
||||||
|
void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled);
|
||||||
|
bool IsVisionTextModeEnabled() const;
|
||||||
|
ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); }
|
||||||
|
const char* GetChatAgentModeName() const;
|
||||||
|
const char* GetChatModeName() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Start listening (event-based, thread-safe)
|
* Start listening (event-based, thread-safe)
|
||||||
@ -146,7 +156,13 @@ private:
|
|||||||
bool aborted_ = false;
|
bool aborted_ = false;
|
||||||
bool assets_version_checked_ = false;
|
bool assets_version_checked_ = false;
|
||||||
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
|
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
|
||||||
|
std::atomic<ChatAgentMode> chat_agent_mode_ = kChatAgentModeNormal;
|
||||||
|
std::atomic<ChatAgentMode> active_chat_agent_mode_ = kChatAgentModeNormal;
|
||||||
std::atomic<bool> vision_text_mode_enabled_ = false;
|
std::atomic<bool> vision_text_mode_enabled_ = false;
|
||||||
|
std::atomic<bool> active_vision_text_mode_enabled_ = false;
|
||||||
|
std::atomic<bool> vad_speaking_ = false;
|
||||||
|
std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
|
||||||
|
std::atomic<bool> accepting_tts_audio_ = false;
|
||||||
int clock_ticks_ = 0;
|
int clock_ticks_ = 0;
|
||||||
TaskHandle_t activation_task_handle_ = nullptr;
|
TaskHandle_t activation_task_handle_ = nullptr;
|
||||||
|
|
||||||
@ -162,7 +178,7 @@ private:
|
|||||||
void HandleWakeWordDetectedEvent();
|
void HandleWakeWordDetectedEvent();
|
||||||
void ContinueOpenAudioChannel(ListeningMode mode);
|
void ContinueOpenAudioChannel(ListeningMode mode);
|
||||||
void ContinueWakeWordInvoke(const std::string& wake_word);
|
void ContinueWakeWordInvoke(const std::string& wake_word);
|
||||||
void SendCurrentVisionFrame();
|
bool SendCurrentVisionFrame();
|
||||||
|
|
||||||
// Activation task (runs in background)
|
// Activation task (runs in background)
|
||||||
void ActivationTask();
|
void ActivationTask();
|
||||||
|
|||||||
@ -26,6 +26,7 @@
|
|||||||
"CONNECTION_SUCCESSFUL": "Connection Successful",
|
"CONNECTION_SUCCESSFUL": "Connection Successful",
|
||||||
"CONNECTED_TO": "Connected to ",
|
"CONNECTED_TO": "Connected to ",
|
||||||
"LISTENING": "Listening...",
|
"LISTENING": "Listening...",
|
||||||
|
"THINKING": "Thinking...",
|
||||||
"SPEAKING": "Speaking...",
|
"SPEAKING": "Speaking...",
|
||||||
"SERVER_NOT_FOUND": "Looking for available service",
|
"SERVER_NOT_FOUND": "Looking for available service",
|
||||||
"SERVER_NOT_CONNECTED": "Unable to connect to service, please try again later",
|
"SERVER_NOT_CONNECTED": "Unable to connect to service, please try again later",
|
||||||
|
|||||||
@ -23,6 +23,7 @@
|
|||||||
"CONNECTING": "连接中...",
|
"CONNECTING": "连接中...",
|
||||||
"CONNECTED_TO": "已连接 ",
|
"CONNECTED_TO": "已连接 ",
|
||||||
"LISTENING": "聆听中...",
|
"LISTENING": "聆听中...",
|
||||||
|
"THINKING": "思考中...",
|
||||||
"SPEAKING": "说话中...",
|
"SPEAKING": "说话中...",
|
||||||
"SERVER_NOT_FOUND": "正在寻找可用服务",
|
"SERVER_NOT_FOUND": "正在寻找可用服务",
|
||||||
"SERVER_NOT_CONNECTED": "无法连接服务,请稍后再试",
|
"SERVER_NOT_CONNECTED": "无法连接服务,请稍后再试",
|
||||||
|
|||||||
@ -579,6 +579,7 @@ void AudioService::EnableWakeWordDetection(bool enable) {
|
|||||||
void AudioService::EnableVoiceProcessing(bool enable) {
|
void AudioService::EnableVoiceProcessing(bool enable) {
|
||||||
ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
|
ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
|
||||||
if (enable) {
|
if (enable) {
|
||||||
|
bool was_running = IsAudioProcessorRunning();
|
||||||
if (!audio_processor_initialized_) {
|
if (!audio_processor_initialized_) {
|
||||||
audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_);
|
audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_);
|
||||||
audio_processor_initialized_ = true;
|
audio_processor_initialized_ = true;
|
||||||
@ -586,7 +587,7 @@ void AudioService::EnableVoiceProcessing(bool enable) {
|
|||||||
|
|
||||||
/* We should make sure no audio is playing */
|
/* We should make sure no audio is playing */
|
||||||
ResetDecoder();
|
ResetDecoder();
|
||||||
audio_input_need_warmup_ = true;
|
audio_input_need_warmup_ = !was_running;
|
||||||
// Reset input resampler to clear cached data from previous mode (e.g. WakeWord)
|
// Reset input resampler to clear cached data from previous mode (e.g. WakeWord)
|
||||||
// This prevents buffer overflow when switching between different feed sizes
|
// This prevents buffer overflow when switching between different feed sizes
|
||||||
{
|
{
|
||||||
|
|||||||
177
main/background_capture_service.cc
Normal file
177
main/background_capture_service.cc
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
#include "background_capture_service.h"
|
||||||
|
|
||||||
|
#include "board.h"
|
||||||
|
#include "camera.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <esp_heap_caps.h>
|
||||||
|
#include <esp_log.h>
|
||||||
|
|
||||||
|
#define TAG "BgCapture"
|
||||||
|
|
||||||
|
BackgroundCaptureService::BackgroundCaptureService() = default;
|
||||||
|
|
||||||
|
BackgroundCaptureService::~BackgroundCaptureService() {
|
||||||
|
Stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundCaptureService::Start() {
|
||||||
|
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
|
||||||
|
if (running_.exchange(true)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto result = xTaskCreate(
|
||||||
|
&BackgroundCaptureService::TaskEntry,
|
||||||
|
"bg_capture",
|
||||||
|
CONFIG_BACKGROUND_CAPTURE_TASK_STACK_SIZE,
|
||||||
|
this,
|
||||||
|
CONFIG_BACKGROUND_CAPTURE_TASK_PRIORITY,
|
||||||
|
&task_handle_);
|
||||||
|
if (result != pdPASS) {
|
||||||
|
running_.store(false);
|
||||||
|
task_handle_ = nullptr;
|
||||||
|
ESP_LOGE(TAG, "Failed to create background capture task");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundCaptureService::Stop() {
|
||||||
|
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
|
||||||
|
if (!running_.exchange(false)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (task_handle_ != nullptr) {
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(20));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundCaptureService::TaskEntry(void* arg) {
|
||||||
|
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
|
||||||
|
auto* service = static_cast<BackgroundCaptureService*>(arg);
|
||||||
|
service->Run();
|
||||||
|
service->task_handle_ = nullptr;
|
||||||
|
#else
|
||||||
|
(void)arg;
|
||||||
|
#endif
|
||||||
|
vTaskDelete(nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void BackgroundCaptureService::Run() {
|
||||||
|
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
|
||||||
|
ESP_LOGI(TAG, "Background capture task started");
|
||||||
|
|
||||||
|
while (running_.load()) {
|
||||||
|
if (!CaptureAndSendFrame()) {
|
||||||
|
consecutive_failures_++;
|
||||||
|
auto delay_ms = GetFailureDelayMs();
|
||||||
|
ESP_LOGW(TAG, "Background capture retry in %u ms, failures=%u",
|
||||||
|
delay_ms, consecutive_failures_);
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(delay_ms));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
consecutive_failures_ = 0;
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(CONFIG_BACKGROUND_CAPTURE_FRAME_INTERVAL_MS));
|
||||||
|
}
|
||||||
|
|
||||||
|
ESP_LOGI(TAG, "Background capture task stopped");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool BackgroundCaptureService::CaptureAndSendFrame() {
|
||||||
|
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
|
||||||
|
const size_t free_internal_heap = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
|
||||||
|
if (free_internal_heap < CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP) {
|
||||||
|
ESP_LOGW(TAG, "Skip background capture, low internal heap: free=%u threshold=%u",
|
||||||
|
static_cast<unsigned>(free_internal_heap),
|
||||||
|
static_cast<unsigned>(CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto camera = Board::GetInstance().GetCamera();
|
||||||
|
if (camera == nullptr) {
|
||||||
|
ESP_LOGW(TAG, "No camera available for background capture");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string jpeg_data;
|
||||||
|
if (!camera->CaptureToJpeg(jpeg_data, false)) {
|
||||||
|
ESP_LOGW(TAG, "Failed to capture background frame");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jpeg_data.empty()) {
|
||||||
|
ESP_LOGW(TAG, "Captured empty background frame");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return UploadJpegFrame(jpeg_data);
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t BackgroundCaptureService::GetFailureDelayMs() const {
|
||||||
|
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
|
||||||
|
const uint32_t base_delay_ms = CONFIG_BACKGROUND_CAPTURE_RETRY_INTERVAL_MS;
|
||||||
|
const uint32_t max_delay_ms = CONFIG_BACKGROUND_CAPTURE_MAX_BACKOFF_MS;
|
||||||
|
const uint32_t shift = std::min<uint32_t>(consecutive_failures_ - 1, 4);
|
||||||
|
return std::min<uint32_t>(base_delay_ms << shift, max_delay_ms);
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool BackgroundCaptureService::UploadJpegFrame(const std::string& jpeg_data) {
|
||||||
|
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
|
||||||
|
const std::string url = CONFIG_BACKGROUND_CAPTURE_UPLOAD_URL;
|
||||||
|
if (url.empty()) {
|
||||||
|
ESP_LOGI(TAG, "Captured background frame: %u bytes", static_cast<unsigned>(jpeg_data.size()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto network = Board::GetInstance().GetNetwork();
|
||||||
|
if (network == nullptr) {
|
||||||
|
ESP_LOGW(TAG, "No network available for background upload");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string boundary = "----XIAOZHI_BACKGROUND_CAPTURE_BOUNDARY";
|
||||||
|
auto http = network->CreateHttp(3);
|
||||||
|
http->SetHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
|
||||||
|
|
||||||
|
if (!http->Open("POST", url)) {
|
||||||
|
ESP_LOGW(TAG, "Failed to open background upload URL: %s", url.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string file_header;
|
||||||
|
file_header += "--" + boundary + "\r\n";
|
||||||
|
file_header += "Content-Disposition: form-data; name=\"file\"; filename=\"frame.jpg\"\r\n";
|
||||||
|
file_header += "Content-Type: image/jpeg\r\n\r\n";
|
||||||
|
http->Write(file_header.c_str(), file_header.size());
|
||||||
|
http->Write(jpeg_data.data(), jpeg_data.size());
|
||||||
|
|
||||||
|
std::string footer;
|
||||||
|
footer += "\r\n--" + boundary + "--\r\n";
|
||||||
|
http->Write(footer.c_str(), footer.size());
|
||||||
|
http->Write("", 0);
|
||||||
|
|
||||||
|
const int status_code = http->GetStatusCode();
|
||||||
|
http->Close();
|
||||||
|
|
||||||
|
if (status_code < 200 || status_code >= 300) {
|
||||||
|
ESP_LOGW(TAG, "Background upload failed, status=%d", status_code);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ESP_LOGI(TAG, "Uploaded background frame: %u bytes", static_cast<unsigned>(jpeg_data.size()));
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
(void)jpeg_data;
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
32
main/background_capture_service.h
Normal file
32
main/background_capture_service.h
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#ifndef BACKGROUND_CAPTURE_SERVICE_H
|
||||||
|
#define BACKGROUND_CAPTURE_SERVICE_H
|
||||||
|
|
||||||
|
#include <freertos/FreeRTOS.h>
|
||||||
|
#include <freertos/task.h>
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class BackgroundCaptureService {
|
||||||
|
public:
|
||||||
|
BackgroundCaptureService();
|
||||||
|
~BackgroundCaptureService();
|
||||||
|
|
||||||
|
void Start();
|
||||||
|
void Stop();
|
||||||
|
bool IsRunning() const { return running_.load(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
TaskHandle_t task_handle_ = nullptr;
|
||||||
|
std::atomic<bool> running_ = false;
|
||||||
|
uint32_t consecutive_failures_ = 0;
|
||||||
|
|
||||||
|
static void TaskEntry(void* arg);
|
||||||
|
void Run();
|
||||||
|
bool CaptureAndSendFrame();
|
||||||
|
bool UploadJpegFrame(const std::string& jpeg_data);
|
||||||
|
uint32_t GetFailureDelayMs() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BACKGROUND_CAPTURE_SERVICE_H
|
||||||
@ -214,6 +214,9 @@ public:
|
|||||||
case kDeviceStateSpeaking:
|
case kDeviceStateSpeaking:
|
||||||
ctrl_->SetStatusColor(64, 0, 0); // red
|
ctrl_->SetStatusColor(64, 0, 0); // red
|
||||||
break;
|
break;
|
||||||
|
case kDeviceStateThinking:
|
||||||
|
ctrl_->SetStatusColor(0, 0, 64); // blue
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
ctrl_->SetStatusColor(0, 0, 64); // blue
|
ctrl_->SetStatusColor(0, 0, 64); // blue
|
||||||
break;
|
break;
|
||||||
|
|||||||
@ -203,7 +203,10 @@ void WifiBoard::EnterWifiConfigMode() {
|
|||||||
auto& app = Application::GetInstance();
|
auto& app = Application::GetInstance();
|
||||||
auto state = app.GetDeviceState();
|
auto state = app.GetDeviceState();
|
||||||
|
|
||||||
if (state == kDeviceStateSpeaking || state == kDeviceStateListening || state == kDeviceStateIdle) {
|
if (state == kDeviceStateSpeaking ||
|
||||||
|
state == kDeviceStateThinking ||
|
||||||
|
state == kDeviceStateListening ||
|
||||||
|
state == kDeviceStateIdle) {
|
||||||
// Reset protocol (close audio channel, reset protocol)
|
// Reset protocol (close audio channel, reset protocol)
|
||||||
Application::GetInstance().ResetProtocol();
|
Application::GetInstance().ResetProtocol();
|
||||||
|
|
||||||
|
|||||||
@ -85,6 +85,13 @@ void ElectronEmojiDisplay::SetStatus(const char* status) {
|
|||||||
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
|
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
|
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
return;
|
return;
|
||||||
|
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
|
||||||
|
lv_obj_set_style_text_font(status_label_, text_font, 0);
|
||||||
|
lv_label_set_text(status_label_, status);
|
||||||
|
lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
|
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
|
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
|
return;
|
||||||
} else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
|
} else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
|
||||||
lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
|
lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
|
||||||
lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标
|
lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标
|
||||||
|
|||||||
@ -155,6 +155,8 @@ void EmojiWidget::SetStatus(const char* status)
|
|||||||
if (player_) {
|
if (player_) {
|
||||||
if (strcmp(status, Lang::Strings::LISTENING) == 0) {
|
if (strcmp(status, Lang::Strings::LISTENING) == 0) {
|
||||||
player_->StartPlayer("asking", true, 15);
|
player_->StartPlayer("asking", true, 15);
|
||||||
|
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
|
||||||
|
player_->StartPlayer("thinking", true, 15);
|
||||||
} else if (strcmp(status, Lang::Strings::STANDBY) == 0) {
|
} else if (strcmp(status, Lang::Strings::STANDBY) == 0) {
|
||||||
player_->StartPlayer("wake", true, 15);
|
player_->StartPlayer("wake", true, 15);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -231,9 +231,9 @@ private:
|
|||||||
// 如果当前是聆听状态,切换到待命状态
|
// 如果当前是聆听状态,切换到待命状态
|
||||||
ESP_LOGI(TAG, "从聆听状态切换到待命状态");
|
ESP_LOGI(TAG, "从聆听状态切换到待命状态");
|
||||||
app.ToggleChatState(); // 切换到待命状态
|
app.ToggleChatState(); // 切换到待命状态
|
||||||
} else if (current_state == kDeviceStateSpeaking) {
|
} else if (current_state == kDeviceStateSpeaking || current_state == kDeviceStateThinking) {
|
||||||
// 如果当前是说话状态,终止说话并切换到待命状态
|
// 如果当前是说话或思考状态,终止并切换到待命状态
|
||||||
ESP_LOGI(TAG, "从说话状态切换到待命状态");
|
ESP_LOGI(TAG, "从说话/思考状态切换到待命状态");
|
||||||
app.ToggleChatState(); // 终止说话
|
app.ToggleChatState(); // 终止说话
|
||||||
} else {
|
} else {
|
||||||
// 其他状态下只唤醒设备
|
// 其他状态下只唤醒设备
|
||||||
|
|||||||
@ -192,6 +192,7 @@ private:
|
|||||||
void PollTouchpad() {
|
void PollTouchpad() {
|
||||||
static bool was_touched = false;
|
static bool was_touched = false;
|
||||||
static int64_t touch_start_time = 0;
|
static int64_t touch_start_time = 0;
|
||||||
|
static int touch_start_x = -1;
|
||||||
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值,超过500ms视为长按
|
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值,超过500ms视为长按
|
||||||
|
|
||||||
ft6336_->UpdateTouchPoint();
|
ft6336_->UpdateTouchPoint();
|
||||||
@ -201,11 +202,14 @@ private:
|
|||||||
if (touch_point.num > 0 && !was_touched) {
|
if (touch_point.num > 0 && !was_touched) {
|
||||||
was_touched = true;
|
was_touched = true;
|
||||||
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
|
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
|
||||||
|
touch_start_x = touch_point.x;
|
||||||
}
|
}
|
||||||
// 检测触摸释放
|
// 检测触摸释放
|
||||||
else if (touch_point.num == 0 && was_touched) {
|
else if (touch_point.num == 0 && was_touched) {
|
||||||
was_touched = false;
|
was_touched = false;
|
||||||
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
|
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
|
||||||
|
bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2;
|
||||||
|
auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal;
|
||||||
|
|
||||||
if (touch_duration < TOUCH_THRESHOLD_MS) {
|
if (touch_duration < TOUCH_THRESHOLD_MS) {
|
||||||
auto& app = Application::GetInstance();
|
auto& app = Application::GetInstance();
|
||||||
@ -213,12 +217,12 @@ private:
|
|||||||
EnterWifiConfigMode();
|
EnterWifiConfigMode();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ESP_LOGI(TAG, "Touch short: text-only mode");
|
ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal");
|
||||||
app.ToggleChatState();
|
app.ToggleChatStateForMode(agent_mode, false);
|
||||||
} else {
|
} else {
|
||||||
auto& app = Application::GetInstance();
|
auto& app = Application::GetInstance();
|
||||||
ESP_LOGI(TAG, "Touch long: vision+text mode");
|
ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal");
|
||||||
app.ToggleChatStateWithVision();
|
app.ToggleChatStateForMode(agent_mode, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -344,18 +348,23 @@ private:
|
|||||||
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
|
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
if (!Application::GetInstance().IsVisionTextModeEnabled()) {
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (board->camera_ == nullptr) {
|
if (board->camera_ == nullptr) {
|
||||||
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
|
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (board->camera_->CaptureBackground()) {
|
if (board->camera_->Capture()) {
|
||||||
if (!has_logged_success) {
|
if (!has_logged_success) {
|
||||||
ESP_LOGI(TAG, "Background vision sampler started");
|
ESP_LOGI(TAG, "Vision preview sampler started");
|
||||||
has_logged_success = true;
|
has_logged_success = true;
|
||||||
}
|
}
|
||||||
} else if (!has_logged_failure) {
|
} else if (!has_logged_failure) {
|
||||||
ESP_LOGW(TAG, "Background vision sampler is waiting for camera");
|
ESP_LOGW(TAG, "Vision preview sampler is waiting for camera");
|
||||||
has_logged_failure = true;
|
has_logged_failure = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -77,6 +77,13 @@ void OttoEmojiDisplay::SetStatus(const char* status) {
|
|||||||
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
|
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
|
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
return;
|
return;
|
||||||
|
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
|
||||||
|
lv_obj_set_style_text_font(status_label_, text_font, 0);
|
||||||
|
lv_label_set_text(status_label_, status);
|
||||||
|
lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
|
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
|
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
|
||||||
|
return;
|
||||||
} else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
|
} else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
|
||||||
lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
|
lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
|
||||||
lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标
|
lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
|
import contextlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
@ -28,33 +30,65 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken"
|
|||||||
LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
|
LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
|
||||||
ROOM_PREFIX = "test-livekit"
|
ROOM_PREFIX = "test-livekit"
|
||||||
IDENTITY_PREFIX = "uv-livekit"
|
IDENTITY_PREFIX = "uv-livekit"
|
||||||
AGENT_NAME = "my-agent"
|
LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent")
|
||||||
|
DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower()
|
||||||
|
AGENT_NAMES = {
|
||||||
|
"normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME),
|
||||||
|
"beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"),
|
||||||
|
}
|
||||||
|
CHAT_MODE_AGENT_NAMES = {
|
||||||
|
"normal": AGENT_NAMES["normal"],
|
||||||
|
"beaver": AGENT_NAMES["beaver"],
|
||||||
|
"vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"),
|
||||||
|
"vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"),
|
||||||
|
}
|
||||||
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
|
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
|
||||||
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
|
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
|
||||||
WS_PORT = 8080
|
WS_PORT = 8080
|
||||||
|
WS_MAX_QUEUE = int(os.getenv("BRIDGE_WS_MAX_QUEUE", "128"))
|
||||||
|
WS_MAX_SIZE = int(os.getenv("BRIDGE_WS_MAX_SIZE", str(8 * 1024 * 1024)))
|
||||||
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
|
AGENT_DISPATCH_MODE = os.getenv("AGENT_DISPATCH_MODE", "token").lower()
|
||||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||||
VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
|
VISION_FRAME_SAVE_DIR = Path(os.getenv("VISION_FRAME_SAVE_DIR", str(PROJECT_ROOT / "vision_frames")))
|
||||||
|
|
||||||
INPUT_SAMPLE_RATE = 16000
|
INPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_INPUT_SAMPLE_RATE", "16000"))
|
||||||
OUTPUT_SAMPLE_RATE = 24000
|
OUTPUT_SAMPLE_RATE = int(os.getenv("BRIDGE_OUTPUT_SAMPLE_RATE", "24000"))
|
||||||
INPUT_FRAME_DURATION_MS = 20
|
INPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_INPUT_FRAME_DURATION_MS", "60"))
|
||||||
INPUT_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * INPUT_FRAME_DURATION_MS // 1000
|
INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 120 // 1000
|
||||||
INPUT_MAX_SAMPLES_PER_OPUS_FRAME = INPUT_SAMPLE_RATE * 60 // 1000
|
OUTPUT_FRAME_DURATION_MS = int(os.getenv("BRIDGE_OUTPUT_FRAME_DURATION_MS", "60"))
|
||||||
OUTPUT_FRAME_DURATION_MS = 20
|
AUDIO_STATS_INTERVAL_SECONDS = float(os.getenv("BRIDGE_AUDIO_STATS_INTERVAL_SECONDS", "5.0"))
|
||||||
OUTPUT_SAMPLES_PER_OPUS_FRAME = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000
|
DOWNLINK_SEND_GAP_WARN_MS = float(os.getenv("BRIDGE_DOWNLINK_SEND_GAP_WARN_MS", "180.0"))
|
||||||
TTS_IDLE_TIMEOUT_SECONDS = 0.25
|
UPLINK_CAPTURE_TIMEOUT_SECONDS = float(os.getenv("BRIDGE_UPLINK_CAPTURE_TIMEOUT_SECONDS", "0.25"))
|
||||||
|
TTS_IDLE_TIMEOUT_SECONDS = float(os.getenv("TTS_IDLE_TIMEOUT_SECONDS", "1.2"))
|
||||||
|
TTS_MIN_ACTIVE_SECONDS = float(os.getenv("TTS_MIN_ACTIVE_SECONDS", "1.0"))
|
||||||
TTS_SILENCE_PEAK_THRESHOLD = 96
|
TTS_SILENCE_PEAK_THRESHOLD = 96
|
||||||
TTS_PRE_ROLL_MS = 80
|
TTS_PRE_ROLL_MS = int(os.getenv("TTS_PRE_ROLL_MS", "480"))
|
||||||
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = 1
|
TTS_START_CONSECUTIVE_AUDIBLE_FRAMES = int(os.getenv("TTS_START_CONSECUTIVE_AUDIBLE_FRAMES", "1"))
|
||||||
TTS_INTERRUPT_SILENCE_FRAMES = 3
|
TTS_INTERRUPT_SILENCE_FRAMES = 3
|
||||||
INTERRUPT_TOPIC = "lk.interrupt"
|
INTERRUPT_TOPIC = "lk.interrupt"
|
||||||
VISION_FRAME_TOPIC = "vision.frame"
|
VISION_FRAME_TOPIC = "vision.frame"
|
||||||
|
AGENT_STATE_ATTRIBUTE = "lk.agent.state"
|
||||||
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;;"
|
TTS_DISPLAY_SENTENCE_BREAKS = "。!?!?;;"
|
||||||
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
|
TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
|
||||||
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
|
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
|
||||||
TTS_DISPLAY_SCROLL_GAP = " "
|
TTS_DISPLAY_SCROLL_GAP = " "
|
||||||
TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
|
TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
|
||||||
|
TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS = float(
|
||||||
|
os.getenv("TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS", "0.25")
|
||||||
|
)
|
||||||
|
TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS = float(
|
||||||
|
os.getenv("TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS", "8.0")
|
||||||
|
)
|
||||||
|
EMOTION_TEXT_PATTERN = re.compile(
|
||||||
|
r"^\s*<?\s*emotion\s*=\s*([^\s>,,;;]+)\s*>?[\s,,;;]*(.*)$",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
EMOTION_TEST_SEQUENCE = [
|
||||||
|
emotion.strip()
|
||||||
|
for emotion in os.getenv("BRIDGE_EMOTION_TEST_SEQUENCE", "").split(",")
|
||||||
|
if emotion.strip()
|
||||||
|
]
|
||||||
|
EMOTION_TEST_INTERVAL_SECONDS = float(os.getenv("BRIDGE_EMOTION_TEST_INTERVAL_SECONDS", "2.0"))
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -64,28 +98,39 @@ class DeviceSession:
|
|||||||
protocol_version: int
|
protocol_version: int
|
||||||
room_name: str
|
room_name: str
|
||||||
identity: str
|
identity: str
|
||||||
|
chat_mode: str
|
||||||
|
agent_mode: str
|
||||||
|
agent_name: str
|
||||||
|
vision_enabled: bool
|
||||||
room: rtc.Room
|
room: rtc.Room
|
||||||
mic_source: AudioSource
|
mic_source: AudioSource
|
||||||
agent_ready: asyncio.Event
|
agent_ready: asyncio.Event
|
||||||
forwarding_tracks: dict[str, asyncio.Task[Any]] = field(default_factory=dict)
|
forwarding_tracks: dict[str, asyncio.Task[Any]] = field(default_factory=dict)
|
||||||
tts_active: bool = False
|
tts_active: bool = False
|
||||||
|
tts_thinking: bool = False
|
||||||
tts_idle_task: Optional[asyncio.Task] = None
|
tts_idle_task: Optional[asyncio.Task] = None
|
||||||
tts_display_task: Optional[asyncio.Task] = None
|
tts_display_task: Optional[asyncio.Task] = None
|
||||||
tts_stream_id: int = 0
|
tts_stream_id: int = 0
|
||||||
tts_transcript_text: str = ""
|
tts_transcript_text: str = ""
|
||||||
tts_display_text: str = ""
|
tts_display_text: str = ""
|
||||||
tts_display_final: bool = False
|
tts_display_final: bool = False
|
||||||
|
tts_emotion: str = ""
|
||||||
tts_suppressed_until: float = 0.0
|
tts_suppressed_until: float = 0.0
|
||||||
|
tts_started_at: float = 0.0
|
||||||
|
tts_last_audible_at: float = 0.0
|
||||||
|
tts_waiting_for_user_audio_after_interrupt: bool = False
|
||||||
|
last_interrupt_time: float = 0.0
|
||||||
|
last_uplink_audible_time: float = 0.0
|
||||||
agent_dispatch_task: Optional[asyncio.Task] = None
|
agent_dispatch_task: Optional[asyncio.Task] = None
|
||||||
closed: bool = False
|
closed: bool = False
|
||||||
captured_frame_count: int = 0
|
captured_frame_count: int = 0
|
||||||
first_capture_log_time: float = 0.0
|
first_capture_log_time: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
async def fetch_token(room_name: str, identity: str) -> str:
|
async def fetch_token(room_name: str, identity: str, agent_name: str) -> str:
|
||||||
params = {"room": room_name, "identity": identity}
|
params = {"room": room_name, "identity": identity}
|
||||||
if AGENT_DISPATCH_MODE == "token":
|
if AGENT_DISPATCH_MODE == "token":
|
||||||
params["agent_name"] = AGENT_NAME
|
params["agent_name"] = agent_name
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||||
response = await client.get(TOKEN_URL, params=params)
|
response = await client.get(TOKEN_URL, params=params)
|
||||||
@ -111,6 +156,100 @@ class ESP32LiveKitBridge:
|
|||||||
if formatted_tb.strip():
|
if formatted_tb.strip():
|
||||||
print(formatted_tb.rstrip())
|
print(formatted_tb.rstrip())
|
||||||
|
|
||||||
|
def _audio_duration_ms(self, sample_count: int, sample_rate: int) -> float:
|
||||||
|
if sample_rate <= 0:
|
||||||
|
return 0.0
|
||||||
|
return sample_count * 1000.0 / sample_rate
|
||||||
|
|
||||||
|
def _build_server_hello(self, session: DeviceSession) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "hello",
|
||||||
|
"transport": "websocket",
|
||||||
|
"session": {
|
||||||
|
"room": session.room_name,
|
||||||
|
"identity": session.identity,
|
||||||
|
},
|
||||||
|
"audio_params": {
|
||||||
|
"format": "opus",
|
||||||
|
"sample_rate": OUTPUT_SAMPLE_RATE,
|
||||||
|
"channels": 1,
|
||||||
|
"frame_duration": OUTPUT_FRAME_DURATION_MS,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _log_client_hello(self, session: DeviceSession, message: dict[str, Any]) -> None:
|
||||||
|
audio_params = message.get("audio_params")
|
||||||
|
if not isinstance(audio_params, dict):
|
||||||
|
return
|
||||||
|
|
||||||
|
sample_rate = audio_params.get("sample_rate")
|
||||||
|
frame_duration = audio_params.get("frame_duration")
|
||||||
|
channels = audio_params.get("channels")
|
||||||
|
fmt = audio_params.get("format")
|
||||||
|
print(
|
||||||
|
"[client-audio] "
|
||||||
|
f"device={session.device_id} format={fmt} sample_rate={sample_rate} "
|
||||||
|
f"channels={channels} frame_duration={frame_duration}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if sample_rate != INPUT_SAMPLE_RATE or channels != 1:
|
||||||
|
print(
|
||||||
|
"[client-audio] warning: bridge uplink decode expects "
|
||||||
|
f"{INPUT_SAMPLE_RATE}Hz mono, got {sample_rate}Hz channels={channels}"
|
||||||
|
)
|
||||||
|
if frame_duration != INPUT_FRAME_DURATION_MS:
|
||||||
|
print(
|
||||||
|
"[client-audio] warning: bridge expects "
|
||||||
|
f"{INPUT_FRAME_DURATION_MS}ms uplink frames, got {frame_duration}ms"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _track_room_connect_task(
|
||||||
|
self,
|
||||||
|
session: DeviceSession,
|
||||||
|
task: asyncio.Task[Any],
|
||||||
|
) -> None:
|
||||||
|
if task.cancelled():
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
task.result()
|
||||||
|
except Exception as exc:
|
||||||
|
self._log_exception(
|
||||||
|
f"LiveKit 房间连接后台任务失败: room={session.room_name}",
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
websocket = session.websocket
|
||||||
|
if websocket is not None:
|
||||||
|
asyncio.create_task(websocket.close(code=1011, reason="livekit connect failed"))
|
||||||
|
|
||||||
|
async def _capture_mic_frame(
|
||||||
|
self,
|
||||||
|
session: DeviceSession,
|
||||||
|
pcm_bytes: bytes,
|
||||||
|
num_samples: int,
|
||||||
|
) -> bool:
|
||||||
|
try:
|
||||||
|
frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples)
|
||||||
|
except TypeError:
|
||||||
|
frame = AudioFrame.create(
|
||||||
|
sample_rate=INPUT_SAMPLE_RATE,
|
||||||
|
num_channels=1,
|
||||||
|
samples_per_channel=num_samples,
|
||||||
|
)
|
||||||
|
memoryview(frame.data).cast("B")[:] = pcm_bytes
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
session.mic_source.capture_frame(frame),
|
||||||
|
timeout=UPLINK_CAPTURE_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
print(
|
||||||
|
"[uplink] warning: capture_frame timeout, dropping frame "
|
||||||
|
f"device={session.device_id} samples={num_samples}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
def _build_device_id(self, websocket: Any) -> str:
|
def _build_device_id(self, websocket: Any) -> str:
|
||||||
headers = websocket.request.headers
|
headers = websocket.request.headers
|
||||||
requested_id = headers.get("X-Device-Id") or headers.get("Device-Id")
|
requested_id = headers.get("X-Device-Id") or headers.get("Device-Id")
|
||||||
@ -125,15 +264,61 @@ class ESP32LiveKitBridge:
|
|||||||
print(f"[session] device={device_id} room={room_name} identity={identity}")
|
print(f"[session] device={device_id} room={room_name} identity={identity}")
|
||||||
return room_name, identity
|
return room_name, identity
|
||||||
|
|
||||||
def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool:
|
def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]:
|
||||||
|
requested_chat_mode = (
|
||||||
|
headers.get("Chat-Mode")
|
||||||
|
or headers.get("X-Chat-Mode")
|
||||||
|
or ""
|
||||||
|
).strip().lower()
|
||||||
|
chat_mode_to_agent = {
|
||||||
|
"normal": ("normal", False),
|
||||||
|
"beaver": ("beaver", False),
|
||||||
|
"vision-normal": ("normal", True),
|
||||||
|
"vision-beaver": ("beaver", True),
|
||||||
|
}
|
||||||
|
|
||||||
|
if requested_chat_mode in chat_mode_to_agent:
|
||||||
|
requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode]
|
||||||
|
else:
|
||||||
|
if requested_chat_mode:
|
||||||
|
print(f"未知 Chat-Mode={requested_chat_mode!r},回退到 Agent-Mode")
|
||||||
|
requested_mode = (
|
||||||
|
headers.get("Agent-Mode")
|
||||||
|
or headers.get("X-Agent-Mode")
|
||||||
|
or DEFAULT_AGENT_MODE
|
||||||
|
or "normal"
|
||||||
|
).strip().lower()
|
||||||
|
vision_enabled = False
|
||||||
|
requested_chat_mode = requested_mode
|
||||||
|
|
||||||
|
requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name")
|
||||||
|
if requested_name:
|
||||||
|
return requested_chat_mode, "custom", requested_name, vision_enabled
|
||||||
|
|
||||||
|
if requested_mode not in AGENT_NAMES:
|
||||||
|
print(f"未知 Agent-Mode={requested_mode!r},回退到 normal")
|
||||||
|
requested_mode = "normal"
|
||||||
|
requested_chat_mode = "vision-normal" if vision_enabled else "normal"
|
||||||
|
|
||||||
|
if requested_chat_mode in CHAT_MODE_AGENT_NAMES:
|
||||||
|
return (
|
||||||
|
requested_chat_mode,
|
||||||
|
requested_mode,
|
||||||
|
CHAT_MODE_AGENT_NAMES[requested_chat_mode],
|
||||||
|
vision_enabled,
|
||||||
|
)
|
||||||
|
|
||||||
|
return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled
|
||||||
|
|
||||||
|
def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool:
|
||||||
identity = getattr(participant, "identity", "") or ""
|
identity = getattr(participant, "identity", "") or ""
|
||||||
return identity.startswith("agent-") or AGENT_NAME in identity
|
return identity.startswith("agent-") or agent_name in identity
|
||||||
|
|
||||||
def _get_agent_identities(self, session: DeviceSession) -> list[str]:
|
def _get_agent_identities(self, session: DeviceSession) -> list[str]:
|
||||||
return [
|
return [
|
||||||
participant.identity
|
participant.identity
|
||||||
for participant in session.room.remote_participants.values()
|
for participant in session.room.remote_participants.values()
|
||||||
if self._is_agent_participant(participant)
|
if self._is_agent_participant(participant, session.agent_name)
|
||||||
]
|
]
|
||||||
|
|
||||||
def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
|
def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
|
||||||
@ -175,7 +360,7 @@ class ESP32LiveKitBridge:
|
|||||||
dispatch_agent_name = getattr(dispatch, "agent_name", None)
|
dispatch_agent_name = getattr(dispatch, "agent_name", None)
|
||||||
dispatch_room = getattr(dispatch, "room", None)
|
dispatch_room = getattr(dispatch, "room", None)
|
||||||
if dispatch_room == session.room_name and (
|
if dispatch_room == session.room_name and (
|
||||||
dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None
|
dispatch_agent_name == session.agent_name or dispatch_agent_name is None
|
||||||
):
|
):
|
||||||
print(
|
print(
|
||||||
f"检测到已有 dispatch: room={session.room_name} "
|
f"检测到已有 dispatch: room={session.room_name} "
|
||||||
@ -198,7 +383,7 @@ class ESP32LiveKitBridge:
|
|||||||
return
|
return
|
||||||
|
|
||||||
for participant in session.room.remote_participants.values():
|
for participant in session.room.remote_participants.values():
|
||||||
if self._is_agent_participant(participant):
|
if self._is_agent_participant(participant, session.agent_name):
|
||||||
# print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}")
|
# print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}")
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -210,7 +395,7 @@ class ESP32LiveKitBridge:
|
|||||||
await session.agent_dispatch_task
|
await session.agent_dispatch_task
|
||||||
|
|
||||||
async def _dispatch_agent(self, session: DeviceSession) -> None:
|
async def _dispatch_agent(self, session: DeviceSession) -> None:
|
||||||
print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}")
|
print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if await self._dispatch_agent_with_sdk(session):
|
if await self._dispatch_agent_with_sdk(session):
|
||||||
@ -240,13 +425,17 @@ class ESP32LiveKitBridge:
|
|||||||
try:
|
try:
|
||||||
dispatch = await lkapi.agent_dispatch.create_dispatch(
|
dispatch = await lkapi.agent_dispatch.create_dispatch(
|
||||||
livekit_api.CreateAgentDispatchRequest(
|
livekit_api.CreateAgentDispatchRequest(
|
||||||
agent_name=AGENT_NAME,
|
agent_name=session.agent_name,
|
||||||
room=session.room_name,
|
room=session.room_name,
|
||||||
metadata=json.dumps(
|
metadata=json.dumps(
|
||||||
{
|
{
|
||||||
"source": "bridge_server",
|
"source": "bridge_server",
|
||||||
"identity": session.identity,
|
"identity": session.identity,
|
||||||
"device_id": session.device_id,
|
"device_id": session.device_id,
|
||||||
|
"chat_mode": session.chat_mode,
|
||||||
|
"agent_mode": session.agent_mode,
|
||||||
|
"agent_name": session.agent_name,
|
||||||
|
"vision_enabled": session.vision_enabled,
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@ -269,13 +458,17 @@ class ESP32LiveKitBridge:
|
|||||||
"--room",
|
"--room",
|
||||||
session.room_name,
|
session.room_name,
|
||||||
"--agent-name",
|
"--agent-name",
|
||||||
AGENT_NAME,
|
session.agent_name,
|
||||||
"--metadata",
|
"--metadata",
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
"source": "bridge_server",
|
"source": "bridge_server",
|
||||||
"identity": session.identity,
|
"identity": session.identity,
|
||||||
"device_id": session.device_id,
|
"device_id": session.device_id,
|
||||||
|
"chat_mode": session.chat_mode,
|
||||||
|
"agent_mode": session.agent_mode,
|
||||||
|
"agent_name": session.agent_name,
|
||||||
|
"vision_enabled": session.vision_enabled,
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
stdout=asyncio.subprocess.PIPE,
|
stdout=asyncio.subprocess.PIPE,
|
||||||
@ -294,7 +487,7 @@ class ESP32LiveKitBridge:
|
|||||||
print(f"lk dispatch create 失败,退出码: {process.returncode}")
|
print(f"lk dispatch create 失败,退出码: {process.returncode}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}")
|
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
|
async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
|
||||||
@ -364,7 +557,7 @@ class ESP32LiveKitBridge:
|
|||||||
print("收到 vision frame,但 image 字段为空")
|
print("收到 vision frame,但 image 字段为空")
|
||||||
return
|
return
|
||||||
|
|
||||||
saved_path = self._save_vision_frame(session, image)
|
saved_path = await asyncio.to_thread(self._save_vision_frame, session, image)
|
||||||
if saved_path is None:
|
if saved_path is None:
|
||||||
return
|
return
|
||||||
print(f"已保存 vision frame: {saved_path}")
|
print(f"已保存 vision frame: {saved_path}")
|
||||||
@ -415,9 +608,40 @@ class ESP32LiveKitBridge:
|
|||||||
await session.websocket.send(json.dumps({"type": "tts", "state": state}))
|
await session.websocket.send(json.dumps({"type": "tts", "state": state}))
|
||||||
print(f"已发送 tts {state}: device={session.device_id}")
|
print(f"已发送 tts {state}: device={session.device_id}")
|
||||||
|
|
||||||
|
async def _send_emotion(self, session: DeviceSession, emotion: str) -> None:
|
||||||
|
if session.websocket is None:
|
||||||
|
print(f"跳过 emotion {emotion},ESP32 尚未连接")
|
||||||
|
return
|
||||||
|
await session.websocket.send(json.dumps({"type": "llm", "emotion": emotion}))
|
||||||
|
print(f"已发送 emotion: device={session.device_id} emotion={emotion}")
|
||||||
|
|
||||||
|
def _parse_emotion_text(self, text: str) -> tuple[Optional[str], str]:
|
||||||
|
match = EMOTION_TEXT_PATTERN.match(text)
|
||||||
|
if match is None:
|
||||||
|
return None, text.strip()
|
||||||
|
emotion, tts_text = match.groups()
|
||||||
|
return emotion.strip(), tts_text.strip()
|
||||||
|
|
||||||
|
async def _run_emotion_test_sequence(self, session: DeviceSession) -> None:
|
||||||
|
if not EMOTION_TEST_SEQUENCE:
|
||||||
|
return
|
||||||
|
|
||||||
|
for index, emotion in enumerate(EMOTION_TEST_SEQUENCE):
|
||||||
|
if session.websocket is None or session.closed:
|
||||||
|
return
|
||||||
|
if index > 0:
|
||||||
|
await asyncio.sleep(EMOTION_TEST_INTERVAL_SECONDS)
|
||||||
|
await self._send_emotion(session, emotion)
|
||||||
|
|
||||||
async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
|
async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
|
||||||
if session.websocket is None:
|
if session.websocket is None:
|
||||||
return
|
return
|
||||||
|
raw_text = text
|
||||||
|
_emotion, text = self._parse_emotion_text(text)
|
||||||
|
if not text:
|
||||||
|
print(f"[tts->esp32] skip empty text: raw={raw_text!r} final={final}")
|
||||||
|
return
|
||||||
|
print(f"[tts->esp32] text={text!r} final={final}")
|
||||||
await session.websocket.send(
|
await session.websocket.send(
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
@ -487,26 +711,99 @@ class ESP32LiveKitBridge:
|
|||||||
if session.tts_active:
|
if session.tts_active:
|
||||||
print("跳过 tts start,当前已处于激活状态")
|
print("跳过 tts start,当前已处于激活状态")
|
||||||
return
|
return
|
||||||
|
block_reason = self._tts_resume_block_reason(session, include_user_quiet=False)
|
||||||
|
if block_reason is not None:
|
||||||
|
print(f"跳过 tts start,打断后仍在等待稳定聆听: {block_reason}")
|
||||||
|
return
|
||||||
if time.monotonic() < session.tts_suppressed_until:
|
if time.monotonic() < session.tts_suppressed_until:
|
||||||
print("跳过 tts start,中断后的残留音频仍在抑制窗口内")
|
print("跳过 tts start,中断后的残留音频仍在抑制窗口内")
|
||||||
return
|
return
|
||||||
if not session.tts_display_text:
|
if not session.tts_display_text:
|
||||||
session.tts_transcript_text = ""
|
session.tts_transcript_text = ""
|
||||||
session.tts_display_final = False
|
session.tts_display_final = False
|
||||||
|
session.tts_emotion = ""
|
||||||
self._cancel_tts_display_task(session)
|
self._cancel_tts_display_task(session)
|
||||||
|
now = time.monotonic()
|
||||||
|
session.tts_started_at = now
|
||||||
|
session.tts_last_audible_at = now
|
||||||
await self._send_tts_state(session, "start")
|
await self._send_tts_state(session, "start")
|
||||||
session.tts_active = True
|
session.tts_active = True
|
||||||
|
session.tts_thinking = False
|
||||||
|
|
||||||
|
async def _start_thinking(self, session: DeviceSession) -> None:
|
||||||
|
if session.tts_active:
|
||||||
|
print("跳过 tts thinking,当前已处于 TTS 播放状态")
|
||||||
|
return
|
||||||
|
if session.tts_thinking:
|
||||||
|
print("跳过 tts thinking,当前已处于思考状态")
|
||||||
|
return
|
||||||
|
block_reason = self._tts_resume_block_reason(session, include_user_quiet=False)
|
||||||
|
if block_reason is not None:
|
||||||
|
print(f"跳过 tts thinking,打断后仍在等待稳定聆听: {block_reason}")
|
||||||
|
return
|
||||||
|
if time.monotonic() < session.tts_suppressed_until:
|
||||||
|
print("跳过 tts thinking,中断后的残留音频仍在抑制窗口内")
|
||||||
|
return
|
||||||
|
await self._send_tts_state(session, "thinking")
|
||||||
|
session.tts_thinking = True
|
||||||
|
|
||||||
|
def _tts_resume_block_reason(
|
||||||
|
self,
|
||||||
|
session: DeviceSession,
|
||||||
|
now: Optional[float] = None,
|
||||||
|
*,
|
||||||
|
include_user_quiet: bool = True,
|
||||||
|
) -> Optional[str]:
|
||||||
|
if now is None:
|
||||||
|
now = time.monotonic()
|
||||||
|
|
||||||
|
if session.tts_waiting_for_user_audio_after_interrupt:
|
||||||
|
return "waiting_for_user_audio_after_interrupt"
|
||||||
|
|
||||||
|
if session.last_interrupt_time <= 0.0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
since_interrupt = now - session.last_interrupt_time
|
||||||
|
if since_interrupt > TTS_POST_INTERRUPT_LISTEN_WINDOW_SECONDS:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if session.last_uplink_audible_time < session.last_interrupt_time:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not include_user_quiet:
|
||||||
|
return None
|
||||||
|
|
||||||
|
quiet_for = now - session.last_uplink_audible_time
|
||||||
|
if quiet_for < TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS:
|
||||||
|
return f"user_audio_quiet_for={quiet_for:.2f}s"
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _handle_agent_state(self, session: DeviceSession, participant: rtc.Participant) -> None:
|
||||||
|
state = participant.attributes.get(AGENT_STATE_ATTRIBUTE)
|
||||||
|
if not isinstance(state, str) or not state:
|
||||||
|
return
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"[agent-state] room={session.room_name} identity={participant.identity} state={state}"
|
||||||
|
)
|
||||||
|
if state == "thinking":
|
||||||
|
asyncio.create_task(self._start_thinking(session))
|
||||||
|
|
||||||
async def _stop_tts(self, session: DeviceSession) -> None:
|
async def _stop_tts(self, session: DeviceSession) -> None:
|
||||||
if not session.tts_active:
|
if not session.tts_active and not session.tts_thinking:
|
||||||
print("跳过 tts stop,当前未激活")
|
print("跳过 tts stop,当前未激活")
|
||||||
return
|
return
|
||||||
self._cancel_tts_display_task(session)
|
self._cancel_tts_display_task(session)
|
||||||
await self._send_tts_state(session, "stop")
|
await self._send_tts_state(session, "stop")
|
||||||
session.tts_active = False
|
session.tts_active = False
|
||||||
|
session.tts_thinking = False
|
||||||
|
session.tts_started_at = 0.0
|
||||||
|
session.tts_last_audible_at = 0.0
|
||||||
session.tts_transcript_text = ""
|
session.tts_transcript_text = ""
|
||||||
session.tts_display_text = ""
|
session.tts_display_text = ""
|
||||||
session.tts_display_final = False
|
session.tts_display_final = False
|
||||||
|
session.tts_emotion = ""
|
||||||
|
|
||||||
async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
|
async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
|
||||||
self._cancel_tts_display_task(session)
|
self._cancel_tts_display_task(session)
|
||||||
@ -514,20 +811,28 @@ class ESP32LiveKitBridge:
|
|||||||
session.tts_idle_task.cancel()
|
session.tts_idle_task.cancel()
|
||||||
session.tts_idle_task = None
|
session.tts_idle_task = None
|
||||||
session.tts_active = False
|
session.tts_active = False
|
||||||
|
session.tts_thinking = False
|
||||||
|
session.tts_started_at = 0.0
|
||||||
|
session.tts_last_audible_at = 0.0
|
||||||
session.tts_transcript_text = ""
|
session.tts_transcript_text = ""
|
||||||
session.tts_display_text = ""
|
session.tts_display_text = ""
|
||||||
session.tts_display_final = False
|
session.tts_display_final = False
|
||||||
|
session.tts_emotion = ""
|
||||||
await self._send_tts_state(session, "stop")
|
await self._send_tts_state(session, "stop")
|
||||||
print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")
|
print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")
|
||||||
|
|
||||||
async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None:
|
async def _abort_tts(self, session: DeviceSession, reason: str = "client_abort") -> None:
|
||||||
print(f"收到打断请求,停止当前 TTS: device={session.device_id} reason={reason}")
|
print(f"收到打断请求,停止当前 TTS: device={session.device_id} reason={reason}")
|
||||||
|
now = time.monotonic()
|
||||||
session.tts_stream_id += 1
|
session.tts_stream_id += 1
|
||||||
session.tts_suppressed_until = time.monotonic() + TTS_INTERRUPT_SUPPRESS_SECONDS
|
session.last_interrupt_time = now
|
||||||
|
session.tts_suppressed_until = now + TTS_INTERRUPT_SUPPRESS_SECONDS
|
||||||
|
session.tts_waiting_for_user_audio_after_interrupt = True
|
||||||
await self._force_stop_tts(session, reason)
|
await self._force_stop_tts(session, reason)
|
||||||
asyncio.create_task(self._send_agent_interrupt(session, reason))
|
asyncio.create_task(self._send_agent_interrupt(session, reason))
|
||||||
|
|
||||||
def _reset_tts_idle_timer(self, session: DeviceSession) -> None:
|
def _reset_tts_idle_timer(self, session: DeviceSession) -> None:
|
||||||
|
session.tts_last_audible_at = time.monotonic()
|
||||||
if session.tts_idle_task is not None:
|
if session.tts_idle_task is not None:
|
||||||
session.tts_idle_task.cancel()
|
session.tts_idle_task.cancel()
|
||||||
session.tts_idle_task = asyncio.create_task(
|
session.tts_idle_task = asyncio.create_task(
|
||||||
@ -536,11 +841,28 @@ class ESP32LiveKitBridge:
|
|||||||
|
|
||||||
async def _tts_idle_watchdog(self, session: DeviceSession, stream_id: int) -> None:
|
async def _tts_idle_watchdog(self, session: DeviceSession, stream_id: int) -> None:
|
||||||
try:
|
try:
|
||||||
|
while True:
|
||||||
await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS)
|
await asyncio.sleep(TTS_IDLE_TIMEOUT_SECONDS)
|
||||||
if stream_id != session.tts_stream_id:
|
if stream_id != session.tts_stream_id or not session.tts_active:
|
||||||
return
|
return
|
||||||
print(f"TTS 空闲超过 {TTS_IDLE_TIMEOUT_SECONDS}s,切回聆听状态")
|
|
||||||
|
now = time.monotonic()
|
||||||
|
idle_for = now - session.tts_last_audible_at
|
||||||
|
active_for = now - session.tts_started_at
|
||||||
|
remaining = max(
|
||||||
|
TTS_IDLE_TIMEOUT_SECONDS - idle_for,
|
||||||
|
TTS_MIN_ACTIVE_SECONDS - active_for,
|
||||||
|
)
|
||||||
|
if remaining > 0:
|
||||||
|
await asyncio.sleep(remaining)
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(
|
||||||
|
"TTS 静音达到阈值,切回聆听状态: "
|
||||||
|
f"idle={idle_for:.2f}s active={active_for:.2f}s"
|
||||||
|
)
|
||||||
await self._stop_tts(session)
|
await self._stop_tts(session)
|
||||||
|
return
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -681,20 +1003,22 @@ class ESP32LiveKitBridge:
|
|||||||
print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
|
print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
|
||||||
self._log_agent_participants(session, "connected")
|
self._log_agent_participants(session, "connected")
|
||||||
for participant in session.room.remote_participants.values():
|
for participant in session.room.remote_participants.values():
|
||||||
if self._is_agent_participant(participant):
|
if self._is_agent_participant(participant, session.agent_name):
|
||||||
session.agent_ready.set()
|
session.agent_ready.set()
|
||||||
self._scan_participant_audio_tracks(session, participant, "connected_scan")
|
self._scan_participant_audio_tracks(session, participant, "connected_scan")
|
||||||
|
self._handle_agent_state(session, participant)
|
||||||
|
|
||||||
@session.room.on("participant_connected")
|
@session.room.on("participant_connected")
|
||||||
def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
|
def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
|
||||||
role = "Agent" if self._is_agent_participant(participant) else "Remote participant"
|
role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
|
||||||
print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
|
print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
|
||||||
self._log_agent_participants(session, "participant_connected")
|
self._log_agent_participants(session, "participant_connected")
|
||||||
if self._is_agent_participant(participant):
|
if self._is_agent_participant(participant, session.agent_name):
|
||||||
session.agent_ready.set()
|
session.agent_ready.set()
|
||||||
self._scan_participant_audio_tracks(
|
self._scan_participant_audio_tracks(
|
||||||
session, participant, "participant_connected_scan"
|
session, participant, "participant_connected_scan"
|
||||||
)
|
)
|
||||||
|
self._handle_agent_state(session, participant)
|
||||||
|
|
||||||
@session.room.on("participant_disconnected")
|
@session.room.on("participant_disconnected")
|
||||||
def on_participant_disconnected(participant: rtc.RemoteParticipant) -> None:
|
def on_participant_disconnected(participant: rtc.RemoteParticipant) -> None:
|
||||||
@ -705,6 +1029,16 @@ class ESP32LiveKitBridge:
|
|||||||
if not track_sid.endswith(f":{participant.identity}")
|
if not track_sid.endswith(f":{participant.identity}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@session.room.on("participant_attributes_changed")
|
||||||
|
def on_participant_attributes_changed(changed: list[str], participant: rtc.Participant) -> None:
|
||||||
|
if AGENT_STATE_ATTRIBUTE not in changed:
|
||||||
|
return
|
||||||
|
if not isinstance(participant, rtc.RemoteParticipant):
|
||||||
|
return
|
||||||
|
if not self._is_agent_participant(participant, session.agent_name):
|
||||||
|
return
|
||||||
|
self._handle_agent_state(session, participant)
|
||||||
|
|
||||||
@session.room.on("data_received")
|
@session.room.on("data_received")
|
||||||
def on_data_received(data_packet: rtc.DataPacket) -> None:
|
def on_data_received(data_packet: rtc.DataPacket) -> None:
|
||||||
identity = data_packet.participant.identity if data_packet.participant else "未知"
|
identity = data_packet.participant.identity if data_packet.participant else "未知"
|
||||||
@ -723,14 +1057,26 @@ class ESP32LiveKitBridge:
|
|||||||
track_pub: rtc.TrackPublication,
|
track_pub: rtc.TrackPublication,
|
||||||
) -> None:
|
) -> None:
|
||||||
identity = participant.identity if participant else "未知"
|
identity = participant.identity if participant else "未知"
|
||||||
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant)
|
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(
|
||||||
|
participant, session.agent_name
|
||||||
|
)
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
|
status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
|
||||||
print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}")
|
print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}")
|
||||||
if is_agent:
|
if is_agent:
|
||||||
if time.monotonic() < session.tts_suppressed_until:
|
if time.monotonic() < session.tts_suppressed_until:
|
||||||
continue
|
continue
|
||||||
display_text = self._current_tts_display_text(segment.text)
|
print(f"[livekit-llm] raw={segment.text!r} final={segment.final}")
|
||||||
|
emotion, tts_text = self._parse_emotion_text(segment.text)
|
||||||
|
print(
|
||||||
|
f"[livekit-llm] parsed emotion={emotion!r} "
|
||||||
|
f"tts_text={tts_text!r} final={segment.final}"
|
||||||
|
)
|
||||||
|
if emotion and emotion != session.tts_emotion:
|
||||||
|
session.tts_emotion = emotion
|
||||||
|
asyncio.create_task(self._send_emotion(session, emotion))
|
||||||
|
display_text = self._current_tts_display_text(tts_text)
|
||||||
|
print(f"[livekit-llm] display_text={display_text!r} final={segment.final}")
|
||||||
if not display_text or display_text == session.tts_transcript_text:
|
if not display_text or display_text == session.tts_transcript_text:
|
||||||
continue
|
continue
|
||||||
session.tts_transcript_text = display_text
|
session.tts_transcript_text = display_text
|
||||||
@ -738,6 +1084,7 @@ class ESP32LiveKitBridge:
|
|||||||
if not segment.final:
|
if not segment.final:
|
||||||
continue
|
continue
|
||||||
display_text = segment.text
|
display_text = segment.text
|
||||||
|
asyncio.create_task(self._start_thinking(session))
|
||||||
|
|
||||||
if session.websocket is not None:
|
if session.websocket is not None:
|
||||||
ws = session.websocket
|
ws = session.websocket
|
||||||
@ -785,7 +1132,7 @@ class ESP32LiveKitBridge:
|
|||||||
# print(f"[config] token_url={TOKEN_URL}")
|
# print(f"[config] token_url={TOKEN_URL}")
|
||||||
# print(f"[config] room={session.room_name} identity={session.identity}")
|
# print(f"[config] room={session.room_name} identity={session.identity}")
|
||||||
# print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
|
# print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
|
||||||
token = await fetch_token(session.room_name, session.identity)
|
token = await fetch_token(session.room_name, session.identity, session.agent_name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await session.room.connect(
|
await session.room.connect(
|
||||||
@ -830,15 +1177,45 @@ class ESP32LiveKitBridge:
|
|||||||
# print(f"等待 agent 加入: room={session.room_name}")
|
# print(f"等待 agent 加入: room={session.room_name}")
|
||||||
try:
|
try:
|
||||||
await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
|
await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
|
||||||
|
if session.closed:
|
||||||
|
return
|
||||||
# print(f"✅ agent 已就绪: room={session.room_name}")
|
# print(f"✅ agent 已就绪: room={session.room_name}")
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
|
if session.closed:
|
||||||
|
return
|
||||||
print(f"⚠️ agent 等待超时: room={session.room_name}")
|
print(f"⚠️ agent 等待超时: room={session.room_name}")
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
print(f"[config] websocket_port={WS_PORT}")
|
print(f"[config] websocket_port={WS_PORT}")
|
||||||
|
print(f"[config] websocket_max_queue={WS_MAX_QUEUE} websocket_max_size={WS_MAX_SIZE}")
|
||||||
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
|
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
|
||||||
print(f"[config] token_url={TOKEN_URL}")
|
print(f"[config] token_url={TOKEN_URL}")
|
||||||
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
|
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
|
||||||
|
print(
|
||||||
|
"[config] audio="
|
||||||
|
f"uplink_decode:{INPUT_SAMPLE_RATE}Hz/{INPUT_FRAME_DURATION_MS}ms "
|
||||||
|
f"downlink_encode:{OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms "
|
||||||
|
f"stats_interval:{AUDIO_STATS_INTERVAL_SECONDS}s "
|
||||||
|
f"capture_timeout:{UPLINK_CAPTURE_TIMEOUT_SECONDS}s "
|
||||||
|
f"tts_idle:{TTS_IDLE_TIMEOUT_SECONDS}s "
|
||||||
|
f"tts_min_active:{TTS_MIN_ACTIVE_SECONDS}s "
|
||||||
|
f"tts_start_frames:{TTS_START_CONSECUTIVE_AUDIBLE_FRAMES} "
|
||||||
|
f"tts_pre_roll:{TTS_PRE_ROLL_MS}ms"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"[config] agents="
|
||||||
|
f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
|
||||||
|
f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} "
|
||||||
|
f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} "
|
||||||
|
f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} "
|
||||||
|
f"default_mode:{DEFAULT_AGENT_MODE}"
|
||||||
|
)
|
||||||
|
if EMOTION_TEST_SEQUENCE:
|
||||||
|
print(
|
||||||
|
"[config] emotion_test_sequence="
|
||||||
|
f"{','.join(EMOTION_TEST_SEQUENCE)} "
|
||||||
|
f"interval={EMOTION_TEST_INTERVAL_SECONDS}s"
|
||||||
|
)
|
||||||
|
|
||||||
async def close(self) -> None:
|
async def close(self) -> None:
|
||||||
for session in list(self.device_sessions.values()):
|
for session in list(self.device_sessions.values()):
|
||||||
@ -849,7 +1226,9 @@ class ESP32LiveKitBridge:
|
|||||||
return
|
return
|
||||||
session.closed = True
|
session.closed = True
|
||||||
session.websocket = None
|
session.websocket = None
|
||||||
|
session.agent_ready.set()
|
||||||
session.tts_active = False
|
session.tts_active = False
|
||||||
|
session.tts_thinking = False
|
||||||
session.tts_stream_id += 1
|
session.tts_stream_id += 1
|
||||||
if session.tts_idle_task is not None:
|
if session.tts_idle_task is not None:
|
||||||
session.tts_idle_task.cancel()
|
session.tts_idle_task.cancel()
|
||||||
@ -870,13 +1249,20 @@ class ESP32LiveKitBridge:
|
|||||||
pending_pcm = bytearray()
|
pending_pcm = bytearray()
|
||||||
pre_roll_pcm = bytearray()
|
pre_roll_pcm = bytearray()
|
||||||
pre_roll_max_bytes = OUTPUT_SAMPLE_RATE * TTS_PRE_ROLL_MS // 1000 * 2
|
pre_roll_max_bytes = OUTPUT_SAMPLE_RATE * TTS_PRE_ROLL_MS // 1000 * 2
|
||||||
|
output_samples_per_opus_frame = OUTPUT_SAMPLE_RATE * OUTPUT_FRAME_DURATION_MS // 1000
|
||||||
|
output_frame_bytes = output_samples_per_opus_frame * 2
|
||||||
audible_frame_streak = 0
|
audible_frame_streak = 0
|
||||||
silence_frame_streak = 0
|
silence_frame_streak = 0
|
||||||
waiting_for_post_interrupt_silence = False
|
waiting_for_post_interrupt_silence = False
|
||||||
|
downlink_packets = 0
|
||||||
|
downlink_audio_ms = 0.0
|
||||||
|
last_downlink_stats_time = time.monotonic()
|
||||||
|
last_send_time: Optional[float] = None
|
||||||
stream_id = session.tts_stream_id
|
stream_id = session.tts_stream_id
|
||||||
print(
|
print(
|
||||||
f"启动 TTS 转发: device={session.device_id} room={session.room_name} "
|
f"启动 TTS 转发: device={session.device_id} room={session.room_name} "
|
||||||
f"track_sid={track_sid} stream_id={stream_id}"
|
f"track_sid={track_sid} stream_id={stream_id} "
|
||||||
|
f"opus={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -897,7 +1283,8 @@ class ESP32LiveKitBridge:
|
|||||||
pcm_data = frame.data.tobytes()
|
pcm_data = frame.data.tobytes()
|
||||||
has_audible_audio = self._has_audible_audio(pcm_data)
|
has_audible_audio = self._has_audible_audio(pcm_data)
|
||||||
|
|
||||||
if time.monotonic() < session.tts_suppressed_until:
|
now = time.monotonic()
|
||||||
|
if now < session.tts_suppressed_until:
|
||||||
pending_pcm.clear()
|
pending_pcm.clear()
|
||||||
pre_roll_pcm.clear()
|
pre_roll_pcm.clear()
|
||||||
audible_frame_streak = 0
|
audible_frame_streak = 0
|
||||||
@ -905,6 +1292,31 @@ class ESP32LiveKitBridge:
|
|||||||
waiting_for_post_interrupt_silence = True
|
waiting_for_post_interrupt_silence = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
block_reason = self._tts_resume_block_reason(
|
||||||
|
session,
|
||||||
|
now,
|
||||||
|
include_user_quiet=False,
|
||||||
|
)
|
||||||
|
if block_reason is not None:
|
||||||
|
pending_pcm.clear()
|
||||||
|
pre_roll_pcm.clear()
|
||||||
|
audible_frame_streak = 0
|
||||||
|
silence_frame_streak = 0
|
||||||
|
if block_reason == "waiting_for_user_audio_after_interrupt":
|
||||||
|
waiting_for_post_interrupt_silence = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (
|
||||||
|
waiting_for_post_interrupt_silence
|
||||||
|
and session.last_interrupt_time > 0.0
|
||||||
|
and session.last_uplink_audible_time >= session.last_interrupt_time
|
||||||
|
and now - session.last_uplink_audible_time
|
||||||
|
>= TTS_POST_INTERRUPT_USER_AUDIO_GRACE_SECONDS
|
||||||
|
):
|
||||||
|
print("检测到用户打断后语音已结束,允许新 TTS 直接起播")
|
||||||
|
waiting_for_post_interrupt_silence = False
|
||||||
|
silence_frame_streak = 0
|
||||||
|
|
||||||
if waiting_for_post_interrupt_silence:
|
if waiting_for_post_interrupt_silence:
|
||||||
if has_audible_audio:
|
if has_audible_audio:
|
||||||
silence_frame_streak = 0
|
silence_frame_streak = 0
|
||||||
@ -952,20 +1364,42 @@ class ESP32LiveKitBridge:
|
|||||||
|
|
||||||
if not current_frame_buffered:
|
if not current_frame_buffered:
|
||||||
pending_pcm.extend(pcm_data)
|
pending_pcm.extend(pcm_data)
|
||||||
frame_bytes = OUTPUT_SAMPLES_PER_OPUS_FRAME * 2
|
|
||||||
|
|
||||||
while (
|
while (
|
||||||
len(pending_pcm) >= frame_bytes
|
len(pending_pcm) >= output_frame_bytes
|
||||||
and stream_id == session.tts_stream_id
|
and stream_id == session.tts_stream_id
|
||||||
and session.websocket is not None
|
and session.websocket is not None
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
opus_packet = encoder.encode(
|
now = time.monotonic()
|
||||||
bytes(pending_pcm[:frame_bytes]),
|
if last_send_time is not None:
|
||||||
OUTPUT_SAMPLES_PER_OPUS_FRAME,
|
send_gap_ms = (now - last_send_time) * 1000.0
|
||||||
|
if send_gap_ms > DOWNLINK_SEND_GAP_WARN_MS:
|
||||||
|
print(
|
||||||
|
"[downlink] warning: send gap "
|
||||||
|
f"{send_gap_ms:.1f}ms device={session.device_id} "
|
||||||
|
f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}"
|
||||||
)
|
)
|
||||||
del pending_pcm[:frame_bytes]
|
last_send_time = now
|
||||||
|
|
||||||
|
opus_packet = encoder.encode(
|
||||||
|
bytes(pending_pcm[:output_frame_bytes]),
|
||||||
|
output_samples_per_opus_frame,
|
||||||
|
)
|
||||||
|
del pending_pcm[:output_frame_bytes]
|
||||||
await session.websocket.send(self._wrap_opus_payload(session, opus_packet))
|
await session.websocket.send(self._wrap_opus_payload(session, opus_packet))
|
||||||
|
downlink_packets += 1
|
||||||
|
downlink_audio_ms += OUTPUT_FRAME_DURATION_MS
|
||||||
|
if now - last_downlink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS:
|
||||||
|
print(
|
||||||
|
"[downlink] "
|
||||||
|
f"device={session.device_id} packets={downlink_packets} "
|
||||||
|
f"audio_ms={downlink_audio_ms:.0f} "
|
||||||
|
f"pending_ms={self._audio_duration_ms(len(pending_pcm) // 2, OUTPUT_SAMPLE_RATE):.1f}"
|
||||||
|
)
|
||||||
|
downlink_packets = 0
|
||||||
|
downlink_audio_ms = 0.0
|
||||||
|
last_downlink_stats_time = now
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"发送回 ESP32 失败: {exc}")
|
print(f"发送回 ESP32 失败: {exc}")
|
||||||
break
|
break
|
||||||
@ -998,6 +1432,7 @@ class ESP32LiveKitBridge:
|
|||||||
await self._close_session(existing_session)
|
await self._close_session(existing_session)
|
||||||
self.device_sessions.pop(device_id, None)
|
self.device_sessions.pop(device_id, None)
|
||||||
|
|
||||||
|
chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers)
|
||||||
room_name, identity = self._build_session_names(device_id)
|
room_name, identity = self._build_session_names(device_id)
|
||||||
session = DeviceSession(
|
session = DeviceSession(
|
||||||
device_id=device_id,
|
device_id=device_id,
|
||||||
@ -1005,6 +1440,10 @@ class ESP32LiveKitBridge:
|
|||||||
protocol_version=protocol_version,
|
protocol_version=protocol_version,
|
||||||
room_name=room_name,
|
room_name=room_name,
|
||||||
identity=identity,
|
identity=identity,
|
||||||
|
chat_mode=chat_mode,
|
||||||
|
agent_mode=agent_mode,
|
||||||
|
agent_name=agent_name,
|
||||||
|
vision_enabled=vision_enabled,
|
||||||
room=rtc.Room(),
|
room=rtc.Room(),
|
||||||
mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
|
mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
|
||||||
agent_ready=asyncio.Event(),
|
agent_ready=asyncio.Event(),
|
||||||
@ -1013,28 +1452,33 @@ class ESP32LiveKitBridge:
|
|||||||
|
|
||||||
print(f"ESP32 已连接: device={device_id}")
|
print(f"ESP32 已连接: device={device_id}")
|
||||||
print(f"ESP32 协议版本: {session.protocol_version}")
|
print(f"ESP32 协议版本: {session.protocol_version}")
|
||||||
|
print(
|
||||||
|
f"ESP32 mode: chat={session.chat_mode} "
|
||||||
|
f"agent={session.agent_mode}/{session.agent_name} "
|
||||||
|
f"vision={session.vision_enabled}"
|
||||||
|
)
|
||||||
session.tts_stream_id += 1
|
session.tts_stream_id += 1
|
||||||
opus_decoder = None
|
opus_decoder = None
|
||||||
|
uplink_packets = 0
|
||||||
|
uplink_audio_ms = 0.0
|
||||||
|
uplink_decode_errors = 0
|
||||||
|
uplink_dropped_frames = 0
|
||||||
|
last_uplink_stats_time = time.monotonic()
|
||||||
|
room_connect_task: Optional[asyncio.Task[Any]] = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
hello_msg = {
|
hello_msg = self._build_server_hello(session)
|
||||||
"type": "hello",
|
|
||||||
"transport": "websocket",
|
|
||||||
"session": {
|
|
||||||
"room": session.room_name,
|
|
||||||
"identity": session.identity,
|
|
||||||
},
|
|
||||||
"audio_params": {
|
|
||||||
"format": "opus",
|
|
||||||
"sample_rate": OUTPUT_SAMPLE_RATE,
|
|
||||||
"channels": 1,
|
|
||||||
"frame_duration": OUTPUT_FRAME_DURATION_MS,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
await websocket.send(json.dumps(hello_msg))
|
await websocket.send(json.dumps(hello_msg))
|
||||||
print(f"已发送 server hello: device={device_id} room={session.room_name}")
|
print(
|
||||||
|
f"已发送 server hello: device={device_id} room={session.room_name} "
|
||||||
|
f"audio={OUTPUT_SAMPLE_RATE}Hz/{OUTPUT_FRAME_DURATION_MS}ms"
|
||||||
|
)
|
||||||
|
asyncio.create_task(self._run_emotion_test_sequence(session))
|
||||||
|
|
||||||
await self._connect_session_room(session)
|
room_connect_task = asyncio.create_task(self._connect_session_room(session))
|
||||||
|
room_connect_task.add_done_callback(
|
||||||
|
lambda task: self._track_room_connect_task(session, task)
|
||||||
|
)
|
||||||
|
|
||||||
async for message in websocket:
|
async for message in websocket:
|
||||||
if isinstance(message, bytes):
|
if isinstance(message, bytes):
|
||||||
@ -1057,6 +1501,16 @@ class ESP32LiveKitBridge:
|
|||||||
if num_samples > 0:
|
if num_samples > 0:
|
||||||
session.captured_frame_count += 1
|
session.captured_frame_count += 1
|
||||||
now = time.monotonic()
|
now = time.monotonic()
|
||||||
|
uplink_packets += 1
|
||||||
|
uplink_audio_ms += self._audio_duration_ms(num_samples, INPUT_SAMPLE_RATE)
|
||||||
|
if self._has_audible_audio(pcm_bytes):
|
||||||
|
session.last_uplink_audible_time = now
|
||||||
|
if session.tts_waiting_for_user_audio_after_interrupt:
|
||||||
|
session.tts_waiting_for_user_audio_after_interrupt = False
|
||||||
|
print(
|
||||||
|
f"[uplink] detected user audio after interrupt: "
|
||||||
|
f"device={session.device_id}"
|
||||||
|
)
|
||||||
if (
|
if (
|
||||||
session.captured_frame_count <= 5
|
session.captured_frame_count <= 5
|
||||||
or now - session.first_capture_log_time >= 5.0
|
or now - session.first_capture_log_time >= 5.0
|
||||||
@ -1067,25 +1521,32 @@ class ESP32LiveKitBridge:
|
|||||||
# f"bytes={len(pcm_bytes)} samples={num_samples} "
|
# f"bytes={len(pcm_bytes)} samples={num_samples} "
|
||||||
# f"room={session.room_name}"
|
# f"room={session.room_name}"
|
||||||
# )
|
# )
|
||||||
try:
|
if now - last_uplink_stats_time >= AUDIO_STATS_INTERVAL_SECONDS:
|
||||||
frame = AudioFrame(pcm_bytes, INPUT_SAMPLE_RATE, 1, num_samples)
|
print(
|
||||||
await session.mic_source.capture_frame(frame)
|
"[uplink] "
|
||||||
except TypeError:
|
f"device={session.device_id} packets={uplink_packets} "
|
||||||
frame = AudioFrame.create(
|
f"audio_ms={uplink_audio_ms:.0f} "
|
||||||
sample_rate=INPUT_SAMPLE_RATE,
|
f"decode_errors={uplink_decode_errors} "
|
||||||
num_channels=1,
|
f"dropped_frames={uplink_dropped_frames}"
|
||||||
samples_per_channel=num_samples,
|
|
||||||
)
|
)
|
||||||
memoryview(frame.data).cast("B")[:] = pcm_bytes
|
uplink_packets = 0
|
||||||
await session.mic_source.capture_frame(frame)
|
uplink_audio_ms = 0.0
|
||||||
|
uplink_decode_errors = 0
|
||||||
|
uplink_dropped_frames = 0
|
||||||
|
last_uplink_stats_time = now
|
||||||
|
if not await self._capture_mic_frame(session, pcm_bytes, num_samples):
|
||||||
|
uplink_dropped_frames += 1
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
uplink_decode_errors += 1
|
||||||
print(f"Opus audio decode error ({len(message)} bytes): {exc}")
|
print(f"Opus audio decode error ({len(message)} bytes): {exc}")
|
||||||
elif isinstance(message, str):
|
elif isinstance(message, str):
|
||||||
try:
|
try:
|
||||||
data = json.loads(message)
|
data = json.loads(message)
|
||||||
# print(f"收到 ESP32 JSON 消息: {data}")
|
# print(f"收到 ESP32 JSON 消息: {data}")
|
||||||
msg_type = data.get("type")
|
msg_type = data.get("type")
|
||||||
if msg_type == "abort":
|
if msg_type == "hello":
|
||||||
|
self._log_client_hello(session, data)
|
||||||
|
elif msg_type == "abort":
|
||||||
reason = data.get("reason")
|
reason = data.get("reason")
|
||||||
abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
|
abort_reason = reason if isinstance(reason, str) and reason else "button_abort"
|
||||||
print(f"处理 ESP32 打断请求: reason={abort_reason}")
|
print(f"处理 ESP32 打断请求: reason={abort_reason}")
|
||||||
@ -1100,6 +1561,10 @@ class ESP32LiveKitBridge:
|
|||||||
self._log_exception("WebSocket 其他错误", exc)
|
self._log_exception("WebSocket 其他错误", exc)
|
||||||
finally:
|
finally:
|
||||||
print(f"ESP32 断开连接: device={device_id} room={session.room_name}")
|
print(f"ESP32 断开连接: device={device_id} room={session.room_name}")
|
||||||
|
if room_connect_task is not None and not room_connect_task.done():
|
||||||
|
room_connect_task.cancel()
|
||||||
|
with contextlib.suppress(asyncio.CancelledError):
|
||||||
|
await room_connect_task
|
||||||
await self._close_session(session)
|
await self._close_session(session)
|
||||||
self.device_sessions.pop(device_id, None)
|
self.device_sessions.pop(device_id, None)
|
||||||
|
|
||||||
@ -1108,7 +1573,13 @@ async def main() -> None:
|
|||||||
bridge = ESP32LiveKitBridge()
|
bridge = ESP32LiveKitBridge()
|
||||||
try:
|
try:
|
||||||
await bridge.start()
|
await bridge.start()
|
||||||
async with websockets.serve(bridge.handle_websocket, "0.0.0.0", WS_PORT):
|
async with websockets.serve(
|
||||||
|
bridge.handle_websocket,
|
||||||
|
"0.0.0.0",
|
||||||
|
WS_PORT,
|
||||||
|
max_queue=WS_MAX_QUEUE,
|
||||||
|
max_size=WS_MAX_SIZE,
|
||||||
|
):
|
||||||
print(f"WebSocket 服务器运行在端口 {WS_PORT},等待 ESP32 连接...")
|
print(f"WebSocket 服务器运行在端口 {WS_PORT},等待 ESP32 连接...")
|
||||||
await asyncio.Future()
|
await asyncio.Future()
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
@ -8,6 +8,7 @@ enum DeviceState {
|
|||||||
kDeviceStateIdle,
|
kDeviceStateIdle,
|
||||||
kDeviceStateConnecting,
|
kDeviceStateConnecting,
|
||||||
kDeviceStateListening,
|
kDeviceStateListening,
|
||||||
|
kDeviceStateThinking,
|
||||||
kDeviceStateSpeaking,
|
kDeviceStateSpeaking,
|
||||||
kDeviceStateUpgrading,
|
kDeviceStateUpgrading,
|
||||||
kDeviceStateActivating,
|
kDeviceStateActivating,
|
||||||
|
|||||||
@ -13,6 +13,7 @@ static const char* const STATE_STRINGS[] = {
|
|||||||
"idle",
|
"idle",
|
||||||
"connecting",
|
"connecting",
|
||||||
"listening",
|
"listening",
|
||||||
|
"thinking",
|
||||||
"speaking",
|
"speaking",
|
||||||
"upgrading",
|
"upgrading",
|
||||||
"activating",
|
"activating",
|
||||||
@ -69,9 +70,10 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con
|
|||||||
to == kDeviceStateActivating;
|
to == kDeviceStateActivating;
|
||||||
|
|
||||||
case kDeviceStateIdle:
|
case kDeviceStateIdle:
|
||||||
// Can go to connecting, listening (manual mode), speaking, activating, upgrading, or wifi configuring
|
// Can go to connecting, listening (manual mode), thinking, speaking, activating, upgrading, or wifi configuring
|
||||||
return to == kDeviceStateConnecting ||
|
return to == kDeviceStateConnecting ||
|
||||||
to == kDeviceStateListening ||
|
to == kDeviceStateListening ||
|
||||||
|
to == kDeviceStateThinking ||
|
||||||
to == kDeviceStateSpeaking ||
|
to == kDeviceStateSpeaking ||
|
||||||
to == kDeviceStateActivating ||
|
to == kDeviceStateActivating ||
|
||||||
to == kDeviceStateUpgrading ||
|
to == kDeviceStateUpgrading ||
|
||||||
@ -83,8 +85,15 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con
|
|||||||
to == kDeviceStateListening;
|
to == kDeviceStateListening;
|
||||||
|
|
||||||
case kDeviceStateListening:
|
case kDeviceStateListening:
|
||||||
// Can go to speaking or idle
|
// Can go to thinking, speaking, or idle
|
||||||
|
return to == kDeviceStateThinking ||
|
||||||
|
to == kDeviceStateSpeaking ||
|
||||||
|
to == kDeviceStateIdle;
|
||||||
|
|
||||||
|
case kDeviceStateThinking:
|
||||||
|
// Can go to speaking, listening, or idle
|
||||||
return to == kDeviceStateSpeaking ||
|
return to == kDeviceStateSpeaking ||
|
||||||
|
to == kDeviceStateListening ||
|
||||||
to == kDeviceStateIdle;
|
to == kDeviceStateIdle;
|
||||||
|
|
||||||
case kDeviceStateSpeaking:
|
case kDeviceStateSpeaking:
|
||||||
|
|||||||
@ -167,6 +167,8 @@ void EmoteDisplay::SetStatus(const char* const status)
|
|||||||
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
|
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
|
||||||
} else if (std::strcmp(status, Lang::Strings::STANDBY) == 0) {
|
} else if (std::strcmp(status, Lang::Strings::STANDBY) == 0) {
|
||||||
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_IDLE, NULL);
|
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_IDLE, NULL);
|
||||||
|
} else if (std::strcmp(status, Lang::Strings::THINKING) == 0) {
|
||||||
|
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
|
||||||
} else if (std::strcmp(status, Lang::Strings::SPEAKING) == 0) {
|
} else if (std::strcmp(status, Lang::Strings::SPEAKING) == 0) {
|
||||||
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_SPEAK, NULL);
|
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_SPEAK, NULL);
|
||||||
} else if (std::strcmp(status, Lang::Strings::ERROR) == 0) {
|
} else if (std::strcmp(status, Lang::Strings::ERROR) == 0) {
|
||||||
|
|||||||
@ -203,6 +203,7 @@ void LvglDisplay::UpdateStatusBar(bool update_all) {
|
|||||||
kDeviceStateStarting,
|
kDeviceStateStarting,
|
||||||
kDeviceStateWifiConfiguring,
|
kDeviceStateWifiConfiguring,
|
||||||
kDeviceStateListening,
|
kDeviceStateListening,
|
||||||
|
kDeviceStateThinking,
|
||||||
kDeviceStateActivating,
|
kDeviceStateActivating,
|
||||||
};
|
};
|
||||||
if (std::find(allowed_states.begin(), allowed_states.end(), device_state) != allowed_states.end()) {
|
if (std::find(allowed_states.begin(), allowed_states.end(), device_state) != allowed_states.end()) {
|
||||||
|
|||||||
@ -228,6 +228,11 @@ void CircularStrip::OnStateChanged() {
|
|||||||
SetAllColor(color);
|
SetAllColor(color);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case kDeviceStateThinking: {
|
||||||
|
StripColor color = { low_brightness_, low_brightness_, default_brightness_ };
|
||||||
|
Blink(color, 300);
|
||||||
|
break;
|
||||||
|
}
|
||||||
case kDeviceStateUpgrading: {
|
case kDeviceStateUpgrading: {
|
||||||
StripColor color = { low_brightness_, default_brightness_, low_brightness_ };
|
StripColor color = { low_brightness_, default_brightness_, low_brightness_ };
|
||||||
Blink(color, 100);
|
Blink(color, 100);
|
||||||
|
|||||||
@ -235,6 +235,10 @@ void GpioLed::OnStateChanged() {
|
|||||||
// TurnOn();
|
// TurnOn();
|
||||||
StartFadeTask();
|
StartFadeTask();
|
||||||
break;
|
break;
|
||||||
|
case kDeviceStateThinking:
|
||||||
|
SetBrightness(DEFAULT_BRIGHTNESS);
|
||||||
|
StartContinuousBlink(300);
|
||||||
|
break;
|
||||||
case kDeviceStateSpeaking:
|
case kDeviceStateSpeaking:
|
||||||
SetBrightness(SPEAKING_BRIGHTNESS);
|
SetBrightness(SPEAKING_BRIGHTNESS);
|
||||||
TurnOn();
|
TurnOn();
|
||||||
|
|||||||
@ -152,6 +152,10 @@ void SingleLed::OnStateChanged() {
|
|||||||
SetColor(0, DEFAULT_BRIGHTNESS, 0);
|
SetColor(0, DEFAULT_BRIGHTNESS, 0);
|
||||||
TurnOn();
|
TurnOn();
|
||||||
break;
|
break;
|
||||||
|
case kDeviceStateThinking:
|
||||||
|
SetColor(0, 0, DEFAULT_BRIGHTNESS);
|
||||||
|
StartContinuousBlink(300);
|
||||||
|
break;
|
||||||
case kDeviceStateUpgrading:
|
case kDeviceStateUpgrading:
|
||||||
SetColor(0, DEFAULT_BRIGHTNESS, 0);
|
SetColor(0, DEFAULT_BRIGHTNESS, 0);
|
||||||
StartContinuousBlink(100);
|
StartContinuousBlink(100);
|
||||||
|
|||||||
@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
|
|||||||
websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
|
websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
|
||||||
websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
|
websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
|
||||||
websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
|
websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
|
||||||
|
websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName());
|
||||||
|
websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName());
|
||||||
|
|
||||||
websocket_->OnData([this](const char* data, size_t len, bool binary) {
|
websocket_->OnData([this](const char* data, size_t len, bool binary) {
|
||||||
if (binary) {
|
if (binary) {
|
||||||
|
|||||||
Reference in New Issue
Block a user