8 Commits
ws ... icon

Author SHA1 Message Date
66d318774a fix: server error to disconnect 2026-06-17 15:19:31 +08:00
60df0fe196 feat: add tools to normal agent 2026-06-12 14:23:41 +08:00
2c4329fd84 fix: voice interupt 2026-06-12 11:38:47 +08:00
9637e09aef feat: beaver 2026-06-04 15:48:10 +08:00
b92e6e1b07 feat: remove background cam every time 2026-05-29 14:53:58 +08:00
33ee598c21 feat: add icon beaver 2026-05-29 11:22:31 +08:00
37343ac0fe feat: icon first commit 2026-05-27 17:16:11 +08:00
fc6302661d feat: support camera capture to livekit 2026-05-25 17:21:11 +08:00
31 changed files with 1735 additions and 365 deletions

2
.gitignore vendored
View File

@ -10,6 +10,7 @@ sdkconfig
dependencies.lock dependencies.lock
.env .env
releases/ releases/
vision_frames/
main/assets/lang_config.h main/assets/lang_config.h
main/mmap_generate_emoji.h main/mmap_generate_emoji.h
.DS_Store .DS_Store
@ -18,3 +19,4 @@ main/mmap_generate_emoji.h
*.bin *.bin
mmap_generate_*.h mmap_generate_*.h
.clangd .clangd
background_frames/

View File

@ -15,7 +15,7 @@ config USE_DIRECT_WEBSOCKET
config WEBSOCKET_URL config WEBSOCKET_URL
string "Default WebSocket URL" string "Default WebSocket URL"
depends on USE_DIRECT_WEBSOCKET depends on USE_DIRECT_WEBSOCKET
default "ws://10.6.80.130:8080" default "ws://172.19.0.240:8080"
help help
The WebSocket server URL used when direct WebSocket mode is enabled. The WebSocket server URL used when direct WebSocket mode is enabled.

View File

@ -1,25 +1,24 @@
#include "application.h" #include "application.h"
#include "assets.h"
#include "assets/lang_config.h"
#include "audio_codec.h"
#include "board.h" #include "board.h"
#include "display.h" #include "display.h"
#include "system_info.h"
#include "audio_codec.h"
#include "mqtt_protocol.h"
#include "websocket_protocol.h"
#include "assets/lang_config.h"
#include "mcp_server.h" #include "mcp_server.h"
#include "assets.h" #include "mqtt_protocol.h"
#include "settings.h" #include "settings.h"
#include "system_info.h"
#include "websocket_protocol.h"
#include <cstring>
#include <esp_log.h>
#include <cJSON.h>
#include <driver/gpio.h> #include <driver/gpio.h>
#include <esp_log.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include <cJSON.h>
#include <font_awesome.h> #include <font_awesome.h>
#include <cstring>
#define TAG "Application" #define TAG "Application"
Application::Application() { Application::Application() {
event_group_ = xEventGroupCreate(); event_group_ = xEventGroupCreate();
@ -33,16 +32,16 @@ Application::Application() {
aec_mode_ = kAecOff; aec_mode_ = kAecOff;
#endif #endif
esp_timer_create_args_t clock_timer_args = { esp_timer_create_args_t clock_timer_args = {.callback =
.callback = [](void* arg) { [](void* arg) {
Application* app = (Application*)arg; Application* app = (Application*)arg;
xEventGroupSetBits(app->event_group_, MAIN_EVENT_CLOCK_TICK); xEventGroupSetBits(app->event_group_,
}, MAIN_EVENT_CLOCK_TICK);
.arg = this, },
.dispatch_method = ESP_TIMER_TASK, .arg = this,
.name = "clock_timer", .dispatch_method = ESP_TIMER_TASK,
.skip_unhandled_events = true .name = "clock_timer",
}; .skip_unhandled_events = true};
esp_timer_create(&clock_timer_args, &clock_timer_handle_); esp_timer_create(&clock_timer_args, &clock_timer_handle_);
} }
@ -54,9 +53,7 @@ Application::~Application() {
vEventGroupDelete(event_group_); vEventGroupDelete(event_group_);
} }
bool Application::SetDeviceState(DeviceState state) { bool Application::SetDeviceState(DeviceState state) { return state_machine_.TransitionTo(state); }
return state_machine_.TransitionTo(state);
}
void Application::Initialize() { void Application::Initialize() {
auto& board = Board::GetInstance(); auto& board = Board::GetInstance();
@ -81,6 +78,7 @@ void Application::Initialize() {
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED); xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
}; };
callbacks.on_vad_change = [this](bool speaking) { callbacks.on_vad_change = [this](bool speaking) {
vad_speaking_.store(speaking);
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE); xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
}; };
audio_service_.SetCallbacks(callbacks); audio_service_.SetCallbacks(callbacks);
@ -141,13 +139,16 @@ void Application::Initialize() {
display->SetStatus(Lang::Strings::DETECTING_MODULE); display->SetStatus(Lang::Strings::DETECTING_MODULE);
break; break;
case NetworkEvent::ModemErrorNoSim: case NetworkEvent::ModemErrorNoSim:
Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_PIN); Alert(Lang::Strings::ERROR, Lang::Strings::PIN_ERROR, "triangle_exclamation",
Lang::Sounds::OGG_ERR_PIN);
break; break;
case NetworkEvent::ModemErrorRegDenied: case NetworkEvent::ModemErrorRegDenied:
Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation", Lang::Sounds::OGG_ERR_REG); Alert(Lang::Strings::ERROR, Lang::Strings::REG_ERROR, "triangle_exclamation",
Lang::Sounds::OGG_ERR_REG);
break; break;
case NetworkEvent::ModemErrorInitFailed: case NetworkEvent::ModemErrorInitFailed:
Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, Lang::Strings::MODEM_INIT_ERROR, "triangle_exclamation",
Lang::Sounds::OGG_EXCLAMATION);
break; break;
case NetworkEvent::ModemErrorTimeout: case NetworkEvent::ModemErrorTimeout:
display->SetStatus(Lang::Strings::REGISTERING_NETWORK); display->SetStatus(Lang::Strings::REGISTERING_NETWORK);
@ -167,18 +168,10 @@ void Application::Run() {
vTaskPrioritySet(nullptr, 10); vTaskPrioritySet(nullptr, 10);
const EventBits_t ALL_EVENTS = const EventBits_t ALL_EVENTS =
MAIN_EVENT_SCHEDULE | MAIN_EVENT_SCHEDULE | MAIN_EVENT_SEND_AUDIO | MAIN_EVENT_WAKE_WORD_DETECTED |
MAIN_EVENT_SEND_AUDIO | MAIN_EVENT_VAD_CHANGE | MAIN_EVENT_CLOCK_TICK | MAIN_EVENT_ERROR |
MAIN_EVENT_WAKE_WORD_DETECTED | MAIN_EVENT_NETWORK_CONNECTED | MAIN_EVENT_NETWORK_DISCONNECTED | MAIN_EVENT_TOGGLE_CHAT |
MAIN_EVENT_VAD_CHANGE | MAIN_EVENT_START_LISTENING | MAIN_EVENT_STOP_LISTENING | MAIN_EVENT_ACTIVATION_DONE |
MAIN_EVENT_CLOCK_TICK |
MAIN_EVENT_ERROR |
MAIN_EVENT_NETWORK_CONNECTED |
MAIN_EVENT_NETWORK_DISCONNECTED |
MAIN_EVENT_TOGGLE_CHAT |
MAIN_EVENT_START_LISTENING |
MAIN_EVENT_STOP_LISTENING |
MAIN_EVENT_ACTIVATION_DONE |
MAIN_EVENT_STATE_CHANGED; MAIN_EVENT_STATE_CHANGED;
while (true) { while (true) {
@ -186,7 +179,8 @@ void Application::Run() {
if (bits & MAIN_EVENT_ERROR) { if (bits & MAIN_EVENT_ERROR) {
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, last_error_message_.c_str(), "circle_xmark",
Lang::Sounds::OGG_EXCLAMATION);
} }
if (bits & MAIN_EVENT_NETWORK_CONNECTED) { if (bits & MAIN_EVENT_NETWORK_CONNECTED) {
@ -233,6 +227,13 @@ void Application::Run() {
if (GetDeviceState() == kDeviceStateListening) { if (GetDeviceState() == kDeviceStateListening) {
auto led = Board::GetInstance().GetLed(); auto led = Board::GetInstance().GetLed();
led->OnStateChanged(); led->OnStateChanged();
if (vad_speaking_.load() && vision_text_mode_enabled_.load() &&
!vision_frame_sent_for_current_listen_.exchange(true)) {
if (!SendCurrentVisionFrame()) {
vision_frame_sent_for_current_listen_.store(false);
}
}
} }
} }
@ -270,12 +271,14 @@ void Application::HandleNetworkConnectedEvent() {
return; return;
} }
xTaskCreate([](void* arg) { xTaskCreate(
Application* app = static_cast<Application*>(arg); [](void* arg) {
app->ActivationTask(); Application* app = static_cast<Application*>(arg);
app->activation_task_handle_ = nullptr; app->ActivationTask();
vTaskDelete(NULL); app->activation_task_handle_ = nullptr;
}, "activation", 4096 * 2, this, 2, &activation_task_handle_); vTaskDelete(NULL);
},
"activation", 4096 * 2, this, 2, &activation_task_handle_);
} }
// Update the status bar immediately to show the network state // Update the status bar immediately to show the network state
@ -286,7 +289,8 @@ void Application::HandleNetworkConnectedEvent() {
void Application::HandleNetworkDisconnectedEvent() { void Application::HandleNetworkDisconnectedEvent() {
// Close current conversation when network disconnected // Close current conversation when network disconnected
auto state = GetDeviceState(); auto state = GetDeviceState();
if (state == kDeviceStateConnecting || state == kDeviceStateListening || state == kDeviceStateSpeaking) { if (state == kDeviceStateConnecting || state == kDeviceStateListening ||
state == kDeviceStateThinking || state == kDeviceStateSpeaking) {
ESP_LOGI(TAG, "Closing audio channel due to network disconnection"); ESP_LOGI(TAG, "Closing audio channel due to network disconnection");
protocol_->CloseAudioChannel(); protocol_->CloseAudioChannel();
} }
@ -371,7 +375,8 @@ void Application::CheckAssetsVersion() {
char message[256]; char message[256];
snprintf(message, sizeof(message), Lang::Strings::FOUND_NEW_ASSETS, download_url.c_str()); snprintf(message, sizeof(message), Lang::Strings::FOUND_NEW_ASSETS, download_url.c_str());
Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down", Lang::Sounds::OGG_UPGRADE); Alert(Lang::Strings::LOADING_ASSETS, message, "cloud_arrow_down",
Lang::Sounds::OGG_UPGRADE);
// Wait for the audio service to be idle for 3 seconds // Wait for the audio service to be idle for 3 seconds
vTaskDelay(pdMS_TO_TICKS(3000)); vTaskDelay(pdMS_TO_TICKS(3000));
@ -379,19 +384,21 @@ void Application::CheckAssetsVersion() {
board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE); board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
display->SetChatMessage("system", Lang::Strings::PLEASE_WAIT); display->SetChatMessage("system", Lang::Strings::PLEASE_WAIT);
bool success = assets.Download(download_url, [this, display](int progress, size_t speed) -> void { bool success =
char buffer[32]; assets.Download(download_url, [this, display](int progress, size_t speed) -> void {
snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024); char buffer[32];
Schedule([display, message = std::string(buffer)]() { snprintf(buffer, sizeof(buffer), "%d%% %uKB/s", progress, speed / 1024);
display->SetChatMessage("system", message.c_str()); Schedule([display, message = std::string(buffer)]() {
display->SetChatMessage("system", message.c_str());
});
}); });
});
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
vTaskDelay(pdMS_TO_TICKS(1000)); vTaskDelay(pdMS_TO_TICKS(1000));
if (!success) { if (!success) {
Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, Lang::Strings::DOWNLOAD_ASSETS_FAILED, "circle_xmark",
Lang::Sounds::OGG_EXCLAMATION);
vTaskDelay(pdMS_TO_TICKS(2000)); vTaskDelay(pdMS_TO_TICKS(2000));
SetDeviceState(kDeviceStateActivating); SetDeviceState(kDeviceStateActivating);
return; return;
@ -407,7 +414,7 @@ void Application::CheckAssetsVersion() {
void Application::CheckNewVersion() { void Application::CheckNewVersion() {
const int MAX_RETRY = 10; const int MAX_RETRY = 10;
int retry_count = 0; int retry_count = 0;
int retry_delay = 10; // Initial retry delay in seconds int retry_delay = 10; // Initial retry delay in seconds
auto& board = Board::GetInstance(); auto& board = Board::GetInstance();
while (true) { while (true) {
@ -423,27 +430,30 @@ void Application::CheckNewVersion() {
} }
char error_message[128]; char error_message[128];
snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err, ota_->GetCheckVersionUrl().c_str()); snprintf(error_message, sizeof(error_message), "code=%d, url=%s", err,
ota_->GetCheckVersionUrl().c_str());
char buffer[256]; char buffer[256];
snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay, error_message); snprintf(buffer, sizeof(buffer), Lang::Strings::CHECK_NEW_VERSION_FAILED, retry_delay,
error_message);
Alert(Lang::Strings::ERROR, buffer, "cloud_slash", Lang::Sounds::OGG_EXCLAMATION); Alert(Lang::Strings::ERROR, buffer, "cloud_slash", Lang::Sounds::OGG_EXCLAMATION);
ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay, retry_count, MAX_RETRY); ESP_LOGW(TAG, "Check new version failed, retry in %d seconds (%d/%d)", retry_delay,
retry_count, MAX_RETRY);
for (int i = 0; i < retry_delay; i++) { for (int i = 0; i < retry_delay; i++) {
vTaskDelay(pdMS_TO_TICKS(1000)); vTaskDelay(pdMS_TO_TICKS(1000));
if (GetDeviceState() == kDeviceStateIdle) { if (GetDeviceState() == kDeviceStateIdle) {
break; break;
} }
} }
retry_delay *= 2; // Double the retry delay retry_delay *= 2; // Double the retry delay
continue; continue;
} }
retry_count = 0; retry_count = 0;
retry_delay = 10; // Reset retry delay retry_delay = 10; // Reset retry delay
if (ota_->HasNewVersion()) { if (ota_->HasNewVersion()) {
if (UpgradeFirmware(ota_->GetFirmwareUrl(), ota_->GetFirmwareVersion())) { if (UpgradeFirmware(ota_->GetFirmwareUrl(), ota_->GetFirmwareVersion())) {
return; // This line will never be reached after reboot return; // This line will never be reached after reboot
} }
// If upgrade failed, continue to normal operation // If upgrade failed, continue to normal operation
} }
@ -499,9 +509,7 @@ void Application::InitializeProtocol() {
} }
#endif #endif
protocol_->OnConnected([this]() { protocol_->OnConnected([this]() { DismissAlert(); });
DismissAlert();
});
protocol_->OnNetworkError([this](const std::string& message) { protocol_->OnNetworkError([this](const std::string& message) {
last_error_message_ = message; last_error_message_ = message;
@ -509,7 +517,7 @@ void Application::InitializeProtocol() {
}); });
protocol_->OnIncomingAudio([this](std::unique_ptr<AudioStreamPacket> packet) { protocol_->OnIncomingAudio([this](std::unique_ptr<AudioStreamPacket> packet) {
if (GetDeviceState() == kDeviceStateSpeaking) { if (accepting_tts_audio_.load() || GetDeviceState() == kDeviceStateSpeaking) {
audio_service_.PushPacketToDecodeQueue(std::move(packet)); audio_service_.PushPacketToDecodeQueue(std::move(packet));
} }
}); });
@ -517,14 +525,20 @@ void Application::InitializeProtocol() {
protocol_->OnAudioChannelOpened([this, codec, &board]() { protocol_->OnAudioChannelOpened([this, codec, &board]() {
board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE); board.SetPowerSaveLevel(PowerSaveLevel::PERFORMANCE);
if (protocol_->server_sample_rate() != codec->output_sample_rate()) { if (protocol_->server_sample_rate() != codec->output_sample_rate()) {
ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion", ESP_LOGW(TAG,
protocol_->server_sample_rate(), codec->output_sample_rate()); "Server sample rate %d does not match device output sample rate %d, "
"resampling may cause distortion",
protocol_->server_sample_rate(), codec->output_sample_rate());
} }
}); });
protocol_->OnAudioChannelClosed([this, &board]() { protocol_->OnAudioChannelClosed([this, &board]() {
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
accepting_tts_audio_.store(false);
Schedule([this]() { Schedule([this]() {
if (GetDeviceState() == kDeviceStateConnecting) {
return;
}
auto display = Board::GetInstance().GetDisplay(); auto display = Board::GetInstance().GetDisplay();
display->SetChatMessage("system", ""); display->SetChatMessage("system", "");
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
@ -536,14 +550,20 @@ void Application::InitializeProtocol() {
auto type = cJSON_GetObjectItem(root, "type"); auto type = cJSON_GetObjectItem(root, "type");
if (strcmp(type->valuestring, "tts") == 0) { if (strcmp(type->valuestring, "tts") == 0) {
auto state = cJSON_GetObjectItem(root, "state"); auto state = cJSON_GetObjectItem(root, "state");
if (strcmp(state->valuestring, "start") == 0) { if (strcmp(state->valuestring, "thinking") == 0) {
Schedule([this]() { SetDeviceState(kDeviceStateThinking); });
} else if (strcmp(state->valuestring, "start") == 0) {
audio_service_.ResetDecoder();
accepting_tts_audio_.store(true);
Schedule([this]() { Schedule([this]() {
aborted_ = false; aborted_ = false;
SetDeviceState(kDeviceStateSpeaking); SetDeviceState(kDeviceStateSpeaking);
}); });
} else if (strcmp(state->valuestring, "stop") == 0) { } else if (strcmp(state->valuestring, "stop") == 0) {
accepting_tts_audio_.store(false);
Schedule([this]() { Schedule([this]() {
if (GetDeviceState() == kDeviceStateSpeaking) { auto state = GetDeviceState();
if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
if (listening_mode_ == kListeningModeManualStop) { if (listening_mode_ == kListeningModeManualStop) {
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
} else { } else {
@ -586,9 +606,7 @@ void Application::InitializeProtocol() {
ESP_LOGI(TAG, "System command: %s", command->valuestring); ESP_LOGI(TAG, "System command: %s", command->valuestring);
if (strcmp(command->valuestring, "reboot") == 0) { if (strcmp(command->valuestring, "reboot") == 0) {
// Do a reboot if user requests a OTA update // Do a reboot if user requests a OTA update
Schedule([this]() { Schedule([this]() { Reboot(); });
Reboot();
});
} else { } else {
ESP_LOGW(TAG, "Unknown system command: %s", command->valuestring); ESP_LOGW(TAG, "Unknown system command: %s", command->valuestring);
} }
@ -598,7 +616,8 @@ void Application::InitializeProtocol() {
auto message = cJSON_GetObjectItem(root, "message"); auto message = cJSON_GetObjectItem(root, "message");
auto emotion = cJSON_GetObjectItem(root, "emotion"); auto emotion = cJSON_GetObjectItem(root, "emotion");
if (cJSON_IsString(status) && cJSON_IsString(message) && cJSON_IsString(emotion)) { if (cJSON_IsString(status) && cJSON_IsString(message) && cJSON_IsString(emotion)) {
Alert(status->valuestring, message->valuestring, emotion->valuestring, Lang::Sounds::OGG_VIBRATION); Alert(status->valuestring, message->valuestring, emotion->valuestring,
Lang::Sounds::OGG_VIBRATION);
} else { } else {
ESP_LOGW(TAG, "Alert command requires status, message and emotion"); ESP_LOGW(TAG, "Alert command requires status, message and emotion");
} }
@ -607,9 +626,10 @@ void Application::InitializeProtocol() {
auto payload = cJSON_GetObjectItem(root, "payload"); auto payload = cJSON_GetObjectItem(root, "payload");
ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root)); ESP_LOGI(TAG, "Received custom message: %s", cJSON_PrintUnformatted(root));
if (cJSON_IsObject(payload)) { if (cJSON_IsObject(payload)) {
Schedule([this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() { Schedule(
display->SetChatMessage("system", payload_str.c_str()); [this, display, payload_str = std::string(cJSON_PrintUnformatted(payload))]() {
}); display->SetChatMessage("system", payload_str.c_str());
});
} else { } else {
ESP_LOGW(TAG, "Invalid custom message format: missing payload"); ESP_LOGW(TAG, "Invalid custom message format: missing payload");
} }
@ -627,32 +647,27 @@ void Application::ShowActivationCode(const std::string& code, const std::string&
char digit; char digit;
const std::string_view& sound; const std::string_view& sound;
}; };
static const std::array<digit_sound, 10> digit_sounds{{ static const std::array<digit_sound, 10> digit_sounds{
digit_sound{'0', Lang::Sounds::OGG_0}, {digit_sound{'0', Lang::Sounds::OGG_0}, digit_sound{'1', Lang::Sounds::OGG_1},
digit_sound{'1', Lang::Sounds::OGG_1}, digit_sound{'2', Lang::Sounds::OGG_2}, digit_sound{'3', Lang::Sounds::OGG_3},
digit_sound{'2', Lang::Sounds::OGG_2}, digit_sound{'4', Lang::Sounds::OGG_4}, digit_sound{'5', Lang::Sounds::OGG_5},
digit_sound{'3', Lang::Sounds::OGG_3}, digit_sound{'6', Lang::Sounds::OGG_6}, digit_sound{'7', Lang::Sounds::OGG_7},
digit_sound{'4', Lang::Sounds::OGG_4}, digit_sound{'8', Lang::Sounds::OGG_8}, digit_sound{'9', Lang::Sounds::OGG_9}}};
digit_sound{'5', Lang::Sounds::OGG_5},
digit_sound{'6', Lang::Sounds::OGG_6},
digit_sound{'7', Lang::Sounds::OGG_7},
digit_sound{'8', Lang::Sounds::OGG_8},
digit_sound{'9', Lang::Sounds::OGG_9}
}};
// This sentence uses 9KB of SRAM, so we need to wait for it to finish // This sentence uses 9KB of SRAM, so we need to wait for it to finish
Alert(Lang::Strings::ACTIVATION, message.c_str(), "link", Lang::Sounds::OGG_ACTIVATION); Alert(Lang::Strings::ACTIVATION, message.c_str(), "link", Lang::Sounds::OGG_ACTIVATION);
for (const auto& digit : code) { for (const auto& digit : code) {
auto it = std::find_if(digit_sounds.begin(), digit_sounds.end(), auto it = std::find_if(digit_sounds.begin(), digit_sounds.end(),
[digit](const digit_sound& ds) { return ds.digit == digit; }); [digit](const digit_sound& ds) { return ds.digit == digit; });
if (it != digit_sounds.end()) { if (it != digit_sounds.end()) {
audio_service_.PlaySound(it->sound); audio_service_.PlaySound(it->sound);
} }
} }
} }
void Application::Alert(const char* status, const char* message, const char* emotion, const std::string_view& sound) { void Application::Alert(const char* status, const char* message, const char* emotion,
const std::string_view& sound) {
ESP_LOGW(TAG, "Alert [%s] %s: %s", emotion, status, message); ESP_LOGW(TAG, "Alert [%s] %s: %s", emotion, status, message);
auto display = Board::GetInstance().GetDisplay(); auto display = Board::GetInstance().GetDisplay();
display->SetStatus(status); display->SetStatus(status);
@ -672,17 +687,40 @@ void Application::DismissAlert() {
} }
} }
void Application::ToggleChatState() { void Application::ToggleChatState() { ToggleChatStateForMode(kChatAgentModeNormal, false); }
void Application::ToggleChatStateWithVision() {
ToggleChatStateForMode(kChatAgentModeNormal, true);
}
void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) {
chat_agent_mode_.store(agent_mode);
vision_text_mode_enabled_.store(vision_enabled);
vision_frame_sent_for_current_listen_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
} }
bool Application::IsVisionTextModeEnabled() const { return vision_text_mode_enabled_.load(); }
const char* Application::GetChatAgentModeName() const {
return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
}
const char* Application::GetChatModeName() const {
bool vision_enabled = vision_text_mode_enabled_.load();
if (chat_agent_mode_.load() == kChatAgentModeBeaver) {
return vision_enabled ? "vision-beaver" : "beaver";
}
return vision_enabled ? "vision-normal" : "normal";
}
void Application::StartListening() { void Application::StartListening() {
vision_text_mode_enabled_.store(false);
vision_frame_sent_for_current_listen_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING); xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
} }
void Application::StopListening() { void Application::StopListening() { xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING); }
xEventGroupSetBits(event_group_, MAIN_EVENT_STOP_LISTENING);
}
void Application::HandleToggleChatEvent() { void Application::HandleToggleChatEvent() {
auto state = GetDeviceState(); auto state = GetDeviceState();
@ -707,17 +745,22 @@ void Application::HandleToggleChatEvent() {
if (state == kDeviceStateIdle) { if (state == kDeviceStateIdle) {
ListeningMode mode = GetDefaultListeningMode(); ListeningMode mode = GetDefaultListeningMode();
if (!protocol_->IsAudioChannelOpened()) { bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
bool vision_mode_changed =
vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
if (protocol_->IsAudioChannelOpened()) {
protocol_->CloseAudioChannel();
}
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update) // Schedule to let the state change be processed first (UI update)
Schedule([this, mode]() { Schedule([this, mode]() { ContinueOpenAudioChannel(mode); });
ContinueOpenAudioChannel(mode);
});
return; return;
} }
SetListeningMode(mode); SetListeningMode(mode);
} else if (state == kDeviceStateSpeaking) { } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
AbortSpeaking(kAbortReasonNone); AbortSpeaking(kAbortReasonNone);
SetListeningMode(GetDefaultListeningMode());
} else if (state == kDeviceStateListening) { } else if (state == kDeviceStateListening) {
protocol_->CloseAudioChannel(); protocol_->CloseAudioChannel();
} }
@ -739,6 +782,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
} }
} }
active_chat_agent_mode_.store(chat_agent_mode_.load());
active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load());
SetListeningMode(mode); SetListeningMode(mode);
} }
@ -763,13 +808,11 @@ void Application::HandleStartListeningEvent() {
if (!protocol_->IsAudioChannelOpened()) { if (!protocol_->IsAudioChannelOpened()) {
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update) // Schedule to let the state change be processed first (UI update)
Schedule([this]() { Schedule([this]() { ContinueOpenAudioChannel(kListeningModeManualStop); });
ContinueOpenAudioChannel(kListeningModeManualStop);
});
return; return;
} }
SetListeningMode(kListeningModeManualStop); SetListeningMode(kListeningModeManualStop);
} else if (state == kDeviceStateSpeaking) { } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
AbortSpeaking(kAbortReasonNone); AbortSpeaking(kAbortReasonNone);
SetListeningMode(kListeningModeManualStop); SetListeningMode(kListeningModeManualStop);
} }
@ -807,17 +850,14 @@ void Application::HandleWakeWordDetectedEvent() {
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update), // Schedule to let the state change be processed first (UI update),
// then continue with OpenAudioChannel which may block for ~1 second // then continue with OpenAudioChannel which may block for ~1 second
Schedule([this, wake_word]() { Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); });
ContinueWakeWordInvoke(wake_word);
});
return; return;
} }
// Channel already opened, continue directly // Channel already opened, continue directly
ContinueWakeWordInvoke(wake_word); ContinueWakeWordInvoke(wake_word);
} else if (state == kDeviceStateSpeaking || state == kDeviceStateListening) { } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking ||
state == kDeviceStateListening) {
AbortSpeaking(kAbortReasonWakeWordDetected); AbortSpeaking(kAbortReasonWakeWordDetected);
// Clear send queue to avoid sending residues to server
while (audio_service_.PopPacketFromSendQueue());
if (state == kDeviceStateListening) { if (state == kDeviceStateListening) {
protocol_->SendStartListening(GetDefaultListeningMode()); protocol_->SendStartListening(GetDefaultListeningMode());
@ -882,9 +922,10 @@ void Application::HandleStateChangedEvent() {
switch (new_state) { switch (new_state) {
case kDeviceStateUnknown: case kDeviceStateUnknown:
case kDeviceStateIdle: case kDeviceStateIdle:
vision_frame_sent_for_current_listen_.store(false);
display->SetStatus(Lang::Strings::STANDBY); display->SetStatus(Lang::Strings::STANDBY);
display->ClearChatMessages(); // Clear messages first display->ClearChatMessages(); // Clear messages first
display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count) display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
audio_service_.EnableVoiceProcessing(false); audio_service_.EnableVoiceProcessing(false);
audio_service_.EnableWakeWordDetection(true); audio_service_.EnableWakeWordDetection(true);
break; break;
@ -894,21 +935,19 @@ void Application::HandleStateChangedEvent() {
display->SetChatMessage("system", ""); display->SetChatMessage("system", "");
break; break;
case kDeviceStateListening: case kDeviceStateListening:
vad_speaking_.store(false);
vision_frame_sent_for_current_listen_.store(false);
display->SetStatus(Lang::Strings::LISTENING); display->SetStatus(Lang::Strings::LISTENING);
display->SetEmotion("neutral"); display->SetEmotion("neutral");
// Make sure the audio processor is running // Re-entering listening after an interrupt must restart the capture path even if the
if (play_popup_on_listening_ || !audio_service_.IsAudioProcessorRunning()) { // processor task is still marked running, otherwise realtime mode can show Listening
// For auto mode, wait for playback queue to be empty before enabling voice processing // while no fresh mic frames are sent.
// This prevents audio truncation when STOP arrives late due to network jitter if (listening_mode_ == kListeningModeAutoStop) {
if (listening_mode_ == kListeningModeAutoStop) { audio_service_.WaitForPlaybackQueueEmpty();
audio_service_.WaitForPlaybackQueueEmpty();
}
// Send the start listening command
protocol_->SendStartListening(listening_mode_);
audio_service_.EnableVoiceProcessing(true);
} }
protocol_->SendStartListening(listening_mode_);
audio_service_.EnableVoiceProcessing(true);
#ifdef CONFIG_WAKE_WORD_DETECTION_IN_LISTENING #ifdef CONFIG_WAKE_WORD_DETECTION_IN_LISTENING
// Enable wake word detection in listening mode (configured via Kconfig) // Enable wake word detection in listening mode (configured via Kconfig)
@ -924,6 +963,16 @@ void Application::HandleStateChangedEvent() {
audio_service_.PlaySound(Lang::Sounds::OGG_POPUP); audio_service_.PlaySound(Lang::Sounds::OGG_POPUP);
} }
break; break;
case kDeviceStateThinking:
vad_speaking_.store(false);
display->SetStatus(Lang::Strings::THINKING);
display->SetEmotion("thinking");
if (listening_mode_ != kListeningModeRealtime) {
audio_service_.EnableVoiceProcessing(false);
audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
}
break;
case kDeviceStateSpeaking: case kDeviceStateSpeaking:
display->SetStatus(Lang::Strings::SPEAKING); display->SetStatus(Lang::Strings::SPEAKING);
@ -932,7 +981,9 @@ void Application::HandleStateChangedEvent() {
// Only AFE wake word can be detected in speaking mode // Only AFE wake word can be detected in speaking mode
audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord()); audio_service_.EnableWakeWordDetection(audio_service_.IsAfeWakeWord());
} }
audio_service_.ResetDecoder(); if (!accepting_tts_audio_.load()) {
audio_service_.ResetDecoder();
}
break; break;
case kDeviceStateWifiConfiguring: case kDeviceStateWifiConfiguring:
audio_service_.EnableVoiceProcessing(false); audio_service_.EnableVoiceProcessing(false);
@ -944,6 +995,27 @@ void Application::HandleStateChangedEvent() {
} }
} }
bool Application::SendCurrentVisionFrame() {
if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
return false;
}
auto camera = Board::GetInstance().GetCamera();
if (camera == nullptr) {
return false;
}
std::string jpeg_data;
if (!camera->CaptureToJpeg(jpeg_data, true)) {
ESP_LOGW(TAG, "Failed to capture vision frame");
return false;
}
protocol_->SendVisionFrame(jpeg_data);
ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
return true;
}
void Application::Schedule(std::function<void()>&& callback) { void Application::Schedule(std::function<void()>&& callback) {
{ {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
@ -955,6 +1027,8 @@ void Application::Schedule(std::function<void()>&& callback) {
void Application::AbortSpeaking(AbortReason reason) { void Application::AbortSpeaking(AbortReason reason) {
ESP_LOGI(TAG, "Abort speaking"); ESP_LOGI(TAG, "Abort speaking");
aborted_ = true; aborted_ = true;
accepting_tts_audio_.store(false);
audio_service_.ResetDecoder();
if (protocol_) { if (protocol_) {
protocol_->SendAbortSpeaking(reason); protocol_->SendAbortSpeaking(reason);
} }
@ -962,6 +1036,8 @@ void Application::AbortSpeaking(AbortReason reason) {
void Application::SetListeningMode(ListeningMode mode) { void Application::SetListeningMode(ListeningMode mode) {
listening_mode_ = mode; listening_mode_ = mode;
vad_speaking_.store(false);
vision_frame_sent_for_current_listen_.store(false);
SetDeviceState(kDeviceStateListening); SetDeviceState(kDeviceStateListening);
} }
@ -996,7 +1072,8 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver
} }
ESP_LOGI(TAG, "Starting firmware upgrade from URL: %s", upgrade_url.c_str()); ESP_LOGI(TAG, "Starting firmware upgrade from URL: %s", upgrade_url.c_str());
Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download", Lang::Sounds::OGG_UPGRADE); Alert(Lang::Strings::OTA_UPGRADE, Lang::Strings::UPGRADING, "download",
Lang::Sounds::OGG_UPGRADE);
vTaskDelay(pdMS_TO_TICKS(3000)); vTaskDelay(pdMS_TO_TICKS(3000));
SetDeviceState(kDeviceStateUpgrading); SetDeviceState(kDeviceStateUpgrading);
@ -1018,17 +1095,19 @@ bool Application::UpgradeFirmware(const std::string& url, const std::string& ver
if (!upgrade_success) { if (!upgrade_success) {
// Upgrade failed, restart audio service and continue running // Upgrade failed, restart audio service and continue running
ESP_LOGE(TAG, "Firmware upgrade failed, restarting audio service and continuing operation..."); ESP_LOGE(TAG,
audio_service_.Start(); // Restart audio service "Firmware upgrade failed, restarting audio service and continuing operation...");
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level audio_service_.Start(); // Restart audio service
Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark", Lang::Sounds::OGG_EXCLAMATION); board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); // Restore power save level
Alert(Lang::Strings::ERROR, Lang::Strings::UPGRADE_FAILED, "circle_xmark",
Lang::Sounds::OGG_EXCLAMATION);
vTaskDelay(pdMS_TO_TICKS(3000)); vTaskDelay(pdMS_TO_TICKS(3000));
return false; return false;
} else { } else {
// Upgrade success, reboot immediately // Upgrade success, reboot immediately
ESP_LOGI(TAG, "Firmware upgrade successful, rebooting..."); ESP_LOGI(TAG, "Firmware upgrade successful, rebooting...");
display->SetChatMessage("system", "Upgrade successful, rebooting..."); display->SetChatMessage("system", "Upgrade successful, rebooting...");
vTaskDelay(pdMS_TO_TICKS(1000)); // Brief pause to show message vTaskDelay(pdMS_TO_TICKS(1000)); // Brief pause to show message
Reboot(); Reboot();
return true; return true;
} }
@ -1047,17 +1126,13 @@ void Application::WakeWordInvoke(const std::string& wake_word) {
if (!protocol_->IsAudioChannelOpened()) { if (!protocol_->IsAudioChannelOpened()) {
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update) // Schedule to let the state change be processed first (UI update)
Schedule([this, wake_word]() { Schedule([this, wake_word]() { ContinueWakeWordInvoke(wake_word); });
ContinueWakeWordInvoke(wake_word);
});
return; return;
} }
// Channel already opened, continue directly // Channel already opened, continue directly
ContinueWakeWordInvoke(wake_word); ContinueWakeWordInvoke(wake_word);
} else if (state == kDeviceStateSpeaking) { } else if (state == kDeviceStateSpeaking || state == kDeviceStateThinking) {
Schedule([this]() { Schedule([this]() { AbortSpeaking(kAbortReasonNone); });
AbortSpeaking(kAbortReasonNone);
});
} else if (state == kDeviceStateListening) { } else if (state == kDeviceStateListening) {
Schedule([this]() { Schedule([this]() {
if (protocol_) { if (protocol_) {
@ -1090,7 +1165,7 @@ void Application::RegisterMcpBroadcastCallback(std::function<void(const std::str
void Application::SendMcpMessage(const std::string& payload) { void Application::SendMcpMessage(const std::string& payload) {
// Always schedule to run in main task for thread safety // Always schedule to run in main task for thread safety
Schedule([this, payload](){ Schedule([this, payload]() {
if (protocol_) { if (protocol_) {
protocol_->SendMcpMessage(payload); protocol_->SendMcpMessage(payload);
} }
@ -1106,18 +1181,18 @@ void Application::SetAecMode(AecMode mode) {
auto& board = Board::GetInstance(); auto& board = Board::GetInstance();
auto display = board.GetDisplay(); auto display = board.GetDisplay();
switch (aec_mode_) { switch (aec_mode_) {
case kAecOff: case kAecOff:
audio_service_.EnableDeviceAec(false); audio_service_.EnableDeviceAec(false);
display->ShowNotification(Lang::Strings::RTC_MODE_OFF); display->ShowNotification(Lang::Strings::RTC_MODE_OFF);
break; break;
case kAecOnServerSide: case kAecOnServerSide:
audio_service_.EnableDeviceAec(false); audio_service_.EnableDeviceAec(false);
display->ShowNotification(Lang::Strings::RTC_MODE_ON); display->ShowNotification(Lang::Strings::RTC_MODE_ON);
break; break;
case kAecOnDeviceSide: case kAecOnDeviceSide:
audio_service_.EnableDeviceAec(true); audio_service_.EnableDeviceAec(true);
display->ShowNotification(Lang::Strings::RTC_MODE_ON); display->ShowNotification(Lang::Strings::RTC_MODE_ON);
break; break;
} }
// If the AEC mode is changed, close the audio channel // If the AEC mode is changed, close the audio channel
@ -1127,9 +1202,7 @@ void Application::SetAecMode(AecMode mode) {
}); });
} }
void Application::PlaySound(const std::string_view& sound) { void Application::PlaySound(const std::string_view& sound) { audio_service_.PlaySound(sound); }
audio_service_.PlaySound(sound);
}
void Application::ResetProtocol() { void Application::ResetProtocol() {
Schedule([this]() { Schedule([this]() {

View File

@ -11,6 +11,7 @@
#include <deque> #include <deque>
#include <memory> #include <memory>
#include <functional> #include <functional>
#include <atomic>
#include "protocol.h" #include "protocol.h"
#include "ota.h" #include "ota.h"
@ -40,6 +41,11 @@ enum AecMode {
kAecOnServerSide, kAecOnServerSide,
}; };
enum ChatAgentMode {
kChatAgentModeNormal,
kChatAgentModeBeaver,
};
class Application { class Application {
public: public:
static Application& GetInstance() { static Application& GetInstance() {
@ -91,6 +97,12 @@ public:
* Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run() * Sends MAIN_EVENT_TOGGLE_CHAT to be handled in Run()
*/ */
void ToggleChatState(); void ToggleChatState();
void ToggleChatStateWithVision();
void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled);
bool IsVisionTextModeEnabled() const;
ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); }
const char* GetChatAgentModeName() const;
const char* GetChatModeName() const;
/** /**
* Start listening (event-based, thread-safe) * Start listening (event-based, thread-safe)
@ -144,6 +156,13 @@ private:
bool aborted_ = false; bool aborted_ = false;
bool assets_version_checked_ = false; bool assets_version_checked_ = false;
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
std::atomic<ChatAgentMode> chat_agent_mode_ = kChatAgentModeNormal;
std::atomic<ChatAgentMode> active_chat_agent_mode_ = kChatAgentModeNormal;
std::atomic<bool> vision_text_mode_enabled_ = false;
std::atomic<bool> active_vision_text_mode_enabled_ = false;
std::atomic<bool> vad_speaking_ = false;
std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
std::atomic<bool> accepting_tts_audio_ = false;
int clock_ticks_ = 0; int clock_ticks_ = 0;
TaskHandle_t activation_task_handle_ = nullptr; TaskHandle_t activation_task_handle_ = nullptr;
@ -159,6 +178,7 @@ private:
void HandleWakeWordDetectedEvent(); void HandleWakeWordDetectedEvent();
void ContinueOpenAudioChannel(ListeningMode mode); void ContinueOpenAudioChannel(ListeningMode mode);
void ContinueWakeWordInvoke(const std::string& wake_word); void ContinueWakeWordInvoke(const std::string& wake_word);
bool SendCurrentVisionFrame();
// Activation task (runs in background) // Activation task (runs in background)
void ActivationTask(); void ActivationTask();

View File

@ -26,6 +26,7 @@
"CONNECTION_SUCCESSFUL": "Connection Successful", "CONNECTION_SUCCESSFUL": "Connection Successful",
"CONNECTED_TO": "Connected to ", "CONNECTED_TO": "Connected to ",
"LISTENING": "Listening...", "LISTENING": "Listening...",
"THINKING": "Thinking...",
"SPEAKING": "Speaking...", "SPEAKING": "Speaking...",
"SERVER_NOT_FOUND": "Looking for available service", "SERVER_NOT_FOUND": "Looking for available service",
"SERVER_NOT_CONNECTED": "Unable to connect to service, please try again later", "SERVER_NOT_CONNECTED": "Unable to connect to service, please try again later",

View File

@ -23,6 +23,7 @@
"CONNECTING": "连接中...", "CONNECTING": "连接中...",
"CONNECTED_TO": "已连接 ", "CONNECTED_TO": "已连接 ",
"LISTENING": "聆听中...", "LISTENING": "聆听中...",
"THINKING": "思考中...",
"SPEAKING": "说话中...", "SPEAKING": "说话中...",
"SERVER_NOT_FOUND": "正在寻找可用服务", "SERVER_NOT_FOUND": "正在寻找可用服务",
"SERVER_NOT_CONNECTED": "无法连接服务,请稍后再试", "SERVER_NOT_CONNECTED": "无法连接服务,请稍后再试",

View File

@ -579,6 +579,7 @@ void AudioService::EnableWakeWordDetection(bool enable) {
void AudioService::EnableVoiceProcessing(bool enable) { void AudioService::EnableVoiceProcessing(bool enable) {
ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling"); ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
if (enable) { if (enable) {
bool was_running = IsAudioProcessorRunning();
if (!audio_processor_initialized_) { if (!audio_processor_initialized_) {
audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_); audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_);
audio_processor_initialized_ = true; audio_processor_initialized_ = true;
@ -586,7 +587,7 @@ void AudioService::EnableVoiceProcessing(bool enable) {
/* We should make sure no audio is playing */ /* We should make sure no audio is playing */
ResetDecoder(); ResetDecoder();
audio_input_need_warmup_ = true; audio_input_need_warmup_ = !was_running;
// Reset input resampler to clear cached data from previous mode (e.g. WakeWord) // Reset input resampler to clear cached data from previous mode (e.g. WakeWord)
// This prevents buffer overflow when switching between different feed sizes // This prevents buffer overflow when switching between different feed sizes
{ {

View File

@ -0,0 +1,177 @@
#include "background_capture_service.h"
#include "board.h"
#include "camera.h"
#include <algorithm>
#include <esp_heap_caps.h>
#include <esp_log.h>
#define TAG "BgCapture"
BackgroundCaptureService::BackgroundCaptureService() = default;
BackgroundCaptureService::~BackgroundCaptureService() {
Stop();
}
void BackgroundCaptureService::Start() {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
if (running_.exchange(true)) {
return;
}
auto result = xTaskCreate(
&BackgroundCaptureService::TaskEntry,
"bg_capture",
CONFIG_BACKGROUND_CAPTURE_TASK_STACK_SIZE,
this,
CONFIG_BACKGROUND_CAPTURE_TASK_PRIORITY,
&task_handle_);
if (result != pdPASS) {
running_.store(false);
task_handle_ = nullptr;
ESP_LOGE(TAG, "Failed to create background capture task");
}
#endif
}
void BackgroundCaptureService::Stop() {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
if (!running_.exchange(false)) {
return;
}
while (task_handle_ != nullptr) {
vTaskDelay(pdMS_TO_TICKS(20));
}
#endif
}
void BackgroundCaptureService::TaskEntry(void* arg) {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
auto* service = static_cast<BackgroundCaptureService*>(arg);
service->Run();
service->task_handle_ = nullptr;
#else
(void)arg;
#endif
vTaskDelete(nullptr);
}
void BackgroundCaptureService::Run() {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
ESP_LOGI(TAG, "Background capture task started");
while (running_.load()) {
if (!CaptureAndSendFrame()) {
consecutive_failures_++;
auto delay_ms = GetFailureDelayMs();
ESP_LOGW(TAG, "Background capture retry in %u ms, failures=%u",
delay_ms, consecutive_failures_);
vTaskDelay(pdMS_TO_TICKS(delay_ms));
continue;
}
consecutive_failures_ = 0;
vTaskDelay(pdMS_TO_TICKS(CONFIG_BACKGROUND_CAPTURE_FRAME_INTERVAL_MS));
}
ESP_LOGI(TAG, "Background capture task stopped");
#endif
}
bool BackgroundCaptureService::CaptureAndSendFrame() {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
const size_t free_internal_heap = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
if (free_internal_heap < CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP) {
ESP_LOGW(TAG, "Skip background capture, low internal heap: free=%u threshold=%u",
static_cast<unsigned>(free_internal_heap),
static_cast<unsigned>(CONFIG_BACKGROUND_CAPTURE_MIN_FREE_INTERNAL_HEAP));
return false;
}
auto camera = Board::GetInstance().GetCamera();
if (camera == nullptr) {
ESP_LOGW(TAG, "No camera available for background capture");
return false;
}
std::string jpeg_data;
if (!camera->CaptureToJpeg(jpeg_data, false)) {
ESP_LOGW(TAG, "Failed to capture background frame");
return false;
}
if (jpeg_data.empty()) {
ESP_LOGW(TAG, "Captured empty background frame");
return false;
}
return UploadJpegFrame(jpeg_data);
#else
return false;
#endif
}
uint32_t BackgroundCaptureService::GetFailureDelayMs() const {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
const uint32_t base_delay_ms = CONFIG_BACKGROUND_CAPTURE_RETRY_INTERVAL_MS;
const uint32_t max_delay_ms = CONFIG_BACKGROUND_CAPTURE_MAX_BACKOFF_MS;
const uint32_t shift = std::min<uint32_t>(consecutive_failures_ - 1, 4);
return std::min<uint32_t>(base_delay_ms << shift, max_delay_ms);
#else
return 0;
#endif
}
bool BackgroundCaptureService::UploadJpegFrame(const std::string& jpeg_data) {
#if CONFIG_BACKGROUND_CAPTURE_ENABLE
const std::string url = CONFIG_BACKGROUND_CAPTURE_UPLOAD_URL;
if (url.empty()) {
ESP_LOGI(TAG, "Captured background frame: %u bytes", static_cast<unsigned>(jpeg_data.size()));
return true;
}
auto network = Board::GetInstance().GetNetwork();
if (network == nullptr) {
ESP_LOGW(TAG, "No network available for background upload");
return false;
}
const std::string boundary = "----XIAOZHI_BACKGROUND_CAPTURE_BOUNDARY";
auto http = network->CreateHttp(3);
http->SetHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
if (!http->Open("POST", url)) {
ESP_LOGW(TAG, "Failed to open background upload URL: %s", url.c_str());
return false;
}
std::string file_header;
file_header += "--" + boundary + "\r\n";
file_header += "Content-Disposition: form-data; name=\"file\"; filename=\"frame.jpg\"\r\n";
file_header += "Content-Type: image/jpeg\r\n\r\n";
http->Write(file_header.c_str(), file_header.size());
http->Write(jpeg_data.data(), jpeg_data.size());
std::string footer;
footer += "\r\n--" + boundary + "--\r\n";
http->Write(footer.c_str(), footer.size());
http->Write("", 0);
const int status_code = http->GetStatusCode();
http->Close();
if (status_code < 200 || status_code >= 300) {
ESP_LOGW(TAG, "Background upload failed, status=%d", status_code);
return false;
}
ESP_LOGI(TAG, "Uploaded background frame: %u bytes", static_cast<unsigned>(jpeg_data.size()));
return true;
#else
(void)jpeg_data;
return false;
#endif
}

View File

@ -0,0 +1,32 @@
#ifndef BACKGROUND_CAPTURE_SERVICE_H
#define BACKGROUND_CAPTURE_SERVICE_H
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <atomic>
#include <cstdint>
#include <string>
class BackgroundCaptureService {
public:
BackgroundCaptureService();
~BackgroundCaptureService();
void Start();
void Stop();
bool IsRunning() const { return running_.load(); }
private:
TaskHandle_t task_handle_ = nullptr;
std::atomic<bool> running_ = false;
uint32_t consecutive_failures_ = 0;
static void TaskEntry(void* arg);
void Run();
bool CaptureAndSendFrame();
bool UploadJpegFrame(const std::string& jpeg_data);
uint32_t GetFailureDelayMs() const;
};
#endif // BACKGROUND_CAPTURE_SERVICE_H

View File

@ -214,6 +214,9 @@ public:
case kDeviceStateSpeaking: case kDeviceStateSpeaking:
ctrl_->SetStatusColor(64, 0, 0); // red ctrl_->SetStatusColor(64, 0, 0); // red
break; break;
case kDeviceStateThinking:
ctrl_->SetStatusColor(0, 0, 64); // blue
break;
default: default:
ctrl_->SetStatusColor(0, 0, 64); // blue ctrl_->SetStatusColor(0, 0, 64); // blue
break; break;

View File

@ -7,6 +7,8 @@ class Camera {
public: public:
virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0; virtual void SetExplainUrl(const std::string& url, const std::string& token) = 0;
virtual bool Capture() = 0; virtual bool Capture() = 0;
virtual bool CaptureBackground() { return Capture(); }
virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) { return false; }
virtual bool SetHMirror(bool enabled) = 0; virtual bool SetHMirror(bool enabled) = 0;
virtual bool SetVFlip(bool enabled) = 0; virtual bool SetVFlip(bool enabled) = 0;
virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op virtual bool SetSwapBytes(bool enabled) { return false; } // Optional, default no-op

View File

@ -24,6 +24,7 @@
#include "lvgl_display.h" #include "lvgl_display.h"
#include "mcp_server.h" #include "mcp_server.h"
#include "system_info.h" #include "system_info.h"
#include "esp_timer.h"
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE #ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#undef LOG_LOCAL_LEVEL #undef LOG_LOCAL_LEVEL
@ -55,6 +56,7 @@
#define TAG "EspVideo" #define TAG "EspVideo"
#define FOREGROUND_CAPTURE_PROTECTION_US (10 * 1000 * 1000)
#if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP) #if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
#warning \ #warning \
@ -381,11 +383,47 @@ EspVideo::~EspVideo() {
} }
void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) { void EspVideo::SetExplainUrl(const std::string& url, const std::string& token) {
std::lock_guard<std::mutex> lock(frame_mutex_);
explain_url_ = url; explain_url_ = url;
explain_token_ = token; explain_token_ = token;
} }
bool EspVideo::Capture() { bool EspVideo::Capture() {
return CaptureFrame(true);
}
bool EspVideo::CaptureBackground() {
return CaptureFrame(false);
}
bool EspVideo::CaptureToJpeg(std::string& jpeg_data, bool show_preview) {
jpeg_data.clear();
if (!CaptureFrame(show_preview)) {
return false;
}
std::lock_guard<std::mutex> lock(frame_mutex_);
if (frame_.data == nullptr || frame_.len == 0) {
return false;
}
uint16_t w = frame_.width ? frame_.width : 320;
uint16_t h = frame_.height ? frame_.height : 240;
return image_to_jpeg_cb(
frame_.data, frame_.len, w, h, frame_.format, 60,
[](void* arg, size_t index, const void* data, size_t len) -> size_t {
auto jpeg_data = static_cast<std::string*>(arg);
if (data != nullptr && len > 0) {
jpeg_data->append(static_cast<const char*>(data), len);
}
return len;
},
&jpeg_data);
}
bool EspVideo::CaptureFrame(bool show_preview) {
std::lock_guard<std::mutex> lock(frame_mutex_);
if (encoder_thread_.joinable()) { if (encoder_thread_.joinable()) {
encoder_thread_.join(); encoder_thread_.join();
} }
@ -394,6 +432,10 @@ bool EspVideo::Capture() {
return false; return false;
} }
if (!show_preview && esp_timer_get_time() < foreground_capture_protected_until_us_) {
return true;
}
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
struct v4l2_buffer buf = {}; struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
@ -729,9 +771,14 @@ bool EspVideo::Capture() {
} }
} }
// 显示预览图片 if (show_preview) {
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay()); foreground_capture_protected_until_us_ = esp_timer_get_time() + FOREGROUND_CAPTURE_PROTECTION_US;
if (display != nullptr) { }
if (show_preview) {
// 显示预览图片
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
if (display != nullptr) {
if (!frame_.data) { if (!frame_.data) {
ESP_LOGE(TAG, "frame.data is null"); ESP_LOGE(TAG, "frame.data is null");
return false; return false;
@ -836,6 +883,7 @@ bool EspVideo::Capture() {
auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format); auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
display->SetPreviewImage(std::move(image)); display->SetPreviewImage(std::move(image));
}
} }
return true; return true;
} }
@ -898,10 +946,16 @@ bool EspVideo::SetVFlip(bool enabled) {
* @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息 * @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息
*/ */
std::string EspVideo::Explain(const std::string& question) { std::string EspVideo::Explain(const std::string& question) {
std::lock_guard<std::mutex> lock(frame_mutex_);
if (explain_url_.empty()) { if (explain_url_.empty()) {
throw std::runtime_error("Image explain URL or token is not set"); throw std::runtime_error("Image explain URL or token is not set");
} }
if (frame_.data == nullptr || frame_.len == 0) {
throw std::runtime_error("No camera frame captured");
}
// 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data // 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk)); QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
if (jpeg_queue == nullptr) { if (jpeg_queue == nullptr) {

View File

@ -5,6 +5,8 @@
#include <thread> #include <thread>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <mutex>
#include <cstdint>
#include <freertos/FreeRTOS.h> #include <freertos/FreeRTOS.h>
#include <freertos/queue.h> #include <freertos/queue.h>
@ -39,6 +41,10 @@ private:
std::string explain_url_; std::string explain_url_;
std::string explain_token_; std::string explain_token_;
std::thread encoder_thread_; std::thread encoder_thread_;
std::mutex frame_mutex_;
int64_t foreground_capture_protected_until_us_ = 0;
bool CaptureFrame(bool show_preview);
public: public:
EspVideo(const esp_video_init_config_t& config); EspVideo(const esp_video_init_config_t& config);
@ -46,6 +52,8 @@ public:
virtual void SetExplainUrl(const std::string& url, const std::string& token); virtual void SetExplainUrl(const std::string& url, const std::string& token);
virtual bool Capture(); virtual bool Capture();
virtual bool CaptureBackground() override;
virtual bool CaptureToJpeg(std::string& jpeg_data, bool show_preview = false) override;
// 翻转控制函数 // 翻转控制函数
virtual bool SetHMirror(bool enabled) override; virtual bool SetHMirror(bool enabled) override;
virtual bool SetVFlip(bool enabled) override; virtual bool SetVFlip(bool enabled) override;

View File

@ -203,7 +203,10 @@ void WifiBoard::EnterWifiConfigMode() {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
auto state = app.GetDeviceState(); auto state = app.GetDeviceState();
if (state == kDeviceStateSpeaking || state == kDeviceStateListening || state == kDeviceStateIdle) { if (state == kDeviceStateSpeaking ||
state == kDeviceStateThinking ||
state == kDeviceStateListening ||
state == kDeviceStateIdle) {
// Reset protocol (close audio channel, reset protocol) // Reset protocol (close audio channel, reset protocol)
Application::GetInstance().ResetProtocol(); Application::GetInstance().ResetProtocol();

View File

@ -85,6 +85,13 @@ void ElectronEmojiDisplay::SetStatus(const char* status) {
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
return; return;
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
lv_obj_set_style_text_font(status_label_, text_font, 0);
lv_label_set_text(status_label_, status);
lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
return;
} else if (strcmp(status, Lang::Strings::CONNECTING) == 0) { } else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0); lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标 lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标

View File

@ -155,6 +155,8 @@ void EmojiWidget::SetStatus(const char* status)
if (player_) { if (player_) {
if (strcmp(status, Lang::Strings::LISTENING) == 0) { if (strcmp(status, Lang::Strings::LISTENING) == 0) {
player_->StartPlayer("asking", true, 15); player_->StartPlayer("asking", true, 15);
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
player_->StartPlayer("thinking", true, 15);
} else if (strcmp(status, Lang::Strings::STANDBY) == 0) { } else if (strcmp(status, Lang::Strings::STANDBY) == 0) {
player_->StartPlayer("wake", true, 15); player_->StartPlayer("wake", true, 15);
} }

View File

@ -231,9 +231,9 @@ private:
// 如果当前是聆听状态,切换到待命状态 // 如果当前是聆听状态,切换到待命状态
ESP_LOGI(TAG, "从聆听状态切换到待命状态"); ESP_LOGI(TAG, "从聆听状态切换到待命状态");
app.ToggleChatState(); // 切换到待命状态 app.ToggleChatState(); // 切换到待命状态
} else if (current_state == kDeviceStateSpeaking) { } else if (current_state == kDeviceStateSpeaking || current_state == kDeviceStateThinking) {
// 如果当前是说话状态,终止说话并切换到待命状态 // 如果当前是说话或思考状态,终止并切换到待命状态
ESP_LOGI(TAG, "从说话状态切换到待命状态"); ESP_LOGI(TAG, "从说话/思考状态切换到待命状态");
app.ToggleChatState(); // 终止说话 app.ToggleChatState(); // 终止说话
} else { } else {
// 其他状态下只唤醒设备 // 其他状态下只唤醒设备

View File

@ -1,21 +1,23 @@
#include "wifi_board.h" #include "application.h"
#include "axp2101.h"
#include "config.h"
#include "cores3_audio_codec.h" #include "cores3_audio_codec.h"
#include "display/lcd_display.h" #include "display/lcd_display.h"
#include "application.h"
#include "config.h"
#include "power_save_timer.h"
#include "i2c_device.h" #include "i2c_device.h"
#include "axp2101.h" #include "power_save_timer.h"
#include "wifi_board.h"
#include <esp_log.h>
#include <driver/i2c_master.h> #include <driver/i2c_master.h>
#include <esp_lcd_ili9341.h>
#include <esp_lcd_panel_io.h> #include <esp_lcd_panel_io.h>
#include <esp_lcd_panel_ops.h> #include <esp_lcd_panel_ops.h>
#include <esp_lcd_ili9341.h> #include <esp_log.h>
#include <esp_timer.h> #include <esp_timer.h>
#include "esp_video.h" #include "esp_video.h"
#define TAG "M5StackCoreS3Board" #define TAG "M5StackCoreS3Board"
#define BACKGROUND_VISION_INITIAL_DELAY_MS 8000
#define BACKGROUND_VISION_SAMPLE_INTERVAL_MS 100
class Pmic : public Axp2101 { class Pmic : public Axp2101 {
public: public:
@ -41,7 +43,7 @@ public:
class CustomBacklight : public Backlight { class CustomBacklight : public Backlight {
public: public:
CustomBacklight(Pmic *pmic) : pmic_(pmic) {} CustomBacklight(Pmic* pmic) : pmic_(pmic) {}
void SetBrightnessImpl(uint8_t brightness) override { void SetBrightnessImpl(uint8_t brightness) override {
pmic_->SetBrightness(target_brightness_); pmic_->SetBrightness(target_brightness_);
@ -49,7 +51,7 @@ public:
} }
private: private:
Pmic *pmic_; Pmic* pmic_;
}; };
class Aw9523 : public I2cDevice { class Aw9523 : public I2cDevice {
@ -96,9 +98,7 @@ public:
read_buffer_ = new uint8_t[6]; read_buffer_ = new uint8_t[6];
} }
~Ft6336() { ~Ft6336() { delete[] read_buffer_; }
delete[] read_buffer_;
}
void UpdateTouchPoint() { void UpdateTouchPoint() {
ReadRegs(0x02, read_buffer_, 6); ReadRegs(0x02, read_buffer_, 6);
@ -107,9 +107,7 @@ public:
tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4]; tp_.y = ((read_buffer_[3] & 0x0F) << 8) | read_buffer_[4];
} }
inline const TouchPoint_t& GetTouchPoint() { inline const TouchPoint_t& GetTouchPoint() { return tp_; }
return tp_;
}
private: private:
uint8_t* read_buffer_ = nullptr; uint8_t* read_buffer_ = nullptr;
@ -137,9 +135,7 @@ private:
GetDisplay()->SetPowerSaveMode(false); GetDisplay()->SetPowerSaveMode(false);
GetBacklight()->RestoreBrightness(); GetBacklight()->RestoreBrightness();
}); });
power_save_timer_->OnShutdownRequest([this]() { power_save_timer_->OnShutdownRequest([this]() { pmic_->PowerOff(); });
pmic_->PowerOff();
});
power_save_timer_->SetEnabled(true); power_save_timer_->SetEnabled(true);
} }
@ -153,9 +149,10 @@ private:
.glitch_ignore_cnt = 7, .glitch_ignore_cnt = 7,
.intr_priority = 0, .intr_priority = 0,
.trans_queue_depth = 0, .trans_queue_depth = 0,
.flags = { .flags =
.enable_internal_pullup = 1, {
}, .enable_internal_pullup = 1,
},
}; };
ESP_ERROR_CHECK(i2c_new_master_bus(&i2c_bus_cfg, &i2c_bus_)); ESP_ERROR_CHECK(i2c_new_master_bus(&i2c_bus_cfg, &i2c_bus_));
} }
@ -195,6 +192,7 @@ private:
void PollTouchpad() { void PollTouchpad() {
static bool was_touched = false; static bool was_touched = false;
static int64_t touch_start_time = 0; static int64_t touch_start_time = 0;
static int touch_start_x = -1;
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按 const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按
ft6336_->UpdateTouchPoint(); ft6336_->UpdateTouchPoint();
@ -203,21 +201,28 @@ private:
// 检测触摸开始 // 检测触摸开始
if (touch_point.num > 0 && !was_touched) { if (touch_point.num > 0 && !was_touched) {
was_touched = true; was_touched = true;
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒 touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
touch_start_x = touch_point.x;
} }
// 检测触摸释放 // 检测触摸释放
else if (touch_point.num == 0 && was_touched) { else if (touch_point.num == 0 && was_touched) {
was_touched = false; was_touched = false;
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time; int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2;
auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal;
// 只有短触才触发
if (touch_duration < TOUCH_THRESHOLD_MS) { if (touch_duration < TOUCH_THRESHOLD_MS) {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
if (app.GetDeviceState() == kDeviceStateStarting) { if (app.GetDeviceState() == kDeviceStateStarting) {
EnterWifiConfigMode(); EnterWifiConfigMode();
return; return;
} }
app.ToggleChatState(); ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal");
app.ToggleChatStateForMode(agent_mode, false);
} else {
auto& app = Application::GetInstance();
ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal");
app.ToggleChatStateForMode(agent_mode, true);
} }
} }
} }
@ -228,10 +233,11 @@ private:
// 创建定时器20ms 间隔 // 创建定时器20ms 间隔
esp_timer_create_args_t timer_args = { esp_timer_create_args_t timer_args = {
.callback = [](void* arg) { .callback =
M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg; [](void* arg) {
board->PollTouchpad(); M5StackCoreS3Board* board = (M5StackCoreS3Board*)arg;
}, board->PollTouchpad();
},
.arg = this, .arg = this,
.dispatch_method = ESP_TIMER_TASK, .dispatch_method = ESP_TIMER_TASK,
.name = "touchpad_timer", .name = "touchpad_timer",
@ -285,23 +291,25 @@ private:
esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY); esp_lcd_panel_swap_xy(panel, DISPLAY_SWAP_XY);
esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y); esp_lcd_panel_mirror(panel, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y);
display_ = new SpiLcdDisplay(panel_io, panel, display_ = new SpiLcdDisplay(panel_io, panel, DISPLAY_WIDTH, DISPLAY_HEIGHT,
DISPLAY_WIDTH, DISPLAY_HEIGHT, DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X, DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY); DISPLAY_OFFSET_X, DISPLAY_OFFSET_Y, DISPLAY_MIRROR_X,
DISPLAY_MIRROR_Y, DISPLAY_SWAP_XY);
} }
void InitializeCamera() { void InitializeCamera() {
static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = { static esp_cam_ctlr_dvp_pin_config_t dvp_pin_config = {
.data_width = CAM_CTLR_DATA_WIDTH_8, .data_width = CAM_CTLR_DATA_WIDTH_8,
.data_io = { .data_io =
[0] = CAMERA_PIN_D0, {
[1] = CAMERA_PIN_D1, [0] = CAMERA_PIN_D0,
[2] = CAMERA_PIN_D2, [1] = CAMERA_PIN_D1,
[3] = CAMERA_PIN_D3, [2] = CAMERA_PIN_D2,
[4] = CAMERA_PIN_D4, [3] = CAMERA_PIN_D3,
[5] = CAMERA_PIN_D5, [4] = CAMERA_PIN_D4,
[6] = CAMERA_PIN_D6, [5] = CAMERA_PIN_D5,
[7] = CAMERA_PIN_D7, [6] = CAMERA_PIN_D6,
}, [7] = CAMERA_PIN_D7,
},
.vsync_io = CAMERA_PIN_VSYNC, .vsync_io = CAMERA_PIN_VSYNC,
.de_io = CAMERA_PIN_HREF, .de_io = CAMERA_PIN_HREF,
.pclk_io = CAMERA_PIN_PCLK, .pclk_io = CAMERA_PIN_PCLK,
@ -330,6 +338,42 @@ private:
camera_->SetHMirror(false); camera_->SetHMirror(false);
} }
void InitializeBackgroundVisionSampler() {
xTaskCreate(
[](void* arg) {
auto board = static_cast<M5StackCoreS3Board*>(arg);
bool has_logged_success = false;
bool has_logged_failure = false;
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
while (true) {
if (!Application::GetInstance().IsVisionTextModeEnabled()) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue;
}
if (board->camera_ == nullptr) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue;
}
if (board->camera_->Capture()) {
if (!has_logged_success) {
ESP_LOGI(TAG, "Vision preview sampler started");
has_logged_success = true;
}
} else if (!has_logged_failure) {
ESP_LOGW(TAG, "Vision preview sampler is waiting for camera");
has_logged_failure = true;
}
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
}
},
"BgVisionSampler", 4096, this, 1, nullptr);
}
public: public:
M5StackCoreS3Board() { M5StackCoreS3Board() {
InitializePowerSaveTimer(); InitializePowerSaveTimer();
@ -340,34 +384,24 @@ public:
InitializeSpi(); InitializeSpi();
InitializeIli9342Display(); InitializeIli9342Display();
InitializeCamera(); InitializeCamera();
InitializeBackgroundVisionSampler();
InitializeFt6336TouchPad(); InitializeFt6336TouchPad();
GetBacklight()->RestoreBrightness(); GetBacklight()->RestoreBrightness();
} }
virtual AudioCodec* GetAudioCodec() override { virtual AudioCodec* GetAudioCodec() override {
static CoreS3AudioCodec audio_codec(i2c_bus_, static CoreS3AudioCodec audio_codec(
AUDIO_INPUT_SAMPLE_RATE, i2c_bus_, AUDIO_INPUT_SAMPLE_RATE, AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_MCLK,
AUDIO_OUTPUT_SAMPLE_RATE, AUDIO_I2S_GPIO_BCLK, AUDIO_I2S_GPIO_WS, AUDIO_I2S_GPIO_DOUT, AUDIO_I2S_GPIO_DIN,
AUDIO_I2S_GPIO_MCLK, AUDIO_CODEC_AW88298_ADDR, AUDIO_CODEC_ES7210_ADDR, AUDIO_INPUT_REFERENCE);
AUDIO_I2S_GPIO_BCLK,
AUDIO_I2S_GPIO_WS,
AUDIO_I2S_GPIO_DOUT,
AUDIO_I2S_GPIO_DIN,
AUDIO_CODEC_AW88298_ADDR,
AUDIO_CODEC_ES7210_ADDR,
AUDIO_INPUT_REFERENCE);
return &audio_codec; return &audio_codec;
} }
virtual Display* GetDisplay() override { virtual Display* GetDisplay() override { return display_; }
return display_;
}
virtual Camera* GetCamera() override { virtual Camera* GetCamera() override { return camera_; }
return camera_;
}
virtual bool GetBatteryLevel(int &level, bool& charging, bool& discharging) override { virtual bool GetBatteryLevel(int& level, bool& charging, bool& discharging) override {
static bool last_discharging = false; static bool last_discharging = false;
charging = pmic_->IsCharging(); charging = pmic_->IsCharging();
discharging = pmic_->IsDischarging(); discharging = pmic_->IsDischarging();
@ -387,7 +421,7 @@ public:
WifiBoard::SetPowerSaveLevel(level); WifiBoard::SetPowerSaveLevel(level);
} }
virtual Backlight *GetBacklight() override { virtual Backlight* GetBacklight() override {
static CustomBacklight backlight(pmic_); static CustomBacklight backlight(pmic_);
return &backlight; return &backlight;
} }

View File

@ -77,6 +77,13 @@ void OttoEmojiDisplay::SetStatus(const char* status) {
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN); lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
return; return;
} else if (strcmp(status, Lang::Strings::THINKING) == 0) {
lv_obj_set_style_text_font(status_label_, text_font, 0);
lv_label_set_text(status_label_, status);
lv_obj_clear_flag(status_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(network_label_, LV_OBJ_FLAG_HIDDEN);
lv_obj_add_flag(battery_label_, LV_OBJ_FLAG_HIDDEN);
return;
} else if (strcmp(status, Lang::Strings::CONNECTING) == 0) { } else if (strcmp(status, Lang::Strings::CONNECTING) == 0) {
lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0); lv_obj_set_style_text_font(status_label_, &OTTO_ICON_FONT, 0);
lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标 lv_label_set_text(status_label_, "\xEF\x83\x81"); // U+F0c1 连接图标

View File

@ -599,7 +599,7 @@ CONFIG_PARTITION_TABLE_MD5=y
# #
CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/" CONFIG_OTA_URL="https://api.tenclass.net/xiaozhi/ota/"
CONFIG_USE_DIRECT_WEBSOCKET=y CONFIG_USE_DIRECT_WEBSOCKET=y
CONFIG_WEBSOCKET_URL="ws://10.6.80.130:8080" CONFIG_WEBSOCKET_URL="ws://172.19.0.240:8080"
CONFIG_WEBSOCKET_TOKEN="" CONFIG_WEBSOCKET_TOKEN=""
CONFIG_WEBSOCKET_PROTOCOL_VERSION=1 CONFIG_WEBSOCKET_PROTOCOL_VERSION=1
# CONFIG_FLASH_NONE_ASSETS is not set # CONFIG_FLASH_NONE_ASSETS is not set

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,7 @@ enum DeviceState {
kDeviceStateIdle, kDeviceStateIdle,
kDeviceStateConnecting, kDeviceStateConnecting,
kDeviceStateListening, kDeviceStateListening,
kDeviceStateThinking,
kDeviceStateSpeaking, kDeviceStateSpeaking,
kDeviceStateUpgrading, kDeviceStateUpgrading,
kDeviceStateActivating, kDeviceStateActivating,

View File

@ -13,6 +13,7 @@ static const char* const STATE_STRINGS[] = {
"idle", "idle",
"connecting", "connecting",
"listening", "listening",
"thinking",
"speaking", "speaking",
"upgrading", "upgrading",
"activating", "activating",
@ -69,9 +70,10 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con
to == kDeviceStateActivating; to == kDeviceStateActivating;
case kDeviceStateIdle: case kDeviceStateIdle:
// Can go to connecting, listening (manual mode), speaking, activating, upgrading, or wifi configuring // Can go to connecting, listening (manual mode), thinking, speaking, activating, upgrading, or wifi configuring
return to == kDeviceStateConnecting || return to == kDeviceStateConnecting ||
to == kDeviceStateListening || to == kDeviceStateListening ||
to == kDeviceStateThinking ||
to == kDeviceStateSpeaking || to == kDeviceStateSpeaking ||
to == kDeviceStateActivating || to == kDeviceStateActivating ||
to == kDeviceStateUpgrading || to == kDeviceStateUpgrading ||
@ -83,8 +85,15 @@ bool DeviceStateMachine::IsValidTransition(DeviceState from, DeviceState to) con
to == kDeviceStateListening; to == kDeviceStateListening;
case kDeviceStateListening: case kDeviceStateListening:
// Can go to speaking or idle // Can go to thinking, speaking, or idle
return to == kDeviceStateThinking ||
to == kDeviceStateSpeaking ||
to == kDeviceStateIdle;
case kDeviceStateThinking:
// Can go to speaking, listening, or idle
return to == kDeviceStateSpeaking || return to == kDeviceStateSpeaking ||
to == kDeviceStateListening ||
to == kDeviceStateIdle; to == kDeviceStateIdle;
case kDeviceStateSpeaking: case kDeviceStateSpeaking:

View File

@ -167,6 +167,8 @@ void EmoteDisplay::SetStatus(const char* const status)
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL); emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
} else if (std::strcmp(status, Lang::Strings::STANDBY) == 0) { } else if (std::strcmp(status, Lang::Strings::STANDBY) == 0) {
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_IDLE, NULL); emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_IDLE, NULL);
} else if (std::strcmp(status, Lang::Strings::THINKING) == 0) {
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_LISTEN, NULL);
} else if (std::strcmp(status, Lang::Strings::SPEAKING) == 0) { } else if (std::strcmp(status, Lang::Strings::SPEAKING) == 0) {
emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_SPEAK, NULL); emote_set_event_msg(emote_handle_, EMOTE_MGR_EVT_SPEAK, NULL);
} else if (std::strcmp(status, Lang::Strings::ERROR) == 0) { } else if (std::strcmp(status, Lang::Strings::ERROR) == 0) {

View File

@ -203,6 +203,7 @@ void LvglDisplay::UpdateStatusBar(bool update_all) {
kDeviceStateStarting, kDeviceStateStarting,
kDeviceStateWifiConfiguring, kDeviceStateWifiConfiguring,
kDeviceStateListening, kDeviceStateListening,
kDeviceStateThinking,
kDeviceStateActivating, kDeviceStateActivating,
}; };
if (std::find(allowed_states.begin(), allowed_states.end(), device_state) != allowed_states.end()) { if (std::find(allowed_states.begin(), allowed_states.end(), device_state) != allowed_states.end()) {

View File

@ -228,6 +228,11 @@ void CircularStrip::OnStateChanged() {
SetAllColor(color); SetAllColor(color);
break; break;
} }
case kDeviceStateThinking: {
StripColor color = { low_brightness_, low_brightness_, default_brightness_ };
Blink(color, 300);
break;
}
case kDeviceStateUpgrading: { case kDeviceStateUpgrading: {
StripColor color = { low_brightness_, default_brightness_, low_brightness_ }; StripColor color = { low_brightness_, default_brightness_, low_brightness_ };
Blink(color, 100); Blink(color, 100);

View File

@ -235,6 +235,10 @@ void GpioLed::OnStateChanged() {
// TurnOn(); // TurnOn();
StartFadeTask(); StartFadeTask();
break; break;
case kDeviceStateThinking:
SetBrightness(DEFAULT_BRIGHTNESS);
StartContinuousBlink(300);
break;
case kDeviceStateSpeaking: case kDeviceStateSpeaking:
SetBrightness(SPEAKING_BRIGHTNESS); SetBrightness(SPEAKING_BRIGHTNESS);
TurnOn(); TurnOn();

View File

@ -152,6 +152,10 @@ void SingleLed::OnStateChanged() {
SetColor(0, DEFAULT_BRIGHTNESS, 0); SetColor(0, DEFAULT_BRIGHTNESS, 0);
TurnOn(); TurnOn();
break; break;
case kDeviceStateThinking:
SetColor(0, 0, DEFAULT_BRIGHTNESS);
StartContinuousBlink(300);
break;
case kDeviceStateUpgrading: case kDeviceStateUpgrading:
SetColor(0, DEFAULT_BRIGHTNESS, 0); SetColor(0, DEFAULT_BRIGHTNESS, 0);
StartContinuousBlink(100); StartContinuousBlink(100);

View File

@ -1,9 +1,22 @@
#include "protocol.h" #include "protocol.h"
#include <esp_log.h> #include <esp_log.h>
#include <mbedtls/base64.h>
#define TAG "Protocol" #define TAG "Protocol"
static std::string Base64Encode(const std::string& data) {
size_t encoded_length = 0;
size_t output_length = 0;
mbedtls_base64_encode(nullptr, 0, &encoded_length,
reinterpret_cast<const unsigned char*>(data.data()), data.size());
std::string result(encoded_length, 0);
mbedtls_base64_encode(reinterpret_cast<unsigned char*>(result.data()), result.size(), &output_length,
reinterpret_cast<const unsigned char*>(data.data()), data.size());
result.resize(output_length);
return result;
}
void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) { void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) {
on_incoming_json_ = callback; on_incoming_json_ = callback;
} }
@ -78,6 +91,27 @@ void Protocol::SendMcpMessage(const std::string& payload) {
SendText(message); SendText(message);
} }
void Protocol::SendVisionFrame(const std::string& jpeg_data) {
if (jpeg_data.empty()) {
return;
}
cJSON* root = cJSON_CreateObject();
cJSON_AddStringToObject(root, "session_id", session_id_.c_str());
cJSON_AddStringToObject(root, "type", "vision");
cJSON_AddStringToObject(root, "state", "frame");
cJSON_AddStringToObject(root, "mime_type", "image/jpeg");
auto encoded = Base64Encode(jpeg_data);
cJSON_AddStringToObject(root, "image", encoded.c_str());
char* json_str = cJSON_PrintUnformatted(root);
if (json_str != nullptr) {
SendText(json_str);
cJSON_free(json_str);
}
cJSON_Delete(root);
}
bool Protocol::IsTimeout() const { bool Protocol::IsTimeout() const {
const int kTimeoutSeconds = 120; const int kTimeoutSeconds = 120;
auto now = std::chrono::steady_clock::now(); auto now = std::chrono::steady_clock::now();

View File

@ -73,6 +73,7 @@ public:
virtual void SendStopListening(); virtual void SendStopListening();
virtual void SendAbortSpeaking(AbortReason reason); virtual void SendAbortSpeaking(AbortReason reason);
virtual void SendMcpMessage(const std::string& message); virtual void SendMcpMessage(const std::string& message);
virtual void SendVisionFrame(const std::string& jpeg_data);
protected: protected:
std::function<void(const cJSON* root)> on_incoming_json_; std::function<void(const cJSON* root)> on_incoming_json_;
@ -95,4 +96,3 @@ protected:
}; };
#endif // PROTOCOL_H #endif // PROTOCOL_H

View File

@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str()); websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str()); websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str()); websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName());
websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName());
websocket_->OnData([this](const char* data, size_t len, bool binary) { websocket_->OnData([this](const char* data, size_t len, bool binary) {
if (binary) { if (binary) {