Compare commits

...

4 Commits

Author SHA1 Message Date
9637e09aef feat: beaver 2026-06-04 15:48:10 +08:00
b92e6e1b07 feat: remove background cam every time 2026-05-29 14:53:58 +08:00
33ee598c21 feat: add icon beaver 2026-05-29 11:22:31 +08:00
37343ac0fe feat: icon first commit 2026-05-27 17:16:11 +08:00
5 changed files with 264 additions and 41 deletions

View File

@ -81,6 +81,7 @@ void Application::Initialize() {
xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED); xEventGroupSetBits(event_group_, MAIN_EVENT_WAKE_WORD_DETECTED);
}; };
callbacks.on_vad_change = [this](bool speaking) { callbacks.on_vad_change = [this](bool speaking) {
vad_speaking_.store(speaking);
xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE); xEventGroupSetBits(event_group_, MAIN_EVENT_VAD_CHANGE);
}; };
audio_service_.SetCallbacks(callbacks); audio_service_.SetCallbacks(callbacks);
@ -233,6 +234,13 @@ void Application::Run() {
if (GetDeviceState() == kDeviceStateListening) { if (GetDeviceState() == kDeviceStateListening) {
auto led = Board::GetInstance().GetLed(); auto led = Board::GetInstance().GetLed();
led->OnStateChanged(); led->OnStateChanged();
if (vad_speaking_.load() && vision_text_mode_enabled_.load() &&
!vision_frame_sent_for_current_listen_.exchange(true)) {
if (!SendCurrentVisionFrame()) {
vision_frame_sent_for_current_listen_.store(false);
}
}
} }
} }
@ -525,6 +533,9 @@ void Application::InitializeProtocol() {
protocol_->OnAudioChannelClosed([this, &board]() { protocol_->OnAudioChannelClosed([this, &board]() {
board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER); board.SetPowerSaveLevel(PowerSaveLevel::LOW_POWER);
Schedule([this]() { Schedule([this]() {
if (GetDeviceState() == kDeviceStateConnecting) {
return;
}
auto display = Board::GetInstance().GetDisplay(); auto display = Board::GetInstance().GetDisplay();
display->SetChatMessage("system", ""); display->SetChatMessage("system", "");
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
@ -673,17 +684,39 @@ void Application::DismissAlert() {
} }
void Application::ToggleChatState() { void Application::ToggleChatState() {
vision_text_mode_enabled_.store(false); ToggleChatStateForMode(kChatAgentModeNormal, false);
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
} }
void Application::ToggleChatStateWithVision() { void Application::ToggleChatStateWithVision() {
vision_text_mode_enabled_.store(true); ToggleChatStateForMode(kChatAgentModeNormal, true);
}
void Application::ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled) {
chat_agent_mode_.store(agent_mode);
vision_text_mode_enabled_.store(vision_enabled);
vision_frame_sent_for_current_listen_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT); xEventGroupSetBits(event_group_, MAIN_EVENT_TOGGLE_CHAT);
} }
bool Application::IsVisionTextModeEnabled() const {
return vision_text_mode_enabled_.load();
}
const char* Application::GetChatAgentModeName() const {
return chat_agent_mode_.load() == kChatAgentModeBeaver ? "beaver" : "normal";
}
const char* Application::GetChatModeName() const {
bool vision_enabled = vision_text_mode_enabled_.load();
if (chat_agent_mode_.load() == kChatAgentModeBeaver) {
return vision_enabled ? "vision-beaver" : "beaver";
}
return vision_enabled ? "vision-normal" : "normal";
}
void Application::StartListening() { void Application::StartListening() {
vision_text_mode_enabled_.store(false); vision_text_mode_enabled_.store(false);
vision_frame_sent_for_current_listen_.store(false);
xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING); xEventGroupSetBits(event_group_, MAIN_EVENT_START_LISTENING);
} }
@ -693,9 +726,6 @@ void Application::StopListening() {
void Application::HandleToggleChatEvent() { void Application::HandleToggleChatEvent() {
auto state = GetDeviceState(); auto state = GetDeviceState();
if (state != kDeviceStateIdle) {
vision_text_mode_enabled_.store(false);
}
if (state == kDeviceStateActivating) { if (state == kDeviceStateActivating) {
SetDeviceState(kDeviceStateIdle); SetDeviceState(kDeviceStateIdle);
@ -717,7 +747,12 @@ void Application::HandleToggleChatEvent() {
if (state == kDeviceStateIdle) { if (state == kDeviceStateIdle) {
ListeningMode mode = GetDefaultListeningMode(); ListeningMode mode = GetDefaultListeningMode();
if (!protocol_->IsAudioChannelOpened()) { bool agent_mode_changed = chat_agent_mode_.load() != active_chat_agent_mode_.load();
bool vision_mode_changed = vision_text_mode_enabled_.load() != active_vision_text_mode_enabled_.load();
if (!protocol_->IsAudioChannelOpened() || agent_mode_changed || vision_mode_changed) {
if (protocol_->IsAudioChannelOpened()) {
protocol_->CloseAudioChannel();
}
SetDeviceState(kDeviceStateConnecting); SetDeviceState(kDeviceStateConnecting);
// Schedule to let the state change be processed first (UI update) // Schedule to let the state change be processed first (UI update)
Schedule([this, mode]() { Schedule([this, mode]() {
@ -749,6 +784,8 @@ void Application::ContinueOpenAudioChannel(ListeningMode mode) {
} }
} }
active_chat_agent_mode_.store(chat_agent_mode_.load());
active_vision_text_mode_enabled_.store(vision_text_mode_enabled_.load());
SetListeningMode(mode); SetListeningMode(mode);
} }
@ -892,6 +929,7 @@ void Application::HandleStateChangedEvent() {
switch (new_state) { switch (new_state) {
case kDeviceStateUnknown: case kDeviceStateUnknown:
case kDeviceStateIdle: case kDeviceStateIdle:
vision_frame_sent_for_current_listen_.store(false);
display->SetStatus(Lang::Strings::STANDBY); display->SetStatus(Lang::Strings::STANDBY);
display->ClearChatMessages(); // Clear messages first display->ClearChatMessages(); // Clear messages first
display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count) display->SetEmotion("neutral"); // Then set emotion (wechat mode checks child count)
@ -904,6 +942,8 @@ void Application::HandleStateChangedEvent() {
display->SetChatMessage("system", ""); display->SetChatMessage("system", "");
break; break;
case kDeviceStateListening: case kDeviceStateListening:
vad_speaking_.store(false);
vision_frame_sent_for_current_listen_.store(false);
display->SetStatus(Lang::Strings::LISTENING); display->SetStatus(Lang::Strings::LISTENING);
display->SetEmotion("neutral"); display->SetEmotion("neutral");
@ -915,9 +955,6 @@ void Application::HandleStateChangedEvent() {
audio_service_.WaitForPlaybackQueueEmpty(); audio_service_.WaitForPlaybackQueueEmpty();
} }
if (vision_text_mode_enabled_.load()) {
SendCurrentVisionFrame();
}
// Send the start listening command // Send the start listening command
protocol_->SendStartListening(listening_mode_); protocol_->SendStartListening(listening_mode_);
audio_service_.EnableVoiceProcessing(true); audio_service_.EnableVoiceProcessing(true);
@ -957,24 +994,25 @@ void Application::HandleStateChangedEvent() {
} }
} }
void Application::SendCurrentVisionFrame() { bool Application::SendCurrentVisionFrame() {
if (!protocol_ || !protocol_->IsAudioChannelOpened()) { if (!protocol_ || !protocol_->IsAudioChannelOpened()) {
return; return false;
} }
auto camera = Board::GetInstance().GetCamera(); auto camera = Board::GetInstance().GetCamera();
if (camera == nullptr) { if (camera == nullptr) {
return; return false;
} }
std::string jpeg_data; std::string jpeg_data;
if (!camera->CaptureToJpeg(jpeg_data, false)) { if (!camera->CaptureToJpeg(jpeg_data, true)) {
ESP_LOGW(TAG, "Failed to capture vision frame"); ESP_LOGW(TAG, "Failed to capture vision frame");
return; return false;
} }
protocol_->SendVisionFrame(jpeg_data); protocol_->SendVisionFrame(jpeg_data);
ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size())); ESP_LOGI(TAG, "Sent vision frame, size=%u bytes", static_cast<unsigned>(jpeg_data.size()));
return true;
} }
void Application::Schedule(std::function<void()>&& callback) { void Application::Schedule(std::function<void()>&& callback) {
@ -995,6 +1033,8 @@ void Application::AbortSpeaking(AbortReason reason) {
void Application::SetListeningMode(ListeningMode mode) { void Application::SetListeningMode(ListeningMode mode) {
listening_mode_ = mode; listening_mode_ = mode;
vad_speaking_.store(false);
vision_frame_sent_for_current_listen_.store(false);
SetDeviceState(kDeviceStateListening); SetDeviceState(kDeviceStateListening);
} }

View File

@ -41,6 +41,11 @@ enum AecMode {
kAecOnServerSide, kAecOnServerSide,
}; };
enum ChatAgentMode {
kChatAgentModeNormal,
kChatAgentModeBeaver,
};
class Application { class Application {
public: public:
static Application& GetInstance() { static Application& GetInstance() {
@ -93,6 +98,11 @@ public:
*/ */
void ToggleChatState(); void ToggleChatState();
void ToggleChatStateWithVision(); void ToggleChatStateWithVision();
void ToggleChatStateForMode(ChatAgentMode agent_mode, bool vision_enabled);
bool IsVisionTextModeEnabled() const;
ChatAgentMode GetChatAgentMode() const { return chat_agent_mode_.load(); }
const char* GetChatAgentModeName() const;
const char* GetChatModeName() const;
/** /**
* Start listening (event-based, thread-safe) * Start listening (event-based, thread-safe)
@ -146,7 +156,12 @@ private:
bool aborted_ = false; bool aborted_ = false;
bool assets_version_checked_ = false; bool assets_version_checked_ = false;
bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening bool play_popup_on_listening_ = false; // Flag to play popup sound after state changes to listening
std::atomic<ChatAgentMode> chat_agent_mode_ = kChatAgentModeNormal;
std::atomic<ChatAgentMode> active_chat_agent_mode_ = kChatAgentModeNormal;
std::atomic<bool> vision_text_mode_enabled_ = false; std::atomic<bool> vision_text_mode_enabled_ = false;
std::atomic<bool> active_vision_text_mode_enabled_ = false;
std::atomic<bool> vad_speaking_ = false;
std::atomic<bool> vision_frame_sent_for_current_listen_ = false;
int clock_ticks_ = 0; int clock_ticks_ = 0;
TaskHandle_t activation_task_handle_ = nullptr; TaskHandle_t activation_task_handle_ = nullptr;
@ -162,7 +177,7 @@ private:
void HandleWakeWordDetectedEvent(); void HandleWakeWordDetectedEvent();
void ContinueOpenAudioChannel(ListeningMode mode); void ContinueOpenAudioChannel(ListeningMode mode);
void ContinueWakeWordInvoke(const std::string& wake_word); void ContinueWakeWordInvoke(const std::string& wake_word);
void SendCurrentVisionFrame(); bool SendCurrentVisionFrame();
// Activation task (runs in background) // Activation task (runs in background)
void ActivationTask(); void ActivationTask();

View File

@ -192,6 +192,7 @@ private:
void PollTouchpad() { void PollTouchpad() {
static bool was_touched = false; static bool was_touched = false;
static int64_t touch_start_time = 0; static int64_t touch_start_time = 0;
static int touch_start_x = -1;
const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按 const int64_t TOUCH_THRESHOLD_MS = 500; // 触摸时长阈值超过500ms视为长按
ft6336_->UpdateTouchPoint(); ft6336_->UpdateTouchPoint();
@ -201,11 +202,14 @@ private:
if (touch_point.num > 0 && !was_touched) { if (touch_point.num > 0 && !was_touched) {
was_touched = true; was_touched = true;
touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒 touch_start_time = esp_timer_get_time() / 1000; // 转换为毫秒
touch_start_x = touch_point.x;
} }
// 检测触摸释放 // 检测触摸释放
else if (touch_point.num == 0 && was_touched) { else if (touch_point.num == 0 && was_touched) {
was_touched = false; was_touched = false;
int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time; int64_t touch_duration = (esp_timer_get_time() / 1000) - touch_start_time;
bool beaver_mode = touch_start_x >= DISPLAY_WIDTH / 2;
auto agent_mode = beaver_mode ? kChatAgentModeBeaver : kChatAgentModeNormal;
if (touch_duration < TOUCH_THRESHOLD_MS) { if (touch_duration < TOUCH_THRESHOLD_MS) {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
@ -213,12 +217,12 @@ private:
EnterWifiConfigMode(); EnterWifiConfigMode();
return; return;
} }
ESP_LOGI(TAG, "Touch short: text-only mode"); ESP_LOGI(TAG, "Touch short: %s text-only mode", beaver_mode ? "beaver" : "normal");
app.ToggleChatState(); app.ToggleChatStateForMode(agent_mode, false);
} else { } else {
auto& app = Application::GetInstance(); auto& app = Application::GetInstance();
ESP_LOGI(TAG, "Touch long: vision+text mode"); ESP_LOGI(TAG, "Touch long: %s vision+text mode", beaver_mode ? "beaver" : "normal");
app.ToggleChatStateWithVision(); app.ToggleChatStateForMode(agent_mode, true);
} }
} }
} }
@ -344,18 +348,23 @@ private:
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS)); vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_INITIAL_DELAY_MS));
while (true) { while (true) {
if (!Application::GetInstance().IsVisionTextModeEnabled()) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue;
}
if (board->camera_ == nullptr) { if (board->camera_ == nullptr) {
vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS)); vTaskDelay(pdMS_TO_TICKS(BACKGROUND_VISION_SAMPLE_INTERVAL_MS));
continue; continue;
} }
if (board->camera_->CaptureBackground()) { if (board->camera_->Capture()) {
if (!has_logged_success) { if (!has_logged_success) {
ESP_LOGI(TAG, "Background vision sampler started"); ESP_LOGI(TAG, "Vision preview sampler started");
has_logged_success = true; has_logged_success = true;
} }
} else if (!has_logged_failure) { } else if (!has_logged_failure) {
ESP_LOGW(TAG, "Background vision sampler is waiting for camera"); ESP_LOGW(TAG, "Vision preview sampler is waiting for camera");
has_logged_failure = true; has_logged_failure = true;
} }

View File

@ -2,6 +2,7 @@ import asyncio
import base64 import base64
import json import json
import os import os
import re
import shutil import shutil
import struct import struct
import sys import sys
@ -28,7 +29,18 @@ TOKEN_URL = "http://172.19.0.240:8000/getToken"
LIVEKIT_WS_URL = "ws://172.19.0.240:7880" LIVEKIT_WS_URL = "ws://172.19.0.240:7880"
ROOM_PREFIX = "test-livekit" ROOM_PREFIX = "test-livekit"
IDENTITY_PREFIX = "uv-livekit" IDENTITY_PREFIX = "uv-livekit"
AGENT_NAME = "my-agent" LEGACY_AGENT_NAME = os.getenv("LIVEKIT_AGENT_NAME", "normal-agent")
DEFAULT_AGENT_MODE = os.getenv("LIVEKIT_DEFAULT_AGENT_MODE", "normal").strip().lower()
AGENT_NAMES = {
"normal": os.getenv("LIVEKIT_NORMAL_AGENT_NAME", LEGACY_AGENT_NAME),
"beaver": os.getenv("LIVEKIT_BEAVER_AGENT_NAME", "beaver-agent"),
}
CHAT_MODE_AGENT_NAMES = {
"normal": AGENT_NAMES["normal"],
"beaver": AGENT_NAMES["beaver"],
"vision-normal": os.getenv("LIVEKIT_VISION_NORMAL_AGENT_NAME", "vision-normal-agent"),
"vision-beaver": os.getenv("LIVEKIT_VISION_BEAVER_AGENT_NAME", "vision-beaver-agent"),
}
CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0")) CONNECT_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_CONNECT_TIMEOUT_SECONDS", "20.0"))
AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0")) AGENT_READY_TIMEOUT_SECONDS = float(os.getenv("LIVEKIT_AGENT_READY_TIMEOUT_SECONDS", "10.0"))
WS_PORT = 8080 WS_PORT = 8080
@ -55,6 +67,16 @@ TTS_DISPLAY_SCROLL_WIDTH = int(os.getenv("TTS_DISPLAY_SCROLL_WIDTH", "18"))
TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18")) TTS_DISPLAY_SCROLL_INTERVAL_SECONDS = float(os.getenv("TTS_DISPLAY_SCROLL_INTERVAL_SECONDS", "0.18"))
TTS_DISPLAY_SCROLL_GAP = " " TTS_DISPLAY_SCROLL_GAP = " "
TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8")) TTS_INTERRUPT_SUPPRESS_SECONDS = float(os.getenv("TTS_INTERRUPT_SUPPRESS_SECONDS", "0.8"))
EMOTION_TEXT_PATTERN = re.compile(
r"^\s*<?\s*emotion\s*=\s*([^\s>,;]+)\s*>?[\s,;]*(.*)$",
re.DOTALL,
)
EMOTION_TEST_SEQUENCE = [
emotion.strip()
for emotion in os.getenv("BRIDGE_EMOTION_TEST_SEQUENCE", "").split(",")
if emotion.strip()
]
EMOTION_TEST_INTERVAL_SECONDS = float(os.getenv("BRIDGE_EMOTION_TEST_INTERVAL_SECONDS", "2.0"))
@dataclass @dataclass
@ -64,6 +86,10 @@ class DeviceSession:
protocol_version: int protocol_version: int
room_name: str room_name: str
identity: str identity: str
chat_mode: str
agent_mode: str
agent_name: str
vision_enabled: bool
room: rtc.Room room: rtc.Room
mic_source: AudioSource mic_source: AudioSource
agent_ready: asyncio.Event agent_ready: asyncio.Event
@ -75,6 +101,7 @@ class DeviceSession:
tts_transcript_text: str = "" tts_transcript_text: str = ""
tts_display_text: str = "" tts_display_text: str = ""
tts_display_final: bool = False tts_display_final: bool = False
tts_emotion: str = ""
tts_suppressed_until: float = 0.0 tts_suppressed_until: float = 0.0
agent_dispatch_task: Optional[asyncio.Task] = None agent_dispatch_task: Optional[asyncio.Task] = None
closed: bool = False closed: bool = False
@ -82,10 +109,10 @@ class DeviceSession:
first_capture_log_time: float = 0.0 first_capture_log_time: float = 0.0
async def fetch_token(room_name: str, identity: str) -> str: async def fetch_token(room_name: str, identity: str, agent_name: str) -> str:
params = {"room": room_name, "identity": identity} params = {"room": room_name, "identity": identity}
if AGENT_DISPATCH_MODE == "token": if AGENT_DISPATCH_MODE == "token":
params["agent_name"] = AGENT_NAME params["agent_name"] = agent_name
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
response = await client.get(TOKEN_URL, params=params) response = await client.get(TOKEN_URL, params=params)
@ -125,15 +152,61 @@ class ESP32LiveKitBridge:
print(f"[session] device={device_id} room={room_name} identity={identity}") print(f"[session] device={device_id} room={room_name} identity={identity}")
return room_name, identity return room_name, identity
def _is_agent_participant(self, participant: rtc.RemoteParticipant) -> bool: def _resolve_agent_selection(self, headers: Any) -> tuple[str, str, str, bool]:
requested_chat_mode = (
headers.get("Chat-Mode")
or headers.get("X-Chat-Mode")
or ""
).strip().lower()
chat_mode_to_agent = {
"normal": ("normal", False),
"beaver": ("beaver", False),
"vision-normal": ("normal", True),
"vision-beaver": ("beaver", True),
}
if requested_chat_mode in chat_mode_to_agent:
requested_mode, vision_enabled = chat_mode_to_agent[requested_chat_mode]
else:
if requested_chat_mode:
print(f"未知 Chat-Mode={requested_chat_mode!r},回退到 Agent-Mode")
requested_mode = (
headers.get("Agent-Mode")
or headers.get("X-Agent-Mode")
or DEFAULT_AGENT_MODE
or "normal"
).strip().lower()
vision_enabled = False
requested_chat_mode = requested_mode
requested_name = headers.get("Agent-Name") or headers.get("X-Agent-Name")
if requested_name:
return requested_chat_mode, "custom", requested_name, vision_enabled
if requested_mode not in AGENT_NAMES:
print(f"未知 Agent-Mode={requested_mode!r},回退到 normal")
requested_mode = "normal"
requested_chat_mode = "vision-normal" if vision_enabled else "normal"
if requested_chat_mode in CHAT_MODE_AGENT_NAMES:
return (
requested_chat_mode,
requested_mode,
CHAT_MODE_AGENT_NAMES[requested_chat_mode],
vision_enabled,
)
return requested_chat_mode, requested_mode, AGENT_NAMES[requested_mode], vision_enabled
def _is_agent_participant(self, participant: rtc.RemoteParticipant, agent_name: str) -> bool:
identity = getattr(participant, "identity", "") or "" identity = getattr(participant, "identity", "") or ""
return identity.startswith("agent-") or AGENT_NAME in identity return identity.startswith("agent-") or agent_name in identity
def _get_agent_identities(self, session: DeviceSession) -> list[str]: def _get_agent_identities(self, session: DeviceSession) -> list[str]:
return [ return [
participant.identity participant.identity
for participant in session.room.remote_participants.values() for participant in session.room.remote_participants.values()
if self._is_agent_participant(participant) if self._is_agent_participant(participant, session.agent_name)
] ]
def _log_agent_participants(self, session: DeviceSession, source: str) -> None: def _log_agent_participants(self, session: DeviceSession, source: str) -> None:
@ -175,7 +248,7 @@ class ESP32LiveKitBridge:
dispatch_agent_name = getattr(dispatch, "agent_name", None) dispatch_agent_name = getattr(dispatch, "agent_name", None)
dispatch_room = getattr(dispatch, "room", None) dispatch_room = getattr(dispatch, "room", None)
if dispatch_room == session.room_name and ( if dispatch_room == session.room_name and (
dispatch_agent_name == AGENT_NAME or dispatch_agent_name is None dispatch_agent_name == session.agent_name or dispatch_agent_name is None
): ):
print( print(
f"检测到已有 dispatch: room={session.room_name} " f"检测到已有 dispatch: room={session.room_name} "
@ -198,7 +271,7 @@ class ESP32LiveKitBridge:
return return
for participant in session.room.remote_participants.values(): for participant in session.room.remote_participants.values():
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
# print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}") # print(f"Agent 已在房间中,跳过 dispatch: {participant.identity}")
return return
@ -210,7 +283,7 @@ class ESP32LiveKitBridge:
await session.agent_dispatch_task await session.agent_dispatch_task
async def _dispatch_agent(self, session: DeviceSession) -> None: async def _dispatch_agent(self, session: DeviceSession) -> None:
print(f"准备 dispatch agent: room={session.room_name}, agent={AGENT_NAME}") print(f"准备 dispatch agent: room={session.room_name}, agent={session.agent_name}")
try: try:
if await self._dispatch_agent_with_sdk(session): if await self._dispatch_agent_with_sdk(session):
@ -240,13 +313,17 @@ class ESP32LiveKitBridge:
try: try:
dispatch = await lkapi.agent_dispatch.create_dispatch( dispatch = await lkapi.agent_dispatch.create_dispatch(
livekit_api.CreateAgentDispatchRequest( livekit_api.CreateAgentDispatchRequest(
agent_name=AGENT_NAME, agent_name=session.agent_name,
room=session.room_name, room=session.room_name,
metadata=json.dumps( metadata=json.dumps(
{ {
"source": "bridge_server", "source": "bridge_server",
"identity": session.identity, "identity": session.identity,
"device_id": session.device_id, "device_id": session.device_id,
"chat_mode": session.chat_mode,
"agent_mode": session.agent_mode,
"agent_name": session.agent_name,
"vision_enabled": session.vision_enabled,
} }
), ),
) )
@ -269,13 +346,17 @@ class ESP32LiveKitBridge:
"--room", "--room",
session.room_name, session.room_name,
"--agent-name", "--agent-name",
AGENT_NAME, session.agent_name,
"--metadata", "--metadata",
json.dumps( json.dumps(
{ {
"source": "bridge_server", "source": "bridge_server",
"identity": session.identity, "identity": session.identity,
"device_id": session.device_id, "device_id": session.device_id,
"chat_mode": session.chat_mode,
"agent_mode": session.agent_mode,
"agent_name": session.agent_name,
"vision_enabled": session.vision_enabled,
} }
), ),
stdout=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE,
@ -294,7 +375,7 @@ class ESP32LiveKitBridge:
print(f"lk dispatch create 失败,退出码: {process.returncode}") print(f"lk dispatch create 失败,退出码: {process.returncode}")
return False return False
print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={AGENT_NAME}") print(f"Agent dispatch 已通过 lk CLI 创建: room={session.room_name}, agent={session.agent_name}")
return True return True
async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool: async def _publish_agent_event(self, session: DeviceSession, payload: dict[str, Any]) -> bool:
@ -415,9 +496,40 @@ class ESP32LiveKitBridge:
await session.websocket.send(json.dumps({"type": "tts", "state": state})) await session.websocket.send(json.dumps({"type": "tts", "state": state}))
print(f"已发送 tts {state}: device={session.device_id}") print(f"已发送 tts {state}: device={session.device_id}")
async def _send_emotion(self, session: DeviceSession, emotion: str) -> None:
if session.websocket is None:
print(f"跳过 emotion {emotion}ESP32 尚未连接")
return
await session.websocket.send(json.dumps({"type": "llm", "emotion": emotion}))
print(f"已发送 emotion: device={session.device_id} emotion={emotion}")
def _parse_emotion_text(self, text: str) -> tuple[Optional[str], str]:
match = EMOTION_TEXT_PATTERN.match(text)
if match is None:
return None, text.strip()
emotion, tts_text = match.groups()
return emotion.strip(), tts_text.strip()
async def _run_emotion_test_sequence(self, session: DeviceSession) -> None:
if not EMOTION_TEST_SEQUENCE:
return
for index, emotion in enumerate(EMOTION_TEST_SEQUENCE):
if session.websocket is None or session.closed:
return
if index > 0:
await asyncio.sleep(EMOTION_TEST_INTERVAL_SECONDS)
await self._send_emotion(session, emotion)
async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None: async def _send_tts_text(self, session: DeviceSession, text: str, final: bool) -> None:
if session.websocket is None: if session.websocket is None:
return return
raw_text = text
_emotion, text = self._parse_emotion_text(text)
if not text:
print(f"[tts->esp32] skip empty text: raw={raw_text!r} final={final}")
return
print(f"[tts->esp32] text={text!r} final={final}")
await session.websocket.send( await session.websocket.send(
json.dumps( json.dumps(
{ {
@ -493,6 +605,7 @@ class ESP32LiveKitBridge:
if not session.tts_display_text: if not session.tts_display_text:
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
self._cancel_tts_display_task(session) self._cancel_tts_display_task(session)
await self._send_tts_state(session, "start") await self._send_tts_state(session, "start")
session.tts_active = True session.tts_active = True
@ -507,6 +620,7 @@ class ESP32LiveKitBridge:
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_text = "" session.tts_display_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None: async def _force_stop_tts(self, session: DeviceSession, reason: str) -> None:
self._cancel_tts_display_task(session) self._cancel_tts_display_task(session)
@ -517,6 +631,7 @@ class ESP32LiveKitBridge:
session.tts_transcript_text = "" session.tts_transcript_text = ""
session.tts_display_text = "" session.tts_display_text = ""
session.tts_display_final = False session.tts_display_final = False
session.tts_emotion = ""
await self._send_tts_state(session, "stop") await self._send_tts_state(session, "stop")
print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}") print(f"已强制停止本地 TTS: device={session.device_id} reason={reason}")
@ -681,16 +796,16 @@ class ESP32LiveKitBridge:
print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}") print(f"✅ 成功连接到 LiveKit 房间: room={session.room_name}")
self._log_agent_participants(session, "connected") self._log_agent_participants(session, "connected")
for participant in session.room.remote_participants.values(): for participant in session.room.remote_participants.values():
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
session.agent_ready.set() session.agent_ready.set()
self._scan_participant_audio_tracks(session, participant, "connected_scan") self._scan_participant_audio_tracks(session, participant, "connected_scan")
@session.room.on("participant_connected") @session.room.on("participant_connected")
def on_participant_connected(participant: rtc.RemoteParticipant) -> None: def on_participant_connected(participant: rtc.RemoteParticipant) -> None:
role = "Agent" if self._is_agent_participant(participant) else "Remote participant" role = "Agent" if self._is_agent_participant(participant, session.agent_name) else "Remote participant"
print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}") print(f"👋 {role} ({participant.identity}) 已加入房间: room={session.room_name}")
self._log_agent_participants(session, "participant_connected") self._log_agent_participants(session, "participant_connected")
if self._is_agent_participant(participant): if self._is_agent_participant(participant, session.agent_name):
session.agent_ready.set() session.agent_ready.set()
self._scan_participant_audio_tracks( self._scan_participant_audio_tracks(
session, participant, "participant_connected_scan" session, participant, "participant_connected_scan"
@ -723,14 +838,26 @@ class ESP32LiveKitBridge:
track_pub: rtc.TrackPublication, track_pub: rtc.TrackPublication,
) -> None: ) -> None:
identity = participant.identity if participant else "未知" identity = participant.identity if participant else "未知"
is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(participant) is_agent = isinstance(participant, rtc.RemoteParticipant) and self._is_agent_participant(
participant, session.agent_name
)
for segment in segments: for segment in segments:
status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果" status = "✅ 最终结果" if segment.final else "⏳ 正在思考/中间结果"
print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}") print(f"🗣️ [{status} | room={session.room_name} | {identity}]: {segment.text}")
if is_agent: if is_agent:
if time.monotonic() < session.tts_suppressed_until: if time.monotonic() < session.tts_suppressed_until:
continue continue
display_text = self._current_tts_display_text(segment.text) print(f"[livekit-llm] raw={segment.text!r} final={segment.final}")
emotion, tts_text = self._parse_emotion_text(segment.text)
print(
f"[livekit-llm] parsed emotion={emotion!r} "
f"tts_text={tts_text!r} final={segment.final}"
)
if emotion and emotion != session.tts_emotion:
session.tts_emotion = emotion
asyncio.create_task(self._send_emotion(session, emotion))
display_text = self._current_tts_display_text(tts_text)
print(f"[livekit-llm] display_text={display_text!r} final={segment.final}")
if not display_text or display_text == session.tts_transcript_text: if not display_text or display_text == session.tts_transcript_text:
continue continue
session.tts_transcript_text = display_text session.tts_transcript_text = display_text
@ -785,7 +912,7 @@ class ESP32LiveKitBridge:
# print(f"[config] token_url={TOKEN_URL}") # print(f"[config] token_url={TOKEN_URL}")
# print(f"[config] room={session.room_name} identity={session.identity}") # print(f"[config] room={session.room_name} identity={session.identity}")
# print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}") # print(f"[config] livekit_connect_timeout={CONNECT_TIMEOUT_SECONDS}")
token = await fetch_token(session.room_name, session.identity) token = await fetch_token(session.room_name, session.identity, session.agent_name)
try: try:
await session.room.connect( await session.room.connect(
@ -830,8 +957,12 @@ class ESP32LiveKitBridge:
# print(f"等待 agent 加入: room={session.room_name}") # print(f"等待 agent 加入: room={session.room_name}")
try: try:
await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS) await asyncio.wait_for(session.agent_ready.wait(), timeout=AGENT_READY_TIMEOUT_SECONDS)
if session.closed:
return
# print(f"✅ agent 已就绪: room={session.room_name}") # print(f"✅ agent 已就绪: room={session.room_name}")
except asyncio.TimeoutError: except asyncio.TimeoutError:
if session.closed:
return
print(f"⚠️ agent 等待超时: room={session.room_name}") print(f"⚠️ agent 等待超时: room={session.room_name}")
async def start(self) -> None: async def start(self) -> None:
@ -839,6 +970,20 @@ class ESP32LiveKitBridge:
print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}") print(f"[config] livekit_ws_url={LIVEKIT_WS_URL}")
print(f"[config] token_url={TOKEN_URL}") print(f"[config] token_url={TOKEN_URL}")
print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}") print(f"[config] agent_dispatch_mode={AGENT_DISPATCH_MODE}")
print(
"[config] agents="
f"normal:{CHAT_MODE_AGENT_NAMES['normal']} "
f"beaver:{CHAT_MODE_AGENT_NAMES['beaver']} "
f"vision-normal:{CHAT_MODE_AGENT_NAMES['vision-normal']} "
f"vision-beaver:{CHAT_MODE_AGENT_NAMES['vision-beaver']} "
f"default_mode:{DEFAULT_AGENT_MODE}"
)
if EMOTION_TEST_SEQUENCE:
print(
"[config] emotion_test_sequence="
f"{','.join(EMOTION_TEST_SEQUENCE)} "
f"interval={EMOTION_TEST_INTERVAL_SECONDS}s"
)
async def close(self) -> None: async def close(self) -> None:
for session in list(self.device_sessions.values()): for session in list(self.device_sessions.values()):
@ -849,6 +994,7 @@ class ESP32LiveKitBridge:
return return
session.closed = True session.closed = True
session.websocket = None session.websocket = None
session.agent_ready.set()
session.tts_active = False session.tts_active = False
session.tts_stream_id += 1 session.tts_stream_id += 1
if session.tts_idle_task is not None: if session.tts_idle_task is not None:
@ -998,6 +1144,7 @@ class ESP32LiveKitBridge:
await self._close_session(existing_session) await self._close_session(existing_session)
self.device_sessions.pop(device_id, None) self.device_sessions.pop(device_id, None)
chat_mode, agent_mode, agent_name, vision_enabled = self._resolve_agent_selection(websocket.request.headers)
room_name, identity = self._build_session_names(device_id) room_name, identity = self._build_session_names(device_id)
session = DeviceSession( session = DeviceSession(
device_id=device_id, device_id=device_id,
@ -1005,6 +1152,10 @@ class ESP32LiveKitBridge:
protocol_version=protocol_version, protocol_version=protocol_version,
room_name=room_name, room_name=room_name,
identity=identity, identity=identity,
chat_mode=chat_mode,
agent_mode=agent_mode,
agent_name=agent_name,
vision_enabled=vision_enabled,
room=rtc.Room(), room=rtc.Room(),
mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1), mic_source=AudioSource(sample_rate=INPUT_SAMPLE_RATE, num_channels=1),
agent_ready=asyncio.Event(), agent_ready=asyncio.Event(),
@ -1013,6 +1164,11 @@ class ESP32LiveKitBridge:
print(f"ESP32 已连接: device={device_id}") print(f"ESP32 已连接: device={device_id}")
print(f"ESP32 协议版本: {session.protocol_version}") print(f"ESP32 协议版本: {session.protocol_version}")
print(
f"ESP32 mode: chat={session.chat_mode} "
f"agent={session.agent_mode}/{session.agent_name} "
f"vision={session.vision_enabled}"
)
session.tts_stream_id += 1 session.tts_stream_id += 1
opus_decoder = None opus_decoder = None
@ -1033,6 +1189,7 @@ class ESP32LiveKitBridge:
} }
await websocket.send(json.dumps(hello_msg)) await websocket.send(json.dumps(hello_msg))
print(f"已发送 server hello: device={device_id} room={session.room_name}") print(f"已发送 server hello: device={device_id} room={session.room_name}")
asyncio.create_task(self._run_emotion_test_sequence(session))
await self._connect_session_room(session) await self._connect_session_room(session)

View File

@ -119,6 +119,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str()); websocket_->SetHeader("Protocol-Version", std::to_string(version_).c_str());
websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str()); websocket_->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str()); websocket_->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
websocket_->SetHeader("Agent-Mode", Application::GetInstance().GetChatAgentModeName());
websocket_->SetHeader("Chat-Mode", Application::GetInstance().GetChatModeName());
websocket_->OnData([this](const char* data, size_t len, bool binary) { websocket_->OnData([this](const char* data, size_t len, bool binary) {
if (binary) { if (binary) {