initial commit

2026-05-07 15:13:15 +08:00
commit ac81d4a9eb
7 changed files with 781 additions and 0 deletions
--- a/custom_agent.py
+++ b/custom_agent.py
@ -0,0 +1,171 @@
+import logging
+import os
+import aiohttp
+from dotenv import load_dotenv
+from livekit import rtc
+from livekit.agents import (
+    Agent,
+    AgentServer,
+    AgentSession,
+    APIConnectOptions,
+    JobContext,
+    JobProcess,
+    LanguageCode,
+    MetricsCollectedEvent,
+    NOT_GIVEN,
+    NotGivenOr,
+    TurnHandlingOptions,
+    cli,
+    metrics,
+    room_io,
+    stt,
+    text_transforms,
+    utils,
+)
+from livekit.plugins import silero, openai
+from livekit.plugins.turn_detector.multilingual import MultilingualModel
+
+logger = logging.getLogger("custom-agent")
+
+load_dotenv()
+
+class SenseVoiceSTT(stt.STT):
+    def __init__(self, url: str):
+        super().__init__(capabilities=stt.STTCapabilities(streaming=False, interim_results=False, diarization=False))
+        self._url = url
+
+    @property
+    def model(self) -> str:
+        return "sensevoice"
+
+    async def _recognize_impl(
+        self,
+        buffer: utils.AudioBuffer,
+        *,
+        language: NotGivenOr[str] = NOT_GIVEN,
+        conn_options: APIConnectOptions,
+    ) -> stt.SpeechEvent:
+        audio_data = rtc.combine_audio_frames(buffer).to_wav_bytes()
+        
+        async with aiohttp.ClientSession() as session:
+            data = aiohttp.FormData()
+            data.add_field('audio', audio_data, filename='audio.wav', content_type='audio/wav')
+            data.add_field('model_name', 'sensevoice')
+            
+            lang = language if language is not NOT_GIVEN else 'auto'
+            data.add_field('language', lang)
+            
+            try:
+                async with session.post(self._url, data=data, timeout=30) as resp:
+                    if resp.status != 200:
+                        raise Exception(f"ASR server returned status {resp.status}")
+                    
+                    result = await resp.json()
+                    if not result.get("result"):
+                        return stt.SpeechEvent(type=stt.SpeechEventType.FINAL_TRANSCRIPT)
+                    
+                    text = result["result"][0].get("clean_text", "")
+                    logger.info(f"SenseVoice ASR Result: {text}")
+                    return stt.SpeechEvent(
+                        type=stt.SpeechEventType.FINAL_TRANSCRIPT,
+                        alternatives=[stt.SpeechData(text=text, language=LanguageCode("zh"))],
+                    )
+            except Exception as e:
+                logger.error(f"SenseVoice ASR error: {e}")
+                raise
+
+class CustomAgent(Agent):
+    def __init__(self) -> None:
+        super().__init__(
+            instructions="Your name is Kelly, built by LiveKit. You are a helpful assistant."
+            "Keep your responses concise and friendly."
+            "You are interacting with the user via a local ASR and LLM pipeline.",
+        )
+
+    async def on_enter(self) -> None:
+        self.session.generate_reply(instructions="greet the user and introduce yourself")
+
+server = AgentServer()
+
+def prewarm(proc: JobProcess) -> None:
+    # Load Silero VAD as requested
+    proc.userdata["vad"] = silero.VAD.load()
+
+server.setup_fnc = prewarm
+
+@server.rtc_session(agent_name="my-agent")
+async def entrypoint(ctx: JobContext) -> None:
+    ctx.log_context_fields = {
+        "room": ctx.room.name,
+    }
+
+    # Configuration for custom local endpoints
+    # These can be set in your .env file
+    ASR_URL = os.getenv("CUSTOM_ASR_URL", "http://10.6.80.21:5003/asr-blackbox")
+    
+    MINIMAX_BASE_URL = os.getenv("MINIMAX_LLM_BASE_URL", "https://oai.bwgdi.com/v1")
+    MINIMAX_MODEL = os.getenv("MINIMAX_LLM_MODEL", "qwen-max")
+    
+    VOXCPM_URL = os.getenv("VOXCPM_TTS_URL", "http://localhost:5050/tts-blackbox")
+    PROMPT_WAV = os.getenv("VOXCPM_PROMPT_WAV", "/assets/2food16k_2.wav")
+
+    # Initialize SenseVoice STT and wrap with StreamAdapter
+    sensevoice_stt = SenseVoiceSTT(url=ASR_URL)
+    stt_stream = stt.StreamAdapter(stt=sensevoice_stt, vad=ctx.proc.userdata["vad"])
+
+    import httpx
+    from openai import AsyncClient as OpenAIAsyncClient
+
+    # Create a custom HTTP client that disables SSL verification
+    http_client = httpx.AsyncClient(verify=False)
+    
+    # Create the OpenAI AsyncClient with the custom HTTP client
+    openai_client = OpenAIAsyncClient(
+        api_key="sk-orez64WkG1NkfksB5j_hGA",
+        base_url=MINIMAX_BASE_URL,
+        http_client=http_client,
+    )
+
+    from tts_voxcpm import VoxCPMTTS
+
+    session: AgentSession = AgentSession(
+        # 1. Custom SenseVoice ASR (STT) with StreamAdapter
+        stt=stt_stream,
+        # 2. Minimax LLM - Using OpenAI plugin with local base_url
+        llm=openai.LLM(
+            model=MINIMAX_MODEL,
+            client=openai_client,
+        ),
+        # 3. VoxCPM TTS - Custom implementation for blackbox API
+        tts=VoxCPMTTS(
+            url=VOXCPM_URL,
+            prompt_wav_path=PROMPT_WAV,
+        ),
+        # 4. Silero VAD
+        vad=ctx.proc.userdata["vad"],
+        turn_handling=TurnHandlingOptions(
+            turn_detection=MultilingualModel(),
+            interruption={
+                "resume_false_interruption": True,
+                "false_interruption_timeout": 1.0,
+            },
+        ),
+        preemptive_generation=True,
+        aec_warmup_duration=3.0,
+        tts_text_transforms=[
+            "filter_emoji",
+            "filter_markdown",
+        ],
+    )
+
+    @session.on("metrics_collected")
+    def _on_metrics_collected(ev: MetricsCollectedEvent) -> None:
+        metrics.log_metrics(ev.metrics)
+
+    await session.start(
+        agent=CustomAgent(),
+        room=ctx.room,
+    )
+
+if __name__ == "__main__":
+    cli.run_app(server)
--- a/test_agent.py
+++ b/test_agent.py
@ -0,0 +1,188 @@
+import asyncio
+import requests
+import logging
+from pathlib import Path
+import uuid
+import wave
+import numpy as np
+from datetime import datetime
+from livekit import rtc
+from livekit.rtc import AudioSource, AudioFrame, LocalAudioTrack
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger("test-agent")
+
+TOKEN_URL = "http://localhost:8000/getToken"
+WS_URL = "wss://esp32-vt80c4y6.livekit.cloud"
+ROOM_NAME = "test-room20"
+WAV_FILE = "2food.wav"
+TEST_TIMEOUT = 30
+
+class TestState:
+    def __init__(self):
+        self.agent_connected = False
+        self.tts_received = False
+        self.tts_count = 0
+
+test_state = TestState()
+
+
+def get_token(agent_name="my-agent"):
+    try:
+        resp = requests.get(
+            TOKEN_URL,
+            params={
+                "room": ROOM_NAME,
+                "identity": f"test-{uuid.uuid4().hex[:6]}",
+                "agent_name": agent_name,
+            },
+            timeout=5
+        )
+        resp.raise_for_status()
+        return resp.json()["token"]
+    except Exception as e:
+        logger.error(f"❌ 获取token失败: {e}")
+        raise
+
+
+async def publish_wav(room, wav_path):
+    wav_path = Path(wav_path)
+    if not wav_path.exists():
+        logger.error(f"❌ WAV文件不存在: {wav_path}")
+        raise FileNotFoundError(f"文件不存在: {wav_path}")
+    
+    logger.info(f"📂 开始上传: {wav_path}")
+    
+    with wave.open(str(wav_path), "rb") as wf:
+        sample_rate = wf.getframerate()
+        num_channels = wf.getnchannels()
+        sample_width = wf.getsampwidth()
+        
+        logger.info(f"📊 WAV信息: {sample_rate}Hz, {num_channels}ch, {sample_width*8}bit")
+        
+        source = AudioSource(sample_rate, num_channels)
+        track = LocalAudioTrack.create_audio_track("mic", source)
+        
+        await room.local_participant.publish_track(track)
+        logger.info("📡 已发布音轨")
+        
+        frame_duration = 0.02
+        samples_per_frame = int(sample_rate * frame_duration)
+        
+        while True:
+            data = wf.readframes(samples_per_frame)
+            if not data:
+                break
+            
+            audio = np.frombuffer(data, dtype=np.int16)
+            if len(audio) == 0:
+                continue
+            
+            samples_per_channel = len(audio) // num_channels
+            
+            frame = AudioFrame(
+                data=data,
+                sample_rate=sample_rate,
+                num_channels=num_channels,
+                samples_per_channel=samples_per_channel,
+            )
+            
+            await source.capture_frame(frame)
+            await asyncio.sleep(frame_duration)
+    
+    logger.info("✅ WAV推流完成")
+
+
+async def test_agent():
+    try:
+        logger.info("🔑 正在获取token...")
+        token = get_token()
+        logger.info("✅ Token获取成功")
+        
+        room = rtc.Room()
+        
+        @room.on("participant_connected")
+        def on_participant_connected(participant):
+            logger.info(f"✅ 参与者加入: {participant.identity}")
+            if "agent" in participant.identity.lower():
+                test_state.agent_connected = True
+                logger.info("🎉 Agent已连接！")
+        
+        @room.on("participant_disconnected")
+        def on_participant_disconnected(participant):
+            logger.info(f"❌ 参与者离开: {participant.identity}")
+        
+        @room.on("track_subscribed")
+        def on_track_subscribed(track, publication, participant):
+            if track.kind == rtc.TrackKind.KIND_AUDIO:
+                test_state.tts_count += 1
+                logger.info(f"🎵 收到TTS音频! (第 {test_state.tts_count} 次)")
+                test_state.tts_received = True
+        
+        logger.info(f"🔌 正在连接房间 {ROOM_NAME}...")
+        await room.connect(WS_URL, token)
+        logger.info("✅ 已连接到房间")
+        logger.info(f"🆔 本地参与者ID: {room.local_participant.identity}")
+        
+        logger.info("⏳ 等待Agent连接...")
+        for i in range(10):
+            if test_state.agent_connected:
+                break
+            await asyncio.sleep(1)
+        
+        if not test_state.agent_connected:
+            logger.warning("⚠️  Agent未连接")
+            return False
+        
+        logger.info("🎙️  正在上传测试音频...")
+        await publish_wav(room, WAV_FILE)
+        
+        logger.info("⏳ 等待Agent响应...")
+        for i in range(TEST_TIMEOUT):
+            if test_state.tts_received:
+                logger.info("✅ 收到Agent TTS响应!")
+                break
+            if i % 5 == 0:
+                logger.info(f"   等待中... ({i+1}/{TEST_TIMEOUT}秒)")
+            await asyncio.sleep(1)
+        
+        await asyncio.sleep(2)
+        
+        logger.info("\n" + "="*60)
+        logger.info("✅ 测试结果")
+        logger.info("="*60)
+        logger.info(f"Agent连接: {'✅' if test_state.agent_connected else '❌'}")
+        logger.info(f"收到TTS响应: {'✅' if test_state.tts_received else '❌'}")
+        logger.info(f"TTS音频次数: {test_state.tts_count} 次")
+        logger.info("="*60)
+        
+        await room.disconnect()
+        logger.info("✅ 已断开连接\n")
+        
+        return test_state.agent_connected and test_state.tts_received
+        
+    except Exception as e:
+        logger.error(f"❌ 测试失败: {e}", exc_info=True)
+        return False
+
+
+async def main():
+    logger.info("🚀 开始测试custom_agent...\n")
+    success = await test_agent()
+    
+    if success:
+        logger.info("✅ 测试成功！custom_agent 正常工作")
+        logger.info("💡 提示: Agent内部的转录和响应日志只能在Agent自身看到，")
+        logger.info("   或通过 agent-starter-react 这样的客户端交互查看")
+        return 0
+    else:
+        logger.error("❌ 测试失败")
+        return 1
+
+
+if __name__ == "__main__":
+    exit_code = asyncio.run(main())
+    exit(exit_code)
--- a/test_asr.py
+++ b/test_asr.py
@ -0,0 +1,53 @@
+import asyncio
+import logging
+import wave
+from custom_agent import SenseVoiceSTT
+from livekit import rtc
+from livekit.agents import utils
+
+# 设置日志级别以查看输出
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("test-asr")
+
+async def test():
+    # 替换为你本地的一个音频文件路径
+    audio_path = "/home/verachen/Music/voice/2food.wav" 
+    
+    # 初始化 ASR
+    stt = SenseVoiceSTT(url="http://10.6.80.21:5003/asr-blackbox")
+    
+    print(f"Testing ASR connectivity with file: {audio_path}")
+    
+    try:
+        # 读取音频文件
+        with wave.open(audio_path, 'rb') as wf:
+            frames = wf.readframes(wf.getnframes())
+            # 简单构造一个 AudioBuffer (假设是单声道 16kHz)
+            # 实际上 SenseVoiceSTT._recognize_impl 会用 combine_audio_frames(buffer).to_wav_bytes()
+            # 所以我们需要传递一个包含 AudioFrame 的 list
+            
+            # 这里我们模拟一个 Frame
+            frame = rtc.AudioFrame(
+                data=frames,
+                sample_rate=wf.getframerate(),
+                num_channels=wf.getnchannels(),
+                samples_per_channel=wf.getnframes()
+            )
+            
+            # 调用 recognize
+            result = await stt.recognize(buffer=[frame])
+            
+            if result.alternatives:
+                print(f"\n--- ASR Result ---")
+                print(f"Text: {result.alternatives[0].text}")
+                print(f"------------------\n")
+            else:
+                print("ASR returned no text.")
+                
+    except FileNotFoundError:
+        print(f"Error: Audio file not found at {audio_path}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    asyncio.run(test())
--- a/test_livekit.py
+++ b/test_livekit.py
@ -0,0 +1,130 @@
+import asyncio
+import requests
+from livekit import rtc
+
+import wave
+import numpy as np
+from livekit.rtc import AudioSource, AudioFrame, LocalAudioTrack
+
+TOKEN_URL = "http://localhost:8000/getToken"
+WS_URL = "wss://esp32-vt80c4y6.livekit.cloud"  # 你的 LiveKit Server 地址
+
+ROOM_NAME = "test-room20"
+import uuid
+IDENTITY = f"uv-{uuid.uuid4().hex[:6]}"
+# IDENTITY = "test-user0"
+
+
+def get_token():
+    resp = requests.get(
+        TOKEN_URL,
+        params={
+            "room": ROOM_NAME,
+            "identity": IDENTITY,
+            "agent_name": "my-agent",  # 关键！！！
+        },
+    )
+    data = resp.json()
+    return data["token"]
+
+
+async def main():
+    token = get_token()
+
+    room = rtc.Room()
+
+    @room.on("participant_connected")
+    def on_participant_connected(participant):
+        print(f"✅ 有人加入房间: {participant.identity}")
+
+    @room.on("participant_disconnected")
+    def on_participant_disconnected(participant):
+        print(f"❌ 有人离开房间: {participant.identity}")
+
+    print("🔌 正在连接房间...")
+    await room.connect(WS_URL, token)
+
+    print("✅ 已连接房间:", ROOM_NAME)
+    print("当前房间成员：")
+    for p in room.remote_participants.values():
+        print(" -", p.identity)
+
+    @room.on("data_received")
+    def on_data_received(data, participant, kind, topic):
+        try:
+            msg = data.decode()
+            print(f"📩 来自 {participant.identity}: {msg}")
+        except:
+            print("📩 收到二进制数据")
+
+    @room.on("track_subscribed")
+    def on_track_subscribed(track, publication, participant):
+        print(f"🎧 订阅轨道: {participant.identity}")
+
+        if track.kind == rtc.TrackKind.KIND_AUDIO:
+            print("👉 TTS 音频来了")
+
+    # 等一下确保连接稳定
+    await asyncio.sleep(1)
+    await room.local_participant.publish_data(
+        b"hello",
+        reliable=True,
+        topic="chat"
+    )
+    # 上传 wav
+    await publish_wav(room, "2food.wav")
+
+    await room.disconnect()
+
+
+async def publish_wav(room, wav_path):
+    print("🎵 开始上传本地 wav:", wav_path)
+
+    wf = wave.open(wav_path, "rb")
+
+    sample_rate = wf.getframerate()
+    num_channels = wf.getnchannels()
+    sample_width = wf.getsampwidth()
+
+    print(f"📊 WAV信息: {sample_rate}Hz, {num_channels}ch, {sample_width*8}bit")
+
+    # 创建音频源
+    source = AudioSource(sample_rate, num_channels)
+
+    # 创建本地音轨
+    track = LocalAudioTrack.create_audio_track("mic", source)
+
+    # 发布轨道
+    await room.local_participant.publish_track(track)
+    print("📡 已发布音轨")
+
+    frame_duration = 0.02  # 20ms
+    samples_per_frame = int(sample_rate * frame_duration)
+
+    while True:
+        data = wf.readframes(samples_per_frame)
+        if not data:
+            break
+
+        # 用于计算长度
+        audio = np.frombuffer(data, dtype=np.int16)
+
+        if len(audio) == 0:
+            continue
+
+        samples_per_channel = len(audio) // num_channels
+
+        frame = AudioFrame(
+            data=data,  # ✅ 关键：用 bytes
+            sample_rate=sample_rate,
+            num_channels=num_channels,
+            samples_per_channel=samples_per_channel,
+        )
+
+        await source.capture_frame(frame)
+        await asyncio.sleep(frame_duration)
+    print("✅ wav 推流结束")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/test_minimax.py
+++ b/test_minimax.py
@ -0,0 +1,71 @@
+import asyncio
+import os
+import logging
+from dotenv import load_dotenv
+from livekit.agents.llm import ChatContext
+from livekit.plugins import openai
+
+# Configure logging to see what's happening
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("test-minimax")
+
+async def test_minimax():
+    print("Loading .env...")
+    load_dotenv()
+    
+    # Configuration from environment or defaults from custom_agent.py
+    MINIMAX_BASE_URL = os.getenv("MINIMAX_LLM_BASE_URL", "https://oai.bwgdi.com/v1")
+    MINIMAX_MODEL = os.getenv("MINIMAX_LLM_MODEL", "MiniMaxAI")
+    # Using the hardcoded key from custom_agent.py as a fallback if not in .env
+    API_KEY = os.getenv("MINIMAX_API_KEY", "sk-orez64WkG1NkfksB5j_hGA")
+
+    import httpx
+    from openai import AsyncClient as OpenAIAsyncClient
+
+    print(f"Connecting to Minimax at {MINIMAX_BASE_URL} using model {MINIMAX_MODEL}")
+
+    # Create a custom HTTP client that disables SSL verification
+    http_client = httpx.AsyncClient(verify=False)
+    
+    # Create the OpenAI AsyncClient with the custom HTTP client
+    openai_client = OpenAIAsyncClient(
+        api_key=API_KEY,
+        base_url=MINIMAX_BASE_URL,
+        http_client=http_client,
+    )
+
+    llm = openai.LLM(
+        model=MINIMAX_MODEL,
+        client=openai_client,
+    )
+
+    print("Creating ChatContext...")
+    chat_ctx = ChatContext()
+    chat_ctx.add_message(
+        content="Hello! Can you introduce yourself? Please reply in Chinese.",
+        role="user",
+    )
+
+    print(f"\n--- Testing Streaming Chat ---")
+    print(f"Request: {chat_ctx.items[-1].content}")
+    print("Response: ", end="", flush=True)
+    
+    try:
+        print("\nCalling llm.chat()...")
+        stream = llm.chat(chat_ctx=chat_ctx)
+        print("Iterating over stream...")
+        async for chunk in stream:
+            if chunk.delta and chunk.delta.content:
+                print(chunk.delta.content, end="", flush=True)
+        print("\n--- Test Completed Successfully ---")
+    except Exception as e:
+        logger.error(f"\nTest failed with error: {e}")
+
+if __name__ == "__main__":
+    print("Starting...")
+    try:
+        asyncio.run(asyncio.wait_for(test_minimax(), timeout=30))
+    except asyncio.TimeoutError:
+        print("\nTest timed out after 30 seconds.")
+    except Exception as e:
+        print(f"\nAn error occurred: {e}")
--- a/test_voxcpm.py
+++ b/test_voxcpm.py
@ -0,0 +1,50 @@
+import asyncio
+import os
+import logging
+from tts_voxcpm import VoxCPMTTS
+from livekit.agents import tts
+
+logging.basicConfig(level=logging.INFO)
+
+async def test_tts():
+    # Use the URL from the user's curl command
+    url = "http://10.6.80.21:5002/tts-blackbox"
+    
+    # Check if we have a real wav file to test with
+    # In the earlier find_by_name, we found tests/change-sophie.wav
+    prompt_wav = "/home/verachen/Music/voice/2food.wav" 
+    if not os.path.exists(prompt_wav):
+         prompt_wav = "/home/verachen/Music/voice/2food.wav"  # fallback to the one in curl
+
+    print(f"Testing VoxCPMTTS with URL: {url}")
+    print(f"Using prompt wav: {prompt_wav}")
+    
+    vox_tts = VoxCPMTTS(
+        url=url,
+        prompt_wav_path=prompt_wav
+    )
+    
+    text = "你好，这是一段测试文本"
+    print(f"Synthesizing text: {text}")
+    
+    try:
+        stream = vox_tts.synthesize(text)
+        audio_frame = await stream.collect()
+        
+        print(f"Successfully synthesized audio!")
+        print(f"Audio duration: {audio_frame.sample_rate * len(audio_frame.data) / (audio_frame.num_channels * 2)} samples?") 
+        # Actually AudioFrame has duration or samples
+        print(f"Samples: {len(audio_frame.data) // 2}")
+        
+        # Save to file for manual check if possible
+        with open("test_output.wav", "wb") as f:
+            # This won't be a valid WAV yet if it's just raw PCM, 
+            # but if collect() returns combined frames, we can use to_wav_bytes()
+            f.write(audio_frame.to_wav_bytes())
+        print("Saved output to test_output.wav")
+        
+    except Exception as e:
+        print(f"TTS test failed: {e}")
+
+if __name__ == "__main__":
+    asyncio.run(test_tts())
--- a/tts_voxcpm.py
+++ b/tts_voxcpm.py
@ -0,0 +1,118 @@
+import aiohttp
+import logging
+import os
+from livekit.agents import tts, utils, APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS
+
+logger = logging.getLogger("voxcpm-tts")
+
+class VoxCPMTTS(tts.TTS):
+    def __init__(
+        self,
+        *,
+        url: str,
+        model_name: str = "voxcpmtts",
+        prompt_text: str = "澳门有乜嘢好食嘅",
+        prompt_wav_path: str = "/home/verachen/Music/voice/2food16k_2.wav",
+        cfg_value: str = "2.0",
+        inference_timesteps: str = "10",
+        do_normalize: str = "true",
+        denoise: str = "true",
+        retry_badcase: str = "true",
+        retry_badcase_max_times: str = "3",
+        retry_badcase_ratio_threshold: str = "6.0",
+        sample_rate: int = 16000,
+    ):
+        super().__init__(
+            capabilities=tts.TTSCapabilities(streaming=False),
+            sample_rate=sample_rate,
+            num_channels=1,
+        )
+        self._url = url
+        self._opts = {
+            "model_name": model_name,
+            "streaming": "false",
+            "prompt_text": prompt_text,
+            "cfg_value": str(cfg_value),
+            "inference_timesteps": str(inference_timesteps),
+            "do_normalize": str(do_normalize),
+            "denoise": str(denoise),
+            "retry_badcase": str(retry_badcase),
+            "retry_badcase_max_times": str(retry_badcase_max_times),
+            "retry_badcase_ratio_threshold": str(retry_badcase_ratio_threshold),
+        }
+        self._prompt_wav_path = prompt_wav_path
+
+    @property
+    def model(self) -> str:
+        return self._opts["model_name"]
+
+    def synthesize(
+        self,
+        text: str,
+        *,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> tts.ChunkedStream:
+        return VoxCPMStream(
+            self, text, self._url, self._opts, self._prompt_wav_path, conn_options=conn_options
+        )
+
+class VoxCPMStream(tts.ChunkedStream):
+    def __init__(
+        self,
+        tts: VoxCPMTTS,
+        text: str,
+        url: str,
+        opts: dict,
+        prompt_wav_path: str,
+        conn_options: APIConnectOptions,
+    ):
+        super().__init__(tts=tts, input_text=text, conn_options=conn_options)
+        self._url = url
+        self._opts = opts
+        self._prompt_wav_path = prompt_wav_path
+
+    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
+        # Initialize emitter early to avoid "AudioEmitter isn't started" error on failure
+        output_emitter.initialize(
+            request_id="",
+            sample_rate=self._tts.sample_rate,
+            num_channels=self._tts.num_channels,
+            mime_type="audio/wav",
+        )
+
+        async with aiohttp.ClientSession() as session:
+            data = aiohttp.FormData()
+            data.add_field("text", self.input_text)
+            for k, v in self._opts.items():
+                data.add_field(k, v)
+
+            # Open the prompt wav file if it exists
+            f = None
+            if os.path.exists(self._prompt_wav_path):
+                f = open(self._prompt_wav_path, "rb")
+                data.add_field("prompt_wav", f, filename="prompt.wav", content_type="audio/wav")
+            else:
+                logger.warning(
+                    f"Prompt wav file not found at {self._prompt_wav_path}, skipping prompt_wav field"
+                )
+
+            try:
+                # Set a reasonable timeout for synthesis
+                async with session.post(
+                    self._url, data=data, timeout=aiohttp.ClientTimeout(total=60)
+                ) as resp:
+                    if resp.status != 200:
+                        err_text = await resp.text()
+                        logger.error(f"VoxCPM TTS error: {resp.status} {err_text}")
+                        return
+
+                    # Read the entire audio data (since streaming=false)
+                    audio_data = await resp.read()
+
+                    output_emitter.push(audio_data)
+                    output_emitter.flush()
+            except Exception as e:
+                logger.error(f"VoxCPM TTS request failed: {e}")
+            finally:
+                if f:
+                    f.close()