feat: supported different models

2026-05-11 11:22:01 +08:00
parent ac81d4a9eb
commit 409c7c9de0
6 changed files with 558 additions and 228 deletions
--- a/test_voxcpm.py
+++ b/test_voxcpm.py
@ -1,50 +1,66 @@
 import asyncio
-import os
 import logging
-from tts_voxcpm import VoxCPMTTS
-from livekit.agents import tts
+import os
+
+from tts import BlackboxTTS

 logging.basicConfig(level=logging.INFO)

+
 async def test_tts():
    # Use the URL from the user's curl command
    url = "http://10.6.80.21:5002/tts-blackbox"
-    
+
    # Check if we have a real wav file to test with
    # In the earlier find_by_name, we found tests/change-sophie.wav
-    prompt_wav = "/home/verachen/Music/voice/2food.wav" 
+    prompt_wav = "/home/verachen/Music/voice/2food.wav"
    if not os.path.exists(prompt_wav):
-         prompt_wav = "/home/verachen/Music/voice/2food.wav"  # fallback to the one in curl
+        prompt_wav = "/home/verachen/Music/voice/2food.wav"  # fallback to the one in curl

-    print(f"Testing VoxCPMTTS with URL: {url}")
+    print(f"Testing BlackboxTTS with URL: {url}")
    print(f"Using prompt wav: {prompt_wav}")
-    
-    vox_tts = VoxCPMTTS(
+
+    blackbox_tts = BlackboxTTS(
        url=url,
-        prompt_wav_path=prompt_wav
+        model_name="voxcpmtts",
+        prompt_wav_path=prompt_wav,
+        params={
+            "streaming": "false",
+            "prompt_text": "澳门有乜嘢好食嘅",
+            "cfg_value": "2.0",
+            "inference_timesteps": "10",
+            "do_normalize": "true",
+            "denoise": "true",
+            "retry_badcase": "true",
+            "retry_badcase_max_times": "3",
+            "retry_badcase_ratio_threshold": "6.0",
+        },
    )
-    
+
    text = "你好，这是一段测试文本"
    print(f"Synthesizing text: {text}")
-    
+
    try:
-        stream = vox_tts.synthesize(text)
+        stream = blackbox_tts.synthesize(text)
        audio_frame = await stream.collect()
-        
-        print(f"Successfully synthesized audio!")
-        print(f"Audio duration: {audio_frame.sample_rate * len(audio_frame.data) / (audio_frame.num_channels * 2)} samples?") 
+
+        print("Successfully synthesized audio!")
+        print(
+            f"Audio duration: {audio_frame.sample_rate * len(audio_frame.data) / (audio_frame.num_channels * 2)} samples?"
+        )
        # Actually AudioFrame has duration or samples
        print(f"Samples: {len(audio_frame.data) // 2}")
-        
+
        # Save to file for manual check if possible
        with open("test_output.wav", "wb") as f:
-            # This won't be a valid WAV yet if it's just raw PCM, 
+            # This won't be a valid WAV yet if it's just raw PCM,
            # but if collect() returns combined frames, we can use to_wav_bytes()
            f.write(audio_frame.to_wav_bytes())
        print("Saved output to test_output.wav")
-        
+
    except Exception as e:
        print(f"TTS test failed: {e}")

+
 if __name__ == "__main__":
    asyncio.run(test_tts())