From b18c5b40da36859314a1afea5fc2b17d94e06f7c Mon Sep 17 00:00:00 2001 From: 0Xiao0 <511201264@qq.com> Date: Thu, 14 May 2026 15:33:20 +0800 Subject: [PATCH] fix: tts parameters --- custom_agent.py | 64 +++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/custom_agent.py b/custom_agent.py index 435d470..53c9b71 100644 --- a/custom_agent.py +++ b/custom_agent.py @@ -186,7 +186,7 @@ async def entrypoint(ctx: JobContext) -> None: url=TTS_URL, model_name=TTS_MODEL, params=_tts_params_from_env(TTS_MODEL), - prompt_wav_path=os.getenv("CUSTOM_TTS_PROMPT_WAV") or os.getenv("VOXCPM_PROMPT_WAV"), + prompt_wav_path=_tts_prompt_wav_from_env(TTS_MODEL), sample_rate=TTS_SAMPLE_RATE, num_channels=TTS_NUM_CHANNELS, ), @@ -240,48 +240,54 @@ def _tts_params_from_env(model_name: str) -> dict[str, str]: model_name = model_name.lower() if model_name == "voxcpmtts": - params.update( - { - "streaming": os.getenv("CUSTOM_TTS_STREAMING", "false"), - "prompt_text": os.getenv( - "CUSTOM_TTS_PROMPT_TEXT", - os.getenv("VOXCPM_PROMPT_TEXT", "澳门有乜嘢好食嘅"), - ), - "cfg_value": os.getenv("VOXCPM_CFG_VALUE", "2.0"), - "inference_timesteps": os.getenv("VOXCPM_INFERENCE_TIMESTEPS", "10"), - "do_normalize": os.getenv("VOXCPM_DO_NORMALIZE", "true"), - "denoise": os.getenv("VOXCPM_DENOISE", "true"), - "retry_badcase": os.getenv("VOXCPM_RETRY_BADCASE", "true"), - "retry_badcase_max_times": os.getenv("VOXCPM_RETRY_BADCASE_MAX_TIMES", "3"), - "retry_badcase_ratio_threshold": os.getenv( - "VOXCPM_RETRY_BADCASE_RATIO_THRESHOLD", "6.0" - ), - } + _set_if_present(params, "streaming", os.getenv("CUSTOM_TTS_STREAMING")) + _set_if_present( + params, + "prompt_text", + os.getenv("CUSTOM_TTS_PROMPT_TEXT") or os.getenv("VOXCPM_PROMPT_TEXT"), + ) + _set_if_present(params, "cfg_value", os.getenv("VOXCPM_CFG_VALUE")) + _set_if_present(params, "inference_timesteps", os.getenv("VOXCPM_INFERENCE_TIMESTEPS")) + _set_if_present(params, "do_normalize", os.getenv("VOXCPM_DO_NORMALIZE")) + _set_if_present(params, "denoise", os.getenv("VOXCPM_DENOISE")) + _set_if_present(params, "retry_badcase", os.getenv("VOXCPM_RETRY_BADCASE")) + _set_if_present( + params, + "retry_badcase_max_times", + os.getenv("VOXCPM_RETRY_BADCASE_MAX_TIMES"), + ) + _set_if_present( + params, + "retry_badcase_ratio_threshold", + os.getenv("VOXCPM_RETRY_BADCASE_RATIO_THRESHOLD"), ) elif model_name == "melotts": - params["speed"] = os.getenv("CUSTOM_TTS_SPEED", "1.0") + _set_if_present(params, "speed", os.getenv("CUSTOM_TTS_SPEED")) elif model_name == "cosyvoicetts": _set_if_present(params, "spk_id", os.getenv("CUSTOM_TTS_SPK_ID")) _set_if_present(params, "model", os.getenv("CUSTOM_TTS_MODE")) _set_if_present(params, "prompt_text", os.getenv("CUSTOM_TTS_PROMPT_TEXT")) _set_if_present(params, "instruct_text", os.getenv("CUSTOM_TTS_INSTRUCT_TEXT")) elif model_name == "sovitstts": - params.update( - { - "text_lang": os.getenv("CUSTOM_TTS_TEXT_LANG", "zh"), - "prompt_lang": os.getenv("CUSTOM_TTS_PROMPT_LANG", "zh"), - "text_split_method": os.getenv("CUSTOM_TTS_TEXT_SPLIT_METHOD", "cut0"), - "batch_size": os.getenv("CUSTOM_TTS_BATCH_SIZE", "1"), - "media_type": os.getenv("CUSTOM_TTS_MEDIA_TYPE", "wav"), - "streaming_mode": os.getenv("CUSTOM_TTS_STREAMING", "false"), - } - ) + _set_if_present(params, "text_lang", os.getenv("CUSTOM_TTS_TEXT_LANG")) + _set_if_present(params, "prompt_lang", os.getenv("CUSTOM_TTS_PROMPT_LANG")) + _set_if_present(params, "text_split_method", os.getenv("CUSTOM_TTS_TEXT_SPLIT_METHOD")) + _set_if_present(params, "batch_size", os.getenv("CUSTOM_TTS_BATCH_SIZE")) + _set_if_present(params, "media_type", os.getenv("CUSTOM_TTS_MEDIA_TYPE")) + _set_if_present(params, "streaming_mode", os.getenv("CUSTOM_TTS_STREAMING")) _set_if_present(params, "ref_audio_path", os.getenv("CUSTOM_TTS_REF_AUDIO_PATH")) _set_if_present(params, "prompt_text", os.getenv("CUSTOM_TTS_PROMPT_TEXT")) return params +def _tts_prompt_wav_from_env(model_name: str) -> str | None: + if model_name.lower() != "voxcpmtts": + return None + + return os.getenv("CUSTOM_TTS_PROMPT_WAV") or os.getenv("VOXCPM_PROMPT_WAV") or None + + def _set_if_present(params: dict[str, str], key: str, value: str | None) -> None: if value: params[key] = value