diff --git a/src/blackbox/tts.py b/src/blackbox/tts.py index cfd58b1..4fc90f4 100644 --- a/src/blackbox/tts.py +++ b/src/blackbox/tts.py @@ -15,7 +15,7 @@ from injector import singleton import sys,os sys.path.append('/home/gpu/Workspace/CosyVoice') from cosyvoice.cli.cosyvoice import CosyVoice -from cosyvoice.utils.file_utils import load_wav +from cosyvoice.utils.file_utils import load_wav, speed_change import soundfile import pyloudnorm as pyln @@ -68,6 +68,7 @@ class TTS(Blackbox): else: self.melo_url = melo_config.url logging.info('#### Initializing MeloTTS Service in ' + self.melo_device + ' mode...') + print('1.#### Initializing MeloTTS Service in ' + self.melo_device + ' mode...') @logging_time(logger=logger) def cosyvoice_model_init(self, cosyvoice_config: CosyVoiceConf) -> None: @@ -84,7 +85,8 @@ class TTS(Blackbox): else: self.cosyvoice_url = cosyvoice_config.url - logging.info('#### Initializing CosyVoiceTTS Service in cuda:' + self.cosyvoice_device + ' mode...') + logging.info('#### Initializing CosyVoiceTTS Service in ' + self.cosyvoice_device + ' mode...') + print('1.#### Initializing CosyVoiceTTS Service in ' + self.cosyvoice_device + ' mode...') @inject def __init__(self, melo_config: MeloConf, cosyvoice_config: CosyVoiceConf, settings: dict) -> None: @@ -108,9 +110,8 @@ class TTS(Blackbox): text = args[0] current_time = time.time() - if user_model_name == 'melotts': - if chroma_collection_id == 'kiki': + if chroma_collection_id == 'kiki' or chroma_collection_id is None: if self.melo_mode == 'local': audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed) f = io.BytesIO() @@ -161,7 +162,7 @@ class TTS(Blackbox): return response.content elif user_model_name == 'cosyvoicetts': - if chroma_collection_id == 'kiki': + if chroma_collection_id == 'kiki' or chroma_collection_id is None: if self.cosyvoice_mode == 'local': set_all_random_seed(56056558) audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language) @@ -192,7 +193,29 @@ class TTS(Blackbox): } response = requests.post(self.cosyvoice_url, json=message) print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time)) - return response.content + return response.content + elif user_model_name == 'man': + if self.cosyvoice_mode == 'local': + set_all_random_seed(35616313) + audio = self.cosyvoicetts.inference_sft(text, '中文男') + try: + audio, sample_rate = speed_change(audio["tts_speech"], 22050, str(1.5)) + audio = audio.numpy().flatten() + except Exception as e: + print(f"Failed to change speed of audio: \n{e}") + f = io.BytesIO() + soundfile.write(f, audio, 22050, format='wav') + f.seek(0) + print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) + return f.read() + else: + message = { + "text": text + } + response = requests.post(self.cosyvoice_url, json=message) + print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time)) + return response.content + else: audio = self.tts_service.read(text) print("#### TTS Service consume : ", (time.time()-current_time))