From ff232f0f969155f027f3e470a67ca6c7b2bfd57a Mon Sep 17 00:00:00 2001 From: 0Xiao0 <511201264@qq.com> Date: Wed, 11 Sep 2024 13:58:26 +0800 Subject: [PATCH] add boss cosyvoicetts --- src/blackbox/tts.py | 135 +++++++++++++++++++++++++++++--------------- 1 file changed, 91 insertions(+), 44 deletions(-) diff --git a/src/blackbox/tts.py b/src/blackbox/tts.py index 0449612..cfd58b1 100644 --- a/src/blackbox/tts.py +++ b/src/blackbox/tts.py @@ -25,6 +25,17 @@ from ..log.logging_time import logging_time import logging logger = logging.getLogger(__name__) +import random +import torch +import numpy as np + +def set_all_random_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + @singleton class TTS(Blackbox): melo_mode: str @@ -92,60 +103,96 @@ class TTS(Blackbox): if settings is None: settings = {} user_model_name = settings.get("tts_model_name") + chroma_collection_id = settings.get("chroma_collection_id") print(f"tts_model_name: {user_model_name}") text = args[0] current_time = time.time() if user_model_name == 'melotts': - if self.melo_mode == 'local': - audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed) - f = io.BytesIO() - soundfile.write(f, audio, 44100, format='wav') - f.seek(0) + if chroma_collection_id == 'kiki': + if self.melo_mode == 'local': + audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed) + f = io.BytesIO() + soundfile.write(f, audio, 44100, format='wav') + f.seek(0) - # Read the audio data from the buffer - data, rate = soundfile.read(f, dtype='float32') - - # Peak normalization - peak_normalized_audio = pyln.normalize.peak(data, -1.0) - - # Integrated loudness normalization - meter = pyln.Meter(rate) - loudness = meter.integrated_loudness(peak_normalized_audio) - loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0) - - # Write the loudness normalized audio to an in-memory buffer - normalized_audio_buffer = io.BytesIO() - soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav') - normalized_audio_buffer.seek(0) - - print("#### MeloTTS Service consume - local : ", (time.time() - current_time)) - return normalized_audio_buffer.read() + # Read the audio data from the buffer + data, rate = soundfile.read(f, dtype='float32') + + # Peak normalization + peak_normalized_audio = pyln.normalize.peak(data, -1.0) + + # Integrated loudness normalization + meter = pyln.Meter(rate) + loudness = meter.integrated_loudness(peak_normalized_audio) + loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0) + + # Write the loudness normalized audio to an in-memory buffer + normalized_audio_buffer = io.BytesIO() + soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav') + normalized_audio_buffer.seek(0) + + print("#### MeloTTS Service consume - local : ", (time.time() - current_time)) + return normalized_audio_buffer.read() - else: - message = { - "text": text - } - response = requests.post(self.melo_url, json=message) - print("#### MeloTTS Service consume - docker : ", (time.time()-current_time)) - return response.content + else: + message = { + "text": text + } + response = requests.post(self.melo_url, json=message) + print("#### MeloTTS Service consume - docker : ", (time.time()-current_time)) + return response.content + elif chroma_collection_id == 'boss': + if self.cosyvoice_mode == 'local': + set_all_random_seed(35616313) + audio = self.cosyvoicetts.inference_sft(text, '中文男') + f = io.BytesIO() + soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav') + f.seek(0) + print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) + return f.read() + else: + message = { + "text": text + } + response = requests.post(self.cosyvoice_url, json=message) + print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time)) + return response.content elif user_model_name == 'cosyvoicetts': - if self.cosyvoice_mode == 'local': - audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language) - f = io.BytesIO() - soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav') - f.seek(0) - print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) - return f.read() - else: - message = { - "text": text - } - response = requests.post(self.cosyvoice_url, json=message) - print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time)) - return response.content + if chroma_collection_id == 'kiki': + if self.cosyvoice_mode == 'local': + set_all_random_seed(56056558) + audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language) + f = io.BytesIO() + soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav') + f.seek(0) + print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) + return f.read() + else: + message = { + "text": text + } + response = requests.post(self.cosyvoice_url, json=message) + print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time)) + return response.content + elif chroma_collection_id == 'boss': + if self.cosyvoice_mode == 'local': + set_all_random_seed(35616313) + audio = self.cosyvoicetts.inference_sft(text, '中文男') + f = io.BytesIO() + soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav') + f.seek(0) + print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) + return f.read() + else: + message = { + "text": text + } + response = requests.post(self.cosyvoice_url, json=message) + print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time)) + return response.content else: audio = self.tts_service.read(text) print("#### TTS Service consume : ", (time.time()-current_time))