add boss cosyvoicetts

This commit is contained in:
0Xiao0
2024-09-11 13:58:26 +08:00
parent 42b419f576
commit ff232f0f96

View File

@ -25,6 +25,17 @@ from ..log.logging_time import logging_time
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
import random
import torch
import numpy as np
def set_all_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
@singleton @singleton
class TTS(Blackbox): class TTS(Blackbox):
melo_mode: str melo_mode: str
@ -92,60 +103,96 @@ class TTS(Blackbox):
if settings is None: if settings is None:
settings = {} settings = {}
user_model_name = settings.get("tts_model_name") user_model_name = settings.get("tts_model_name")
chroma_collection_id = settings.get("chroma_collection_id")
print(f"tts_model_name: {user_model_name}") print(f"tts_model_name: {user_model_name}")
text = args[0] text = args[0]
current_time = time.time() current_time = time.time()
if user_model_name == 'melotts': if user_model_name == 'melotts':
if self.melo_mode == 'local': if chroma_collection_id == 'kiki':
audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed) if self.melo_mode == 'local':
f = io.BytesIO() audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed)
soundfile.write(f, audio, 44100, format='wav') f = io.BytesIO()
f.seek(0) soundfile.write(f, audio, 44100, format='wav')
f.seek(0)
# Read the audio data from the buffer # Read the audio data from the buffer
data, rate = soundfile.read(f, dtype='float32') data, rate = soundfile.read(f, dtype='float32')
# Peak normalization # Peak normalization
peak_normalized_audio = pyln.normalize.peak(data, -1.0) peak_normalized_audio = pyln.normalize.peak(data, -1.0)
# Integrated loudness normalization # Integrated loudness normalization
meter = pyln.Meter(rate) meter = pyln.Meter(rate)
loudness = meter.integrated_loudness(peak_normalized_audio) loudness = meter.integrated_loudness(peak_normalized_audio)
loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0) loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0)
# Write the loudness normalized audio to an in-memory buffer # Write the loudness normalized audio to an in-memory buffer
normalized_audio_buffer = io.BytesIO() normalized_audio_buffer = io.BytesIO()
soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav') soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav')
normalized_audio_buffer.seek(0) normalized_audio_buffer.seek(0)
print("#### MeloTTS Service consume - local : ", (time.time() - current_time)) print("#### MeloTTS Service consume - local : ", (time.time() - current_time))
return normalized_audio_buffer.read() return normalized_audio_buffer.read()
else: else:
message = { message = {
"text": text "text": text
} }
response = requests.post(self.melo_url, json=message) response = requests.post(self.melo_url, json=message)
print("#### MeloTTS Service consume - docker : ", (time.time()-current_time)) print("#### MeloTTS Service consume - docker : ", (time.time()-current_time))
return response.content return response.content
elif chroma_collection_id == 'boss':
if self.cosyvoice_mode == 'local':
set_all_random_seed(35616313)
audio = self.cosyvoicetts.inference_sft(text, '中文男')
f = io.BytesIO()
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
f.seek(0)
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
return f.read()
else:
message = {
"text": text
}
response = requests.post(self.cosyvoice_url, json=message)
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
return response.content
elif user_model_name == 'cosyvoicetts': elif user_model_name == 'cosyvoicetts':
if self.cosyvoice_mode == 'local': if chroma_collection_id == 'kiki':
audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language) if self.cosyvoice_mode == 'local':
f = io.BytesIO() set_all_random_seed(56056558)
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav') audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language)
f.seek(0) f = io.BytesIO()
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
return f.read() f.seek(0)
else: print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
message = { return f.read()
"text": text else:
} message = {
response = requests.post(self.cosyvoice_url, json=message) "text": text
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time)) }
return response.content response = requests.post(self.cosyvoice_url, json=message)
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
return response.content
elif chroma_collection_id == 'boss':
if self.cosyvoice_mode == 'local':
set_all_random_seed(35616313)
audio = self.cosyvoicetts.inference_sft(text, '中文男')
f = io.BytesIO()
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
f.seek(0)
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
return f.read()
else:
message = {
"text": text
}
response = requests.post(self.cosyvoice_url, json=message)
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
return response.content
else: else:
audio = self.tts_service.read(text) audio = self.tts_service.read(text)
print("#### TTS Service consume : ", (time.time()-current_time)) print("#### TTS Service consume : ", (time.time()-current_time))