mirror of
https://github.com/BoardWare-Genius/jarvis-models.git
synced 2025-12-13 16:53:24 +00:00
add boss cosyvoicetts
This commit is contained in:
@ -25,6 +25,17 @@ from ..log.logging_time import logging_time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import random
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
def set_all_random_seed(seed):
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
|
||||
@singleton
|
||||
class TTS(Blackbox):
|
||||
melo_mode: str
|
||||
@ -92,60 +103,96 @@ class TTS(Blackbox):
|
||||
if settings is None:
|
||||
settings = {}
|
||||
user_model_name = settings.get("tts_model_name")
|
||||
chroma_collection_id = settings.get("chroma_collection_id")
|
||||
print(f"tts_model_name: {user_model_name}")
|
||||
|
||||
text = args[0]
|
||||
current_time = time.time()
|
||||
|
||||
if user_model_name == 'melotts':
|
||||
if self.melo_mode == 'local':
|
||||
audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed)
|
||||
f = io.BytesIO()
|
||||
soundfile.write(f, audio, 44100, format='wav')
|
||||
f.seek(0)
|
||||
if chroma_collection_id == 'kiki':
|
||||
if self.melo_mode == 'local':
|
||||
audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed)
|
||||
f = io.BytesIO()
|
||||
soundfile.write(f, audio, 44100, format='wav')
|
||||
f.seek(0)
|
||||
|
||||
# Read the audio data from the buffer
|
||||
data, rate = soundfile.read(f, dtype='float32')
|
||||
# Read the audio data from the buffer
|
||||
data, rate = soundfile.read(f, dtype='float32')
|
||||
|
||||
# Peak normalization
|
||||
peak_normalized_audio = pyln.normalize.peak(data, -1.0)
|
||||
# Peak normalization
|
||||
peak_normalized_audio = pyln.normalize.peak(data, -1.0)
|
||||
|
||||
# Integrated loudness normalization
|
||||
meter = pyln.Meter(rate)
|
||||
loudness = meter.integrated_loudness(peak_normalized_audio)
|
||||
loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0)
|
||||
# Integrated loudness normalization
|
||||
meter = pyln.Meter(rate)
|
||||
loudness = meter.integrated_loudness(peak_normalized_audio)
|
||||
loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0)
|
||||
|
||||
# Write the loudness normalized audio to an in-memory buffer
|
||||
normalized_audio_buffer = io.BytesIO()
|
||||
soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav')
|
||||
normalized_audio_buffer.seek(0)
|
||||
# Write the loudness normalized audio to an in-memory buffer
|
||||
normalized_audio_buffer = io.BytesIO()
|
||||
soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav')
|
||||
normalized_audio_buffer.seek(0)
|
||||
|
||||
print("#### MeloTTS Service consume - local : ", (time.time() - current_time))
|
||||
return normalized_audio_buffer.read()
|
||||
print("#### MeloTTS Service consume - local : ", (time.time() - current_time))
|
||||
return normalized_audio_buffer.read()
|
||||
|
||||
else:
|
||||
message = {
|
||||
"text": text
|
||||
}
|
||||
response = requests.post(self.melo_url, json=message)
|
||||
print("#### MeloTTS Service consume - docker : ", (time.time()-current_time))
|
||||
return response.content
|
||||
else:
|
||||
message = {
|
||||
"text": text
|
||||
}
|
||||
response = requests.post(self.melo_url, json=message)
|
||||
print("#### MeloTTS Service consume - docker : ", (time.time()-current_time))
|
||||
return response.content
|
||||
elif chroma_collection_id == 'boss':
|
||||
if self.cosyvoice_mode == 'local':
|
||||
set_all_random_seed(35616313)
|
||||
audio = self.cosyvoicetts.inference_sft(text, '中文男')
|
||||
f = io.BytesIO()
|
||||
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
|
||||
f.seek(0)
|
||||
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
|
||||
return f.read()
|
||||
else:
|
||||
message = {
|
||||
"text": text
|
||||
}
|
||||
response = requests.post(self.cosyvoice_url, json=message)
|
||||
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
|
||||
return response.content
|
||||
|
||||
elif user_model_name == 'cosyvoicetts':
|
||||
if self.cosyvoice_mode == 'local':
|
||||
audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language)
|
||||
f = io.BytesIO()
|
||||
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
|
||||
f.seek(0)
|
||||
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
|
||||
return f.read()
|
||||
else:
|
||||
message = {
|
||||
"text": text
|
||||
}
|
||||
response = requests.post(self.cosyvoice_url, json=message)
|
||||
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
|
||||
return response.content
|
||||
if chroma_collection_id == 'kiki':
|
||||
if self.cosyvoice_mode == 'local':
|
||||
set_all_random_seed(56056558)
|
||||
audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language)
|
||||
f = io.BytesIO()
|
||||
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
|
||||
f.seek(0)
|
||||
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
|
||||
return f.read()
|
||||
else:
|
||||
message = {
|
||||
"text": text
|
||||
}
|
||||
response = requests.post(self.cosyvoice_url, json=message)
|
||||
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
|
||||
return response.content
|
||||
elif chroma_collection_id == 'boss':
|
||||
if self.cosyvoice_mode == 'local':
|
||||
set_all_random_seed(35616313)
|
||||
audio = self.cosyvoicetts.inference_sft(text, '中文男')
|
||||
f = io.BytesIO()
|
||||
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
|
||||
f.seek(0)
|
||||
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
|
||||
return f.read()
|
||||
else:
|
||||
message = {
|
||||
"text": text
|
||||
}
|
||||
response = requests.post(self.cosyvoice_url, json=message)
|
||||
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
|
||||
return response.content
|
||||
else:
|
||||
audio = self.tts_service.read(text)
|
||||
print("#### TTS Service consume : ", (time.time()-current_time))
|
||||
|
||||
Reference in New Issue
Block a user