mirror of
https://github.com/BoardWare-Genius/jarvis-models.git
synced 2025-12-13 16:53:24 +00:00
add boss cosyvoicetts
This commit is contained in:
@ -25,6 +25,17 @@ from ..log.logging_time import logging_time
|
|||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
import random
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def set_all_random_seed(seed):
|
||||||
|
random.seed(seed)
|
||||||
|
np.random.seed(seed)
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
|
||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class TTS(Blackbox):
|
class TTS(Blackbox):
|
||||||
melo_mode: str
|
melo_mode: str
|
||||||
@ -92,60 +103,96 @@ class TTS(Blackbox):
|
|||||||
if settings is None:
|
if settings is None:
|
||||||
settings = {}
|
settings = {}
|
||||||
user_model_name = settings.get("tts_model_name")
|
user_model_name = settings.get("tts_model_name")
|
||||||
|
chroma_collection_id = settings.get("chroma_collection_id")
|
||||||
print(f"tts_model_name: {user_model_name}")
|
print(f"tts_model_name: {user_model_name}")
|
||||||
|
|
||||||
text = args[0]
|
text = args[0]
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
|
|
||||||
if user_model_name == 'melotts':
|
if user_model_name == 'melotts':
|
||||||
if self.melo_mode == 'local':
|
if chroma_collection_id == 'kiki':
|
||||||
audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed)
|
if self.melo_mode == 'local':
|
||||||
f = io.BytesIO()
|
audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed)
|
||||||
soundfile.write(f, audio, 44100, format='wav')
|
f = io.BytesIO()
|
||||||
f.seek(0)
|
soundfile.write(f, audio, 44100, format='wav')
|
||||||
|
f.seek(0)
|
||||||
|
|
||||||
# Read the audio data from the buffer
|
# Read the audio data from the buffer
|
||||||
data, rate = soundfile.read(f, dtype='float32')
|
data, rate = soundfile.read(f, dtype='float32')
|
||||||
|
|
||||||
# Peak normalization
|
# Peak normalization
|
||||||
peak_normalized_audio = pyln.normalize.peak(data, -1.0)
|
peak_normalized_audio = pyln.normalize.peak(data, -1.0)
|
||||||
|
|
||||||
# Integrated loudness normalization
|
# Integrated loudness normalization
|
||||||
meter = pyln.Meter(rate)
|
meter = pyln.Meter(rate)
|
||||||
loudness = meter.integrated_loudness(peak_normalized_audio)
|
loudness = meter.integrated_loudness(peak_normalized_audio)
|
||||||
loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0)
|
loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0)
|
||||||
|
|
||||||
# Write the loudness normalized audio to an in-memory buffer
|
# Write the loudness normalized audio to an in-memory buffer
|
||||||
normalized_audio_buffer = io.BytesIO()
|
normalized_audio_buffer = io.BytesIO()
|
||||||
soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav')
|
soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav')
|
||||||
normalized_audio_buffer.seek(0)
|
normalized_audio_buffer.seek(0)
|
||||||
|
|
||||||
print("#### MeloTTS Service consume - local : ", (time.time() - current_time))
|
print("#### MeloTTS Service consume - local : ", (time.time() - current_time))
|
||||||
return normalized_audio_buffer.read()
|
return normalized_audio_buffer.read()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
message = {
|
message = {
|
||||||
"text": text
|
"text": text
|
||||||
}
|
}
|
||||||
response = requests.post(self.melo_url, json=message)
|
response = requests.post(self.melo_url, json=message)
|
||||||
print("#### MeloTTS Service consume - docker : ", (time.time()-current_time))
|
print("#### MeloTTS Service consume - docker : ", (time.time()-current_time))
|
||||||
return response.content
|
return response.content
|
||||||
|
elif chroma_collection_id == 'boss':
|
||||||
|
if self.cosyvoice_mode == 'local':
|
||||||
|
set_all_random_seed(35616313)
|
||||||
|
audio = self.cosyvoicetts.inference_sft(text, '中文男')
|
||||||
|
f = io.BytesIO()
|
||||||
|
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
|
||||||
|
f.seek(0)
|
||||||
|
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
|
||||||
|
return f.read()
|
||||||
|
else:
|
||||||
|
message = {
|
||||||
|
"text": text
|
||||||
|
}
|
||||||
|
response = requests.post(self.cosyvoice_url, json=message)
|
||||||
|
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
|
||||||
|
return response.content
|
||||||
|
|
||||||
elif user_model_name == 'cosyvoicetts':
|
elif user_model_name == 'cosyvoicetts':
|
||||||
if self.cosyvoice_mode == 'local':
|
if chroma_collection_id == 'kiki':
|
||||||
audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language)
|
if self.cosyvoice_mode == 'local':
|
||||||
f = io.BytesIO()
|
set_all_random_seed(56056558)
|
||||||
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
|
audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language)
|
||||||
f.seek(0)
|
f = io.BytesIO()
|
||||||
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
|
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
|
||||||
return f.read()
|
f.seek(0)
|
||||||
else:
|
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
|
||||||
message = {
|
return f.read()
|
||||||
"text": text
|
else:
|
||||||
}
|
message = {
|
||||||
response = requests.post(self.cosyvoice_url, json=message)
|
"text": text
|
||||||
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
|
}
|
||||||
return response.content
|
response = requests.post(self.cosyvoice_url, json=message)
|
||||||
|
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
|
||||||
|
return response.content
|
||||||
|
elif chroma_collection_id == 'boss':
|
||||||
|
if self.cosyvoice_mode == 'local':
|
||||||
|
set_all_random_seed(35616313)
|
||||||
|
audio = self.cosyvoicetts.inference_sft(text, '中文男')
|
||||||
|
f = io.BytesIO()
|
||||||
|
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
|
||||||
|
f.seek(0)
|
||||||
|
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
|
||||||
|
return f.read()
|
||||||
|
else:
|
||||||
|
message = {
|
||||||
|
"text": text
|
||||||
|
}
|
||||||
|
response = requests.post(self.cosyvoice_url, json=message)
|
||||||
|
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
|
||||||
|
return response.content
|
||||||
else:
|
else:
|
||||||
audio = self.tts_service.read(text)
|
audio = self.tts_service.read(text)
|
||||||
print("#### TTS Service consume : ", (time.time()-current_time))
|
print("#### TTS Service consume : ", (time.time()-current_time))
|
||||||
|
|||||||
Reference in New Issue
Block a user