add cosyvoicetts

This commit is contained in:
0Xiao0
2024-07-29 10:24:44 +08:00
parent 5bb80d396b
commit 0315955a38
7 changed files with 142 additions and 4 deletions

View File

@ -52,6 +52,11 @@ def melotts_loader():
from .melotts import MeloTTS from .melotts import MeloTTS
return Injector().get(MeloTTS) return Injector().get(MeloTTS)
@model_loader(lazy=blackboxConf.lazyloading)
def cosyvoicetts_loader():
from .cosyvoicetts import CosyVoiceTTS
return Injector().get(CosyVoiceTTS)
@model_loader(lazy=blackboxConf.lazyloading) @model_loader(lazy=blackboxConf.lazyloading)
def tts_loader(): def tts_loader():
from .tts import TTS from .tts import TTS

View File

@ -0,0 +1,93 @@
import io
import time
import requests
from fastapi import Request, Response, status
from fastapi.responses import JSONResponse
from injector import inject
from injector import singleton
from ..log.logging_time import logging_time
from ..configuration import CosyVoiceConf
from .blackbox import Blackbox
import soundfile
import pyloudnorm as pyln
import sys
sys.path.append('/home/gpu/Workspace/CosyVoice')
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
import torchaudio
import os
import logging
logger = logging.getLogger(__name__)
@singleton
class CosyVoiceTTS(Blackbox):
mode: str
url: str
speed: int
device: str
language: str
speaker: str
@logging_time(logger=logger)
def model_init(self, cosyvoice_config: CosyVoiceConf) -> None:
self.speed = cosyvoice_config.speed
self.device = cosyvoice_config.device
self.language = cosyvoice_config.language
self.speaker = cosyvoice_config.speaker
self.device = cosyvoice_config.device
self.url = ''
self.mode = cosyvoice_config.mode
self.cosyvoicetts = None
self.speaker_ids = None
os.environ['CUDA_VISIBLE_DEVICES'] = str(cosyvoice_config.device)
if self.mode == 'local':
self.cosyvoicetts = CosyVoice('/home/gpu/Workspace/Models/CosyVoice/pretrained_models/CosyVoice-300M')
else:
self.url = cosyvoice_config.url
logging.info('#### Initializing CosyVoiceTTS Service in cuda:' + str(cosyvoice_config.device) + ' mode...')
@inject
def __init__(self, cosyvoice_config: CosyVoiceConf) -> None:
self.model_init(cosyvoice_config)
def __call__(self, *args, **kwargs):
return self.processing(*args, **kwargs)
def valid(self, *args, **kwargs) -> bool:
text = args[0]
return isinstance(text, str)
@logging_time(logger=logger)
def processing(self, *args, **kwargs) -> io.BytesIO | bytes:
text = args[0]
current_time = time.time()
if self.mode == 'local':
audio = self.cosyvoicetts.inference_sft(text, self.language)
f = io.BytesIO()
soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
f.seek(0)
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
return f.read()
else:
message = {
"text": text
}
response = requests.post(self.url, json=message)
print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
return response.content
async def fast_api_handler(self, request: Request) -> Response:
try:
data = await request.json()
except:
return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST)
text = data.get("text")
if text is None:
return JSONResponse(content={"error": "text is required"}, status_code=status.HTTP_400_BAD_REQUEST)
return Response(content=self.processing(text), media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"})

View File

@ -13,6 +13,7 @@ from ..configuration import MeloConf
from .blackbox import Blackbox from .blackbox import Blackbox
import soundfile import soundfile
import pyloudnorm as pyln
from melo.api import TTS from melo.api import TTS
import logging import logging
@ -65,8 +66,29 @@ class MeloTTS(Blackbox):
f = io.BytesIO() f = io.BytesIO()
soundfile.write(f, audio, 44100, format='wav') soundfile.write(f, audio, 44100, format='wav')
f.seek(0) f.seek(0)
# print("#### MeloTTS Service consume - local : ", (time.time() - current_time))
# return f.read()
# Read the audio data from the buffer
data, rate = soundfile.read(f, dtype='float32')
# Peak normalization
peak_normalized_audio = pyln.normalize.peak(data, -1.0)
# Integrated loudness normalization
meter = pyln.Meter(rate)
loudness = meter.integrated_loudness(peak_normalized_audio)
loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0)
# Write the loudness normalized audio to an in-memory buffer
normalized_audio_buffer = io.BytesIO()
soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav')
normalized_audio_buffer.seek(0)
print("#### MeloTTS Service consume - local : ", (time.time() - current_time)) print("#### MeloTTS Service consume - local : ", (time.time() - current_time))
return f.read() return normalized_audio_buffer.read()
else: else:
message = { message = {
"text": text "text": text

View File

@ -65,6 +65,23 @@ class MeloConf():
self.language = config.get("melotts.language") self.language = config.get("melotts.language")
self.speaker = config.get("melotts.speaker") self.speaker = config.get("melotts.speaker")
class CosyVoiceConf():
mode: str
url: str
speed: int
device: str
language: str
speaker: str
@inject
def __init__(self, config: Configuration) -> None:
self.mode = config.get("cosyvoicetts.mode")
self.url = config.get("cosyvoicetts.url")
self.speed = config.get("cosyvoicetts.speed")
self.device = config.get("cosyvoicetts.device")
self.language = config.get("cosyvoicetts.language")
self.speaker = config.get("cosyvoicetts.speaker")
# 'CRITICAL': CRITICAL, # 'CRITICAL': CRITICAL,
# 'FATAL': FATAL, # 'FATAL': FATAL,
# 'ERROR': ERROR, # 'ERROR': ERROR,

View File

@ -5,7 +5,7 @@ from transformers import BertTokenizer
import numpy as np import numpy as np
dirabspath = __file__.split("\\")[1:-1] dirabspath = __file__.split("\\")[1:-1]
dirabspath= "C://" + "/".join(dirabspath) dirabspath= "/home/gpu/Workspace/jarvis-models/src/sentiment_engine" + "/".join(dirabspath)
default_path = dirabspath + "/models/paimon_sentiment.onnx" default_path = dirabspath + "/models/paimon_sentiment.onnx"

View File

@ -19,7 +19,7 @@ import logging
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
dirbaspath = __file__.split("\\")[1:-1] dirbaspath = __file__.split("\\")[1:-1]
dirbaspath= "C://" + "/".join(dirbaspath) dirbaspath= "/home/gpu/Workspace/jarvis-models/src/tts" + "/".join(dirbaspath)
config = { config = {
'paimon': { 'paimon': {
'cfg': dirbaspath + '/models/paimon6k.json', 'cfg': dirbaspath + '/models/paimon6k.json',

View File

@ -93,3 +93,4 @@ components:
- chroma_upsert - chroma_upsert
- melotts - melotts
- vlms - vlms
- cosyvoicetts