jarvis-models/src/blackbox/cosyvoicetts.py

import io
import time

import requests
from fastapi import Request, Response, status
from fastapi.responses import JSONResponse
from injector import inject
from injector import singleton

from ..log.logging_time import logging_time

from ..configuration import CosyVoiceConf
from .blackbox import Blackbox

import soundfile
import pyloudnorm as pyln
import sys
sys.path.append('/home/gpu/Workspace/CosyVoice')
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
import torchaudio

import os
import logging
logger = logging.getLogger(__name__)

@singleton
class CosyVoiceTTS(Blackbox):
    mode: str
    url: str
    speed: int
    device: str
    language: str
    speaker: str

    @logging_time(logger=logger)
    def model_init(self, cosyvoice_config: CosyVoiceConf) -> None:
        self.speed = cosyvoice_config.speed
        self.device = cosyvoice_config.device
        self.language = cosyvoice_config.language
        self.speaker = cosyvoice_config.speaker
        self.device = cosyvoice_config.device
        self.url = ''
        self.mode = cosyvoice_config.mode
        self.cosyvoicetts = None
        self.speaker_ids = None
        os.environ['CUDA_VISIBLE_DEVICES'] = str(cosyvoice_config.device)
        if self.mode == 'local':
            self.cosyvoicetts = CosyVoice('/home/gpu/Workspace/Models/CosyVoice/pretrained_models/CosyVoice-300M')

        else:
            self.url = cosyvoice_config.url
        logging.info('#### Initializing CosyVoiceTTS Service in cuda:' + str(cosyvoice_config.device) + ' mode...')

    @inject
    def __init__(self, cosyvoice_config: CosyVoiceConf) -> None:
        self.model_init(cosyvoice_config)

    def __call__(self, *args, **kwargs):
        return self.processing(*args, **kwargs)

    def valid(self, *args, **kwargs) -> bool:
        text = args[0]
        return isinstance(text, str)

    @logging_time(logger=logger)
    def processing(self, *args, **kwargs) -> io.BytesIO | bytes:
        text = args[0]
        current_time = time.time()
        if self.mode == 'local':
            audio = self.cosyvoicetts.inference_sft(text, self.language)
            f = io.BytesIO()
            soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
            f.seek(0)
            print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
            return f.read()
        else:
            message = {
                "text": text
            }
            response = requests.post(self.url, json=message)
            print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time))
            return response.content

    async def fast_api_handler(self, request: Request) -> Response:
        try:
            data = await request.json()
        except:
            return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST)
        text = data.get("text")
        if text is None:
            return JSONResponse(content={"error": "text is required"}, status_code=status.HTTP_400_BAD_REQUEST)
        return Response(content=self.processing(text), media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"})