diff --git a/src/blackbox/tts.py b/src/blackbox/tts.py index 7446190..3228daf 100644 --- a/src/blackbox/tts.py +++ b/src/blackbox/tts.py @@ -17,7 +17,7 @@ import sys,os sys.path.append('/Workspace/CosyVoice') sys.path.append('/Workspace/CosyVoice/third_party/Matcha-TTS') from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 -# from cosyvoice.utils.file_utils import load_wav, speed_change +from cosyvoice.utils.file_utils import load_wav#, speed_change import soundfile as sf import pyloudnorm as pyln @@ -33,6 +33,7 @@ import numpy as np from pydub import AudioSegment import subprocess +import re def set_all_random_seed(seed): random.seed(seed) @@ -107,12 +108,13 @@ class TTS(Blackbox): self.cosyvoice_url = '' self.cosyvoice_mode = cosyvoice_config.mode self.cosyvoicetts = None + self.prompt_speech_16k = None # os.environ['CUDA_VISIBLE_DEVICES'] = str(cosyvoice_config.device) if self.cosyvoice_mode == 'local': # self.cosyvoicetts = CosyVoice('/Workspace/Models/CosyVoice/pretrained_models/CosyVoice-300M') - self.cosyvoicetts = CosyVoice('/model/Voice/CosyVoice/pretrained_models/CosyVoice-300M') - # self.cosyvoicetts = CosyVoice2('/model/Voice/CosyVoice/pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False) - + # self.cosyvoicetts = CosyVoice('/model/Voice/CosyVoice/pretrained_models/CosyVoice-300M') + self.cosyvoicetts = CosyVoice2('/model/Voice/CosyVoice/pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False) + self.prompt_speech_16k = load_wav('/Workspace/jarvis-models/Ricky-Wong-3-Mins.wav_0006003840_0006134080.wav', 16000) else: self.cosyvoice_url = cosyvoice_config.url @@ -158,7 +160,22 @@ class TTS(Blackbox): def __call__(self, *args, **kwargs): return self.processing(*args, **kwargs) + + def filter_invalid_chars(self,text): + """过滤无效字符(包括字节流)""" + invalid_keywords = ["data:", "\n", "\r", "\t", " "] + + if isinstance(text, bytes): + text = text.decode('utf-8', errors='ignore') + + for keyword in invalid_keywords: + text = text.replace(keyword, "") + + # 移除所有英文字母和符号(保留中文、标点等) + text = re.sub(r'[a-zA-Z]', '', text) + return text.strip() + @logging_time(logger=logger) def processing(self, *args, settings: dict) -> io.BytesIO: @@ -233,13 +250,45 @@ class TTS(Blackbox): if self.cosyvoice_mode == 'local': set_all_random_seed(56056558) print("*"*90) - audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language, stream=True) + # audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language, stream=True) + audio = self.cosyvoicetts.inference_instruct2(text, '用粤语说这句话', self.prompt_speech_16k, stream=False) + # for i, j in enumerate(audio): + # f = io.BytesIO() + # sf.write(f, j['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav') + # f.seek(0) + # print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) + # return f.read() + # 打印 audio 的长度和内容结构 + # print(f"Total audio segments: {len(audio)}") + # print(f"Audio data structure: {audio}") + + # 创建一个空的列表来存储所有音频段的 NumPy 数组 + all_audio_data = [] + + # 遍历每一段音频并将它们存储到 all_audio_data 列表 for i, j in enumerate(audio): - f = io.BytesIO() - sf.write(f, j['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav') - f.seek(0) + # print(f"Processing segment {i + 1}...") + + # 打印每段音频的信息,确保其正确 + # print(f"Segment {i + 1} shape: {j['tts_speech'].shape}") + + # 直接将音频数据转换成 NumPy 数组 + audio_data = j['tts_speech'].cpu().numpy() + + # 将每个段的音频数据添加到 all_audio_data 列表 + all_audio_data.append(audio_data[0]) # 取音频的第一个通道(假设为单声道) + + # 将所有音频段的 NumPy 数组合并成一个完整的音频数组 + combined_audio_data = np.concatenate(all_audio_data, axis=0) + + # 将合并后的音频数据写入到 BytesIO 中 + f = io.BytesIO() + sf.write(f, combined_audio_data, 22050, format='wav') # 22050 为采样率,可能需要根据实际情况调整 + f.seek(0) + + # 返回合并后的音频 print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) - return f.read() + return f.read() # 返回最终合并后的音频数据 else: message = { "text": text @@ -266,6 +315,7 @@ class TTS(Blackbox): return response.content elif user_model_name == 'sovitstts': + # text = self.filter_invalid_chars(text) if chroma_collection_id == 'kiki' or chroma_collection_id is None: if self.sovits_mode == 'local': set_all_random_seed(56056558) @@ -288,7 +338,7 @@ class TTS(Blackbox): "media_type": self.sovits_media_type, "streaming_mode": self.sovits_streaming_mode } - if user_stream: + if user_stream == True or str(user_stream).lower() == "true": response = requests.get(self.sovits_url, params=message, stream=True) print("#### SoVITS Service consume - docker : ", (time.time()-current_time)) return response @@ -360,8 +410,10 @@ class TTS(Blackbox): return JSONResponse(content={"error": "text is required"}, status_code=status.HTTP_400_BAD_REQUEST) by = self.processing(text, settings=setting) # return Response(content=by, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"}) + print(f"tts user_stream: {type(user_stream)}") - if user_stream: + if user_stream == True or str(user_stream).lower() == "true": + print(f"tts user_stream22: {user_stream}") if by.status_code == 200: print("*"*90) def audio_stream(): @@ -405,6 +457,7 @@ class TTS(Blackbox): else: wav_filename = os.path.join(self.audio_dir, 'audio.wav') + print("8"*90) with open(wav_filename, 'wb') as f: f.write(by) diff --git a/src/blackbox/vlms.py b/src/blackbox/vlms.py index b2716e0..1a65fbf 100644 --- a/src/blackbox/vlms.py +++ b/src/blackbox/vlms.py @@ -13,9 +13,13 @@ import requests import base64 import copy import ast + +import json + import random from time import time + import io from PIL import Image from lmdeploy.serve.openai.api_client import APIClient @@ -94,7 +98,6 @@ class VLMS(Blackbox): response: a string history: a list """ - if settings: for k in settings: if k not in self.settings: @@ -315,14 +318,19 @@ class VLMS(Blackbox): ## TODO: add support for multiple images and support image in form-data format json_request = True try: - content_type = request.headers['content-type'] + content_type = request.headers.get('content-type', '') + print(content_type) if content_type == 'application/json': data = await request.json() - else: + elif 'multipart/form-data' in content_type: data = await request.form() - json_request = False + json_request = False + else: + body = await request.body() + data = json.loads(body.decode("utf-8")) + except Exception as e: - return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST) + return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST) model_name = data.get("model_name") prompt = data.get("prompt") @@ -337,8 +345,8 @@ class VLMS(Blackbox): else: return JSONResponse(content={"error": "context format error, should be in format of list or Openai_format"}, status_code=status.HTTP_400_BAD_REQUEST) - if json_request: - img_data = data.get("img_data") + if json_request or 'multipart/form-data' not in content_type: + img_data = data.get("img_data") else: img_data = await data.get("img_data").read() if settings: settings = ast.literal_eval(settings)