Merge pull request #28 from BoardWare-Genius/veraGDI

Vera gdi
This commit is contained in:
headbigsile
2025-03-19 16:08:21 +08:00
committed by GitHub
2 changed files with 79 additions and 18 deletions

View File

@ -17,7 +17,7 @@ import sys,os
sys.path.append('/Workspace/CosyVoice') sys.path.append('/Workspace/CosyVoice')
sys.path.append('/Workspace/CosyVoice/third_party/Matcha-TTS') sys.path.append('/Workspace/CosyVoice/third_party/Matcha-TTS')
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
# from cosyvoice.utils.file_utils import load_wav, speed_change from cosyvoice.utils.file_utils import load_wav#, speed_change
import soundfile as sf import soundfile as sf
import pyloudnorm as pyln import pyloudnorm as pyln
@ -33,6 +33,7 @@ import numpy as np
from pydub import AudioSegment from pydub import AudioSegment
import subprocess import subprocess
import re
def set_all_random_seed(seed): def set_all_random_seed(seed):
random.seed(seed) random.seed(seed)
@ -107,12 +108,13 @@ class TTS(Blackbox):
self.cosyvoice_url = '' self.cosyvoice_url = ''
self.cosyvoice_mode = cosyvoice_config.mode self.cosyvoice_mode = cosyvoice_config.mode
self.cosyvoicetts = None self.cosyvoicetts = None
self.prompt_speech_16k = None
# os.environ['CUDA_VISIBLE_DEVICES'] = str(cosyvoice_config.device) # os.environ['CUDA_VISIBLE_DEVICES'] = str(cosyvoice_config.device)
if self.cosyvoice_mode == 'local': if self.cosyvoice_mode == 'local':
# self.cosyvoicetts = CosyVoice('/Workspace/Models/CosyVoice/pretrained_models/CosyVoice-300M') # self.cosyvoicetts = CosyVoice('/Workspace/Models/CosyVoice/pretrained_models/CosyVoice-300M')
self.cosyvoicetts = CosyVoice('/model/Voice/CosyVoice/pretrained_models/CosyVoice-300M') # self.cosyvoicetts = CosyVoice('/model/Voice/CosyVoice/pretrained_models/CosyVoice-300M')
# self.cosyvoicetts = CosyVoice2('/model/Voice/CosyVoice/pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False) self.cosyvoicetts = CosyVoice2('/model/Voice/CosyVoice/pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
self.prompt_speech_16k = load_wav('/Workspace/jarvis-models/Ricky-Wong-3-Mins.wav_0006003840_0006134080.wav', 16000)
else: else:
self.cosyvoice_url = cosyvoice_config.url self.cosyvoice_url = cosyvoice_config.url
@ -159,6 +161,21 @@ class TTS(Blackbox):
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
return self.processing(*args, **kwargs) return self.processing(*args, **kwargs)
def filter_invalid_chars(self,text):
"""过滤无效字符(包括字节流)"""
invalid_keywords = ["data:", "\n", "\r", "\t", " "]
if isinstance(text, bytes):
text = text.decode('utf-8', errors='ignore')
for keyword in invalid_keywords:
text = text.replace(keyword, "")
# 移除所有英文字母和符号(保留中文、标点等)
text = re.sub(r'[a-zA-Z]', '', text)
return text.strip()
@logging_time(logger=logger) @logging_time(logger=logger)
def processing(self, *args, settings: dict) -> io.BytesIO: def processing(self, *args, settings: dict) -> io.BytesIO:
@ -233,13 +250,45 @@ class TTS(Blackbox):
if self.cosyvoice_mode == 'local': if self.cosyvoice_mode == 'local':
set_all_random_seed(56056558) set_all_random_seed(56056558)
print("*"*90) print("*"*90)
audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language, stream=True) # audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language, stream=True)
audio = self.cosyvoicetts.inference_instruct2(text, '用粤语说这句话', self.prompt_speech_16k, stream=False)
# for i, j in enumerate(audio):
# f = io.BytesIO()
# sf.write(f, j['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
# f.seek(0)
# print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
# return f.read()
# 打印 audio 的长度和内容结构
# print(f"Total audio segments: {len(audio)}")
# print(f"Audio data structure: {audio}")
# 创建一个空的列表来存储所有音频段的 NumPy 数组
all_audio_data = []
# 遍历每一段音频并将它们存储到 all_audio_data 列表
for i, j in enumerate(audio): for i, j in enumerate(audio):
f = io.BytesIO() # print(f"Processing segment {i + 1}...")
sf.write(f, j['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')
f.seek(0) # 打印每段音频的信息,确保其正确
# print(f"Segment {i + 1} shape: {j['tts_speech'].shape}")
# 直接将音频数据转换成 NumPy 数组
audio_data = j['tts_speech'].cpu().numpy()
# 将每个段的音频数据添加到 all_audio_data 列表
all_audio_data.append(audio_data[0]) # 取音频的第一个通道(假设为单声道)
# 将所有音频段的 NumPy 数组合并成一个完整的音频数组
combined_audio_data = np.concatenate(all_audio_data, axis=0)
# 将合并后的音频数据写入到 BytesIO 中
f = io.BytesIO()
sf.write(f, combined_audio_data, 22050, format='wav') # 22050 为采样率,可能需要根据实际情况调整
f.seek(0)
# 返回合并后的音频
print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
return f.read() return f.read() # 返回最终合并后的音频数据
else: else:
message = { message = {
"text": text "text": text
@ -266,6 +315,7 @@ class TTS(Blackbox):
return response.content return response.content
elif user_model_name == 'sovitstts': elif user_model_name == 'sovitstts':
# text = self.filter_invalid_chars(text)
if chroma_collection_id == 'kiki' or chroma_collection_id is None: if chroma_collection_id == 'kiki' or chroma_collection_id is None:
if self.sovits_mode == 'local': if self.sovits_mode == 'local':
set_all_random_seed(56056558) set_all_random_seed(56056558)
@ -288,7 +338,7 @@ class TTS(Blackbox):
"media_type": self.sovits_media_type, "media_type": self.sovits_media_type,
"streaming_mode": self.sovits_streaming_mode "streaming_mode": self.sovits_streaming_mode
} }
if user_stream: if user_stream == True or str(user_stream).lower() == "true":
response = requests.get(self.sovits_url, params=message, stream=True) response = requests.get(self.sovits_url, params=message, stream=True)
print("#### SoVITS Service consume - docker : ", (time.time()-current_time)) print("#### SoVITS Service consume - docker : ", (time.time()-current_time))
return response return response
@ -360,8 +410,10 @@ class TTS(Blackbox):
return JSONResponse(content={"error": "text is required"}, status_code=status.HTTP_400_BAD_REQUEST) return JSONResponse(content={"error": "text is required"}, status_code=status.HTTP_400_BAD_REQUEST)
by = self.processing(text, settings=setting) by = self.processing(text, settings=setting)
# return Response(content=by, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"}) # return Response(content=by, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"})
print(f"tts user_stream: {type(user_stream)}")
if user_stream: if user_stream == True or str(user_stream).lower() == "true":
print(f"tts user_stream22: {user_stream}")
if by.status_code == 200: if by.status_code == 200:
print("*"*90) print("*"*90)
def audio_stream(): def audio_stream():
@ -405,6 +457,7 @@ class TTS(Blackbox):
else: else:
wav_filename = os.path.join(self.audio_dir, 'audio.wav') wav_filename = os.path.join(self.audio_dir, 'audio.wav')
print("8"*90)
with open(wav_filename, 'wb') as f: with open(wav_filename, 'wb') as f:
f.write(by) f.write(by)

View File

@ -13,9 +13,13 @@ import requests
import base64 import base64
import copy import copy
import ast import ast
import json
import random import random
from time import time from time import time
import io import io
from PIL import Image from PIL import Image
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient
@ -94,7 +98,6 @@ class VLMS(Blackbox):
response: a string response: a string
history: a list history: a list
""" """
if settings: if settings:
for k in settings: for k in settings:
if k not in self.settings: if k not in self.settings:
@ -315,12 +318,17 @@ class VLMS(Blackbox):
## TODO: add support for multiple images and support image in form-data format ## TODO: add support for multiple images and support image in form-data format
json_request = True json_request = True
try: try:
content_type = request.headers['content-type'] content_type = request.headers.get('content-type', '')
print(content_type)
if content_type == 'application/json': if content_type == 'application/json':
data = await request.json() data = await request.json()
else: elif 'multipart/form-data' in content_type:
data = await request.form() data = await request.form()
json_request = False json_request = False
else:
body = await request.body()
data = json.loads(body.decode("utf-8"))
except Exception as e: except Exception as e:
return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST) return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST)
@ -337,7 +345,7 @@ class VLMS(Blackbox):
else: else:
return JSONResponse(content={"error": "context format error, should be in format of list or Openai_format"}, status_code=status.HTTP_400_BAD_REQUEST) return JSONResponse(content={"error": "context format error, should be in format of list or Openai_format"}, status_code=status.HTTP_400_BAD_REQUEST)
if json_request: if json_request or 'multipart/form-data' not in content_type:
img_data = data.get("img_data") img_data = data.get("img_data")
else: else:
img_data = await data.get("img_data").read() img_data = await data.get("img_data").read()