diff --git a/= b/= new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py index 4f70de2..de29765 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,9 @@ from injector import Injector,inject from src.log.handler import LogHandler from src.configuration import EnvConf, LogConf, singleton +import argparse + + @singleton class Main(): @@ -14,7 +17,7 @@ class Main(): def run(self): logger = logging.getLogger(__name__) logger.info("jarvis-models start", extra={"version": "0.0.1"}) - uvicorn.run("server:app", host="0.0.0.0", port=8000, log_level="info") + uvicorn.run("server:app", host="0.0.0.0", port=8000, log_level="info",reload = True) if __name__ == "__main__": injector = Injector() diff --git a/sample/chroma_client1.py b/sample/chroma_client1.py index f4bbdb1..133613c 100644 --- a/sample/chroma_client1.py +++ b/sample/chroma_client1.py @@ -70,7 +70,7 @@ def get_all_files(folder_path): # 加载文档和拆分文档 -loader = TextLoader("/home/administrator/Workspace/jarvis-models/sample/RAG_zh.txt") +loader = TextLoader("/home/gpu/Workspace/jarvis-models/sample/RAG_zh.txt") documents = loader.load() @@ -84,11 +84,11 @@ ids = ["20240521_store"+str(i) for i in range(len(docs))] # 加载embedding模型和chroma server -embedding_model = SentenceTransformerEmbeddings(model_name='/home/administrator/Workspace/Models/BAAI/bge-large-zh-v1.5', model_kwargs={"device": "cuda"}) -client = chromadb.HttpClient(host='172.16.5.8', port=7000) +embedding_model = SentenceTransformerEmbeddings(model_name='/home/gpu/Workspace/Models/BAAI/bge-large-zh-v1.5', model_kwargs={"device": "cuda"}) +client = chromadb.HttpClient(host='10.6.81.119', port=7000) id = "g2e" -client.delete_collection(id) +#client.delete_collection(id) collection_number = client.get_or_create_collection(id).count() print("collection_number",collection_number) start_time2 = time.time() @@ -106,8 +106,8 @@ print("collection_number",collection_number) # # chroma 召回 # from chromadb.utils import embedding_functions -# embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/administrator/Workspace/Models/BAAI/bge-large-zh-v1.5", device = "cuda") -# client = chromadb.HttpClient(host='172.16.5.8', port=7000) +# embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/gpu/Workspace/Models/BAAI/bge-large-zh-v1.5", device = "cuda") +# client = chromadb.HttpClient(host='10.6.81.119', port=7000) # collection = client.get_collection("g2e", embedding_function=embedding_model) # print(collection.count()) @@ -152,7 +152,7 @@ print("collection_number",collection_number) # 'Content-Type': 'application/json', # 'Authorization': "Bearer " + key # } -# url = "http://172.16.5.8:23333/v1/chat/completions" +# url = "http://10.6.81.119:23333/v1/chat/completions" # fastchat_response = requests.post(url, json=chat_inputs, headers=header) # # print(fastchat_response.json()) diff --git a/sample/chroma_client_en.py b/sample/chroma_client_en.py index ab8a136..ee60034 100644 --- a/sample/chroma_client_en.py +++ b/sample/chroma_client_en.py @@ -70,7 +70,7 @@ def get_all_files(folder_path): # 加载文档和拆分文档 -loader = TextLoader("/home/administrator/Workspace/jarvis-models/sample/RAG_en.txt") +loader = TextLoader("/home/gpu/Workspace/jarvis-models/sample/RAG_en.txt") documents = loader.load() @@ -84,8 +84,8 @@ ids = ["20240521_store"+str(i) for i in range(len(docs))] # 加载embedding模型和chroma server -embedding_model = SentenceTransformerEmbeddings(model_name='/home/administrator/Workspace/Models/BAAI/bge-large-en-v1.5', model_kwargs={"device": "cuda"}) -client = chromadb.HttpClient(host='172.16.5.8', port=7000) +embedding_model = SentenceTransformerEmbeddings(model_name='/home/gpu/Workspace/Models/BAAI/bge-small-en-v1.5', model_kwargs={"device": "cuda"}) +client = chromadb.HttpClient(host='10.6.81.119', port=7000) id = "g2e_english" client.delete_collection(id) @@ -106,8 +106,8 @@ print("collection_number",collection_number) # # chroma 召回 # from chromadb.utils import embedding_functions -# embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/administrator/Workspace/Models/BAAI/bge-large-zh-v1.5", device = "cuda") -# client = chromadb.HttpClient(host='172.16.5.8', port=7000) +# embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/gpu/Workspace/Models/BAAI/bge-large-zh-v1.5", device = "cuda") +# client = chromadb.HttpClient(host='10.6.81.119', port=7000) # collection = client.get_collection("g2e", embedding_function=embedding_model) # print(collection.count()) @@ -152,7 +152,7 @@ print("collection_number",collection_number) # 'Content-Type': 'application/json', # 'Authorization': "Bearer " + key # } -# url = "http://172.16.5.8:23333/v1/chat/completions" +# url = "http://10.6.81.119:23333/v1/chat/completions" # fastchat_response = requests.post(url, json=chat_inputs, headers=header) # # print(fastchat_response.json()) diff --git a/sample/chroma_client_query.py b/sample/chroma_client_query.py index 9034c65..a3935f3 100644 --- a/sample/chroma_client_query.py +++ b/sample/chroma_client_query.py @@ -66,7 +66,7 @@ def get_all_files(folder_path): # 加载文档和拆分文档 -# loader = TextLoader("/home/administrator/Workspace/jarvis-models/sample/RAG_zh.txt") +# loader = TextLoader("/home/gpu/Workspace/jarvis-models/sample/RAG_zh.txt") # documents = loader.load() @@ -80,8 +80,8 @@ def get_all_files(folder_path): # # 加载embedding模型和chroma server -# embedding_model = SentenceTransformerEmbeddings(model_name='/home/administrator/Workspace/Models/BAAI/bge-large-zh-v1.5', model_kwargs={"device": "cuda"}) -# client = chromadb.HttpClient(host='172.16.5.8', port=7000) +# embedding_model = SentenceTransformerEmbeddings(model_name='/home/gpu/Workspace/Models/BAAI/bge-large-zh-v1.5', model_kwargs={"device": "cuda"}) +# client = chromadb.HttpClient(host='10.6.81.119', port=7000) # id = "g2e" # client.delete_collection(id) @@ -102,8 +102,8 @@ def get_all_files(folder_path): # chroma 召回 from chromadb.utils import embedding_functions -embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/administrator/Workspace/Models/BAAI/bge-large-zh-v1.5", device = "cuda") -client = chromadb.HttpClient(host='172.16.5.8', port=7000) +embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/gpu/Workspace/Models/BAAI/bge-large-zh-v1.5", device = "cuda") +client = chromadb.HttpClient(host='10.6.81.119', port=7000) collection = client.get_collection("g2e", embedding_function=embedding_model) print(collection.count()) @@ -148,7 +148,7 @@ print("time: ", time.time() - start_time) # 'Content-Type': 'application/json', # 'Authorization': "Bearer " + key # } -# url = "http://172.16.5.8:23333/v1/chat/completions" +# url = "http://10.6.81.119:23333/v1/chat/completions" # fastchat_response = requests.post(url, json=chat_inputs, headers=header) # # print(fastchat_response.json()) diff --git a/src/blackbox/asr.py b/src/blackbox/asr.py index 5338ec2..6c10c74 100644 --- a/src/blackbox/asr.py +++ b/src/blackbox/asr.py @@ -6,26 +6,109 @@ from fastapi.responses import JSONResponse from ..asr.rapid_paraformer.utils import read_yaml from ..asr.rapid_paraformer import RapidParaformer + +from funasr import AutoModel +from funasr.utils.postprocess_utils import rich_transcription_postprocess from .blackbox import Blackbox from injector import singleton, inject +import tempfile + +import json +import os +from ..configuration import SenseVoiceConf + +from ..log.logging_time import logging_time +import logging +logger = logging.getLogger(__name__) + @singleton class ASR(Blackbox): + mode: str + url: str + speed: int + device: str + language: str + speaker: str + + @logging_time(logger=logger) + def model_init(self, sensevoice_config: SenseVoiceConf) -> None: + + config = read_yaml(".env.yaml") + self.paraformer = RapidParaformer(config) + + model_dir = "/home/gpu/Workspace/Models/SenseVoice/SenseVoiceSmall" + + self.speed = sensevoice_config.speed + self.device = sensevoice_config.device + self.language = sensevoice_config.language + self.speaker = sensevoice_config.speaker + self.device = sensevoice_config.device + self.url = '' + self.mode = sensevoice_config.mode + self.asr = None + self.speaker_ids = None + # os.environ['CUDA_VISIBLE_DEVICES'] = str(sensevoice_config.device) + if self.mode == 'local': + self.asr = AutoModel( + model=model_dir, + trust_remote_code=True, + remote_code= "/home/gpu/Workspace/SenseVoice/model.py", + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + device=self.device, + ) + + else: + self.url = sensevoice_config.url + logging.info('#### Initializing SenseVoiceASR Service in cuda:' + sensevoice_config.device + ' mode...') @inject - def __init__(self,path = ".env.yaml") -> None: - config = read_yaml(path) - self.paraformer = RapidParaformer(config) - + def __init__(self, sensevoice_config: SenseVoiceConf, settings: dict) -> None: + self.model_init(sensevoice_config) + def __call__(self, *args, **kwargs): return self.processing(*args, **kwargs) - async def processing(self, *args, **kwargs): + async def processing(self, *args, settings: dict): + + print("\nChat Settings: ", settings) + if settings is None: + settings = {} + user_model_name = settings.get("asr_model_name") + print(f"asr_model_name: {user_model_name}") data = args[0] - results = self.paraformer([BytesIO(data)]) - if len(results) == 0: - return None - return results[0] + + if user_model_name == 'sensevoice' or ['sensevoice']: + # 创建一个临时文件 + with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: + temp_audio_file.write(data) + temp_audio_path = temp_audio_file.name + res = self.asr.generate( + input=temp_audio_path, + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, + ) + # results = self.paraformer([BytesIO(data)]) + results = rich_transcription_postprocess(res[0]["text"]) + os.remove(temp_audio_path) + if len(results) == 0: + return None + return results + + elif user_model_name == 'funasr' or ['funasr']: + + return results + + else: + results = self.paraformer([BytesIO(data)]) + if len(results) == 0: + return None + return results[0] def valid(self, data: any) -> bool: if isinstance(data, bytes): @@ -34,12 +117,20 @@ class ASR(Blackbox): async def fast_api_handler(self, request: Request) -> Response: data = (await request.form()).get("audio") + setting: dict = (await request.form()).get("settings") + + if isinstance(setting, str): + try: + setting = json.loads(setting) # 尝试将字符串转换为字典 + except json.JSONDecodeError: + return JSONResponse(content={"error": "Invalid settings format"}, status_code=status.HTTP_400_BAD_REQUEST) + if data is None: - self.logger.warn("asr bag request","type", "fast_api_handler", "api", "asr") + # self.logger.warn("asr bag request","type", "fast_api_handler", "api", "asr") return JSONResponse(content={"error": "data is required"}, status_code=status.HTTP_400_BAD_REQUEST) d = await data.read() try: - txt = await self.processing(d) + txt = await self.processing(d, settings=setting) except ValueError as e: return JSONResponse(content={"error": str(e)}, status_code=status.HTTP_400_BAD_REQUEST) return JSONResponse(content={"text": txt}, status_code=status.HTTP_200_OK) \ No newline at end of file diff --git a/src/blackbox/asr_bak.py b/src/blackbox/asr_bak.py new file mode 100644 index 0000000..9f6f614 --- /dev/null +++ b/src/blackbox/asr_bak.py @@ -0,0 +1,45 @@ +from io import BytesIO +from typing import Any, Coroutine + +from fastapi import Request, Response, status +from fastapi.responses import JSONResponse + +from ..asr.rapid_paraformer.utils import read_yaml +from ..asr.rapid_paraformer import RapidParaformer +from .blackbox import Blackbox +from injector import singleton, inject + +@singleton +class ASR(Blackbox): + + @inject + def __init__(self,path = ".env.yaml") -> None: + config = read_yaml(path) + self.paraformer = RapidParaformer(config) + + def __call__(self, *args, **kwargs): + return self.processing(*args, **kwargs) + + async def processing(self, *args, **kwargs): + data = args[0] + results = self.paraformer([BytesIO(data)]) + if len(results) == 0: + return None + return results[0] + + def valid(self, data: any) -> bool: + if isinstance(data, bytes): + return True + return False + + async def fast_api_handler(self, request: Request) -> Response: + data = (await request.form()).get("audio") + if data is None: + # self.logger.warn("asr bag request","type", "fast_api_handler", "api", "asr") + return JSONResponse(content={"error": "data is required"}, status_code=status.HTTP_400_BAD_REQUEST) + d = await data.read() + try: + txt = await self.processing(d) + except ValueError as e: + return JSONResponse(content={"error": str(e)}, status_code=status.HTTP_400_BAD_REQUEST) + return JSONResponse(content={"text": txt}, status_code=status.HTTP_200_OK) \ No newline at end of file diff --git a/src/blackbox/asrsensevoice.py b/src/blackbox/asrsensevoice.py new file mode 100644 index 0000000..d3d0e78 --- /dev/null +++ b/src/blackbox/asrsensevoice.py @@ -0,0 +1,103 @@ +from io import BytesIO +from typing import Any, Coroutine + +from fastapi import Request, Response, status +from fastapi.responses import JSONResponse + +from funasr import AutoModel +from funasr.utils.postprocess_utils import rich_transcription_postprocess +from .blackbox import Blackbox +from injector import singleton, inject + +import tempfile + +import os +from ..configuration import SenseVoiceConf + +from ..log.logging_time import logging_time +import logging +logger = logging.getLogger(__name__) + +@singleton +class ASR(Blackbox): + mode: str + url: str + speed: int + device: str + language: str + speaker: str + + @logging_time(logger=logger) + def model_init(self, sensevoice_config: SenseVoiceConf) -> None: + + model_dir = "/home/gpu/Workspace/Models/SenseVoice/SenseVoiceSmall" + + self.speed = sensevoice_config.speed + self.device = sensevoice_config.device + self.language = sensevoice_config.language + self.speaker = sensevoice_config.speaker + self.device = sensevoice_config.device + self.url = '' + self.mode = sensevoice_config.mode + self.asr = None + self.speaker_ids = None + os.environ['CUDA_VISIBLE_DEVICES'] = str(sensevoice_config.device) + if self.mode == 'local': + self.asr = AutoModel( + model=model_dir, + trust_remote_code=True, + remote_code= "/home/gpu/Workspace/SenseVoice/model.py", + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + device="cuda:0", + ) + + else: + self.url = sensevoice_config.url + logging.info('#### Initializing SenseVoiceASR Service in cuda:' + str(sensevoice_config.device) + ' mode...') + + @inject + def __init__(self, sensevoice_config: SenseVoiceConf) -> None: + self.model_init(sensevoice_config) + + def __call__(self, *args, **kwargs): + return self.processing(*args, **kwargs) + + async def processing(self, *args, **kwargs): + data = args[0] + # 创建一个临时文件 + with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: + temp_audio_file.write(data) + temp_audio_path = temp_audio_file.name + res = self.asr.generate( + input=temp_audio_path, + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, + ) + # results = self.paraformer([BytesIO(data)]) + results = rich_transcription_postprocess(res[0]["text"]) + os.remove(temp_audio_path) + if len(results) == 0: + return None + return results + + def valid(self, data: any) -> bool: + if isinstance(data, bytes): + return True + return False + + async def fast_api_handler(self, request: Request) -> Response: + data = (await request.form()).get("audio") + if data is None: + # self.logger.warn("asr bag request","type", "fast_api_handler", "api", "asr") + return JSONResponse(content={"error": "data is required"}, status_code=status.HTTP_400_BAD_REQUEST) + d = await data.read() + try: + txt = await self.processing(d) + except ValueError as e: + return JSONResponse(content={"error": str(e)}, status_code=status.HTTP_400_BAD_REQUEST) + return JSONResponse(content={"text": txt}, status_code=status.HTTP_200_OK) \ No newline at end of file diff --git a/src/blackbox/audio_chat.py b/src/blackbox/audio_chat.py index 1108b44..3f34e1d 100644 --- a/src/blackbox/audio_chat.py +++ b/src/blackbox/audio_chat.py @@ -3,7 +3,7 @@ from fastapi import Request, Response,status from fastapi.responses import JSONResponse from injector import inject, singleton -from .asr import ASR +from .asrsensevoice import ASR from .tesou import Tesou from .tts import TTS diff --git a/src/blackbox/blackbox_factory.py b/src/blackbox/blackbox_factory.py index 8565eb1..8544416 100644 --- a/src/blackbox/blackbox_factory.py +++ b/src/blackbox/blackbox_factory.py @@ -47,15 +47,15 @@ def vlms_loader(): from .vlms import VLMS return Injector().get(VLMS) -@model_loader(lazy=blackboxConf.lazyloading) -def melotts_loader(): - from .melotts import MeloTTS - return Injector().get(MeloTTS) +# @model_loader(lazy=blackboxConf.lazyloading) +# def melotts_loader(): +# from .melotts import MeloTTS +# return Injector().get(MeloTTS) -@model_loader(lazy=blackboxConf.lazyloading) -def cosyvoicetts_loader(): - from .cosyvoicetts import CosyVoiceTTS - return Injector().get(CosyVoiceTTS) +# @model_loader(lazy=blackboxConf.lazyloading) +# def cosyvoicetts_loader(): +# from .cosyvoicetts import CosyVoiceTTS +# return Injector().get(CosyVoiceTTS) @model_loader(lazy=blackboxConf.lazyloading) def tts_loader(): @@ -97,11 +97,6 @@ def chat_llama_loader(): from .chat_llama import ChatLLaMA return Injector().get(ChatLLaMA) -@model_loader(lazy=blackboxConf.lazyloading) -def cosyvoicetts_loader(): - from .cosyvoicetts import CosyVoiceTTS - return Injector().get(CosyVoiceTTS) - @singleton class BlackboxFactory: models = {} @@ -119,11 +114,11 @@ class BlackboxFactory: self.models["chroma_query"] = chroma_query_loader self.models["chroma_upsert"] = chroma_upsert_loader self.models["chroma_chat"] = chroma_chat_loader - self.models["melotts"] = melotts_loader + # self.models["melotts"] = melotts_loader self.models["vlms"] = vlms_loader self.models["chat"] = chat_loader self.models["chat_llama"] = chat_llama_loader - self.models["cosyvoicetts"] = cosyvoicetts_loader + # self.models["cosyvoicetts"] = cosyvoicetts_loader def __call__(self, *args, **kwargs): return self.processing(*args, **kwargs) diff --git a/src/blackbox/chat.py b/src/blackbox/chat.py index f0a2715..878fd48 100644 --- a/src/blackbox/chat.py +++ b/src/blackbox/chat.py @@ -92,7 +92,7 @@ class Chat(Blackbox): #user_presence_penalty = 0.8 if user_model_url is None or user_model_url.isspace() or user_model_url == "": - user_model_url = "http://172.16.5.8:23333/v1/chat/completions" + user_model_url = "http://10.6.81.119:23333/v1/chat/completions" if user_model_key is None or user_model_key.isspace() or user_model_key == "": user_model_key = "YOUR_API_KEY" diff --git a/src/blackbox/chroma_query.py b/src/blackbox/chroma_query.py index e85cabb..58e20ac 100755 --- a/src/blackbox/chroma_query.py +++ b/src/blackbox/chroma_query.py @@ -20,9 +20,9 @@ class ChromaQuery(Blackbox): def __init__(self, *args, **kwargs) -> None: # config = read_yaml(args[0]) # load chromadb and embedding model - self.embedding_model_1 = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/administrator/Workspace/Models/BAAI/bge-large-zh-v1.5", device = "cuda") - self.embedding_model_2 = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/administrator/Workspace/Models/BAAI/bge-large-en-v1.5", device = "cuda") - self.client_1 = chromadb.HttpClient(host='172.16.5.8', port=7000) + self.embedding_model_1 = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/gpu/Workspace/Models/BAAI/bge-large-zh-v1.5", device = "cuda:1") + self.embedding_model_2 = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="/home/gpu/Workspace/Models/BAAI/bge-small-en-v1.5", device = "cuda:1") + self.client_1 = chromadb.HttpClient(host='10.6.81.119', port=7000) # self.client_2 = chromadb.HttpClient(host='10.6.82.192', port=8000) def __call__(self, *args, **kwargs): @@ -51,10 +51,10 @@ class ChromaQuery(Blackbox): return JSONResponse(content={"error": "question is required"}, status_code=status.HTTP_400_BAD_REQUEST) if chroma_embedding_model is None or chroma_embedding_model.isspace() or chroma_embedding_model == "": - chroma_embedding_model = "/home/administrator/Workspace/Models/BAAI/bge-large-zh-v1.5" + chroma_embedding_model = "/home/gpu/Workspace/Models/BAAI/bge-large-zh-v1.5" if chroma_host is None or chroma_host.isspace() or chroma_host == "": - chroma_host = "172.16.5.8" + chroma_host = "10.6.81.119" if chroma_port is None or chroma_port.isspace() or chroma_port == "": chroma_port = "7000" @@ -66,17 +66,17 @@ class ChromaQuery(Blackbox): chroma_n_results = 3 # load client and embedding model from init - if re.search(r"172.16.5.8", chroma_host) and re.search(r"7000", chroma_port): + if re.search(r"10.6.81.119", chroma_host) and re.search(r"7000", chroma_port): client = self.client_1 else: client = chromadb.HttpClient(host=chroma_host, port=chroma_port) - if re.search(r"/home/administrator/Workspace/Models/BAAI/bge-large-zh-v1.5", chroma_embedding_model): + if re.search(r"/home/gpu/Workspace/Models/BAAI/bge-large-zh-v1.5", chroma_embedding_model): embedding_model = self.embedding_model_1 - elif re.search(r"/home/administrator/Workspace/Models/BAAI/bge-large-en-v1.5", chroma_embedding_model): + elif re.search(r"/home/gpu/Workspace/Models/BAAI/bge-small-en-v1.5", chroma_embedding_model): embedding_model = self.embedding_model_2 else: - embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=chroma_embedding_model, device = "cuda") + embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=chroma_embedding_model, device = "cuda:1") # load collection collection = client.get_collection(chroma_collection_id, embedding_function=embedding_model) diff --git a/src/blackbox/g2e.py b/src/blackbox/g2e.py index 416ab72..d192a78 100755 --- a/src/blackbox/g2e.py +++ b/src/blackbox/g2e.py @@ -23,7 +23,7 @@ class G2E(Blackbox): if context == None: context = [] #url = 'http://120.196.116.194:48890/v1' - url = 'http://120.196.116.194:48892/v1' + url = 'http://10.6.81.119:23333/v1' background_prompt = '''KOMBUKIKI是一款茶饮料,目标受众 年龄:20-35岁 性别:女性 地点:一线城市、二线城市 职业:精英中产、都市白领 收入水平:中高收入,有一定消费能力 兴趣和爱好:注重健康,有运动习惯 @@ -73,11 +73,11 @@ class G2E(Blackbox): response = client.chat.completions.create( model=model_name, messages=messages, - temperature=0.8, - top_p=0.8, - frequency_penalty=0.5, - presence_penalty=0.8, - stop=100 + temperature="0.8", + top_p="0.8", + frequency_penalty="0.5", + presence_penalty="0.8", + stop="100" ) fastchat_content = response.choices[0].message.content diff --git a/src/blackbox/tts.py b/src/blackbox/tts.py index f05c082..0449612 100644 --- a/src/blackbox/tts.py +++ b/src/blackbox/tts.py @@ -2,27 +2,154 @@ import io import time from ntpath import join +import requests from fastapi import Request, Response, status from fastapi.responses import JSONResponse from .blackbox import Blackbox from ..tts.tts_service import TTService +from ..configuration import MeloConf +from ..configuration import CosyVoiceConf +from injector import inject from injector import singleton +import sys,os +sys.path.append('/home/gpu/Workspace/CosyVoice') +from cosyvoice.cli.cosyvoice import CosyVoice +from cosyvoice.utils.file_utils import load_wav + +import soundfile +import pyloudnorm as pyln +from melo.api import TTS as MELOTTS + +from ..log.logging_time import logging_time +import logging +logger = logging.getLogger(__name__) + @singleton class TTS(Blackbox): + melo_mode: str + melo_url: str + melo_speed: int + melo_device: str + melo_language: str + melo_speaker: str + + cosyvoice_mode: str + cosyvoice_url: str + cosyvoice_speed: int + cosyvoice_device: str + cosyvoice_language: str + cosyvoice_speaker: str + + @logging_time(logger=logger) + def melo_model_init(self, melo_config: MeloConf) -> None: + self.melo_speed = melo_config.speed + self.melo_device = melo_config.device + self.melo_language = melo_config.language + self.melo_speaker = melo_config.speaker + self.melo_url = '' + self.melo_mode = melo_config.mode + self.melotts = None + self.speaker_ids = None + if self.melo_mode == 'local': + self.melotts = MELOTTS(language=self.melo_language, device=self.melo_device) + self.speaker_ids = self.melotts.hps.data.spk2id + else: + self.melo_url = melo_config.url + logging.info('#### Initializing MeloTTS Service in ' + self.melo_device + ' mode...') + + @logging_time(logger=logger) + def cosyvoice_model_init(self, cosyvoice_config: CosyVoiceConf) -> None: + self.cosyvoice_speed = cosyvoice_config.speed + self.cosyvoice_device = cosyvoice_config.device + self.cosyvoice_language = cosyvoice_config.language + self.cosyvoice_speaker = cosyvoice_config.speaker + self.cosyvoice_url = '' + self.cosyvoice_mode = cosyvoice_config.mode + self.cosyvoicetts = None + # os.environ['CUDA_VISIBLE_DEVICES'] = str(cosyvoice_config.device) + if self.cosyvoice_mode == 'local': + self.cosyvoicetts = CosyVoice('/home/gpu/Workspace/Models/CosyVoice/pretrained_models/CosyVoice-300M') + + else: + self.cosyvoice_url = cosyvoice_config.url + logging.info('#### Initializing CosyVoiceTTS Service in cuda:' + self.cosyvoice_device + ' mode...') + + @inject + def __init__(self, melo_config: MeloConf, cosyvoice_config: CosyVoiceConf, settings: dict) -> None: + self.tts_service = TTService("yunfeineo") + self.melo_model_init(melo_config) + self.cosyvoice_model_init(cosyvoice_config) - def __init__(self, *args, **kwargs) -> None: - self.tts_service = TTService("catmaid") def __call__(self, *args, **kwargs): return self.processing(*args, **kwargs) - def processing(self, *args, **kwargs) -> io.BytesIO: + @logging_time(logger=logger) + def processing(self, *args, settings: dict) -> io.BytesIO: + + print("\nChat Settings: ", settings) + if settings is None: + settings = {} + user_model_name = settings.get("tts_model_name") + print(f"tts_model_name: {user_model_name}") + text = args[0] current_time = time.time() - audio = self.tts_service.read(text) - print("#### TTS Service consume : ", (time.time()-current_time)) - return audio + + if user_model_name == 'melotts': + if self.melo_mode == 'local': + audio = self.melotts.tts_to_file(text, self.speaker_ids[self.melo_speaker], speed=self.melo_speed) + f = io.BytesIO() + soundfile.write(f, audio, 44100, format='wav') + f.seek(0) + + # Read the audio data from the buffer + data, rate = soundfile.read(f, dtype='float32') + + # Peak normalization + peak_normalized_audio = pyln.normalize.peak(data, -1.0) + + # Integrated loudness normalization + meter = pyln.Meter(rate) + loudness = meter.integrated_loudness(peak_normalized_audio) + loudness_normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0) + + # Write the loudness normalized audio to an in-memory buffer + normalized_audio_buffer = io.BytesIO() + soundfile.write(normalized_audio_buffer, loudness_normalized_audio, rate, format='wav') + normalized_audio_buffer.seek(0) + + print("#### MeloTTS Service consume - local : ", (time.time() - current_time)) + return normalized_audio_buffer.read() + + else: + message = { + "text": text + } + response = requests.post(self.melo_url, json=message) + print("#### MeloTTS Service consume - docker : ", (time.time()-current_time)) + return response.content + + elif user_model_name == 'cosyvoicetts': + if self.cosyvoice_mode == 'local': + audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language) + f = io.BytesIO() + soundfile.write(f, audio['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav') + f.seek(0) + print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time)) + return f.read() + else: + message = { + "text": text + } + response = requests.post(self.cosyvoice_url, json=message) + print("#### CosyVoiceTTS Service consume - docker : ", (time.time()-current_time)) + return response.content + else: + audio = self.tts_service.read(text) + print("#### TTS Service consume : ", (time.time()-current_time)) + return audio.read() def valid(self, *args, **kwargs) -> bool: text = args[0] @@ -31,10 +158,12 @@ class TTS(Blackbox): async def fast_api_handler(self, request: Request) -> Response: try: data = await request.json() + print(f"data: {data}") except: return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST) text = data.get("text") + setting = data.get("settings") if text is None: return JSONResponse(content={"error": "text is required"}, status_code=status.HTTP_400_BAD_REQUEST) - by = self.processing(text) - return Response(content=by.read(), media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"}) \ No newline at end of file + by = self.processing(text, settings=setting) + return Response(content=by, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"}) \ No newline at end of file diff --git a/src/blackbox/tts_bak.py b/src/blackbox/tts_bak.py new file mode 100644 index 0000000..583e525 --- /dev/null +++ b/src/blackbox/tts_bak.py @@ -0,0 +1,40 @@ +import io +import time +from ntpath import join + +from fastapi import Request, Response, status +from fastapi.responses import JSONResponse +from .blackbox import Blackbox +from ..tts.tts_service import TTService +from injector import singleton + +@singleton +class TTS(Blackbox): + + def __init__(self, *args, **kwargs) -> None: + self.tts_service = TTService("yunfeineo") + + def __call__(self, *args, **kwargs): + return self.processing(*args, **kwargs) + + def processing(self, *args, **kwargs) -> io.BytesIO: + text = args[0] + current_time = time.time() + audio = self.tts_service.read(text) + print("#### TTS Service consume : ", (time.time()-current_time)) + return audio + + def valid(self, *args, **kwargs) -> bool: + text = args[0] + return isinstance(text, str) + + async def fast_api_handler(self, request: Request) -> Response: + try: + data = await request.json() + except: + return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST) + text = data.get("text") + if text is None: + return JSONResponse(content={"error": "text is required"}, status_code=status.HTTP_400_BAD_REQUEST) + by = self.processing(text) + return Response(content=by.read(), media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"}) \ No newline at end of file diff --git a/src/blackbox/vlms.py b/src/blackbox/vlms.py index cd02ce8..2083df4 100644 --- a/src/blackbox/vlms.py +++ b/src/blackbox/vlms.py @@ -1,11 +1,19 @@ from fastapi import Request, Response, status from fastapi.responses import JSONResponse -from .blackbox import Blackbox +from injector import singleton,inject from typing import Optional +from .blackbox import Blackbox +from ..log.logging_time import logging_time +from .chroma_query import ChromaQuery +from ..configuration import VLMConf + import requests import base64 +import io +from PIL import Image +from lmdeploy.serve.openai.api_client import APIClient def is_base64(value) -> bool: try: @@ -14,9 +22,16 @@ def is_base64(value) -> bool: except Exception: return False - +@singleton class VLMS(Blackbox): + @inject + def __init__(self, vlm_config: VLMConf): + # Chroma database initially set up for RAG for vision model. + # It could be expended to history store. + # self.chroma_query = chroma_query + self.url = vlm_config.url + def __call__(self, *args, **kwargs): return self.processing(*args, **kwargs) @@ -26,6 +41,7 @@ class VLMS(Blackbox): def processing(self, prompt, images, model_name: Optional[str] = None) -> str: + # Current only Qwen-vl model if model_name == "Qwen-VL-Chat": model_name = "infer-qwen-vl" elif model_name == "llava-llama-3-8b-v1_1-transformers": @@ -33,19 +49,61 @@ class VLMS(Blackbox): else: model_name = "infer-qwen-vl" - url = 'http://120.196.116.194:48894/' + model_name + '/' + + ## AutoLoad Model + # url = 'http://10.6.80.87:8000/' + model_name + '/' if is_base64(images): images_data = images else: + + # print("{}Type of image data in form {}".format('#'*20,type(images))) + # print("{}Type of image data in form {}".format('#'*20,type(images.file))) + # byte_stream = io.BytesIO(images.read()) + # print("{}Type of image data in form {}".format('#'*20,type(byte_stream))) + # roiImg = Image.open(byte_stream) + # print("{}Successful {}".format('#'*20,type(roiImg))) + # return str(type(byte_stream)) + # images_data = base64.b64encode(byte_stream) + + with open(images, "rb") as img_file: - images_data = str(base64.b64encode(img_file.read()), 'utf-8') + # images_data = str(base64.b64encode(img_file.read()), 'utf-8') + images_data = base64.b64encode(img_file.read()) - data_input = {'model': model_name, 'prompt': prompt, 'img_data': images_data} + # data_input = {'model': model_name, 'prompt': prompt, 'img_data': images_data} - data = requests.post(url, json=data_input) + # data = requests.post(url, json=data_input) + # 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg' - return data.text + ## Lmdeploy + api_client = APIClient(self.url) + model_name = api_client.available_models[0] + messages = [{ + 'role': 'user', + 'content': [{ + 'type': 'text', + 'text': prompt, + }, { + 'type': 'image_url', + 'image_url': { + 'url': f"data:image/jpeg;base64,{images_data}", + # './val_data/image_5.jpg', + }, + }] + } + ] + + responses = '' + for i,item in enumerate(api_client.chat_completions_v1(model=model_name, + messages=messages#,stream = True + )): + # print(item["choices"][0]["message"]['content']) + responses += item["choices"][0]["message"]['content'] + + return responses + + # return data.text async def fast_api_handler(self, request: Request) -> Response: try: @@ -63,5 +121,6 @@ class VLMS(Blackbox): if model_name is None or model_name.isspace(): model_name = "Qwen-VL-Chat" - jsonresp = str(JSONResponse(content={"response": self.processing(prompt, img_data, model_name)}).body, "utf-8") - return JSONResponse(content={"response": jsonresp}, status_code=status.HTTP_200_OK) \ No newline at end of file + # jsonresp = str(JSONResponse(content={"response": self.processing(prompt, img_data, model_name)}).body, "utf-8") + + return JSONResponse(content={"response": self.processing(prompt, img_data, model_name)}, status_code=status.HTTP_200_OK) \ No newline at end of file diff --git a/src/configuration.py b/src/configuration.py index 96d57aa..651cb95 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -81,6 +81,23 @@ class CosyVoiceConf(): self.device = config.get("cosyvoicetts.device") self.language = config.get("cosyvoicetts.language") self.speaker = config.get("cosyvoicetts.speaker") + +class SenseVoiceConf(): + mode: str + url: str + speed: int + device: str + language: str + speaker: str + + @inject + def __init__(self, config: Configuration) -> None: + self.mode = config.get("sensevoiceasr.mode") + self.url = config.get("sensevoiceasr.url") + self.speed = config.get("sensevoiceasr.speed") + self.device = config.get("sensevoiceasr.device") + self.language = config.get("sensevoiceasr.language") + self.speaker = config.get("sensevoiceasr.speaker") # 'CRITICAL': CRITICAL, # 'FATAL': FATAL, @@ -110,6 +127,7 @@ class LogConf(): self.filename = config.get("log.filename") self.time_format = config.get("log.time_format", default=DEFAULT_TIME_FORMAT) + @singleton class EnvConf(): version: str @@ -128,4 +146,11 @@ class BlackboxConf(): @inject def __init__(self, config: Configuration) -> None: - self.lazyloading = bool(config.get("blackbox.lazyloading", default=False)) \ No newline at end of file + self.lazyloading = bool(config.get("blackbox.lazyloading", default=False)) + +@singleton +class VLMConf(): + + @inject + def __init__(self, config: Configuration) -> None: + self.url = config.get("vlms.url") diff --git a/src/tts/tts_service.py b/src/tts/tts_service.py index 41ea663..f768f1a 100644 --- a/src/tts/tts_service.py +++ b/src/tts/tts_service.py @@ -21,6 +21,60 @@ logging.basicConfig(level=logging.INFO) dirbaspath = __file__.split("\\")[1:-1] dirbaspath= "/home/gpu/Workspace/jarvis-models/src/tts" + "/".join(dirbaspath) config = { + 'ayaka': { + 'cfg': dirbaspath + '/models/ayaka.json', + 'model': dirbaspath + '/models/ayaka_167k.pth', + 'char': 'character_ayaka', + 'speed': 1 + }, + 'catmix': { + 'cfg': dirbaspath + '/models/catmix.json', + 'model': dirbaspath + '/models/catmix_107k.pth', + 'char': 'character_catmix', + 'speed': 1.1 + }, + 'noelle': { + 'cfg': dirbaspath + '/models/noelle.json', + 'model': dirbaspath + '/models/noelle_337k.pth', + 'char': 'character_noelle', + 'speed': 1.1 + }, + 'miko': { + 'cfg': dirbaspath + '/models/miko.json', + 'model': dirbaspath + '/models/miko_139k.pth', + 'char': 'character_miko', + 'speed': 1.1 + }, + 'nahida': { + 'cfg': dirbaspath + '/models/nahida.json', + 'model': dirbaspath + '/models/nahida_129k.pth', + 'char': 'character_nahida', + 'speed': 1.1 + }, + 'ningguang': { + 'cfg': dirbaspath + '/models/ningguang.json', + 'model': dirbaspath + '/models/ningguang_179k.pth', + 'char': 'character_ningguang', + 'speed': 1.1 + }, + 'yoimiya': { + 'cfg': dirbaspath + '/models/yoimiya.json', + 'model': dirbaspath + '/models/yoimiya_102k.pth', + 'char': 'character_yoimiya', + 'speed': 1.1 + }, + 'yunfeineo': { + 'cfg': dirbaspath + '/models/yunfeineo.json', + 'model': dirbaspath + '/models/yunfeineo_25k.pth', + 'char': 'character_yunfeineo', + 'speed': 1.1 + }, + 'zhongli': { + 'cfg': dirbaspath + '/models/zhongli.json', + 'model': dirbaspath + '/models/zhongli_44k.pth', + 'char': 'character_', + 'speed': 1.1 + }, 'paimon': { 'cfg': dirbaspath + '/models/paimon6k.json', 'model': dirbaspath + '/models/paimon6k_390k.pth', @@ -28,7 +82,7 @@ config = { 'speed': 1 }, 'yunfei': { - 'cfg': dirbaspath + '/tts/models/yunfeimix2.json', + 'cfg': dirbaspath + '/models/yunfeimix2.json', 'model': dirbaspath + '/models/yunfeimix2_53k.pth', 'char': 'character_yunfei', 'speed': 1.1 diff --git a/test_data/voice/2food.wav b/test_data/voice/2food.wav new file mode 100644 index 0000000..5e3b84c Binary files /dev/null and b/test_data/voice/2food.wav differ diff --git a/test_data/voice/2forget.wav b/test_data/voice/2forget.wav new file mode 100644 index 0000000..a65cc4f Binary files /dev/null and b/test_data/voice/2forget.wav differ diff --git a/test_data/voice/2nihao.wav b/test_data/voice/2nihao.wav new file mode 100644 index 0000000..43f9ea2 Binary files /dev/null and b/test_data/voice/2nihao.wav differ diff --git a/test_data/voice/2play.wav b/test_data/voice/2play.wav new file mode 100644 index 0000000..2b4f836 Binary files /dev/null and b/test_data/voice/2play.wav differ diff --git a/test_data/voice/2weather.wav b/test_data/voice/2weather.wav new file mode 100644 index 0000000..d048238 Binary files /dev/null and b/test_data/voice/2weather.wav differ diff --git a/test_data/voice/food.wav b/test_data/voice/food.wav new file mode 100644 index 0000000..25a82ed Binary files /dev/null and b/test_data/voice/food.wav differ diff --git a/test_data/voice/forget.wav b/test_data/voice/forget.wav new file mode 100644 index 0000000..73f5888 Binary files /dev/null and b/test_data/voice/forget.wav differ diff --git a/test_data/voice/hello.wav b/test_data/voice/hello.wav new file mode 100644 index 0000000..ed13884 Binary files /dev/null and b/test_data/voice/hello.wav differ diff --git a/test_data/voice/nihao.wav b/test_data/voice/nihao.wav new file mode 100644 index 0000000..f671987 Binary files /dev/null and b/test_data/voice/nihao.wav differ diff --git a/test_data/voice/play.wav b/test_data/voice/play.wav new file mode 100644 index 0000000..661ca76 Binary files /dev/null and b/test_data/voice/play.wav differ