diff --git a/src/blackbox/tts.py b/src/blackbox/tts.py
index 7446190..3228daf 100644
--- a/src/blackbox/tts.py
+++ b/src/blackbox/tts.py
@@ -17,7 +17,7 @@ import sys,os
 sys.path.append('/Workspace/CosyVoice')
 sys.path.append('/Workspace/CosyVoice/third_party/Matcha-TTS')
 from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
-# from cosyvoice.utils.file_utils import load_wav, speed_change
+from cosyvoice.utils.file_utils import load_wav#, speed_change
 
 import soundfile as sf
 import pyloudnorm as pyln
@@ -33,6 +33,7 @@ import numpy as np
 
 from pydub import AudioSegment
 import subprocess
+import re
 
 def set_all_random_seed(seed):
     random.seed(seed)
@@ -107,12 +108,13 @@ class TTS(Blackbox):
         self.cosyvoice_url = ''
         self.cosyvoice_mode = cosyvoice_config.mode
         self.cosyvoicetts = None
+        self.prompt_speech_16k = None
         # os.environ['CUDA_VISIBLE_DEVICES'] = str(cosyvoice_config.device)
         if self.cosyvoice_mode == 'local':
             # self.cosyvoicetts = CosyVoice('/Workspace/Models/CosyVoice/pretrained_models/CosyVoice-300M')
-            self.cosyvoicetts = CosyVoice('/model/Voice/CosyVoice/pretrained_models/CosyVoice-300M')
-            # self.cosyvoicetts = CosyVoice2('/model/Voice/CosyVoice/pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
-
+            # self.cosyvoicetts = CosyVoice('/model/Voice/CosyVoice/pretrained_models/CosyVoice-300M')
+            self.cosyvoicetts = CosyVoice2('/model/Voice/CosyVoice/pretrained_models/CosyVoice2-0.5B', load_jit=True, load_onnx=False, load_trt=False)
+            self.prompt_speech_16k = load_wav('/Workspace/jarvis-models/Ricky-Wong-3-Mins.wav_0006003840_0006134080.wav', 16000)
             
         else:
             self.cosyvoice_url = cosyvoice_config.url
@@ -158,7 +160,22 @@ class TTS(Blackbox):
 
     def __call__(self, *args, **kwargs):
         return self.processing(*args, **kwargs)
+    
+    def filter_invalid_chars(self,text):
+        """过滤无效字符（包括字节流）"""
+        invalid_keywords = ["data:", "\n", "\r", "\t", " "]
+        
+        if isinstance(text, bytes):
+            text = text.decode('utf-8', errors='ignore')
+        
+        for keyword in invalid_keywords:
+            text = text.replace(keyword, "")
+            
+        # 移除所有英文字母和符号（保留中文、标点等）
+        text = re.sub(r'[a-zA-Z]', '', text)
 
+        return text.strip()
+    
     @logging_time(logger=logger)
     def processing(self, *args, settings: dict) -> io.BytesIO:
 
@@ -233,13 +250,45 @@ class TTS(Blackbox):
                 if self.cosyvoice_mode == 'local':
                     set_all_random_seed(56056558)
                     print("*"*90)
-                    audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language, stream=True)
+                    # audio = self.cosyvoicetts.inference_sft(text, self.cosyvoice_language, stream=True)
+                    audio = self.cosyvoicetts.inference_instruct2(text, '用粤语说这句话', self.prompt_speech_16k, stream=False)
+                    # for i, j in enumerate(audio):
+                    #     f = io.BytesIO()
+                    #     sf.write(f, j['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')   
+                    #     f.seek(0)
+                    # print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
+                    # return f.read()
+                    # 打印 audio 的长度和内容结构
+                    # print(f"Total audio segments: {len(audio)}")
+                    # print(f"Audio data structure: {audio}")
+                    
+                    # 创建一个空的列表来存储所有音频段的 NumPy 数组
+                    all_audio_data = []
+                    
+                    # 遍历每一段音频并将它们存储到 all_audio_data 列表
                     for i, j in enumerate(audio):
-                        f = io.BytesIO()
-                        sf.write(f, j['tts_speech'].cpu().numpy().squeeze(0), 22050, format='wav')   
-                        f.seek(0)
+                        # print(f"Processing segment {i + 1}...")
+                        
+                        # 打印每段音频的信息，确保其正确
+                        # print(f"Segment {i + 1} shape: {j['tts_speech'].shape}")
+                        
+                        # 直接将音频数据转换成 NumPy 数组
+                        audio_data = j['tts_speech'].cpu().numpy()
+                        
+                        # 将每个段的音频数据添加到 all_audio_data 列表
+                        all_audio_data.append(audio_data[0])  # 取音频的第一个通道（假设为单声道）
+                    
+                    # 将所有音频段的 NumPy 数组合并成一个完整的音频数组
+                    combined_audio_data = np.concatenate(all_audio_data, axis=0)
+                    
+                    # 将合并后的音频数据写入到 BytesIO 中
+                    f = io.BytesIO()
+                    sf.write(f, combined_audio_data, 22050, format='wav')  # 22050 为采样率，可能需要根据实际情况调整
+                    f.seek(0)
+                    
+                    # 返回合并后的音频
                     print("#### CosyVoiceTTS Service consume - local : ", (time.time() - current_time))
-                    return f.read()
+                    return f.read()  # 返回最终合并后的音频数据
                 else:
                     message = {
                         "text": text
@@ -266,6 +315,7 @@ class TTS(Blackbox):
                     return response.content 
 
         elif user_model_name == 'sovitstts':
+            # text = self.filter_invalid_chars(text)
             if chroma_collection_id == 'kiki' or chroma_collection_id is None:
                 if self.sovits_mode == 'local':
                     set_all_random_seed(56056558)                   
@@ -288,7 +338,7 @@ class TTS(Blackbox):
                         "media_type": self.sovits_media_type,
                         "streaming_mode": self.sovits_streaming_mode
                     }
-                    if user_stream:
+                    if user_stream == True or str(user_stream).lower() == "true":
                         response = requests.get(self.sovits_url, params=message, stream=True)
                         print("#### SoVITS Service consume - docker : ", (time.time()-current_time))
                         return response
@@ -360,8 +410,10 @@ class TTS(Blackbox):
             return JSONResponse(content={"error": "text is required"}, status_code=status.HTTP_400_BAD_REQUEST)
         by = self.processing(text, settings=setting)
         # return Response(content=by, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"})
+        print(f"tts user_stream: {type(user_stream)}")
 
-        if user_stream:
+        if user_stream == True or str(user_stream).lower() == "true":
+            print(f"tts user_stream22: {user_stream}")
             if by.status_code == 200:
                 print("*"*90)
                 def audio_stream():
@@ -405,6 +457,7 @@ class TTS(Blackbox):
 
         else:
             wav_filename = os.path.join(self.audio_dir, 'audio.wav')
+            print("8"*90)
             with open(wav_filename, 'wb') as f:
                 f.write(by)
 
diff --git a/src/blackbox/vlms.py b/src/blackbox/vlms.py
index b2716e0..1a65fbf 100644
--- a/src/blackbox/vlms.py
+++ b/src/blackbox/vlms.py
@@ -13,9 +13,13 @@ import requests
 import base64
 import copy
 import ast
+
+import json
+
 import random
 from time import time
 
+
 import io 
 from PIL import Image
 from lmdeploy.serve.openai.api_client import APIClient
@@ -94,7 +98,6 @@ class VLMS(Blackbox):
             response: a string 
             history: a list
         """
-
         if settings:
             for k in settings:
                 if k not in self.settings:
@@ -315,14 +318,19 @@ class VLMS(Blackbox):
         ## TODO: add support for multiple images and support image in form-data format
         json_request = True
         try:
-            content_type = request.headers['content-type']
+            content_type = request.headers.get('content-type', '')
+            print(content_type)
             if content_type == 'application/json':
                 data = await request.json()
-            else:
+            elif 'multipart/form-data' in content_type:
                 data = await request.form()
-                json_request = False
+                json_request = False             
+            else:
+                body = await request.body()
+                data = json.loads(body.decode("utf-8"))
+
         except Exception as e:
-            return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST)
+            return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST)    
         
         model_name = data.get("model_name")
         prompt = data.get("prompt")
@@ -337,8 +345,8 @@ class VLMS(Blackbox):
         else:
             return JSONResponse(content={"error": "context format error, should be in format of list or Openai_format"}, status_code=status.HTTP_400_BAD_REQUEST)
         
-        if json_request:
-            img_data = data.get("img_data")    
+        if json_request or 'multipart/form-data' not in content_type:
+            img_data = data.get("img_data")
         else:
             img_data = await data.get("img_data").read()
             if settings: settings = ast.literal_eval(settings)