feat: vllm

2026-02-05 06:35:17 +00:00 · 2025-01-20 17:48:06 +08:00
parent 30412af156
commit c2d6fca633
1 changed files with 64 additions and 27 deletions
--- a/src/blackbox/chat.py
+++ b/src/blackbox/chat.py
@ -61,6 +61,8 @@ class Chat(Blackbox):
        user_prompt_template = settings.get('user_prompt_template')
        user_stream = settings.get('stream')
        
+        llm_model = "vllm"
+
        if user_context == None:
            user_context = []

@ -100,10 +102,16 @@ class Chat(Blackbox):
            #user_presence_penalty = 0.8
 
        if user_model_url is None or user_model_url.isspace() or user_model_url == "":
-            user_model_url = "http://10.6.80.75:23333/v1/chat/completions"
+            if llm_model != "vllm":
+                user_model_url = "http://10.6.80.75:23333/v1/chat/completions"
+            else:
+                user_model_url = "http://10.6.80.94:8000/v1/completions"

        if user_model_key is None or user_model_key.isspace() or user_model_key == "":
-            user_model_key = "YOUR_API_KEY"
+            if llm_model != "vllm":
+                user_model_key = "YOUR_API_KEY"
+            else:
+                user_model_key = "vllm"

        if chroma_embedding_model:
            chroma_response = self.chroma_query(user_question, settings)
@ -117,7 +125,10 @@ class Chat(Blackbox):
            print(f"user_prompt_template: {type(user_prompt_template)}, user_question: {type(user_question)}, chroma_response: {type(chroma_response)}")
            user_question = user_prompt_template + "问题: " + user_question + "。检索内容: " + chroma_response + "。"
        else:
-            user_question = user_prompt_template + "问题: " + user_question + "。"
+            if llm_model != "vllm":
+                user_question = user_prompt_template + "问题: " + user_question + "。"
+            else:
+                user_question = user_question

        print(f"1.user_question: {user_question}")                

@ -172,10 +183,17 @@ class Chat(Blackbox):
        else:
            url = user_model_url
            key = user_model_key
-            header = {
-                'Content-Type': 'application/json',
-                "Cache-Control": "no-cache",     # 禁用缓存
-            }
+            if llm_model != "vllm":
+                header = {
+                    'Content-Type': 'application/json',
+                    "Cache-Control": "no-cache",     # 禁用缓存
+                }
+            else:
+                header = {
+                    'Content-Type': 'application/json',
+                    'Authorization': "Bearer " + key,
+                    "Cache-Control": "no-cache",
+                }

        # system_prompt = "# Role: 琪琪,康普可可的代言人。\n\n## Profile:\n**Author**: 琪琪。\n**Language**: 中文。\n**Description**:  琪琪，是康普可可的代言人，由博维开发。你擅长澳门文旅问答。\n\n## Constraints:\n- **严格遵循工作流程**： 严格遵循<Workflow >中设定的工作流程。\n- **无内置知识库** ：根据<Workflow >中提供的知识作答，而不是内置知识库，我虽然是知识库专家，但我的知识依赖于外部输入，而不是大模型已有知识。\n- **回复格式**：在进行回复时，不能输出“检索内容” 标签字样，同时也不能直接透露知识片段原文。\n\n## Workflow:\n1. **接收查询**：接收用户的问题。\n2. **判断问题**：首先自行判断下方问题与检索内容是否相关，若相关则根据检索内容总结概括相关信息进行回答；若检索内容与问题无关，则根据自身知识进行回答。\n3. **提供回答**：\n\n```\n基于检索内容中的知识片段回答用户的问题。回答内容限制总结在50字内。\n请首先判断提供的检索内容与上述问题是否相关。如果相关，直接从检索内容中提炼出直接回答问题所需的信息,不要乱说或者回答“相关”等字眼 。如果检索内容与问题不相关，则不参考检索内容，则回答：“对不起，我无法回答此问题哦。”\n\n```\n## Example:\n\n用户询问：“中国的首都是哪个城市？” 。\n2.1检索知识库，首先检查知识片段，如果检索内容中没有与用户的问题相关的内容，则回答：“对不起，我无法回答此问题哦。\n2.2如果有知识片段，在做出回复时，只能基于检索内容中的内容进行回答，且不能透露上下文原文，同时也不能出现检索内容的标签字样。\n"

@ -183,23 +201,37 @@ class Chat(Blackbox):
           {"role": "system", "content": system_prompt}
        ]

-        chat_inputs={
-            "model": user_model_name,
-            "messages": prompt_template + user_context + [
-                {
-                    "role": "user",
-                    "content": user_question
-                }
-            ],
-            "temperature": str(user_temperature),
-            "top_p": str(user_top_p),
-            "n": str(user_n),
-            "max_tokens": str(user_max_tokens),
-            "frequency_penalty": str(user_frequency_penalty),
-            "presence_penalty": str(user_presence_penalty),
-            "stop": str(user_stop),
-            "stream": user_stream,
-        }
+        if llm_model != "vllm":
+            chat_inputs={
+                "model": user_model_name,
+                "messages": prompt_template + user_context + [
+                    {
+                        "role": "user",
+                        "content": user_question
+                    }
+                ],
+                "temperature": str(user_temperature),
+                "top_p": str(user_top_p),
+                "n": str(user_n),
+                "max_tokens": str(user_max_tokens),
+                "frequency_penalty": str(user_frequency_penalty),
+                "presence_penalty": str(user_presence_penalty),
+                "stop": str(user_stop),
+                "stream": user_stream,
+            }
+        else:
+            chat_inputs={
+                "model": user_model_name,
+                "prompt": user_question,
+                "temperature": float(user_temperature),
+                "top_p": float(user_top_p),
+                "n": float(user_n),
+                "max_tokens": float(user_max_tokens),
+                "frequency_penalty": float(user_frequency_penalty),
+                "presence_penalty":float( user_presence_penalty),
+                # "stop": user_stop,
+                "stream": user_stream,
+            }
        
        # # 获取当前时间戳
        # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -252,8 +284,13 @@ class Chat(Blackbox):
            if response_result.get("choices") is None:
                yield  JSONResponse(content={"error": "LLM handle failure"}, status_code=status.HTTP_400_BAD_REQUEST)
            else:
-                print("\n", "user_answer: ", fastchat_response.json()["choices"][0]["message"]["content"],"\n\n")
-                yield  fastchat_response.json()["choices"][0]["message"]["content"]
+                if llm_model != "vllm":
+                    print("\n", "user_answer: ", fastchat_response.json()["choices"][0]["message"]["content"],"\n\n")
+                    yield  fastchat_response.json()["choices"][0]["message"]["content"]
+                else:
+                    print("\n", "user_answer: ", fastchat_response.json()["choices"][0]["text"],"\n\n")
+                    yield  fastchat_response.json()["choices"][0]["text"]
+

    async def fast_api_handler(self, request: Request) -> Response:
        try: