from typing import Any, Coroutine from fastapi import Request, Response, status from fastapi.responses import JSONResponse from .blackbox import Blackbox import requests import json from injector import singleton @singleton class Fastchat(Blackbox): def __call__(self, *args, **kwargs): return self.processing(*args, **kwargs) def valid(self, *args, **kwargs) -> bool: data = args[0] return isinstance(data, list) # model_name有 Qwen1.5-14B-Chat , internlm2-chat-20b def processing(self, model_name, prompt, template, context: list, temperature, top_p, top_k, n, max_tokens) -> str: if context == None: context = [] url = 'http://120.196.116.194:48892/v1/chat/completions' # context可以为空列表,也可以是用户的对话历史 # context = [ # { # "role": "user", # "content": "智能体核心思想" # }, # { # "role": "assistant", # "content": "智能体的核心思想是将人工智能应用于问题求解者角色,它通过算法模拟人类决策过程,通过感知环境、学习、规划和执行行动,以实现特定任务或目标。其目标是通过自我适应和优化,实现高效问题解决。" # }, # ] prompt_template = [ {"role": "system", "content": template}, ] fastchat_inputs={ "model": model_name, "messages": prompt_template + context + [ { "role": "user", "content": prompt } ], "temperature": temperature, "top_p": top_p, "top_k": top_k, "n": n, "max_tokens": max_tokens, "stream": False, } # { # "model": "string", # "messages": "string", # "temperature": 0.7, # between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. # "top_p": 1, # 控制生成下一个单词的概率分布,即从所有可能的单词中,只选择概率最高的一部分作为候选单词 # "top_k": -1, # top-k 参数设置为 3意味着选择前三个tokens。 # "n": 1, # How many chat completion choices to generate for each input message. # "max_tokens": 1024, # The maximum number of tokens to generate in the chat completion. # "stop": [ # "string" # ], # "stream": False, # "presence_penalty": 0, # Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. # "frequency_penalty": 0, # Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim. # "user": "string" # } fastchat_response = requests.post(url, json=fastchat_inputs) # user_message = fastchat_inputs["messages"] # context.append(user_message) assistant_message = fastchat_response.json()["choices"][0]["message"] # context.append(assistant_message) fastchat_content = assistant_message["content"] return fastchat_content async def fast_api_handler(self, request: Request) -> Response: try: data = await request.json() except: return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST) user_model_name = data.get("model_name") user_context = data.get("context") user_question = data.get("question") user_template = data.get("template") user_temperature = data.get("temperature") user_top_p = data.get("top_p") user_top_k = data.get("top_k") user_n = data.get("n") user_max_tokens = data.get("max_tokens") if user_question is None: return JSONResponse(content={"error": "question is required"}, status_code=status.HTTP_400_BAD_REQUEST) if user_model_name is None or user_model_name.isspace() or user_model_name == "": user_model_name = "Qwen1.5-14B-Chat" if user_template is None or user_template.isspace(): # user_template 是定义LLM的语气,例如template = "使用小丑的语气说话。",user_template可以为空字串,或者是用户自定义的语气 user_template = "" if user_temperature is None or user_temperature == "": user_temperature = 0.7 if user_top_p is None or user_top_p == "": user_top_p = 1 if user_top_k is None or user_top_k == "": user_top_k = -1 if user_n is None or user_n == "": user_n = 1 if user_max_tokens is None or user_max_tokens == "": user_max_tokens = 1024 return JSONResponse(content={"response": self.processing(user_model_name, user_question, user_template, user_context, user_temperature, user_top_p, user_top_k, user_n, user_max_tokens)}, status_code=status.HTTP_200_OK)