jarvis-models/src/blackbox/fastchat.py

from typing import Any, Coroutine

from fastapi import Request, Response, status
from fastapi.responses import JSONResponse
from .blackbox import Blackbox

import requests
import json

from injector import singleton
@singleton
class Fastchat(Blackbox):

    def __call__(self, *args, **kwargs):
        return self.processing(*args, **kwargs)

    def valid(self, *args, **kwargs) -> bool:
        data = args[0]
        return isinstance(data, list)

    # model_name有 Qwen1.5-14B-Chat , internlm2-chat-20b
    def processing(self, model_name, prompt, template, context: list, temperature, top_p, top_k, n, max_tokens)  -> str:
        if context == None:
            context = []
        url = 'http://120.196.116.194:48892/v1/chat/completions'

        # context可以为空列表，也可以是用户的对话历史
        # context = [
        #     {
        #         "role": "user",
        #         "content": "智能体核心思想"
        #     },
        #     {
        #         "role": "assistant",
        #         "content": "智能体的核心思想是将人工智能应用于问题求解者角色，它通过算法模拟人类决策过程，通过感知环境、学习、规划和执行行动，以实现特定任务或目标。其目标是通过自我适应和优化，实现高效问题解决。"
        #     },
        # ]

        prompt_template = [
           {"role": "system", "content": template},
        ]

        fastchat_inputs={
            "model": model_name,
            "messages": prompt_template + context + [
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "temperature": temperature,
            "top_p": top_p,
            "top_k": top_k,
            "n": n,
            "max_tokens": max_tokens,
            "stream": False,
        }


        # {
        #     "model": "string",
        #     "messages": "string",
        #     "temperature": 0.7, #  between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
        #     "top_p": 1, # 控制生成下一个单词的概率分布，即从所有可能的单词中，只选择概率最高的一部分作为候选单词
        #     "top_k": -1, #  top-k 参数设置为 3意味着选择前三个tokens。
        #     "n": 1, # How many chat completion choices to generate for each input message.
        #     "max_tokens": 1024, # The maximum number of tokens to generate in the chat completion.
        #     "stop": [
        #         "string"
        #     ],
        #     "stream": False,
        #     "presence_penalty": 0, # Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
        #     "frequency_penalty": 0, # Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim.
        #     "user": "string"
        # }

        fastchat_response = requests.post(url, json=fastchat_inputs)

        # user_message = fastchat_inputs["messages"]
        # context.append(user_message)

        assistant_message = fastchat_response.json()["choices"][0]["message"]
        # context.append(assistant_message)

        fastchat_content = assistant_message["content"]

        return fastchat_content

    async def fast_api_handler(self, request: Request) -> Response:
        try:
            data = await request.json()
        except:
            return JSONResponse(content={"error": "json parse error"}, status_code=status.HTTP_400_BAD_REQUEST)

        user_model_name = data.get("model_name")
        user_context = data.get("context")
        user_question = data.get("question")
        user_template = data.get("template")
        user_temperature = data.get("temperature")
        user_top_p = data.get("top_p")
        user_top_k = data.get("top_k")
        user_n = data.get("n")
        user_max_tokens = data.get("max_tokens")


        if user_question is None:
            return JSONResponse(content={"error": "question is required"}, status_code=status.HTTP_400_BAD_REQUEST)

        if user_model_name is None or user_model_name.isspace() or user_model_name == "":
            user_model_name = "Qwen1.5-14B-Chat"

        if user_template is None or user_template.isspace():
            # user_template 是定义LLM的语气，例如template = "使用小丑的语气说话。"，user_template可以为空字串，或者是用户自定义的语气
            user_template = ""

        if user_temperature is None or user_temperature == "":
            user_temperature = 0.7

        if user_top_p is None or user_top_p == "":
            user_top_p = 1

        if user_top_k is None or user_top_k == "":
            user_top_k = -1

        if user_n is None or user_n == "":
            user_n = 1

        if user_max_tokens is None or user_max_tokens == "":
            user_max_tokens = 1024


        return JSONResponse(content={"response": self.processing(user_model_name, user_question, user_template, user_context,
        user_temperature, user_top_p, user_top_k, user_n, user_max_tokens)}, status_code=status.HTTP_200_OK)