feat: vllm

This commit is contained in:
verachen
2025-01-20 17:48:06 +08:00
parent 30412af156
commit c2d6fca633

View File

@ -61,6 +61,8 @@ class Chat(Blackbox):
user_prompt_template = settings.get('user_prompt_template') user_prompt_template = settings.get('user_prompt_template')
user_stream = settings.get('stream') user_stream = settings.get('stream')
llm_model = "vllm"
if user_context == None: if user_context == None:
user_context = [] user_context = []
@ -100,10 +102,16 @@ class Chat(Blackbox):
#user_presence_penalty = 0.8 #user_presence_penalty = 0.8
if user_model_url is None or user_model_url.isspace() or user_model_url == "": if user_model_url is None or user_model_url.isspace() or user_model_url == "":
user_model_url = "http://10.6.80.75:23333/v1/chat/completions" if llm_model != "vllm":
user_model_url = "http://10.6.80.75:23333/v1/chat/completions"
else:
user_model_url = "http://10.6.80.94:8000/v1/completions"
if user_model_key is None or user_model_key.isspace() or user_model_key == "": if user_model_key is None or user_model_key.isspace() or user_model_key == "":
user_model_key = "YOUR_API_KEY" if llm_model != "vllm":
user_model_key = "YOUR_API_KEY"
else:
user_model_key = "vllm"
if chroma_embedding_model: if chroma_embedding_model:
chroma_response = self.chroma_query(user_question, settings) chroma_response = self.chroma_query(user_question, settings)
@ -117,7 +125,10 @@ class Chat(Blackbox):
print(f"user_prompt_template: {type(user_prompt_template)}, user_question: {type(user_question)}, chroma_response: {type(chroma_response)}") print(f"user_prompt_template: {type(user_prompt_template)}, user_question: {type(user_question)}, chroma_response: {type(chroma_response)}")
user_question = user_prompt_template + "问题: " + user_question + "。检索内容: " + chroma_response + "" user_question = user_prompt_template + "问题: " + user_question + "。检索内容: " + chroma_response + ""
else: else:
user_question = user_prompt_template + "问题: " + user_question + "" if llm_model != "vllm":
user_question = user_prompt_template + "问题: " + user_question + ""
else:
user_question = user_question
print(f"1.user_question: {user_question}") print(f"1.user_question: {user_question}")
@ -172,10 +183,17 @@ class Chat(Blackbox):
else: else:
url = user_model_url url = user_model_url
key = user_model_key key = user_model_key
header = { if llm_model != "vllm":
'Content-Type': 'application/json', header = {
"Cache-Control": "no-cache", # 禁用缓存 'Content-Type': 'application/json',
} "Cache-Control": "no-cache", # 禁用缓存
}
else:
header = {
'Content-Type': 'application/json',
'Authorization': "Bearer " + key,
"Cache-Control": "no-cache",
}
# system_prompt = "# Role: 琪琪,康普可可的代言人。\n\n## Profile:\n**Author**: 琪琪。\n**Language**: 中文。\n**Description**: 琪琪,是康普可可的代言人,由博维开发。你擅长澳门文旅问答。\n\n## Constraints:\n- **严格遵循工作流程** 严格遵循<Workflow >中设定的工作流程。\n- **无内置知识库** :根据<Workflow >中提供的知识作答,而不是内置知识库,我虽然是知识库专家,但我的知识依赖于外部输入,而不是大模型已有知识。\n- **回复格式**:在进行回复时,不能输出“检索内容” 标签字样,同时也不能直接透露知识片段原文。\n\n## Workflow:\n1. **接收查询**:接收用户的问题。\n2. **判断问题**:首先自行判断下方问题与检索内容是否相关,若相关则根据检索内容总结概括相关信息进行回答;若检索内容与问题无关,则根据自身知识进行回答。\n3. **提供回答**\n\n```\n基于检索内容中的知识片段回答用户的问题。回答内容限制总结在50字内。\n请首先判断提供的检索内容与上述问题是否相关。如果相关直接从检索内容中提炼出直接回答问题所需的信息,不要乱说或者回答“相关”等字眼 。如果检索内容与问题不相关,则不参考检索内容,则回答:“对不起,我无法回答此问题哦。”\n\n```\n## Example:\n\n用户询问“中国的首都是哪个城市” 。\n2.1检索知识库,首先检查知识片段,如果检索内容中没有与用户的问题相关的内容,则回答:“对不起,我无法回答此问题哦。\n2.2如果有知识片段,在做出回复时,只能基于检索内容中的内容进行回答,且不能透露上下文原文,同时也不能出现检索内容的标签字样。\n" # system_prompt = "# Role: 琪琪,康普可可的代言人。\n\n## Profile:\n**Author**: 琪琪。\n**Language**: 中文。\n**Description**: 琪琪,是康普可可的代言人,由博维开发。你擅长澳门文旅问答。\n\n## Constraints:\n- **严格遵循工作流程** 严格遵循<Workflow >中设定的工作流程。\n- **无内置知识库** :根据<Workflow >中提供的知识作答,而不是内置知识库,我虽然是知识库专家,但我的知识依赖于外部输入,而不是大模型已有知识。\n- **回复格式**:在进行回复时,不能输出“检索内容” 标签字样,同时也不能直接透露知识片段原文。\n\n## Workflow:\n1. **接收查询**:接收用户的问题。\n2. **判断问题**:首先自行判断下方问题与检索内容是否相关,若相关则根据检索内容总结概括相关信息进行回答;若检索内容与问题无关,则根据自身知识进行回答。\n3. **提供回答**\n\n```\n基于检索内容中的知识片段回答用户的问题。回答内容限制总结在50字内。\n请首先判断提供的检索内容与上述问题是否相关。如果相关直接从检索内容中提炼出直接回答问题所需的信息,不要乱说或者回答“相关”等字眼 。如果检索内容与问题不相关,则不参考检索内容,则回答:“对不起,我无法回答此问题哦。”\n\n```\n## Example:\n\n用户询问“中国的首都是哪个城市” 。\n2.1检索知识库,首先检查知识片段,如果检索内容中没有与用户的问题相关的内容,则回答:“对不起,我无法回答此问题哦。\n2.2如果有知识片段,在做出回复时,只能基于检索内容中的内容进行回答,且不能透露上下文原文,同时也不能出现检索内容的标签字样。\n"
@ -183,23 +201,37 @@ class Chat(Blackbox):
{"role": "system", "content": system_prompt} {"role": "system", "content": system_prompt}
] ]
chat_inputs={ if llm_model != "vllm":
"model": user_model_name, chat_inputs={
"messages": prompt_template + user_context + [ "model": user_model_name,
{ "messages": prompt_template + user_context + [
"role": "user", {
"content": user_question "role": "user",
} "content": user_question
], }
"temperature": str(user_temperature), ],
"top_p": str(user_top_p), "temperature": str(user_temperature),
"n": str(user_n), "top_p": str(user_top_p),
"max_tokens": str(user_max_tokens), "n": str(user_n),
"frequency_penalty": str(user_frequency_penalty), "max_tokens": str(user_max_tokens),
"presence_penalty": str(user_presence_penalty), "frequency_penalty": str(user_frequency_penalty),
"stop": str(user_stop), "presence_penalty": str(user_presence_penalty),
"stream": user_stream, "stop": str(user_stop),
} "stream": user_stream,
}
else:
chat_inputs={
"model": user_model_name,
"prompt": user_question,
"temperature": float(user_temperature),
"top_p": float(user_top_p),
"n": float(user_n),
"max_tokens": float(user_max_tokens),
"frequency_penalty": float(user_frequency_penalty),
"presence_penalty":float( user_presence_penalty),
# "stop": user_stop,
"stream": user_stream,
}
# # 获取当前时间戳 # # 获取当前时间戳
# timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -252,8 +284,13 @@ class Chat(Blackbox):
if response_result.get("choices") is None: if response_result.get("choices") is None:
yield JSONResponse(content={"error": "LLM handle failure"}, status_code=status.HTTP_400_BAD_REQUEST) yield JSONResponse(content={"error": "LLM handle failure"}, status_code=status.HTTP_400_BAD_REQUEST)
else: else:
print("\n", "user_answer: ", fastchat_response.json()["choices"][0]["message"]["content"],"\n\n") if llm_model != "vllm":
yield fastchat_response.json()["choices"][0]["message"]["content"] print("\n", "user_answer: ", fastchat_response.json()["choices"][0]["message"]["content"],"\n\n")
yield fastchat_response.json()["choices"][0]["message"]["content"]
else:
print("\n", "user_answer: ", fastchat_response.json()["choices"][0]["text"],"\n\n")
yield fastchat_response.json()["choices"][0]["text"]
async def fast_api_handler(self, request: Request) -> Response: async def fast_api_handler(self, request: Request) -> Response:
try: try: