diff --git a/vllm/vllm-serve/templates/services.yaml b/vllm/vllm-serve/templates/services.yaml index 54f2623..2c5c947 100644 --- a/vllm/vllm-serve/templates/services.yaml +++ b/vllm/vllm-serve/templates/services.yaml @@ -30,6 +30,6 @@ spec: ports: - protocol: TCP port: {{ .Values.svc.port | default 8080 }} - targetPort: {{ .Values.svc.port | default 8080 }} - nodePort: {{ .Values.svc.nodePort | default 30080 }} + targetPort: {{ .Values.svc.targetPort | default 8080 }} + # nodePort: {{ .Values.svc.nodePort | default 30080 }} diff --git a/vllm/vllm-serve/templates/single.yaml b/vllm/vllm-serve/templates/single.yaml index bcb13bc..1e2c7a4 100644 --- a/vllm/vllm-serve/templates/single.yaml +++ b/vllm/vllm-serve/templates/single.yaml @@ -56,11 +56,20 @@ spec: - name: RAY_DEDUP_LOGS value: "0" command: - - sh - - -c - - "MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME; - echo 'Using single node ------------------------------------------'; - python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code" +          - /bin/sh +          - -c +          args: +          - | +            {{ .Values.command | default " +              MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); +              MODEL_PATH='{{ .Values.model.localMountPath }}/$MODEL_NAME'; +              python3 -m vllm.entrypoints.openai.api_server \ +                --port 8080 \ +                --model $MODEL_PATH \ +                --tensor-parallel-size {{ .Values.resources.gpuLimit }} \ +                --pipeline_parallel_size {{ .Values.workerSize }} \ +                --trust_remote_code +            " }} resources: limits: nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" diff --git a/vllm/vllm-serve/values.schema.json b/vllm/vllm-serve/values.schema.json new file mode 100644 index 0000000..c6f78de --- /dev/null +++ b/vllm/vllm-serve/values.schema.json @@ -0,0 +1,341 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "vllm-serve Helm Chart Values", + "description": "Schema for vllm-serve Helm chart values", + "type": "object", + "properties": { + "model": { + "type": "object", + "description": "模型配置", + "properties": { + "huggingfaceName": { + "type": "string", + "description": "HuggingFace 模型名称", + "default": "Qwen/Qwen2.5-0.5B-Instruct", + "enum": [ + "swiss-ai/Apertus-8B-2509", + "swiss-ai/Apertus-70B-Instruct-2509", + "BAAI/Aquila-7B", + "BAAI/AquilaChat-7B", + "arcee-ai/AFM-4.5B-Base", + "Snowflake/snowflake-arctic-base", + "Snowflake/snowflake-arctic-instruct", + "baichuan-inc/Baichuan2-13B-Chat", + "baichuan-inc/Baichuan-7B", + "inclusionAI/Ling-lite-1.5", + "inclusionAI/Ling-plus", + "inclusionAI/Ling-mini-2.0", + "ibm-ai-platform/Bamba-9B-fp8", + "ibm-ai-platform/Bamba-9B", + "bigscience/bloom", + "bigscience/bloomz", + "zai-org/chatglm2-6b", + "zai-org/chatglm3-6b", + "CohereLabs/c4ai-command-r-v01", + "CohereLabs/c4ai-command-r7b-12-2024", + "CohereLabs/c4ai-command-a-03-2025", + "CohereLabs/command-a-reasoning-08-2025", + "databricks/dbrx-base", + "databricks/dbrx-instruct", + "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "deepseek-ai/deepseek-llm-67b-base", + "deepseek-ai/deepseek-llm-7b-chat", + "deepseek-ai/DeepSeek-V2", + "deepseek-ai/DeepSeek-V2-Chat", + "deepseek-ai/DeepSeek-V3", + "deepseek-ai/DeepSeek-R1", + "deepseek-ai/DeepSeek-V3.1", + "rednote-hilab/dots.llm1.base", + "rednote-hilab/dots.llm1.inst", + "rednote-hilab/dots.ocr", + "baidu/ERNIE-4.5-0.3B-PT", + "baidu/ERNIE-4.5-21B-A3B-PT", + "baidu/ERNIE-4.5-300B-A47B-PT", + "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", + "LGAI-EXAONE/EXAONE-4.0-32B", + "mgleize/fairseq2-dummy-Llama-3.2-1B", + "tiiuae/falcon-7b", + "tiiuae/falcon-40b", + "tiiuae/falcon-rw-7b", + "tiiuae/falcon-mamba-7b", + "tiiuae/falcon-mamba-7b-instruct", + "tiiuae/Falcon-H1-34B-Base", + "tiiuae/Falcon-H1-34B-Instruct", + "allenai/FlexOlmo-7x7B-1T", + "allenai/FlexOlmo-7x7B-1T-RT", + "google/gemma-2b", + "google/gemma-1.1-2b-it", + "google/gemma-2-9b", + "google/gemma-2-27b", + "google/gemma-3-1b-it", + "google/gemma-3n-E2B-it", + "google/gemma-3n-E4B-it", + "zai-org/glm-4-9b-chat-hf", + "zai-org/GLM-4-32B-0414", + "zai-org/GLM-4.5", + "gpt2", + "gpt2-xl", + "bigcode/starcoder", + "bigcode/gpt_bigcode-santacoder", + "WizardLM/WizardCoder-15B-V1.0", + "EleutherAI/gpt-j-6b", + "nomic-ai/gpt4all-j", + "EleutherAI/gpt-neox-20b", + "EleutherAI/pythia-12b", + "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5", + "databricks/dolly-v2-12b", + "stabilityai/stablelm-tuned-alpha-7b", + "openai/gpt-oss-120b", + "openai/gpt-oss-20b", + "ibm-granite/granite-3.0-2b-base", + "ibm-granite/granite-3.1-8b-instruct", + "ibm/PowerLM-3b", + "ibm-granite/granite-3.0-1b-a400m-base", + "ibm-granite/granite-3.0-3b-a800m-instruct", + "ibm/PowerMoE-3b", + "ibm-granite/granite-4.0-tiny-preview", + "parasail-ai/GritLM-7B-vllm", + "hpcai-tech/grok-1", + "tencent/Hunyuan-7B-Instruct", + "tencent/Hunyuan-A13B-Instruct", + "tencent/Hunyuan-A13B-Pretrain", + "tencent/Hunyuan-A13B-Instruct-FP8", + "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", + "internlm/internlm-7b", + "internlm/internlm-chat-7b", + "internlm/internlm2-7b", + "internlm/internlm2-chat-7b", + "internlm/internlm3-8b-instruct", + "inceptionai/jais-13b", + "inceptionai/jais-13b-chat", + "inceptionai/jais-30b-v3", + "inceptionai/jais-30b-chat-v3", + "ai21labs/AI21-Jamba-1.5-Large", + "ai21labs/AI21-Jamba-1.5-Mini", + "ai21labs/Jamba-v0.1", + "LiquidAI/LFM2-1.2B", + "LiquidAI/LFM2-700M", + "LiquidAI/LFM2-350M", + "LiquidAI/LFM2-8B-A1B-preview", + "meta-llama/Meta-Llama-3.1-405B-Instruct", + "meta-llama/Meta-Llama-3.1-70B", + "meta-llama/Meta-Llama-3-70B-Instruct", + "meta-llama/Llama-2-70b-hf", + "01-ai/Yi-34B", + "state-spaces/mamba-130m-hf", + "state-spaces/mamba-790m-hf", + "state-spaces/mamba-2.8b-hf", + "mistralai/Mamba-Codestral-7B-v0.1", + "XiaomiMiMo/MiMo-7B-RL", + "openbmb/MiniCPM-2B-sft-bf16", + "openbmb/MiniCPM-2B-dpo-bf16", + "openbmb/MiniCPM-S-1B-sft", + "openbmb/MiniCPM3-4B", + "MiniMaxAI/MiniMax-M2", + "mistralai/Mistral-7B-v0.1", + "mistralai/Mistral-7B-Instruct-v0.1", + "mistralai/Mixtral-8x7B-v0.1", + "mistralai/Mixtral-8x7B-Instruct-v0.1", + "mistral-community/Mixtral-8x22B-v0.1", + "mosaicml/mpt-7b", + "mosaicml/mpt-7b-storywriter", + "mosaicml/mpt-30b", + "nvidia/Minitron-8B-Base", + "mgoin/Nemotron-4-340B-Base-hf-FP8", + "nvidia/Nemotron-H-8B-Base-8K", + "nvidia/Nemotron-H-47B-Base-8K", + "nvidia/Nemotron-H-56B-Base-8K", + "allenai/OLMo-1B-hf", + "allenai/OLMo-7B-hf", + "allenai/OLMo-2-0425-1B", + "allenai/OLMoE-1B-7B-0924", + "allenai/OLMoE-1B-7B-0924-Instruct", + "facebook/opt-66b", + "facebook/opt-iml-max-30b", + "OrionStarAI/Orion-14B-Base", + "OrionStarAI/Orion-14B-Chat", + "microsoft/phi-1_5", + "microsoft/phi-2", + "microsoft/Phi-4-mini-instruct", + "microsoft/Phi-4", + "microsoft/Phi-3-mini-4k-instruct", + "microsoft/Phi-3-mini-128k-instruct", + "microsoft/Phi-3-medium-128k-instruct", + "microsoft/Phi-3.5-MoE-instruct", + "adept/persimmon-8b-base", + "adept/persimmon-8b-chat", + "pfnet/plamo-2-1b", + "pfnet/plamo-2-8b", + "Qwen/Qwen-7B", + "Qwen/Qwen-7B-Chat", + "Qwen/QwQ-32B-Preview", + "Qwen/Qwen2-7B-Instruct", + "Qwen/Qwen2-7B", + "Qwen/Qwen2.5-0.5B-Instruct", + "Qwen/Qwen1.5-MoE-A2.7B", + "Qwen/Qwen1.5-MoE-A2.7B-Chat", + "Qwen/Qwen3-8B", + "Qwen/Qwen3-30B-A3B", + "Qwen/Qwen3-Next-80B-A3B-Instruct", + "ByteDance-Seed/Seed-OSS-36B-Instruct", + "stabilityai/stablelm-3b-4e1t", + "stabilityai/stablelm-base-alpha-7b-v2", + "bigcode/starcoder2-3b", + "bigcode/starcoder2-7b", + "bigcode/starcoder2-15b", + "upstage/solar-pro-preview-instruct", + "Tele-AI/TeleChat2-3B", + "Tele-AI/TeleChat2-7B", + "Tele-AI/TeleChat2-35B", + "CofeAI/FLM-2-52B-Instruct-2407", + "CofeAI/Tele-FLM", + "xverse/XVERSE-7B-Chat", + "xverse/XVERSE-13B-Chat", + "xverse/XVERSE-65B-Chat", + "MiniMaxAI/MiniMax-M1-40k", + "MiniMaxAI/MiniMax-M1-80k", + "MiniMaxAI/MiniMax-Text-01", + "Zyphra/Zamba2-7B-instruct", + "Zyphra/Zamba2-2.7B-instruct", + "Zyphra/Zamba2-1.2B-instruct", + "meituan-longcat/LongCat-Flash-Chat", + "meituan-longcat/LongCat-Flash-Chat-FP8", + "rhymes-ai/Aria", + "CohereForAI/aya-vision-8b", + "CohereForAI/aya-vision-32b", + "Open-Bee/Bee-8B-RL", + "Open-Bee/Bee-8B-SFT", + "Salesforce/blip2-opt-2.7b", + "Salesforce/blip2-opt-6.7b", + "facebook/chameleon-7b", + "CohereLabs/command-a-vision-07-2025", + "deepseek-ai/deepseek-vl2-tiny", + "deepseek-ai/deepseek-vl2-small", + "deepseek-ai/deepseek-vl2", + "deepseek-ai/DeepSeek-OCR", + "baidu/ERNIE-4.5-VL-28B-A3B-PT", + "baidu/ERNIE-4.5-VL-424B-A47B-PT", + "adept/fuyu-8b", + "google/gemma-3-4b-it", + "google/gemma-3-27b-it", + "zai-org/glm-4v-9b", + "zai-org/cogagent-9b-20241220", + "zai-org/GLM-4.1V-9B-Thinking", + "zai-org/GLM-4.5V", + "ibm-granite/granite-speech-3.3-8b", + "h2oai/h2ovl-mississippi-800m", + "h2oai/h2ovl-mississippi-2b", + "HuggingFaceM4/Idefics3-8B-Llama3", + "internlm/Intern-S1", + "internlm/Intern-S1-mini", + "OpenGVLab/InternVL3_5-14B", + "OpenGVLab/InternVL3-9B", + "OpenGVLab/InternVideo2_5_Chat_8B", + "OpenGVLab/InternVL2_5-4B", + "OpenGVLab/Mono-InternVL-2B", + "OpenGVLab/InternVL2-4B", + "OpenGVLab/InternVL3-1B-hf", + "Kwai-Keye/Keye-VL-8B-Preview", + "Kwai-Keye/Keye-VL-1_5-8B", + "moonshotai/Kimi-VL-A3B-Instruct", + "moonshotai/Kimi-VL-A3B-Thinking", + "lightonai/LightOnOCR-1B", + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", + "llava-hf/llava-1.5-7b-hf", + "TIGER-Lab/Mantis-8B-siglip-llama3", + "mistral-community/pixtral-12b", + "llava-hf/llava-v1.6-mistral-7b-hf", + "llava-hf/llava-v1.6-vicuna-7b-hf", + "llava-hf/LLaVA-NeXT-Video-7B-hf", + "llava-hf/llava-onevision-qwen2-7b-ov-hf", + "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + "mispeech/midashenglm-7b", + "openbmb/MiniCPM-o-2_6", + "openbmb/MiniCPM-V-2", + "openbmb/MiniCPM-Llama3-V-2_5", + "openbmb/MiniCPM-V-2_6", + "openbmb/MiniCPM-V-4", + "openbmb/MiniCPM-V-4_5", + "MiniMaxAI/MiniMax-VL-01", + "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + "allenai/Molmo-7B-D-0924", + "allenai/Molmo-7B-O-0924", + "nvidia/NVLM-D-72B", + "AIDC-AI/Ovis2-1B", + "AIDC-AI/Ovis1.6-Llama3.2-3B", + "AIDC-AI/Ovis2.5-9B", + "google/paligemma-3b-pt-224", + "google/paligemma-3b-mix-224", + "google/paligemma2-3b-ft-docci-448", + "microsoft/Phi-3-vision-128k-instruct", + "microsoft/Phi-3.5-vision-instruct", + "microsoft/Phi-4-multimodal-instruct", + "mistralai/Pixtral-12B-2409", + "Qwen/Qwen-VL", + "Qwen/Qwen-VL-Chat", + "Qwen/Qwen2-Audio-7B-Instruct", + "Qwen/QVQ-72B-Preview", + "Qwen/Qwen2-VL-7B-Instruct", + "Qwen/Qwen2-VL-72B-Instruct", + "Qwen/Qwen2.5-VL-3B-Instruct", + "Qwen/Qwen2.5-VL-72B-Instruct", + "Qwen/Qwen2.5-Omni-3B", + "Qwen/Qwen2.5-Omni-7B", + "Qwen/Qwen3-VL-4B-Instruct", + "Qwen/Qwen3-VL-30B-A3B-Instruct", + "Qwen/Qwen3-Omni-30B-A3B-Instruct", + "Qwen/Qwen3-Omni-30B-A3B-Thinking", + "YannQi/R-4B", + "Skywork/Skywork-R1V-38B", + "SmolVLM2-2.2B-Instruct", + "stepfun-ai/step3", + "omni-search/Tarsier-7b", + "omni-search/Tarsier-34b", + "omni-research/Tarsier2-Recap-7b", + "omni-research/Tarsier2-7b-0115" + ] + } + }, + "required": ["huggingfaceName"] + }, + "resources": { + "type": "object", + "description": "资源配置", + "properties": { + "gpuLimit": { + "type": "integer", + "description": "GPU 限制", + "default": 1, + "minimum": 1 + }, + "cpuRequest": { + "type": "integer", + "description": "CPU 请求", + "default": 12, + "minimum": 1 + }, + "memoryLimit": { + "type": "string", + "description": "内存限制", + "default": "16Gi", + "pattern": "^[0-9]+(\\.[0-9]+)?(Mi|Gi|Ti)$" + }, + "shmSize": { + "type": "string", + "description": "共享内存大小", + "default": "20Gi", + "pattern": "^[0-9]+(\\.[0-9]+)?(Mi|Gi|Ti)$" + } + } + }, + "workerSize": { + "type": "integer", + "description": "Worker 数量", + "default": 1, + "minimum": 1 + } + } +} diff --git a/vllm/vllm-serve/values.yaml b/vllm/vllm-serve/values.yaml index 49e85ae..d4255f4 100644 --- a/vllm/vllm-serve/values.yaml +++ b/vllm/vllm-serve/values.yaml @@ -31,6 +31,7 @@ model: image: "docker.io/vllm/vllm-openai:latest" # 包含 huggingface-cli 的镜像 # 功能选择 +command: "" resources: gpuLimit: 1 @@ -58,15 +59,15 @@ lmdeploy: # NFS PV/PVC 配置 nfs: - server: "10.6.80.11" - path: "/volume1/Dataset/PVStore/lab-data-model-pvc-c0beeab1-6dd5-4c6a-bd2c-6ce9e114c25e/Weight" + server: "172.19.207.21" + path: "/volume1/datasets/storage/models/Weight" storageClass: "local-path" pvSize: "500Gi" pvcSize: "50Gi" # LeaderWorkerSet 配置 replicaCount: 1 -workerSize: 2 +workerSize: 1 nodeSelector: {}