chore(repo): init helm-charts repo with two charts and automated package/push

This commit is contained in:
Ivan087
2025-11-17 16:50:28 +08:00
commit e8451c0675
28 changed files with 1905 additions and 0 deletions

23
vllm-serve/.helmignore Normal file
View File

@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

8
vllm-serve/Chart.yaml Normal file
View File

@ -0,0 +1,8 @@
annotations:
helm.sh/resource-policy: keep
apiVersion: v2
appVersion: 1.16.0
description: A Helm chart for deploying vLLM with NFS storage
name: vllm-serve
type: application
version: 0.2.0

View File

@ -0,0 +1,16 @@
1. Get the application URL by running these commands:
{{- if contains "NodePort" .Values.svc.type }}
export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "vllm-serve.fullname" . }})
export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
echo http://$NODE_IP:$NODE_PORT
{{- else if contains "LoadBalancer" .Values.svc.type }}
NOTE: It may take a few minutes for the LoadBalancer IP to be available.
You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "vllm-serve.fullname" . }}'
export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "vllm-serve.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
echo http://$SERVICE_IP:{{ .Values.svc.port }}
{{- else if contains "ClusterIP" .Values.svc.type }}
export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "vllm-serve.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
echo "Visit http://127.0.0.1:8080 to use your application"
kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
{{- end }}

View File

@ -0,0 +1,62 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "vllm-serve.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "vllm-serve.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "vllm-serve.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "vllm-serve.labels" -}}
helm.sh/chart: {{ include "vllm-serve.chart" . }}
{{ include "vllm-serve.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "vllm-serve.selectorLabels" -}}
app.kubernetes.io/name: {{ include "vllm-serve.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "vllm-serve.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "vllm-serve.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

View File

@ -0,0 +1,193 @@
{{- if gt (int .Values.workerSize) 1 }}
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: {{ .Release.Name }}
spec:
replicas: {{ .Values.replicaCount }}
leaderWorkerTemplate:
size: {{ .Values.workerSize }}
restartPolicy: RecreateGroupOnPodRestart
leaderTemplate:
metadata:
labels:
role: leader
spec:
initContainers:
# 模型下载作为第一个 initContainer
- name: download-model
image: {{ .Values.model.download.image }}
imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }}
env:
- name: HF_ENDPOINT
value: https://hf-mirror.com
- name: HUGGING_FACE_HUB_TOKEN
value: {{ .Values.model.huggingfaceToken }}
command:
- sh
- -c
- |
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
DEST_DIR="{{ .Values.model.localMountPath }}/Weight/$MODEL_NAME"
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
# 检查模型是否存在,不存在则下载
echo "DEST_DIR= $DEST_DIR"
if [ ! -f "$DEST_DIR/config.json" ]; then
ls -l {{ .Values.model.localMountPath }}
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
wget https://hf-mirror.com/hfd/hfd.sh
chmod a+x hfd.sh
apt update && apt upgrade
apt install aria2 -y
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
else
echo "Model already exists at $DEST_DIR"
fi
volumeMounts:
- name: weight-volume
mountPath: {{ .Values.model.localMountPath }}
containers:
- name: vllm-leader
image: {{ .Values.vllm.image }}
imagePullPolicy: IfNotPresent
securityContext:
capabilities:
add: [ "IPC_LOCK" ]
env:
# - name: HUGGING_FACE_HUB_TOKEN
# value: {{ .Values.vllm.huggingfaceToken }}
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_DEBUG
value: INFO
- name: NCCL_IB_HCA
value: mlx5_0:1
- name: NCCL_IB_GID_INDEX
value: "0" # 或 "7",根据你的网络配置而定
- name: RAY_DEDUP_LOGS
value: "0"
command:
- sh
- -c
{{- if .Values.command }}
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }}"
{{- else }}
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
{{- end }}
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
memory: {{ .Values.resources.memoryLimit }}
ephemeral-storage: 10Gi
rdma/rdma_shared_device_a: 10
requests:
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
ports:
- containerPort: 8000
name: http
readinessProbe:
tcpSocket:
#httpGet:
#path: /health
port: 8000
initialDelaySeconds: 120
periodSeconds: 20
timeoutSeconds: 5
volumeMounts:
- mountPath: /dev/shm
name: dshm
- name: weight-volume
mountPath: {{ .Values.model.localMountPath }}
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.resources.shmSize }}
- name: weight-volume
persistentVolumeClaim:
claimName: {{ .Release.Name }}-pvc-model
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 10 }}
{{- end }}
workerTemplate:
spec:
containers:
- name: vllm-worker
image: {{ .Values.vllm.image }}
imagePullPolicy: IfNotPresent
securityContext:
capabilities:
add: [ "IPC_LOCK" ]
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
memory: {{ .Values.resources.memoryLimit }}
ephemeral-storage: 10Gi
rdma/rdma_shared_device_a: 10
requests:
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
env:
# - name: HUGGING_FACE_HUB_TOKEN
# value: {{ .Values.vllm.huggingfaceToken }}
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_DEBUG
value: INFO
- name: NCCL_IB_HCA
value: mlx5_0:1
- name: NCCL_IB_GID_INDEX
value: "0" # 或 "7",根据你的网络配置而定
- name: RAY_DEDUP_LOGS
value: "0"
volumeMounts:
- mountPath: /dev/shm
name: dshm
- name: weight-volume
mountPath: {{ .Values.model.localMountPath }}
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.resources.shmSize }}
- name: weight-volume
persistentVolumeClaim:
claimName: {{ .Release.Name }}-pvc-model
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- end }}

View File

@ -0,0 +1,40 @@
#apiVersion: v1
#kind: PersistentVolume
#metadata:
# name: {{ .Release.Name }}-pv-model
#spec:
# storageClassName: weight # {{ .Values.nfs.storageClass | default "local-path" }}
# capacity:
# storage: {{ .Values.nfs.pvSize }}
# accessModes:
# - ReadWriteMany
# persistentVolumeReclaimPolicy: Retain
# # nfs:
# # path: {{ .Values.nfs.path }}
# # server: {{ .Values.nfs.server }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ .Release.Name }}-pvc-model
# annotations:
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: {{ .Values.nfs.pvcSize }}
# volumeName: {{ .Release.Name }}-pv-model
storageClassName: weight
#apiVersion: v1
#kind: PersistentVolumeClaim
#metadata:
# name: {{ .Release.Name }}-pvc-model
#spec:
# accessModes:
# - ReadWriteMany
# resources:
# requests:
# storage: 20Gi
# storageClassName: nas-dataset

View File

@ -0,0 +1,35 @@
#apiVersion: v1
#kind: Service
#metadata:
# name: infer-leader-loadbalancer
#spec:
# type: LoadBalancer
# selector:
# leaderworkerset.sigs.k8s.io/name: infer
# role: leader
# ports:
# - protocol: TCP
# port: 8080
# targetPort: 8080
#
---
apiVersion: v1
kind: Service
metadata:
name: {{ .Release.Name }}-svc
spec:
type: {{ .Values.svc.type | default "LoadBalancer" }}
{{- if gt (int .Values.workerSize) 1 }}
selector:
leaderworkerset.sigs.k8s.io/name: {{ .Release.Name }}
role: leader
{{- else }}
selector:
app: {{ .Release.Name }}
{{- end }}
ports:
- protocol: TCP
port: {{ .Values.svc.port | default 80 }}
targetPort: http # {{ .Values.svc.targetPort | default 8080 }}
# nodePort: {{ .Values.svc.nodePort | default 30080 }}

View File

@ -0,0 +1,127 @@
{{- if eq (int .Values.workerSize) 1 }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ .Release.Name }}
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
app: {{ .Release.Name }}
template:
metadata:
labels:
app: {{ .Release.Name }}
spec:
initContainers:
# 模型下载作为第一个 initContainer
- name: download-model
image: {{ .Values.model.download.image }}
imagePullPolicy: IfNotPresent
env:
- name: HF_ENDPOINT
value: https://hf-mirror.com
- name: HUGGING_FACE_HUB_TOKEN
value: {{ .Values.model.huggingfaceToken }}
command:
- sh
- -c
- |
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
DEST_DIR="{{ .Values.model.localMountPath }}/Weight/$MODEL_NAME"
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
# 检查模型是否存在,不存在则下载
echo "DEST_DIR= $DEST_DIR"
if [ ! -f "$DEST_DIR/config.json" ]; then
ls -l {{ .Values.model.localMountPath }}
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
wget https://hf-mirror.com/hfd/hfd.sh
chmod a+x hfd.sh
apt update && apt upgrade
apt install aria2 -y
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
else
echo "Model already exists at $DEST_DIR"
fi
volumeMounts:
- name: weight-volume
mountPath: {{ .Values.model.localMountPath }}
containers:
- name: vllm-pod
image: {{ .Values.vllm.image }}
imagePullPolicy: IfNotPresent
env:
- name: HUGGING_FACE_HUB_TOKEN
value: {{ .Values.vllm.huggingfaceToken }}
- name: RAY_DEDUP_LOGS
value: "0"
command:
- sh
- -c
{{- if .Values.command }}
- {{ .Values.command | quote }}
{{- else }}
- |
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}');
MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
python3 -m vllm.entrypoints.openai.api_server \
--port 8000 \
--model $MODEL_PATH \
--tensor-parallel-size {{ .Values.resources.gpuLimit }} \
--pipeline_parallel_size {{ .Values.workerSize }} \
--trust_remote_code
{{- end }}
# - "
# MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}');
# MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
# python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
memory: {{ .Values.resources.memoryLimit }}
ephemeral-storage: 10Gi
requests:
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
ports:
- containerPort: 8000
name: http
readinessProbe:
#tcpSocket:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 120
periodSeconds: 20
timeoutSeconds: 5
volumeMounts:
- mountPath: /dev/shm
name: dshm
- name: weight-volume
mountPath: {{ .Values.model.localMountPath }}
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.resources.shmSize }}
- name: weight-volume
persistentVolumeClaim:
claimName: {{ .Release.Name }}-pvc-model
# - name: weight-volume
# nfs:
# path: "/volume1/Dataset/PVStore/lab-data-model-pvc-c0beeab1-6dd5-4c6a-bd2c-6ce9e114c25e/Weight"
# server: "10.6.80.11"
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

View File

@ -0,0 +1,346 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "vllm-serve Helm Chart Values",
"description": "Schema for vllm-serve Helm chart values",
"type": "object",
"properties": {
"model": {
"type": "object",
"description": "模型配置",
"properties": {
"huggingfaceName": {
"type": "string",
"description": "HuggingFace 模型名称",
"default": "Qwen/Qwen2.5-0.5B-Instruct",
"enum": [
"swiss-ai/Apertus-8B-2509",
"swiss-ai/Apertus-70B-Instruct-2509",
"BAAI/Aquila-7B",
"BAAI/AquilaChat-7B",
"arcee-ai/AFM-4.5B-Base",
"Snowflake/snowflake-arctic-base",
"Snowflake/snowflake-arctic-instruct",
"baichuan-inc/Baichuan2-13B-Chat",
"baichuan-inc/Baichuan-7B",
"inclusionAI/Ling-lite-1.5",
"inclusionAI/Ling-plus",
"inclusionAI/Ling-mini-2.0",
"ibm-ai-platform/Bamba-9B-fp8",
"ibm-ai-platform/Bamba-9B",
"bigscience/bloom",
"bigscience/bloomz",
"zai-org/chatglm2-6b",
"zai-org/chatglm3-6b",
"CohereLabs/c4ai-command-r-v01",
"CohereLabs/c4ai-command-r7b-12-2024",
"CohereLabs/c4ai-command-a-03-2025",
"CohereLabs/command-a-reasoning-08-2025",
"databricks/dbrx-base",
"databricks/dbrx-instruct",
"nvidia/Llama-3_3-Nemotron-Super-49B-v1",
"deepseek-ai/deepseek-llm-67b-base",
"deepseek-ai/deepseek-llm-7b-chat",
"deepseek-ai/DeepSeek-V2",
"deepseek-ai/DeepSeek-V2-Chat",
"deepseek-ai/DeepSeek-V3",
"deepseek-ai/DeepSeek-R1",
"deepseek-ai/DeepSeek-V3.1",
"rednote-hilab/dots.llm1.base",
"rednote-hilab/dots.llm1.inst",
"rednote-hilab/dots.ocr",
"baidu/ERNIE-4.5-0.3B-PT",
"baidu/ERNIE-4.5-21B-A3B-PT",
"baidu/ERNIE-4.5-300B-A47B-PT",
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
"LGAI-EXAONE/EXAONE-4.0-32B",
"mgleize/fairseq2-dummy-Llama-3.2-1B",
"tiiuae/falcon-7b",
"tiiuae/falcon-40b",
"tiiuae/falcon-rw-7b",
"tiiuae/falcon-mamba-7b",
"tiiuae/falcon-mamba-7b-instruct",
"tiiuae/Falcon-H1-34B-Base",
"tiiuae/Falcon-H1-34B-Instruct",
"allenai/FlexOlmo-7x7B-1T",
"allenai/FlexOlmo-7x7B-1T-RT",
"google/gemma-2b",
"google/gemma-1.1-2b-it",
"google/gemma-2-9b",
"google/gemma-2-27b",
"google/gemma-3-1b-it",
"google/gemma-3n-E2B-it",
"google/gemma-3n-E4B-it",
"zai-org/glm-4-9b-chat-hf",
"zai-org/GLM-4-32B-0414",
"zai-org/GLM-4.5",
"gpt2",
"gpt2-xl",
"bigcode/starcoder",
"bigcode/gpt_bigcode-santacoder",
"WizardLM/WizardCoder-15B-V1.0",
"EleutherAI/gpt-j-6b",
"nomic-ai/gpt4all-j",
"EleutherAI/gpt-neox-20b",
"EleutherAI/pythia-12b",
"OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
"databricks/dolly-v2-12b",
"stabilityai/stablelm-tuned-alpha-7b",
"openai/gpt-oss-120b",
"openai/gpt-oss-20b",
"ibm-granite/granite-3.0-2b-base",
"ibm-granite/granite-3.1-8b-instruct",
"ibm/PowerLM-3b",
"ibm-granite/granite-3.0-1b-a400m-base",
"ibm-granite/granite-3.0-3b-a800m-instruct",
"ibm/PowerMoE-3b",
"ibm-granite/granite-4.0-tiny-preview",
"parasail-ai/GritLM-7B-vllm",
"hpcai-tech/grok-1",
"tencent/Hunyuan-7B-Instruct",
"tencent/Hunyuan-A13B-Instruct",
"tencent/Hunyuan-A13B-Pretrain",
"tencent/Hunyuan-A13B-Instruct-FP8",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"internlm/internlm-7b",
"internlm/internlm-chat-7b",
"internlm/internlm2-7b",
"internlm/internlm2-chat-7b",
"internlm/internlm3-8b-instruct",
"inceptionai/jais-13b",
"inceptionai/jais-13b-chat",
"inceptionai/jais-30b-v3",
"inceptionai/jais-30b-chat-v3",
"ai21labs/AI21-Jamba-1.5-Large",
"ai21labs/AI21-Jamba-1.5-Mini",
"ai21labs/Jamba-v0.1",
"LiquidAI/LFM2-1.2B",
"LiquidAI/LFM2-700M",
"LiquidAI/LFM2-350M",
"LiquidAI/LFM2-8B-A1B-preview",
"meta-llama/Meta-Llama-3.1-405B-Instruct",
"meta-llama/Meta-Llama-3.1-70B",
"meta-llama/Meta-Llama-3-70B-Instruct",
"meta-llama/Llama-2-70b-hf",
"01-ai/Yi-34B",
"state-spaces/mamba-130m-hf",
"state-spaces/mamba-790m-hf",
"state-spaces/mamba-2.8b-hf",
"mistralai/Mamba-Codestral-7B-v0.1",
"XiaomiMiMo/MiMo-7B-RL",
"openbmb/MiniCPM-2B-sft-bf16",
"openbmb/MiniCPM-2B-dpo-bf16",
"openbmb/MiniCPM-S-1B-sft",
"openbmb/MiniCPM3-4B",
"MiniMaxAI/MiniMax-M2",
"mistralai/Mistral-7B-v0.1",
"mistralai/Mistral-7B-Instruct-v0.1",
"mistralai/Mixtral-8x7B-v0.1",
"mistralai/Mixtral-8x7B-Instruct-v0.1",
"mistral-community/Mixtral-8x22B-v0.1",
"mosaicml/mpt-7b",
"mosaicml/mpt-7b-storywriter",
"mosaicml/mpt-30b",
"nvidia/Minitron-8B-Base",
"mgoin/Nemotron-4-340B-Base-hf-FP8",
"nvidia/Nemotron-H-8B-Base-8K",
"nvidia/Nemotron-H-47B-Base-8K",
"nvidia/Nemotron-H-56B-Base-8K",
"allenai/OLMo-1B-hf",
"allenai/OLMo-7B-hf",
"allenai/OLMo-2-0425-1B",
"allenai/OLMoE-1B-7B-0924",
"allenai/OLMoE-1B-7B-0924-Instruct",
"facebook/opt-66b",
"facebook/opt-iml-max-30b",
"OrionStarAI/Orion-14B-Base",
"OrionStarAI/Orion-14B-Chat",
"microsoft/phi-1_5",
"microsoft/phi-2",
"microsoft/Phi-4-mini-instruct",
"microsoft/Phi-4",
"microsoft/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-mini-128k-instruct",
"microsoft/Phi-3-medium-128k-instruct",
"microsoft/Phi-3.5-MoE-instruct",
"adept/persimmon-8b-base",
"adept/persimmon-8b-chat",
"pfnet/plamo-2-1b",
"pfnet/plamo-2-8b",
"Qwen/Qwen-7B",
"Qwen/Qwen-7B-Chat",
"Qwen/QwQ-32B-Preview",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-7B",
"Qwen/Qwen2.5-0.5B-Instruct",
"Qwen/Qwen1.5-MoE-A2.7B",
"Qwen/Qwen1.5-MoE-A2.7B-Chat",
"Qwen/Qwen3-8B",
"Qwen/Qwen3-30B-A3B",
"Qwen/Qwen3-Next-80B-A3B-Instruct",
"ByteDance-Seed/Seed-OSS-36B-Instruct",
"stabilityai/stablelm-3b-4e1t",
"stabilityai/stablelm-base-alpha-7b-v2",
"bigcode/starcoder2-3b",
"bigcode/starcoder2-7b",
"bigcode/starcoder2-15b",
"upstage/solar-pro-preview-instruct",
"Tele-AI/TeleChat2-3B",
"Tele-AI/TeleChat2-7B",
"Tele-AI/TeleChat2-35B",
"CofeAI/FLM-2-52B-Instruct-2407",
"CofeAI/Tele-FLM",
"xverse/XVERSE-7B-Chat",
"xverse/XVERSE-13B-Chat",
"xverse/XVERSE-65B-Chat",
"MiniMaxAI/MiniMax-M1-40k",
"MiniMaxAI/MiniMax-M1-80k",
"MiniMaxAI/MiniMax-Text-01",
"Zyphra/Zamba2-7B-instruct",
"Zyphra/Zamba2-2.7B-instruct",
"Zyphra/Zamba2-1.2B-instruct",
"meituan-longcat/LongCat-Flash-Chat",
"meituan-longcat/LongCat-Flash-Chat-FP8",
"rhymes-ai/Aria",
"CohereForAI/aya-vision-8b",
"CohereForAI/aya-vision-32b",
"Open-Bee/Bee-8B-RL",
"Open-Bee/Bee-8B-SFT",
"Salesforce/blip2-opt-2.7b",
"Salesforce/blip2-opt-6.7b",
"facebook/chameleon-7b",
"CohereLabs/command-a-vision-07-2025",
"deepseek-ai/deepseek-vl2-tiny",
"deepseek-ai/deepseek-vl2-small",
"deepseek-ai/deepseek-vl2",
"deepseek-ai/DeepSeek-OCR",
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
"baidu/ERNIE-4.5-VL-424B-A47B-PT",
"adept/fuyu-8b",
"google/gemma-3-4b-it",
"google/gemma-3-27b-it",
"zai-org/glm-4v-9b",
"zai-org/cogagent-9b-20241220",
"zai-org/GLM-4.1V-9B-Thinking",
"zai-org/GLM-4.5V",
"ibm-granite/granite-speech-3.3-8b",
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
"HuggingFaceM4/Idefics3-8B-Llama3",
"internlm/Intern-S1",
"internlm/Intern-S1-mini",
"OpenGVLab/InternVL3_5-14B",
"OpenGVLab/InternVL3-9B",
"OpenGVLab/InternVideo2_5_Chat_8B",
"OpenGVLab/InternVL2_5-4B",
"OpenGVLab/Mono-InternVL-2B",
"OpenGVLab/InternVL2-4B",
"OpenGVLab/InternVL3-1B-hf",
"Kwai-Keye/Keye-VL-8B-Preview",
"Kwai-Keye/Keye-VL-1_5-8B",
"moonshotai/Kimi-VL-A3B-Instruct",
"moonshotai/Kimi-VL-A3B-Thinking",
"lightonai/LightOnOCR-1B",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"meta-llama/Llama-4-Maverick-17B-128E-Instruct",
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
"llava-hf/llava-1.5-7b-hf",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"mistral-community/pixtral-12b",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/llava-v1.6-vicuna-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
"llava-hf/llava-onevision-qwen2-7b-ov-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"mispeech/midashenglm-7b",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2",
"openbmb/MiniCPM-Llama3-V-2_5",
"openbmb/MiniCPM-V-2_6",
"openbmb/MiniCPM-V-4",
"openbmb/MiniCPM-V-4_5",
"MiniMaxAI/MiniMax-VL-01",
"mistralai/Mistral-Small-3.1-24B-Instruct-2503",
"allenai/Molmo-7B-D-0924",
"allenai/Molmo-7B-O-0924",
"nvidia/NVLM-D-72B",
"AIDC-AI/Ovis2-1B",
"AIDC-AI/Ovis1.6-Llama3.2-3B",
"AIDC-AI/Ovis2.5-9B",
"google/paligemma-3b-pt-224",
"google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-3-vision-128k-instruct",
"microsoft/Phi-3.5-vision-instruct",
"microsoft/Phi-4-multimodal-instruct",
"mistralai/Pixtral-12B-2409",
"Qwen/Qwen-VL",
"Qwen/Qwen-VL-Chat",
"Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/QVQ-72B-Preview",
"Qwen/Qwen2-VL-7B-Instruct",
"Qwen/Qwen2-VL-72B-Instruct",
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2.5-VL-72B-Instruct",
"Qwen/Qwen2.5-Omni-3B",
"Qwen/Qwen2.5-Omni-7B",
"Qwen/Qwen3-VL-4B-Instruct",
"Qwen/Qwen3-VL-30B-A3B-Instruct",
"Qwen/Qwen3-Omni-30B-A3B-Instruct",
"Qwen/Qwen3-Omni-30B-A3B-Thinking",
"YannQi/R-4B",
"Skywork/Skywork-R1V-38B",
"SmolVLM2-2.2B-Instruct",
"stepfun-ai/step3",
"omni-search/Tarsier-7b",
"omni-search/Tarsier-34b",
"omni-research/Tarsier2-Recap-7b",
"omni-research/Tarsier2-7b-0115"
]
}
},
"required": ["huggingfaceName"]
},
"resources": {
"type": "object",
"description": "资源配置",
"properties": {
"gpuLimit": {
"type": "integer",
"description": "GPU 限制",
"default": 1,
"minimum": 1
},
"cpuRequest": {
"type": "integer",
"description": "CPU 请求",
"default": 12,
"minimum": 1
},
"memoryLimit": {
"type": "string",
"description": "内存限制",
"default": "16Gi",
"pattern": "^[0-9]+(\\.[0-9]+)?(Mi|Gi|Ti)$"
},
"shmSize": {
"type": "string",
"description": "共享内存大小",
"default": "20Gi",
"pattern": "^[0-9]+(\\.[0-9]+)?(Mi|Gi|Ti)$"
}
}
},
"workerSize": {
"type": "integer",
"description": "Worker 数量",
"default": 1,
"minimum": 1
},
"command": {
"type": "string",
"description": "自定义命令,模型路径路为 /Model/Weight/Qwen3-0.6B, LoRA 路径为 /Model/LoRA/Qwen3-0.6B (可选) \n e.g. vllm serve --model /Model/Weight/Qwen3-0.6B ",
"default": ""
}
}
}

76
vllm-serve/values.yaml Normal file
View File

@ -0,0 +1,76 @@
# Default values for vllm-app.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
imagePullSecrets: []
imagePullPolicy: IfNotPresent
# This is to override the chart name.
nameOverride: ""
fullnameOverride: ""
# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/
serviceAccount:
# Specifies whether a service account should be created
create: true
# Automatically mount a ServiceAccount's API credentials?
automount: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name: ""
# 模型配置
model:
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct" # 用户只需输入这个
localMountPath: "/Model" # PVC 固定挂载路径
huggingfaceToken: "<your-hf-token>"
download: # 启用自动下载
image: "docker.io/vllm/vllm-openai:latest" # 包含 huggingface-cli 的镜像
# 功能选择
resources:
gpuLimit: 1
cpuRequest: 12
memoryLimit: "16Gi"
shmSize: "20Gi"
svc:
type: LoadBalancer
port: 80
targetPort: 8000
# nodePort: 30080
# vLLM 应用配置
vllm:
image: "docker.io/vllm/vllm-openai:latest"
command: ""
llama:
image: "docker.io/library/one-click:v1"
# lmdeploy 应用配置
lmdeploy:
image: "docker.io/openmmlab/lmdeploy:latest-cu12"
# NFS PV/PVC 配置
nfs:
server: "10.6.80.11"
path: "/volume1/Dataset/PVStore/lab-data-model-pvc-c0beeab1-6dd5-4c6a-bd2c-6ce9e114c25e/Weight"
# storageClass: "local-path"
pvSize: "500Gi"
pvcSize: "50Gi"
# LeaderWorkerSet 配置
replicaCount: 1
workerSize: 1
nodeSelector: {}
tolerations: []
affinity: {}