Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 89bc94a6a9 | |||
| 97c5d559e3 | |||
| 51163f0442 | |||
| 69da2b8c5b | |||
| 4e9a086547 | |||
| 5aebfc5cdf | |||
| 210c967135 | |||
| 9e08afdcb2 | |||
| fccbb10208 | |||
| ad350ee5ad |
@ -1,4 +1,4 @@
|
||||
# List each Helm chart directory to package and push (one per line)
|
||||
# vllm-serve
|
||||
vllm-serve
|
||||
# code-server-chart
|
||||
open-webui
|
||||
# open-webui
|
||||
|
||||
@ -5,4 +5,4 @@ appVersion: 1.16.0
|
||||
description: A Helm chart for deploying vLLM with NFS storage
|
||||
name: vllm-serve
|
||||
type: application
|
||||
version: 0.2.0
|
||||
version: 0.3.0
|
||||
|
||||
@ -16,7 +16,7 @@ spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
image: alpine:latest
|
||||
imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }}
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
@ -31,26 +31,38 @@ spec:
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/Weight/$MODEL_NAME"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt update && apt upgrade
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
# echo "DEST_DIR= $DEST_DIR"
|
||||
# if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
# ls -l {{ .Values.model.localMountPath }}
|
||||
# echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
# wget https://hf-mirror.com/hfd/hfd.sh
|
||||
# chmod a+x hfd.sh
|
||||
# apt update && apt upgrade
|
||||
# apt install aria2 -y
|
||||
# ./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# # huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# else
|
||||
# echo "Model already exists at $DEST_DIR"
|
||||
# fi
|
||||
SUCCESS_FLAG="${DEST_DIR}/.success_flag"
|
||||
if [ -f "$SUCCESS_FLAG" ]; then
|
||||
echo "✅ Success flag found. Skipping download."
|
||||
exit 0
|
||||
fi
|
||||
echo "⬇️ Starting download..."
|
||||
apk add --no-cache bash aria2 wget ca-certificates curl
|
||||
wget https://hf-mirror.com/hfd/hfd.sh -O hfd.sh && chmod +x hfd.sh
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --tool aria2c -x 8 --local-dir "$DEST_DIR"
|
||||
|
||||
touch "$SUCCESS_FLAG"
|
||||
echo "🎉 Done."
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullPolicy: {{ .Values.imagePullPolicy }}
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
@ -58,45 +70,81 @@ spec:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
value: {{ .Values.rdma.interface | default "eth0" | quote }}
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: {{ .Values.rdma.interface | default "eth0" | quote }}
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
|
||||
# RDMA 条件配置
|
||||
{{- if .Values.rdma.enabled }}
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_IB_HCA
|
||||
value: {{ .Values.rdma.hca | default "^mlx5" | quote }}
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: {{ .Values.rdma.gidIndex | default "0" | quote }} # 或 "7",根据你的网络配置而定
|
||||
{{- else }}
|
||||
# 如果未开启 RDMA,显式禁用 IB,防止 NCCL 尝试探测报错
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "1"
|
||||
{{- end }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
{{- if .Values.command }}
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }}"
|
||||
{{- else }}
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
- |
|
||||
# 1. 自动 RDMA 探测逻辑 (无论下方跑什么命令,先执行这段)
|
||||
# =======================================================
|
||||
# echo "🔍 [Init] Detecting RDMA devices..."
|
||||
# if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then
|
||||
# echo "✅ [Init] RDMA devices found. Enabling NCCL IB."
|
||||
# export NCCL_IB_DISABLE=0
|
||||
# # 如果环境变量没指定 HCA,默认使用 ^mlx5 匹配
|
||||
# export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"}
|
||||
# else
|
||||
# echo "⚠️ [Init] No RDMA devices found. Falling back to TCP."
|
||||
# export NCCL_IB_DISABLE=1
|
||||
# export NCCL_NET_GDR_LEVEL=0
|
||||
# fi
|
||||
# echo "🚀 [Init] RDMA setup complete. NCCL_IB_DISABLE=$NCCL_IB_DISABLE"
|
||||
|
||||
{{- if .Values.command }}
|
||||
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }} --distributed-executor-backend ray
|
||||
{{- else }}
|
||||
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
|
||||
{{- end }}
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --distributed-executor-backend ray --trust_remote_code
|
||||
{{- end }}
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
|
||||
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
|
||||
{{- end }}
|
||||
{{- if .Values.rdma.enabled }}
|
||||
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
|
||||
{{- end }}
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
|
||||
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
|
||||
{{- end }}
|
||||
{{- if .Values.rdma.enabled }}
|
||||
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
#httpGet:
|
||||
#path: /health
|
||||
# tcpSocket:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 20
|
||||
@ -138,33 +186,63 @@ spec:
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
- |
|
||||
# 1. 自动 RDMA 探测逻辑
|
||||
# =======================================================
|
||||
# echo "🔍 [Init] Detecting RDMA devices..."
|
||||
# if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then
|
||||
# echo "✅ [Init] RDMA devices found. Enabling NCCL IB."
|
||||
# export NCCL_IB_DISABLE=0
|
||||
# export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"}
|
||||
# else
|
||||
# echo "⚠️ [Init] No RDMA devices found. Falling back to TCP."
|
||||
# export NCCL_IB_DISABLE=1
|
||||
# export NCCL_NET_GDR_LEVEL=0
|
||||
# fi
|
||||
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
|
||||
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
|
||||
{{- end }}
|
||||
{{- if .Values.rdma.enabled }}
|
||||
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
|
||||
{{- end }}
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
|
||||
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
|
||||
{{- end }}
|
||||
{{- if .Values.rdma.enabled }}
|
||||
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
|
||||
{{- end }}
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
value: {{ .Values.rdma.interface | default "eth0" | quote }}
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
value: {{ .Values.rdma.interface | default "eth0" | quote }}
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
|
||||
{{- if .Values.rdma.enabled }}
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_IB_HCA
|
||||
value: {{ .Values.rdma.hca | default "^mlx5" | quote }}
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: {{ .Values.rdma.gidIndex | default "0" | quote }}
|
||||
{{- else }}
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "1"
|
||||
{{- end }}
|
||||
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
@ -190,4 +268,4 @@ spec:
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
@ -16,7 +16,7 @@ spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
image: alpine:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
@ -24,33 +24,46 @@ spec:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/Weight/$MODEL_NAME"
|
||||
SUCCESS_FLAG="${DEST_DIR}/.success_flag"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt update && apt upgrade
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
# echo "DEST_DIR= $DEST_DIR"
|
||||
# if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
# ls -l {{ .Values.model.localMountPath }}
|
||||
# echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
# wget https://hf-mirror.com/hfd/hfd.sh
|
||||
# chmod a+x hfd.sh
|
||||
# apt update && apt upgrade
|
||||
# apt install aria2 -y
|
||||
# ./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# # huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# else
|
||||
# echo "Model already exists at $DEST_DIR"
|
||||
# fi
|
||||
if [ -f "$SUCCESS_FLAG" ]; then
|
||||
echo "✅ Success flag found. Skipping download."
|
||||
exit 0
|
||||
fi
|
||||
echo "⬇️ Starting download..."
|
||||
apk add --no-cache bash aria2 wget ca-certificates curl
|
||||
wget https://hf-mirror.com/hfd/hfd.sh -O hfd.sh && chmod +x hfd.sh
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --tool aria2c -x 8 --local-dir "$DEST_DIR"
|
||||
|
||||
touch "$SUCCESS_FLAG"
|
||||
echo "🎉 Done."
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: vllm-pod
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullPolicy: {{ .Values.imagePullPolicy }}
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.vllm.huggingfaceToken }}
|
||||
@ -79,11 +92,18 @@ spec:
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
|
||||
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
|
||||
{{- end }}
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
|
||||
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
@ -124,4 +144,4 @@ spec:
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
@ -440,7 +440,142 @@
|
||||
"tencent/DRIVE-RL",
|
||||
"tencent/DRIVE-SFT",
|
||||
"Qwen/Qwen2.5-14B",
|
||||
"BAAI/bge-base-zh-v1.5"
|
||||
"BAAI/bge-base-zh-v1.5",
|
||||
"01-ai/Yi-1.5-34B",
|
||||
"01-ai/Yi-1.5-6B",
|
||||
"01-ai/Yi-1.5-9B",
|
||||
"01-ai/Yi-34B-Chat",
|
||||
"ByteDance/Dolphin-1.5",
|
||||
"ByteDance/Ouro-1.4B",
|
||||
"ByteDance/Ouro-1.4B-Thinking",
|
||||
"ByteDance/Ouro-2.6B",
|
||||
"ByteDance/Ouro-2.6B-Thinking",
|
||||
"OpenGVLab/InternVL3-38B",
|
||||
"OpenGVLab/InternVL3-78B",
|
||||
"OpenGVLab/InternVL3_5-1B",
|
||||
"OpenGVLab/InternVL3_5-30B-A3B-Instruct",
|
||||
"OpenGVLab/InternVL3_5-38B",
|
||||
"OpenGVLab/InternVL3_5-8B",
|
||||
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
|
||||
"OpenGVLab/Mini-InternVL2-2B-DA-Medical",
|
||||
"OpenGVLab/SDLM-32B-D4",
|
||||
"Qwen/CodeQwen1.5-7B-Chat",
|
||||
"Qwen/Qwen1.5-1.8B",
|
||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
|
||||
"Qwen/Qwen2-0.5B",
|
||||
"Qwen/Qwen2-Math-72B-Instruct",
|
||||
"Qwen/Qwen2.5-0.5B",
|
||||
"Qwen/Qwen2.5-1.5B",
|
||||
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||
"Qwen/Qwen2.5-1.5B-Instruct-GGUF",
|
||||
"Qwen/Qwen2.5-14B-Instruct",
|
||||
"Qwen/Qwen2.5-14B-Instruct-1M",
|
||||
"Qwen/Qwen2.5-32B-Instruct",
|
||||
"Qwen/Qwen2.5-32B-Instruct-AWQ",
|
||||
"Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4",
|
||||
"Qwen/Qwen2.5-3B-Instruct",
|
||||
"Qwen/Qwen2.5-3B-Instruct-GGUF",
|
||||
"Qwen/Qwen2.5-72B-Instruct",
|
||||
"Qwen/Qwen2.5-7B",
|
||||
"Qwen/Qwen2.5-7B-Instruct",
|
||||
"Qwen/Qwen2.5-7B-Instruct-1M",
|
||||
"Qwen/Qwen2.5-7B-Instruct-GGUF",
|
||||
"Qwen/Qwen2.5-Coder-0.5B",
|
||||
"Qwen/Qwen2.5-Coder-1.5B",
|
||||
"Qwen/Qwen2.5-Coder-1.5B-Instruct",
|
||||
"Qwen/Qwen2.5-Coder-14B-Instruct",
|
||||
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
||||
"Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
|
||||
"Qwen/Qwen2.5-Coder-7B-Instruct",
|
||||
"Qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
|
||||
"Qwen/Qwen2.5-Math-72B",
|
||||
"Qwen/Qwen2.5-Math-7B",
|
||||
"Qwen/Qwen2.5-VL-32B-Instruct",
|
||||
"Qwen/Qwen2.5-VL-72B-Instruct-AWQ",
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
"Qwen/Qwen3-0.6B-Base",
|
||||
"Qwen/Qwen3-1.7B",
|
||||
"Qwen/Qwen3-1.7B-FP8",
|
||||
"Qwen/Qwen3-14B-FP8",
|
||||
"Qwen/Qwen3-14B-GGUF",
|
||||
"Qwen/Qwen3-30B-A3B-Base",
|
||||
"Qwen/Qwen3-30B-A3B-Thinking-2507-FP8",
|
||||
"Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
|
||||
"Qwen/Qwen3-VL-4B-Instruct-FP8",
|
||||
"baichuan-inc/Baichuan-M2-32B-GPTQ-Int4",
|
||||
"baidu/ERNIE-4.5-0.3B-Base-PT",
|
||||
"baidu/ERNIE-4.5-21B-A3B-Base-PT",
|
||||
"baidu/ERNIE-4.5-VL-28B-A3B-Base-PT",
|
||||
"baidu/ERNIE-4.5-VL-28B-A3B-Thinking",
|
||||
"baidu/ERNIE-4.5-VL-424B-A47B-Base-PT",
|
||||
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
||||
"deepseek-ai/DeepSeek-V3.1-Terminus",
|
||||
"deepseek-ai/DeepSeek-V3.2-Exp",
|
||||
"deepseek-ai/deepseek-coder-1.3b-base",
|
||||
"deepseek-ai/deepseek-coder-1.3b-instruct",
|
||||
"deepseek-ai/deepseek-coder-6.7b-instruct",
|
||||
"google/codegemma-1.1-2b-GGUF",
|
||||
"google/gemma-3-27b-pt",
|
||||
"google/paligemma2-28b-pt-896",
|
||||
"google/reformer-crime-and-punishment",
|
||||
"google/reformer-enwik8",
|
||||
"google/t5-11b-ssm-nq",
|
||||
"google/t5-3b-ssm-nq",
|
||||
"google/t5-large-ssm-nq",
|
||||
"google/t5-small-ssm-nq",
|
||||
"google/t5-xl-ssm-nq",
|
||||
"google/t5-xxl-ssm-nq",
|
||||
"google/t5gemma-ml-ml-ul2-it",
|
||||
"internlm/Spatial-SSRL-7B",
|
||||
"llava-hf/llava-1.5-13b-hf",
|
||||
"llava-hf/llava-v1.6-34b-hf",
|
||||
"meta-llama/Llama-3.1-405B",
|
||||
"meta-llama/Llama-3.1-405B-Instruct",
|
||||
"meta-llama/Llama-3.1-70B",
|
||||
"meta-llama/Llama-3.1-70B-Instruct",
|
||||
"meta-llama/Llama-3.1-8B",
|
||||
"meta-llama/Llama-3.1-8B-Instruct",
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-3B",
|
||||
"meta-llama/Llama-3.2-3B-Instruct",
|
||||
"meta-llama/Llama-3.2-90B-Vision",
|
||||
"meta-llama/Llama-3.2-90B-Vision-Instruct",
|
||||
"meta-llama/Llama-3.3-70B-Instruct",
|
||||
"microsoft/MediPhi-Instruct",
|
||||
"microsoft/Phi-3-medium-4k-instruct-onnx-cpu",
|
||||
"microsoft/Phi-3.5-mini-instruct",
|
||||
"microsoft/bitnet-b1.58-2B-4T",
|
||||
"microsoft/bitnet-b1.58-2B-4T-gguf",
|
||||
"microsoft/kosmos-2.5",
|
||||
"microsoft/kosmos-2.5-chat",
|
||||
"microsoft/llava-med-v1.5-mistral-7b",
|
||||
"mistralai/Mistral-7B-Instruct-v0.2",
|
||||
"moonshotai/Kimi-Dev-72B",
|
||||
"moonshotai/Kimi-K2-Base",
|
||||
"moonshotai/Kimi-K2-Instruct",
|
||||
"moonshotai/Kimi-K2-Instruct-0905",
|
||||
"moonshotai/Kimi-K2-Thinking",
|
||||
"moonshotai/Kimi-Linear-48B-A3B-Base",
|
||||
"moonshotai/Kimi-Linear-48B-A3B-Instruct",
|
||||
"moonshotai/Moonlight-16B-A3B",
|
||||
"openbmb/MiniCPM4.1-8B",
|
||||
"tencent/DeepSeek-V3.1-Terminus-W4AFP8",
|
||||
"tencent/Hunyuan-0.5B-Pretrain",
|
||||
"zai-org/GLM-4-9B-0414",
|
||||
"zai-org/GLM-4.1V-9B-Base",
|
||||
"zai-org/GLM-4.5-Air",
|
||||
"zai-org/GLM-4.5V-FP8",
|
||||
"zai-org/GLM-4.6",
|
||||
"zai-org/GLM-4.6-FP8",
|
||||
"zai-org/GLM-Z1-32B-0414",
|
||||
"zai-org/GLM-Z1-9B-0414",
|
||||
"zai-org/Glyph",
|
||||
"zai-org/UI2Code_N",
|
||||
"zai-org/WebVIA-Agent",
|
||||
"zai-org/codegeex4-all-9b"
|
||||
]
|
||||
}
|
||||
},
|
||||
@ -458,6 +593,12 @@
|
||||
"default": 1,
|
||||
"minimum": 1
|
||||
},
|
||||
"gpuMem": {
|
||||
"type": "integer",
|
||||
"description": "GPU 显存限制,单位MB, 0表示独占卡",
|
||||
"default": 0,
|
||||
"minimum": 0
|
||||
},
|
||||
"cpuRequest": {
|
||||
"type": "integer",
|
||||
"description": "CPU 请求",
|
||||
|
||||
@ -27,17 +27,25 @@ model:
|
||||
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct" # 用户只需输入这个
|
||||
localMountPath: "/Model" # PVC 固定挂载路径
|
||||
huggingfaceToken: "<your-hf-token>"
|
||||
download: # 启用自动下载
|
||||
image: "docker.io/vllm/vllm-openai:latest" # 包含 huggingface-cli 的镜像
|
||||
|
||||
# 功能选择
|
||||
|
||||
resources:
|
||||
gpuLimit: 1
|
||||
gpuMem: 0
|
||||
cpuRequest: 12
|
||||
memoryLimit: "16Gi"
|
||||
shmSize: "20Gi"
|
||||
|
||||
# RDMA 配置部分
|
||||
rdma:
|
||||
enabled: false # 开关:默认关闭,防止在无 RDMA 节点报错
|
||||
interface: eth0 # NCCL/GLOO 通信使用的网卡名称 (有 RDMA 时可能是 ib0 或 bond0)
|
||||
resourceName: "rdma/rdma_shared_device_a" # RDMA 资源名称 (取决于你的 k8s 插件)
|
||||
resourceCount: 5 # 每个 Pod 需要的 RDMA 设备数量
|
||||
hca: "mlx5_0:1" # 指定的 HCA 设备 (或者使用 ^mlx5 进行前缀匹配)
|
||||
gidIndex: "0" # RoCEv2 通常需要指定 GID
|
||||
|
||||
svc:
|
||||
type: LoadBalancer
|
||||
port: 80
|
||||
|
||||
Reference in New Issue
Block a user