Compare commits

3 Commits
vllm ... main

Author SHA1 Message Date
89bc94a6a9 feat: support Hami on k3s and k8s
All checks were successful
Publish Helm Charts / helm-publish (push) Successful in 7s
2025-12-05 17:09:01 +08:00
97c5d559e3 fix: debug lws without IB, and set --distributed-executor-backend ray as default
All checks were successful
Publish Helm Charts / helm-publish (push) Successful in 8s
2025-12-04 09:47:43 +08:00
51163f0442 Merge pull request 'fix: fix bugs for initContainer in LWS' (#3) from vllm into main
All checks were successful
Publish Helm Charts / helm-publish (push) Successful in 7s
Reviewed-on: #3
2025-12-02 08:39:07 +00:00
5 changed files with 118 additions and 32 deletions

View File

@ -5,4 +5,4 @@ appVersion: 1.16.0
description: A Helm chart for deploying vLLM with NFS storage
name: vllm-serve
type: application
version: 0.2.1
version: 0.3.0

View File

@ -16,7 +16,7 @@ spec:
initContainers:
# 模型下载作为第一个 initContainer
- name: download-model
image: {{ .Values.model.download.image }}
image: alpine:latest
imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }}
env:
- name: HF_ENDPOINT
@ -62,7 +62,7 @@ spec:
containers:
- name: vllm-leader
image: {{ .Values.vllm.image }}
imagePullPolicy: IfNotPresent
imagePullPolicy: {{ .Values.imagePullPolicy }}
securityContext:
capabilities:
add: [ "IPC_LOCK" ]
@ -70,38 +70,74 @@ spec:
# - name: HUGGING_FACE_HUB_TOKEN
# value: {{ .Values.vllm.huggingfaceToken }}
- name: GLOO_SOCKET_IFNAME
value: eth0
value: {{ .Values.rdma.interface | default "eth0" | quote }}
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_DISABLE
value: {{ .Values.rdma.interface | default "eth0" | quote }}
- name: RAY_DEDUP_LOGS
value: "0"
- name: NCCL_DEBUG
value: INFO
- name: NCCL_IB_HCA
value: mlx5_0:1
- name: NCCL_IB_GID_INDEX
value: "0" # 或 "7",根据你的网络配置而定
- name: RAY_DEDUP_LOGS
# RDMA 条件配置
{{- if .Values.rdma.enabled }}
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_IB_HCA
value: {{ .Values.rdma.hca | default "^mlx5" | quote }}
- name: NCCL_IB_GID_INDEX
value: {{ .Values.rdma.gidIndex | default "0" | quote }} # 或 "7",根据你的网络配置而定
{{- else }}
# 如果未开启 RDMA显式禁用 IB防止 NCCL 尝试探测报错
- name: NCCL_IB_DISABLE
value: "1"
{{- end }}
command:
- sh
- -c
{{- if .Values.command }}
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }}"
{{- else }}
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
- |
# 1. 自动 RDMA 探测逻辑 (无论下方跑什么命令,先执行这段)
# =======================================================
# echo "🔍 [Init] Detecting RDMA devices..."
# if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then
# echo "✅ [Init] RDMA devices found. Enabling NCCL IB."
# export NCCL_IB_DISABLE=0
# # 如果环境变量没指定 HCA默认使用 ^mlx5 匹配
# export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"}
# else
# echo "⚠️ [Init] No RDMA devices found. Falling back to TCP."
# export NCCL_IB_DISABLE=1
# export NCCL_NET_GDR_LEVEL=0
# fi
# echo "🚀 [Init] RDMA setup complete. NCCL_IB_DISABLE=$NCCL_IB_DISABLE"
{{- if .Values.command }}
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }} --distributed-executor-backend ray
{{- else }}
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
{{- end }}
python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --distributed-executor-backend ray --trust_remote_code
{{- end }}
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
memory: {{ .Values.resources.memoryLimit }}
ephemeral-storage: 10Gi
rdma/rdma_shared_device_a: 10
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
{{- end }}
{{- if .Values.rdma.enabled }}
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
{{- end }}
requests:
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
{{- end }}
{{- if .Values.rdma.enabled }}
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
{{- end }}
ports:
- containerPort: 8000
name: http
@ -150,33 +186,63 @@ spec:
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
- |
# 1. 自动 RDMA 探测逻辑
# =======================================================
# echo "🔍 [Init] Detecting RDMA devices..."
# if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then
# echo "✅ [Init] RDMA devices found. Enabling NCCL IB."
# export NCCL_IB_DISABLE=0
# export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"}
# else
# echo "⚠️ [Init] No RDMA devices found. Falling back to TCP."
# export NCCL_IB_DISABLE=1
# export NCCL_NET_GDR_LEVEL=0
# fi
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
memory: {{ .Values.resources.memoryLimit }}
ephemeral-storage: 10Gi
rdma/rdma_shared_device_a: 10
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
{{- end }}
{{- if .Values.rdma.enabled }}
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
{{- end }}
requests:
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
{{- end }}
{{- if .Values.rdma.enabled }}
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
{{- end }}
env:
# - name: HUGGING_FACE_HUB_TOKEN
# value: {{ .Values.vllm.huggingfaceToken }}
- name: GLOO_SOCKET_IFNAME
value: eth0
value: {{ .Values.rdma.interface | default "eth0" | quote }}
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_DISABLE
value: "0"
value: {{ .Values.rdma.interface | default "eth0" | quote }}
- name: NCCL_DEBUG
value: INFO
- name: NCCL_IB_HCA
value: mlx5_0:1
- name: NCCL_IB_GID_INDEX
value: "0" # 或 "7",根据你的网络配置而定
- name: RAY_DEDUP_LOGS
value: "0"
{{- if .Values.rdma.enabled }}
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_IB_HCA
value: {{ .Values.rdma.hca | default "^mlx5" | quote }}
- name: NCCL_IB_GID_INDEX
value: {{ .Values.rdma.gidIndex | default "0" | quote }}
{{- else }}
- name: NCCL_IB_DISABLE
value: "1"
{{- end }}
volumeMounts:
- mountPath: /dev/shm
name: dshm

View File

@ -92,12 +92,18 @@ spec:
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
{{- end }}
memory: {{ .Values.resources.memoryLimit }}
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
requests:
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
{{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }}
nvidia.com/gpumem: {{ .Values.resources.gpuMem }}
{{- end }}
ports:
- containerPort: 8000
name: http

View File

@ -593,6 +593,12 @@
"default": 1,
"minimum": 1
},
"gpuMem": {
"type": "integer",
"description": "GPU 显存限制单位MB, 0表示独占卡",
"default": 0,
"minimum": 0
},
"cpuRequest": {
"type": "integer",
"description": "CPU 请求",

View File

@ -27,17 +27,25 @@ model:
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct" # 用户只需输入这个
localMountPath: "/Model" # PVC 固定挂载路径
huggingfaceToken: "<your-hf-token>"
download: # 启用自动下载
image: "docker.io/vllm/vllm-openai:latest" # 包含 huggingface-cli 的镜像
# 功能选择
resources:
gpuLimit: 1
gpuMem: 0
cpuRequest: 12
memoryLimit: "16Gi"
shmSize: "20Gi"
# RDMA 配置部分
rdma:
enabled: false # 开关:默认关闭,防止在无 RDMA 节点报错
interface: eth0 # NCCL/GLOO 通信使用的网卡名称 (有 RDMA 时可能是 ib0 或 bond0)
resourceName: "rdma/rdma_shared_device_a" # RDMA 资源名称 (取决于你的 k8s 插件)
resourceCount: 5 # 每个 Pod 需要的 RDMA 设备数量
hca: "mlx5_0:1" # 指定的 HCA 设备 (或者使用 ^mlx5 进行前缀匹配)
gidIndex: "0" # RoCEv2 通常需要指定 GID
svc:
type: LoadBalancer
port: 80