Files
helm-charts/vllm-serve/templates/lws.yaml
Ivan087 97c5d559e3
All checks were successful
Publish Helm Charts / helm-publish (push) Successful in 8s
fix: debug lws without IB, and set --distributed-executor-backend ray as default
2025-12-04 09:47:43 +08:00

260 lines
11 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{{- if gt (int .Values.workerSize) 1 }}
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: {{ .Release.Name }}
spec:
replicas: {{ .Values.replicaCount }}
leaderWorkerTemplate:
size: {{ .Values.workerSize }}
restartPolicy: RecreateGroupOnPodRestart
leaderTemplate:
metadata:
labels:
role: leader
spec:
initContainers:
# 模型下载作为第一个 initContainer
- name: download-model
image: {{ .Values.model.download.image }}
imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }}
env:
- name: HF_ENDPOINT
value: https://hf-mirror.com
- name: HUGGING_FACE_HUB_TOKEN
value: {{ .Values.model.huggingfaceToken }}
command:
- sh
- -c
- |
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
DEST_DIR="{{ .Values.model.localMountPath }}/Weight/$MODEL_NAME"
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
# 检查模型是否存在,不存在则下载
# echo "DEST_DIR= $DEST_DIR"
# if [ ! -f "$DEST_DIR/config.json" ]; then
# ls -l {{ .Values.model.localMountPath }}
# echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
# wget https://hf-mirror.com/hfd/hfd.sh
# chmod a+x hfd.sh
# apt update && apt upgrade
# apt install aria2 -y
# ./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
# # huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
# else
# echo "Model already exists at $DEST_DIR"
# fi
SUCCESS_FLAG="${DEST_DIR}/.success_flag"
if [ -f "$SUCCESS_FLAG" ]; then
echo "✅ Success flag found. Skipping download."
exit 0
fi
echo "⬇️ Starting download..."
apk add --no-cache bash aria2 wget ca-certificates curl
wget https://hf-mirror.com/hfd/hfd.sh -O hfd.sh && chmod +x hfd.sh
./hfd.sh {{ .Values.model.huggingfaceName }} --tool aria2c -x 8 --local-dir "$DEST_DIR"
touch "$SUCCESS_FLAG"
echo "🎉 Done."
volumeMounts:
- name: weight-volume
mountPath: {{ .Values.model.localMountPath }}
containers:
- name: vllm-leader
image: {{ .Values.vllm.image }}
imagePullPolicy: {{ .Values.imagePullPolicy }}
securityContext:
capabilities:
add: [ "IPC_LOCK" ]
env:
# - name: HUGGING_FACE_HUB_TOKEN
# value: {{ .Values.vllm.huggingfaceToken }}
- name: GLOO_SOCKET_IFNAME
value: {{ .Values.rdma.interface | default "eth0" | quote }}
- name: NCCL_SOCKET_IFNAME
value: {{ .Values.rdma.interface | default "eth0" | quote }}
- name: RAY_DEDUP_LOGS
value: "0"
- name: NCCL_DEBUG
value: INFO
# RDMA 条件配置
{{- if .Values.rdma.enabled }}
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_IB_HCA
value: {{ .Values.rdma.hca | default "^mlx5" | quote }}
- name: NCCL_IB_GID_INDEX
value: {{ .Values.rdma.gidIndex | default "0" | quote }} # 或 "7",根据你的网络配置而定
{{- else }}
# 如果未开启 RDMA显式禁用 IB防止 NCCL 尝试探测报错
- name: NCCL_IB_DISABLE
value: "1"
{{- end }}
command:
- sh
- -c
- |
# 1. 自动 RDMA 探测逻辑 (无论下方跑什么命令,先执行这段)
# =======================================================
# echo "🔍 [Init] Detecting RDMA devices..."
# if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then
# echo "✅ [Init] RDMA devices found. Enabling NCCL IB."
# export NCCL_IB_DISABLE=0
# # 如果环境变量没指定 HCA默认使用 ^mlx5 匹配
# export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"}
# else
# echo "⚠️ [Init] No RDMA devices found. Falling back to TCP."
# export NCCL_IB_DISABLE=1
# export NCCL_NET_GDR_LEVEL=0
# fi
# echo "🚀 [Init] RDMA setup complete. NCCL_IB_DISABLE=$NCCL_IB_DISABLE"
{{- if .Values.command }}
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }} --distributed-executor-backend ray
{{- else }}
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --distributed-executor-backend ray --trust_remote_code
{{- end }}
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
memory: {{ .Values.resources.memoryLimit }}
ephemeral-storage: 10Gi
{{- if .Values.rdma.enabled }}
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
{{- end }}
requests:
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
{{- if .Values.rdma.enabled }}
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
{{- end }}
ports:
- containerPort: 8000
name: http
readinessProbe:
# tcpSocket:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 120
periodSeconds: 20
timeoutSeconds: 5
volumeMounts:
- mountPath: /dev/shm
name: dshm
- name: weight-volume
mountPath: {{ .Values.model.localMountPath }}
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.resources.shmSize }}
- name: weight-volume
persistentVolumeClaim:
claimName: {{ .Release.Name }}-pvc-model
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 10 }}
{{- end }}
workerTemplate:
spec:
containers:
- name: vllm-worker
image: {{ .Values.vllm.image }}
imagePullPolicy: IfNotPresent
securityContext:
capabilities:
add: [ "IPC_LOCK" ]
command:
- sh
- -c
- |
# 1. 自动 RDMA 探测逻辑
# =======================================================
# echo "🔍 [Init] Detecting RDMA devices..."
# if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then
# echo "✅ [Init] RDMA devices found. Enabling NCCL IB."
# export NCCL_IB_DISABLE=0
# export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"}
# else
# echo "⚠️ [Init] No RDMA devices found. Falling back to TCP."
# export NCCL_IB_DISABLE=1
# export NCCL_NET_GDR_LEVEL=0
# fi
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
memory: {{ .Values.resources.memoryLimit }}
ephemeral-storage: 10Gi
{{- if .Values.rdma.enabled }}
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
{{- end }}
requests:
ephemeral-storage: 10Gi
cpu: {{ .Values.resources.cpuRequest }}
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
{{- if .Values.rdma.enabled }}
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
{{- end }}
env:
- name: GLOO_SOCKET_IFNAME
value: {{ .Values.rdma.interface | default "eth0" | quote }}
- name: NCCL_SOCKET_IFNAME
value: {{ .Values.rdma.interface | default "eth0" | quote }}
- name: NCCL_DEBUG
value: INFO
- name: RAY_DEDUP_LOGS
value: "0"
{{- if .Values.rdma.enabled }}
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_IB_HCA
value: {{ .Values.rdma.hca | default "^mlx5" | quote }}
- name: NCCL_IB_GID_INDEX
value: {{ .Values.rdma.gidIndex | default "0" | quote }}
{{- else }}
- name: NCCL_IB_DISABLE
value: "1"
{{- end }}
volumeMounts:
- mountPath: /dev/shm
name: dshm
- name: weight-volume
mountPath: {{ .Values.model.localMountPath }}
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: {{ .Values.resources.shmSize }}
- name: weight-volume
persistentVolumeClaim:
claimName: {{ .Release.Name }}-pvc-model
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- end }}