{{- if gt (int .Values.workerSize) 1 }} apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: name: {{ .Release.Name }} spec: replicas: {{ .Values.replicaCount }} leaderWorkerTemplate: size: {{ .Values.workerSize }} restartPolicy: RecreateGroupOnPodRestart leaderTemplate: metadata: labels: role: leader spec: initContainers: # 模型下载作为第一个 initContainer - name: download-model image: alpine:latest imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }} env: - name: HF_ENDPOINT value: https://hf-mirror.com - name: HUGGING_FACE_HUB_TOKEN value: {{ .Values.model.huggingfaceToken }} command: - sh - -c - | MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}") DEST_DIR="{{ .Values.model.localMountPath }}/Weight/$MODEL_NAME" # DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}" # 检查模型是否存在,不存在则下载 # echo "DEST_DIR= $DEST_DIR" # if [ ! -f "$DEST_DIR/config.json" ]; then # ls -l {{ .Values.model.localMountPath }} # echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR" # wget https://hf-mirror.com/hfd/hfd.sh # chmod a+x hfd.sh # apt update && apt upgrade # apt install aria2 -y # ./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR" # # huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR" # else # echo "Model already exists at $DEST_DIR" # fi SUCCESS_FLAG="${DEST_DIR}/.success_flag" if [ -f "$SUCCESS_FLAG" ]; then echo "✅ Success flag found. Skipping download." exit 0 fi echo "⬇️ Starting download..." apk add --no-cache bash aria2 wget ca-certificates curl wget https://hf-mirror.com/hfd/hfd.sh -O hfd.sh && chmod +x hfd.sh ./hfd.sh {{ .Values.model.huggingfaceName }} --tool aria2c -x 8 --local-dir "$DEST_DIR" touch "$SUCCESS_FLAG" echo "🎉 Done." volumeMounts: - name: weight-volume mountPath: {{ .Values.model.localMountPath }} containers: - name: vllm-leader image: {{ .Values.vllm.image }} imagePullPolicy: {{ .Values.imagePullPolicy }} securityContext: capabilities: add: [ "IPC_LOCK" ] env: # - name: HUGGING_FACE_HUB_TOKEN # value: {{ .Values.vllm.huggingfaceToken }} - name: GLOO_SOCKET_IFNAME value: {{ .Values.rdma.interface | default "eth0" | quote }} - name: NCCL_SOCKET_IFNAME value: {{ .Values.rdma.interface | default "eth0" | quote }} - name: RAY_DEDUP_LOGS value: "0" - name: NCCL_DEBUG value: INFO # RDMA 条件配置 {{- if .Values.rdma.enabled }} - name: NCCL_IB_DISABLE value: "0" - name: NCCL_IB_HCA value: {{ .Values.rdma.hca | default "^mlx5" | quote }} - name: NCCL_IB_GID_INDEX value: {{ .Values.rdma.gidIndex | default "0" | quote }} # 或 "7",根据你的网络配置而定 {{- else }} # 如果未开启 RDMA,显式禁用 IB,防止 NCCL 尝试探测报错 - name: NCCL_IB_DISABLE value: "1" {{- end }} command: - sh - -c - | # 1. 自动 RDMA 探测逻辑 (无论下方跑什么命令,先执行这段) # ======================================================= # echo "🔍 [Init] Detecting RDMA devices..." # if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then # echo "✅ [Init] RDMA devices found. Enabling NCCL IB." # export NCCL_IB_DISABLE=0 # # 如果环境变量没指定 HCA,默认使用 ^mlx5 匹配 # export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"} # else # echo "⚠️ [Init] No RDMA devices found. Falling back to TCP." # export NCCL_IB_DISABLE=1 # export NCCL_NET_GDR_LEVEL=0 # fi # echo "🚀 [Init] RDMA setup complete. NCCL_IB_DISABLE=$NCCL_IB_DISABLE" {{- if .Values.command }} bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }} --distributed-executor-backend ray {{- else }} bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME; python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --distributed-executor-backend ray --trust_remote_code {{- end }} resources: limits: nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" memory: {{ .Values.resources.memoryLimit }} ephemeral-storage: 10Gi {{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }} nvidia.com/gpumem: {{ .Values.resources.gpuMem }} {{- end }} {{- if .Values.rdma.enabled }} {{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }} {{- end }} requests: ephemeral-storage: 10Gi cpu: {{ .Values.resources.cpuRequest }} nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" {{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }} nvidia.com/gpumem: {{ .Values.resources.gpuMem }} {{- end }} {{- if .Values.rdma.enabled }} {{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }} {{- end }} ports: - containerPort: 8000 name: http readinessProbe: # tcpSocket: httpGet: path: /health port: 8000 initialDelaySeconds: 120 periodSeconds: 20 timeoutSeconds: 5 volumeMounts: - mountPath: /dev/shm name: dshm - name: weight-volume mountPath: {{ .Values.model.localMountPath }} volumes: - name: dshm emptyDir: medium: Memory sizeLimit: {{ .Values.resources.shmSize }} - name: weight-volume persistentVolumeClaim: claimName: {{ .Release.Name }}-pvc-model {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 10 }} {{- end }} {{- with .Values.affinity }} affinity: {{- toYaml . | nindent 10 }} {{- end }} {{- with .Values.tolerations }} tolerations: {{- toYaml . | nindent 10 }} {{- end }} workerTemplate: spec: containers: - name: vllm-worker image: {{ .Values.vllm.image }} imagePullPolicy: IfNotPresent securityContext: capabilities: add: [ "IPC_LOCK" ] command: - sh - -c - | # 1. 自动 RDMA 探测逻辑 # ======================================================= # echo "🔍 [Init] Detecting RDMA devices..." # if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then # echo "✅ [Init] RDMA devices found. Enabling NCCL IB." # export NCCL_IB_DISABLE=0 # export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"} # else # echo "⚠️ [Init] No RDMA devices found. Falling back to TCP." # export NCCL_IB_DISABLE=1 # export NCCL_NET_GDR_LEVEL=0 # fi bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS) resources: limits: nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" memory: {{ .Values.resources.memoryLimit }} ephemeral-storage: 10Gi {{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }} nvidia.com/gpumem: {{ .Values.resources.gpuMem }} {{- end }} {{- if .Values.rdma.enabled }} {{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }} {{- end }} requests: ephemeral-storage: 10Gi cpu: {{ .Values.resources.cpuRequest }} nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" {{- if and .Values.resources.gpuMem (gt (int .Values.resources.gpuMem) 0) }} nvidia.com/gpumem: {{ .Values.resources.gpuMem }} {{- end }} {{- if .Values.rdma.enabled }} {{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }} {{- end }} env: - name: GLOO_SOCKET_IFNAME value: {{ .Values.rdma.interface | default "eth0" | quote }} - name: NCCL_SOCKET_IFNAME value: {{ .Values.rdma.interface | default "eth0" | quote }} - name: NCCL_DEBUG value: INFO - name: RAY_DEDUP_LOGS value: "0" {{- if .Values.rdma.enabled }} - name: NCCL_IB_DISABLE value: "0" - name: NCCL_IB_HCA value: {{ .Values.rdma.hca | default "^mlx5" | quote }} - name: NCCL_IB_GID_INDEX value: {{ .Values.rdma.gidIndex | default "0" | quote }} {{- else }} - name: NCCL_IB_DISABLE value: "1" {{- end }} volumeMounts: - mountPath: /dev/shm name: dshm - name: weight-volume mountPath: {{ .Values.model.localMountPath }} volumes: - name: dshm emptyDir: medium: Memory sizeLimit: {{ .Values.resources.shmSize }} - name: weight-volume persistentVolumeClaim: claimName: {{ .Release.Name }}-pvc-model {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 10 }} {{- end }} {{- with .Values.affinity }} affinity: {{- toYaml . | nindent 10 }} {{- end }} {{- with .Values.tolerations }} tolerations: {{- toYaml . | nindent 10 }} {{- end }} {{- end }}