From 97c5d559e3191255886e85aab4358bd533a49cf4 Mon Sep 17 00:00:00 2001 From: Ivan087 Date: Thu, 4 Dec 2025 09:47:43 +0800 Subject: [PATCH] fix: debug lws without IB, and set --distributed-executor-backend ray as default --- vllm-serve/templates/lws.yaml | 110 +++++++++++++++++++++++++--------- vllm-serve/values.yaml | 11 +++- 2 files changed, 91 insertions(+), 30 deletions(-) diff --git a/vllm-serve/templates/lws.yaml b/vllm-serve/templates/lws.yaml index bb15c84..1f2a087 100644 --- a/vllm-serve/templates/lws.yaml +++ b/vllm-serve/templates/lws.yaml @@ -62,7 +62,7 @@ spec: containers: - name: vllm-leader image: {{ .Values.vllm.image }} - imagePullPolicy: IfNotPresent + imagePullPolicy: {{ .Values.imagePullPolicy }} securityContext: capabilities: add: [ "IPC_LOCK" ] @@ -70,38 +70,68 @@ spec: # - name: HUGGING_FACE_HUB_TOKEN # value: {{ .Values.vllm.huggingfaceToken }} - name: GLOO_SOCKET_IFNAME - value: eth0 + value: {{ .Values.rdma.interface | default "eth0" | quote }} - name: NCCL_SOCKET_IFNAME - value: eth0 - - name: NCCL_IB_DISABLE + value: {{ .Values.rdma.interface | default "eth0" | quote }} + - name: RAY_DEDUP_LOGS value: "0" - name: NCCL_DEBUG value: INFO - - name: NCCL_IB_HCA - value: mlx5_0:1 - - name: NCCL_IB_GID_INDEX - value: "0" # 或 "7",根据你的网络配置而定 - - name: RAY_DEDUP_LOGS + + # RDMA 条件配置 + {{- if .Values.rdma.enabled }} + - name: NCCL_IB_DISABLE value: "0" + - name: NCCL_IB_HCA + value: {{ .Values.rdma.hca | default "^mlx5" | quote }} + - name: NCCL_IB_GID_INDEX + value: {{ .Values.rdma.gidIndex | default "0" | quote }} # 或 "7",根据你的网络配置而定 + {{- else }} + # 如果未开启 RDMA,显式禁用 IB,防止 NCCL 尝试探测报错 + - name: NCCL_IB_DISABLE + value: "1" + {{- end }} command: - sh - -c - {{- if .Values.command }} - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }}" - {{- else }} - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); + - | + # 1. 自动 RDMA 探测逻辑 (无论下方跑什么命令,先执行这段) + # ======================================================= + # echo "🔍 [Init] Detecting RDMA devices..." + # if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then + # echo "✅ [Init] RDMA devices found. Enabling NCCL IB." + # export NCCL_IB_DISABLE=0 + # # 如果环境变量没指定 HCA,默认使用 ^mlx5 匹配 + # export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"} + # else + # echo "⚠️ [Init] No RDMA devices found. Falling back to TCP." + # export NCCL_IB_DISABLE=1 + # export NCCL_NET_GDR_LEVEL=0 + # fi + # echo "🚀 [Init] RDMA setup complete. NCCL_IB_DISABLE=$NCCL_IB_DISABLE" + + {{- if .Values.command }} + bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }} --distributed-executor-backend ray + {{- else }} + bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME; - python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code" - {{- end }} + python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --distributed-executor-backend ray --trust_remote_code + {{- end }} resources: limits: nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" memory: {{ .Values.resources.memoryLimit }} ephemeral-storage: 10Gi - rdma/rdma_shared_device_a: 10 + {{- if .Values.rdma.enabled }} + {{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }} + {{- end }} requests: ephemeral-storage: 10Gi cpu: {{ .Values.resources.cpuRequest }} + nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" + {{- if .Values.rdma.enabled }} + {{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }} + {{- end }} ports: - containerPort: 8000 name: http @@ -150,33 +180,57 @@ spec: command: - sh - -c - - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" + - | + # 1. 自动 RDMA 探测逻辑 + # ======================================================= + # echo "🔍 [Init] Detecting RDMA devices..." + # if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then + # echo "✅ [Init] RDMA devices found. Enabling NCCL IB." + # export NCCL_IB_DISABLE=0 + # export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"} + # else + # echo "⚠️ [Init] No RDMA devices found. Falling back to TCP." + # export NCCL_IB_DISABLE=1 + # export NCCL_NET_GDR_LEVEL=0 + # fi + bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS) resources: limits: nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" memory: {{ .Values.resources.memoryLimit }} ephemeral-storage: 10Gi - rdma/rdma_shared_device_a: 10 + {{- if .Values.rdma.enabled }} + {{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }} + {{- end }} requests: ephemeral-storage: 10Gi cpu: {{ .Values.resources.cpuRequest }} + nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}" + {{- if .Values.rdma.enabled }} + {{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }} + {{- end }} env: - # - name: HUGGING_FACE_HUB_TOKEN - # value: {{ .Values.vllm.huggingfaceToken }} - name: GLOO_SOCKET_IFNAME - value: eth0 + value: {{ .Values.rdma.interface | default "eth0" | quote }} - name: NCCL_SOCKET_IFNAME - value: eth0 - - name: NCCL_IB_DISABLE - value: "0" + value: {{ .Values.rdma.interface | default "eth0" | quote }} - name: NCCL_DEBUG value: INFO - - name: NCCL_IB_HCA - value: mlx5_0:1 - - name: NCCL_IB_GID_INDEX - value: "0" # 或 "7",根据你的网络配置而定 - name: RAY_DEDUP_LOGS value: "0" + + {{- if .Values.rdma.enabled }} + - name: NCCL_IB_DISABLE + value: "0" + - name: NCCL_IB_HCA + value: {{ .Values.rdma.hca | default "^mlx5" | quote }} + - name: NCCL_IB_GID_INDEX + value: {{ .Values.rdma.gidIndex | default "0" | quote }} + {{- else }} + - name: NCCL_IB_DISABLE + value: "1" + {{- end }} + volumeMounts: - mountPath: /dev/shm name: dshm diff --git a/vllm-serve/values.yaml b/vllm-serve/values.yaml index 2d5a6a0..266a1fe 100644 --- a/vllm-serve/values.yaml +++ b/vllm-serve/values.yaml @@ -27,8 +27,6 @@ model: huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct" # 用户只需输入这个 localMountPath: "/Model" # PVC 固定挂载路径 huggingfaceToken: "" - download: # 启用自动下载 - image: "docker.io/vllm/vllm-openai:latest" # 包含 huggingface-cli 的镜像 # 功能选择 @@ -38,6 +36,15 @@ resources: memoryLimit: "16Gi" shmSize: "20Gi" +# RDMA 配置部分 +rdma: + enabled: false # 开关:默认关闭,防止在无 RDMA 节点报错 + interface: eth0 # NCCL/GLOO 通信使用的网卡名称 (有 RDMA 时可能是 ib0 或 bond0) + resourceName: "rdma/rdma_shared_device_a" # RDMA 资源名称 (取决于你的 k8s 插件) + resourceCount: 5 # 每个 Pod 需要的 RDMA 设备数量 + hca: "mlx5_0:1" # 指定的 HCA 设备 (或者使用 ^mlx5 进行前缀匹配) + gidIndex: "0" # RoCEv2 通常需要指定 GID + svc: type: LoadBalancer port: 80