fix: debug lws without IB, and set --distributed-executor-backend ray as default
All checks were successful
Publish Helm Charts / helm-publish (push) Successful in 8s
All checks were successful
Publish Helm Charts / helm-publish (push) Successful in 8s
This commit is contained in:
@ -62,7 +62,7 @@ spec:
|
|||||||
containers:
|
containers:
|
||||||
- name: vllm-leader
|
- name: vllm-leader
|
||||||
image: {{ .Values.vllm.image }}
|
image: {{ .Values.vllm.image }}
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: {{ .Values.imagePullPolicy }}
|
||||||
securityContext:
|
securityContext:
|
||||||
capabilities:
|
capabilities:
|
||||||
add: [ "IPC_LOCK" ]
|
add: [ "IPC_LOCK" ]
|
||||||
@ -70,38 +70,68 @@ spec:
|
|||||||
# - name: HUGGING_FACE_HUB_TOKEN
|
# - name: HUGGING_FACE_HUB_TOKEN
|
||||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||||
- name: GLOO_SOCKET_IFNAME
|
- name: GLOO_SOCKET_IFNAME
|
||||||
value: eth0
|
value: {{ .Values.rdma.interface | default "eth0" | quote }}
|
||||||
- name: NCCL_SOCKET_IFNAME
|
- name: NCCL_SOCKET_IFNAME
|
||||||
value: eth0
|
value: {{ .Values.rdma.interface | default "eth0" | quote }}
|
||||||
- name: NCCL_IB_DISABLE
|
- name: RAY_DEDUP_LOGS
|
||||||
value: "0"
|
value: "0"
|
||||||
- name: NCCL_DEBUG
|
- name: NCCL_DEBUG
|
||||||
value: INFO
|
value: INFO
|
||||||
- name: NCCL_IB_HCA
|
|
||||||
value: mlx5_0:1
|
# RDMA 条件配置
|
||||||
- name: NCCL_IB_GID_INDEX
|
{{- if .Values.rdma.enabled }}
|
||||||
value: "0" # 或 "7",根据你的网络配置而定
|
- name: NCCL_IB_DISABLE
|
||||||
- name: RAY_DEDUP_LOGS
|
|
||||||
value: "0"
|
value: "0"
|
||||||
|
- name: NCCL_IB_HCA
|
||||||
|
value: {{ .Values.rdma.hca | default "^mlx5" | quote }}
|
||||||
|
- name: NCCL_IB_GID_INDEX
|
||||||
|
value: {{ .Values.rdma.gidIndex | default "0" | quote }} # 或 "7",根据你的网络配置而定
|
||||||
|
{{- else }}
|
||||||
|
# 如果未开启 RDMA,显式禁用 IB,防止 NCCL 尝试探测报错
|
||||||
|
- name: NCCL_IB_DISABLE
|
||||||
|
value: "1"
|
||||||
|
{{- end }}
|
||||||
command:
|
command:
|
||||||
- sh
|
- sh
|
||||||
- -c
|
- -c
|
||||||
|
- |
|
||||||
|
# 1. 自动 RDMA 探测逻辑 (无论下方跑什么命令,先执行这段)
|
||||||
|
# =======================================================
|
||||||
|
# echo "🔍 [Init] Detecting RDMA devices..."
|
||||||
|
# if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then
|
||||||
|
# echo "✅ [Init] RDMA devices found. Enabling NCCL IB."
|
||||||
|
# export NCCL_IB_DISABLE=0
|
||||||
|
# # 如果环境变量没指定 HCA,默认使用 ^mlx5 匹配
|
||||||
|
# export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"}
|
||||||
|
# else
|
||||||
|
# echo "⚠️ [Init] No RDMA devices found. Falling back to TCP."
|
||||||
|
# export NCCL_IB_DISABLE=1
|
||||||
|
# export NCCL_NET_GDR_LEVEL=0
|
||||||
|
# fi
|
||||||
|
# echo "🚀 [Init] RDMA setup complete. NCCL_IB_DISABLE=$NCCL_IB_DISABLE"
|
||||||
|
|
||||||
{{- if .Values.command }}
|
{{- if .Values.command }}
|
||||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }}"
|
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); {{ .Values.command }} --distributed-executor-backend ray
|
||||||
{{- else }}
|
{{- else }}
|
||||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||||
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
|
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
|
||||||
python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
|
python3 -m vllm.entrypoints.openai.api_server --port 8000 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --distributed-executor-backend ray --trust_remote_code
|
||||||
{{- end }}
|
{{- end }}
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||||
memory: {{ .Values.resources.memoryLimit }}
|
memory: {{ .Values.resources.memoryLimit }}
|
||||||
ephemeral-storage: 10Gi
|
ephemeral-storage: 10Gi
|
||||||
rdma/rdma_shared_device_a: 10
|
{{- if .Values.rdma.enabled }}
|
||||||
|
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
|
||||||
|
{{- end }}
|
||||||
requests:
|
requests:
|
||||||
ephemeral-storage: 10Gi
|
ephemeral-storage: 10Gi
|
||||||
cpu: {{ .Values.resources.cpuRequest }}
|
cpu: {{ .Values.resources.cpuRequest }}
|
||||||
|
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||||
|
{{- if .Values.rdma.enabled }}
|
||||||
|
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
|
||||||
|
{{- end }}
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8000
|
- containerPort: 8000
|
||||||
name: http
|
name: http
|
||||||
@ -150,33 +180,57 @@ spec:
|
|||||||
command:
|
command:
|
||||||
- sh
|
- sh
|
||||||
- -c
|
- -c
|
||||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
- |
|
||||||
|
# 1. 自动 RDMA 探测逻辑
|
||||||
|
# =======================================================
|
||||||
|
# echo "🔍 [Init] Detecting RDMA devices..."
|
||||||
|
# if [ -d "/sys/class/infiniband" ] && [ "$(ls -A /sys/class/infiniband)" ]; then
|
||||||
|
# echo "✅ [Init] RDMA devices found. Enabling NCCL IB."
|
||||||
|
# export NCCL_IB_DISABLE=0
|
||||||
|
# export NCCL_IB_HCA=${NCCL_IB_HCA:-"^mlx5"}
|
||||||
|
# else
|
||||||
|
# echo "⚠️ [Init] No RDMA devices found. Falling back to TCP."
|
||||||
|
# export NCCL_IB_DISABLE=1
|
||||||
|
# export NCCL_NET_GDR_LEVEL=0
|
||||||
|
# fi
|
||||||
|
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||||
memory: {{ .Values.resources.memoryLimit }}
|
memory: {{ .Values.resources.memoryLimit }}
|
||||||
ephemeral-storage: 10Gi
|
ephemeral-storage: 10Gi
|
||||||
rdma/rdma_shared_device_a: 10
|
{{- if .Values.rdma.enabled }}
|
||||||
|
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
|
||||||
|
{{- end }}
|
||||||
requests:
|
requests:
|
||||||
ephemeral-storage: 10Gi
|
ephemeral-storage: 10Gi
|
||||||
cpu: {{ .Values.resources.cpuRequest }}
|
cpu: {{ .Values.resources.cpuRequest }}
|
||||||
|
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||||
|
{{- if .Values.rdma.enabled }}
|
||||||
|
{{ .Values.rdma.resourceName }}: {{ .Values.rdma.resourceCount | default 1 }}
|
||||||
|
{{- end }}
|
||||||
env:
|
env:
|
||||||
# - name: HUGGING_FACE_HUB_TOKEN
|
|
||||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
|
||||||
- name: GLOO_SOCKET_IFNAME
|
- name: GLOO_SOCKET_IFNAME
|
||||||
value: eth0
|
value: {{ .Values.rdma.interface | default "eth0" | quote }}
|
||||||
- name: NCCL_SOCKET_IFNAME
|
- name: NCCL_SOCKET_IFNAME
|
||||||
value: eth0
|
value: {{ .Values.rdma.interface | default "eth0" | quote }}
|
||||||
- name: NCCL_IB_DISABLE
|
|
||||||
value: "0"
|
|
||||||
- name: NCCL_DEBUG
|
- name: NCCL_DEBUG
|
||||||
value: INFO
|
value: INFO
|
||||||
- name: NCCL_IB_HCA
|
|
||||||
value: mlx5_0:1
|
|
||||||
- name: NCCL_IB_GID_INDEX
|
|
||||||
value: "0" # 或 "7",根据你的网络配置而定
|
|
||||||
- name: RAY_DEDUP_LOGS
|
- name: RAY_DEDUP_LOGS
|
||||||
value: "0"
|
value: "0"
|
||||||
|
|
||||||
|
{{- if .Values.rdma.enabled }}
|
||||||
|
- name: NCCL_IB_DISABLE
|
||||||
|
value: "0"
|
||||||
|
- name: NCCL_IB_HCA
|
||||||
|
value: {{ .Values.rdma.hca | default "^mlx5" | quote }}
|
||||||
|
- name: NCCL_IB_GID_INDEX
|
||||||
|
value: {{ .Values.rdma.gidIndex | default "0" | quote }}
|
||||||
|
{{- else }}
|
||||||
|
- name: NCCL_IB_DISABLE
|
||||||
|
value: "1"
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /dev/shm
|
- mountPath: /dev/shm
|
||||||
name: dshm
|
name: dshm
|
||||||
|
|||||||
@ -27,8 +27,6 @@ model:
|
|||||||
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct" # 用户只需输入这个
|
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct" # 用户只需输入这个
|
||||||
localMountPath: "/Model" # PVC 固定挂载路径
|
localMountPath: "/Model" # PVC 固定挂载路径
|
||||||
huggingfaceToken: "<your-hf-token>"
|
huggingfaceToken: "<your-hf-token>"
|
||||||
download: # 启用自动下载
|
|
||||||
image: "docker.io/vllm/vllm-openai:latest" # 包含 huggingface-cli 的镜像
|
|
||||||
|
|
||||||
# 功能选择
|
# 功能选择
|
||||||
|
|
||||||
@ -38,6 +36,15 @@ resources:
|
|||||||
memoryLimit: "16Gi"
|
memoryLimit: "16Gi"
|
||||||
shmSize: "20Gi"
|
shmSize: "20Gi"
|
||||||
|
|
||||||
|
# RDMA 配置部分
|
||||||
|
rdma:
|
||||||
|
enabled: false # 开关:默认关闭,防止在无 RDMA 节点报错
|
||||||
|
interface: eth0 # NCCL/GLOO 通信使用的网卡名称 (有 RDMA 时可能是 ib0 或 bond0)
|
||||||
|
resourceName: "rdma/rdma_shared_device_a" # RDMA 资源名称 (取决于你的 k8s 插件)
|
||||||
|
resourceCount: 5 # 每个 Pod 需要的 RDMA 设备数量
|
||||||
|
hca: "mlx5_0:1" # 指定的 HCA 设备 (或者使用 ^mlx5 进行前缀匹配)
|
||||||
|
gidIndex: "0" # RoCEv2 通常需要指定 GID
|
||||||
|
|
||||||
svc:
|
svc:
|
||||||
type: LoadBalancer
|
type: LoadBalancer
|
||||||
port: 80
|
port: 80
|
||||||
|
|||||||
Reference in New Issue
Block a user