fix: debug lws without IB, and set --distributed-executor-backend ray as default

2025-12-04 09:47:43 +08:00
parent 51163f0442
commit 97c5d559e3
2 changed files with 91 additions and 30 deletions
--- a/vllm-serve/values.yaml
+++ b/vllm-serve/values.yaml
@ -27,8 +27,6 @@ model:
  huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct"  # 用户只需输入这个
  localMountPath: "/Model"                  # PVC 固定挂载路径
  huggingfaceToken: "<your-hf-token>"
-  download:                               # 启用自动下载
-    image: "docker.io/vllm/vllm-openai:latest"  # 包含 huggingface-cli 的镜像

 # 功能选择

@ -38,6 +36,15 @@ resources:
  memoryLimit: "16Gi"
  shmSize: "20Gi"

+#  RDMA 配置部分
+rdma:
+  enabled: false  # 开关：默认关闭，防止在无 RDMA 节点报错
+  interface: eth0 # NCCL/GLOO 通信使用的网卡名称 (有 RDMA 时可能是 ib0 或 bond0)
+  resourceName: "rdma/rdma_shared_device_a" # RDMA 资源名称 (取决于你的 k8s 插件)
+  resourceCount: 5   # 每个 Pod 需要的 RDMA 设备数量
+  hca: "mlx5_0:1"    # 指定的 HCA 设备 (或者使用 ^mlx5 进行前缀匹配)
+  gidIndex: "0"      # RoCEv2 通常需要指定 GID    
+    
 svc:
  type: LoadBalancer
  port: 80