{{- if eq (int .Values.workerSize) 1 }} apiVersion: apps/v1 kind: Deployment metadata: name: {{ .Release.Name }} spec: replicas: {{ .Values.replicaCount }} selector: matchLabels: app: {{ .Release.Name }} template: metadata: labels: app: {{ .Release.Name }} spec: initContainers: # 模型下载作为第一个 initContainer - name: download-model image: {{ .Values.model.download.image }} imagePullPolicy: IfNotPresent env: - name: HF_ENDPOINT value: https://hf-mirror.com - name: HUGGING_FACE_HUB_TOKEN value: {{ .Values.model.huggingfaceToken }} command: - sh - -c - | MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}") DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME" # DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}" # 检查模型是否存在,不存在则下载 echo "DEST_DIR= $DEST_DIR" ls $DEST_DIR ls -l {{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }} if [ ! -f "$DEST_DIR/config.json" ]; then ls -l {{ .Values.model.localMountPath }} echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR" wget https://hf-mirror.com/hfd/hfd.sh chmod a+x hfd.sh apt install aria2 -y ./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR" # huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR" else echo "Model already exists at $DEST_DIR" fi volumeMounts: - name: weight-volume mountPath: {{ .Values.model.localMountPath }} containers: - name: vllm-leader image: {{ .Values.vllm.image }} imagePullPolicy: IfNotPresent #securityContext: # capabilities: # add: [ "IPC_LOCK" ] env: - name: HUGGING_FACE_HUB_TOKEN value: {{ .Values.vllm.huggingfaceToken }} #- name: GLOO_SOCKET_IFNAME # value: eth0 #- name: NCCL_SOCKET_IFNAME # value: eth0 #- name: NCCL_IB_DISABLE # value: "0" #- name: NCCL_DEBUG # value: INFO #- name: NCCL_IB_HCA # value: mlx5_0:1 #- name: NCCL_IB_GID_INDEX # value: "0" # 或 "7",根据你的网络配置而定 - name: RAY_DEDUP_LOGS value: "0" command: - sh - -c - "MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME; echo 'Using single node ------------------------------------------'; python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.vllm.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }}" resources: limits: nvidia.com/gpu: "{{ .Values.vllm.gpuLimit }}" memory: {{ .Values.vllm.memoryLimit }} ephemeral-storage: 10Gi #rdma/rdma_shared_device_a: 10 requests: ephemeral-storage: 10Gi cpu: {{ .Values.vllm.cpuRequest }} ports: - containerPort: 8080 name: http readinessProbe: #tcpSocket: httpGet: path: /health port: 8080 initialDelaySeconds: 120 periodSeconds: 20 timeoutSeconds: 5 volumeMounts: - mountPath: /dev/shm name: dshm - name: weight-volume mountPath: {{ .Values.model.localMountPath }} volumes: - name: dshm emptyDir: medium: Memory sizeLimit: {{ .Values.vllm.shmSize }} - name: weight-volume persistentVolumeClaim: claimName: nfs-pvc-model {{- end }}