ocdp_chart/vllm/vllm-serve/templates/single.yaml

{{- if eq (int .Values.workerSize) 1 }}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ .Release.Name }}
spec:
  replicas: {{ .Values.replicaCount }}
  selector:
    matchLabels:
      app:  {{ .Release.Name }}
  template:
    metadata:
      labels:
        app: {{ .Release.Name }}
    spec:
      initContainers:
     # 模型下载作为第一个 initContainer
      - name: download-model
        image: {{ .Values.model.download.image }}
        imagePullPolicy: IfNotPresent
        env:
          - name: HF_ENDPOINT
            value: https://hf-mirror.com
          - name: HUGGING_FACE_HUB_TOKEN
            value: {{ .Values.model.huggingfaceToken }}
        command:
          - sh
          - -c
          - |
            MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
            DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
            # DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
            # 检查模型是否存在，不存在则下载
            echo "DEST_DIR= $DEST_DIR"
            if [ ! -f "$DEST_DIR/config.json" ]; then
              ls -l {{ .Values.model.localMountPath }}
              echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
              wget https://hf-mirror.com/hfd/hfd.sh
              chmod a+x hfd.sh
              apt install aria2 -y
              ./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
              # huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
            else
              echo "Model already exists at $DEST_DIR"
            fi
        volumeMounts:
        - name: weight-volume
          mountPath: {{ .Values.model.localMountPath }}
      containers:
      - name: vllm-pod
        image: {{ .Values.vllm.image }}
        imagePullPolicy:  IfNotPresent
        env:
        - name: HUGGING_FACE_HUB_TOKEN
          value: {{ .Values.vllm.huggingfaceToken }}
        - name: RAY_DEDUP_LOGS
          value: "0"
        command:
          - sh
          - -c
          - "MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
            echo 'Using single node ------------------------------------------';
            python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
        resources:
          limits:
            nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
            memory: {{ .Values.resources.memoryLimit }}
            ephemeral-storage: 10Gi
          requests:
            ephemeral-storage: 10Gi
            cpu: {{ .Values.resources.cpuRequest }}
        ports:
        - containerPort: 8080
          name: http
        readinessProbe:
          #tcpSocket:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 120
          periodSeconds: 20
          timeoutSeconds: 5
        volumeMounts:
          - mountPath: /dev/shm
            name: dshm
          - name: weight-volume
            mountPath:  {{ .Values.model.localMountPath }}
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
          sizeLimit: {{ .Values.resources.shmSize }}
      - name: weight-volume
        persistentVolumeClaim:
          claimName: {{ .Release.Name }}-pvc-model
      {{- with .Values.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.affinity }}
      affinity:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
{{- end }}