compatible with two clusters

This commit is contained in:
mangomqy
2025-11-12 07:26:18 +00:00
parent a158e24d3f
commit 7ad1018f25
7 changed files with 411 additions and 35 deletions

View File

@ -28,7 +28,7 @@ spec:
- -c
- |
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
DEST_DIR="{{ .Values.model.localMountPath }}/Weight/$MODEL_NAME"
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
# 检查模型是否存在,不存在则下载
echo "DEST_DIR= $DEST_DIR"
@ -37,6 +37,7 @@ spec:
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
wget https://hf-mirror.com/hfd/hfd.sh
chmod a+x hfd.sh
apt update && apt upgrade
apt install aria2 -y
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
@ -58,9 +59,25 @@ spec:
command:
- sh
- -c
- "MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
echo 'Using single node ------------------------------------------';
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
#args:
# {{- if .Values.command }}
# - {{ .Values.command | quote }}
# {{- else }}
# - |
# MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}');
# MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
# # 注意:这里引用变量时不再使用引号包围整个命令块
# python3 -m vllm.entrypoints.openai.api_server \
# --port 8080 \
# --model $MODEL_PATH \
# --tensor-parallel-size {{ .Values.resources.gpuLimit }} \
# --pipeline_parallel_size {{ .Values.workerSize }} \
# --trust_remote_code
# {{- end }}
- "
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}');
MODEL_PATH='{{ .Values.model.localMountPath }}/Weight/'$MODEL_NAME;
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
resources:
limits:
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
@ -92,7 +109,11 @@ spec:
sizeLimit: {{ .Values.resources.shmSize }}
- name: weight-volume
persistentVolumeClaim:
claimName: {{ .Release.Name }}-pvc-model
claimName: {{ .Release.Name }}-pvc-model
# - name: weight-volume
# nfs:
# path: "/volume1/Dataset/PVStore/lab-data-model-pvc-c0beeab1-6dd5-4c6a-bd2c-6ce9e114c25e/Weight"
# server: "10.6.80.11"
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}