first backup of charts
This commit is contained in:
166
webchat/vllm-app/templates/lws.yaml
Normal file
166
webchat/vllm-app/templates/lws.yaml
Normal file
@ -0,0 +1,166 @@
|
||||
{{- if and (gt (int .Values.workerSize) 1) (eq .Values.app "vllm") }}
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: infer
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
leaderWorkerTemplate:
|
||||
size: {{ .Values.workerSize }}
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
value: https://hf-mirror.com
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
ls $DEST_DIR
|
||||
ls -l {{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
fi
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }}"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
#httpGet:
|
||||
#path: /health
|
||||
port: 8080
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 5
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Values.app }}-pvc-model
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-worker
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Values.app }}-pvc-model
|
||||
{{- end }}
|
||||
Reference in New Issue
Block a user