first backup of charts
This commit is contained in:
53
vllm/metadata.yaml
Normal file
53
vllm/metadata.yaml
Normal file
@ -0,0 +1,53 @@
|
||||
|
||||
application_name: &application_name vllm
|
||||
|
||||
distributed:
|
||||
method: helm
|
||||
release_name: *application_name
|
||||
chart: vllm-app
|
||||
sets:
|
||||
app: vllm
|
||||
model:
|
||||
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct"
|
||||
resources:
|
||||
gpuLimit: 1
|
||||
cpuRequest: 8
|
||||
memoryLimit: "16Gi"
|
||||
shmSize: "15Gi"
|
||||
workerSize: 2
|
||||
nodeSelector: {}
|
||||
svc:
|
||||
svc_type: NodePort
|
||||
protocol: http
|
||||
hostname: 10.6.14.123
|
||||
port: 30080
|
||||
url: ~
|
||||
paths:
|
||||
docs_path: /docs
|
||||
redoc_path: /redoc
|
||||
pod:
|
||||
name: infer-0
|
||||
monolithic:
|
||||
method: helm
|
||||
release_name: *application_name
|
||||
chart: vllm-app
|
||||
sets:
|
||||
app: vllm
|
||||
model:
|
||||
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct"
|
||||
resources:
|
||||
gpuLimit: 1
|
||||
cpuRequest: 8
|
||||
memoryLimit: "16Gi"
|
||||
shmSize: "15Gi"
|
||||
workerSize: 1
|
||||
nodeSelector: {}
|
||||
svc:
|
||||
svc_type: NodePort
|
||||
protocol: http
|
||||
hostname: 10.6.14.123
|
||||
port: 30080
|
||||
url: ~
|
||||
pod:
|
||||
name: vllm
|
||||
|
||||
23
vllm/vllm-app/.helmignore
Normal file
23
vllm/vllm-app/.helmignore
Normal file
@ -0,0 +1,23 @@
|
||||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*.orig
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
||||
.vscode/
|
||||
25
vllm/vllm-app/Chart.yaml
Normal file
25
vllm/vllm-app/Chart.yaml
Normal file
@ -0,0 +1,25 @@
|
||||
apiVersion: v2
|
||||
name: vllm-app
|
||||
description: A Helm chart for deploying vLLM with NFS storage
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep # 防止资源被意外删除
|
||||
# A chart can be either an 'application' or a 'library' chart.
|
||||
#
|
||||
# Application charts are a collection of templates that can be packaged into versioned archives
|
||||
# to be deployed.
|
||||
#
|
||||
# Library charts provide useful utilities or functions for the chart developer. They're included as
|
||||
# a dependency of application charts to inject those utilities and functions into the rendering
|
||||
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
||||
type: application
|
||||
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.1.0
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "1.16.0"
|
||||
165
vllm/vllm-app/templates/llama.yaml
Normal file
165
vllm/vllm-app/templates/llama.yaml
Normal file
@ -0,0 +1,165 @@
|
||||
{{- if and (gt (int .Values.workerSize) 1) (eq .Values.app "llama") }}
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: infer
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
leaderWorkerTemplate:
|
||||
size: {{ .Values.workerSize }}
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
value: https://hf-mirror.com
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
ls $DEST_DIR
|
||||
ls -l {{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
fi
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: llama-leader
|
||||
image: {{ .Values.llama.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
- name : USE_RAY
|
||||
value: "1"
|
||||
# - name : LMDEPLOY_EXECUTOR_BACKEND
|
||||
# value: "ray"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash {{ .Values.model.localMountPath }}/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_NAME_OR_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
|
||||
llamafactory-cli webchat {{ .Values.model.localMountPath }}/lws-config/qwen2_5_3B.yaml "
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
ports:
|
||||
- containerPort: 7860
|
||||
name: http
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Values.app }}-pvc-model
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: llama-worker
|
||||
image: {{ .Values.llama.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "echo $(LWS_LEADER_ADDRESS);
|
||||
bash {{ .Values.model.localMountPath }}/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
# - name : LMDEPLOY_EXECUTOR_BACKEND
|
||||
# value: "ray"
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Values.app }}-pvc-model
|
||||
{{- end }}
|
||||
170
vllm/vllm-app/templates/lmdeploy_lws.yaml
Normal file
170
vllm/vllm-app/templates/lmdeploy_lws.yaml
Normal file
@ -0,0 +1,170 @@
|
||||
{{- if and (gt (int .Values.workerSize) 1) (eq .Values.app "lmdeploy") }}
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: infer
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
leaderWorkerTemplate:
|
||||
size: {{ .Values.workerSize }}
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
value: https://hf-mirror.com
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
ls $DEST_DIR
|
||||
ls -l {{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
fi
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: lmdeploy-leader
|
||||
image: {{ .Values.lmdeploy.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
- name : LMDEPLOY_EXECUTOR_BACKEND
|
||||
value: "ray"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash {{ .Values.model.localMountPath }}/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
|
||||
lmdeploy serve api_server $MODEL_PATH --backend pytorch --tp $(({{ .Values.resources.gpuLimit }} * {{ .Values.workerSize }})) --server-port 8080 --cache-max-entry-count 0.9"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
#httpGet:
|
||||
#path: /health
|
||||
port: 8080
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 5
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs-pvc-model
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: lmdeploy-worker
|
||||
image: {{ .Values.lmdeploy.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash {{ .Values.model.localMountPath }}/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.lmdeploy.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
- name : LMDEPLOY_EXECUTOR_BACKEND
|
||||
value: "ray"
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs-pvc-model
|
||||
{{- end }}
|
||||
166
vllm/vllm-app/templates/lws.yaml
Normal file
166
vllm/vllm-app/templates/lws.yaml
Normal file
@ -0,0 +1,166 @@
|
||||
{{- if and (gt (int .Values.workerSize) 1) (eq .Values.app "vllm") }}
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: infer
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
leaderWorkerTemplate:
|
||||
size: {{ .Values.workerSize }}
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
value: https://hf-mirror.com
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
ls $DEST_DIR
|
||||
ls -l {{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
fi
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
#httpGet:
|
||||
#path: /health
|
||||
port: 8080
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 5
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Values.app }}-pvc-model
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-worker
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Values.app }}-pvc-model
|
||||
{{- end }}
|
||||
44
vllm/vllm-app/templates/model-download-job.yaml
Normal file
44
vllm/vllm-app/templates/model-download-job.yaml
Normal file
@ -0,0 +1,44 @@
|
||||
{{- if .Values.model.download.enabled }}
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{ .Release.Name }}-download-model
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade # 在安装/升级前执行
|
||||
"helm.sh/hook-weight": "-10" # 优先执行
|
||||
"helm.sh/hook-delete-policy": hook-succeeded
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: downloader
|
||||
image: {{ .Values.model.download.image }}
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
value: https://hf-mirror.com
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
if [ -d "$DEST_DIR" ]; then
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
else
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
fi
|
||||
volumeMounts:
|
||||
- name: model-storage
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: model-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs-pvc-model # 复用之前的 PVC
|
||||
{{- end }}
|
||||
14
vllm/vllm-app/templates/nfs-pv.yaml
Normal file
14
vllm/vllm-app/templates/nfs-pv.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: {{ .Values.app }}-pv-model
|
||||
spec:
|
||||
storageClassName: {{ .Values.nfs.storageClass | default "local-path" }}
|
||||
capacity:
|
||||
storage: {{ .Values.nfs.pvSize }}
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
nfs:
|
||||
path: {{ .Values.nfs.path }}
|
||||
server: {{ .Values.nfs.server }}
|
||||
12
vllm/vllm-app/templates/nfs-pvc.yaml
Normal file
12
vllm/vllm-app/templates/nfs-pvc.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ .Values.app }}-pvc-model
|
||||
annotations:
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.nfs.pvcSize }}
|
||||
volumeName: {{ .Values.app }}-pv-model
|
||||
39
vllm/vllm-app/templates/services.yaml
Normal file
39
vllm/vllm-app/templates/services.yaml
Normal file
@ -0,0 +1,39 @@
|
||||
#apiVersion: v1
|
||||
#kind: Service
|
||||
#metadata:
|
||||
# name: infer-leader-loadbalancer
|
||||
#spec:
|
||||
# type: LoadBalancer
|
||||
# selector:
|
||||
# leaderworkerset.sigs.k8s.io/name: infer
|
||||
# role: leader
|
||||
# ports:
|
||||
# - protocol: TCP
|
||||
# port: 8080
|
||||
# targetPort: 8080
|
||||
#
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ .Values.app }}-leader-nodeport
|
||||
spec:
|
||||
type: NodePort
|
||||
{{- if gt (int .Values.workerSize) 1 }}
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: infer
|
||||
role: leader
|
||||
{{- else }}
|
||||
selector:
|
||||
app: vllm-app
|
||||
{{- end }}
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8080
|
||||
{{- if eq .Values.app "llama" }}
|
||||
targetPort: 7860
|
||||
{{- else }}
|
||||
targetPort: 8080
|
||||
{{- end }}
|
||||
nodePort: 30080
|
||||
|
||||
114
vllm/vllm-app/templates/single.yaml
Normal file
114
vllm/vllm-app/templates/single.yaml
Normal file
@ -0,0 +1,114 @@
|
||||
{{- if eq (int .Values.workerSize) 1 }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: vllm-app
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: vllm-app
|
||||
spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
value: https://hf-mirror.com
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
ls $DEST_DIR
|
||||
ls -l {{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
fi
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
#securityContext:
|
||||
# capabilities:
|
||||
# add: [ "IPC_LOCK" ]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.vllm.huggingfaceToken }}
|
||||
#- name: GLOO_SOCKET_IFNAME
|
||||
# value: eth0
|
||||
#- name: NCCL_SOCKET_IFNAME
|
||||
# value: eth0
|
||||
#- name: NCCL_IB_DISABLE
|
||||
# value: "0"
|
||||
#- name: NCCL_DEBUG
|
||||
# value: INFO
|
||||
#- name: NCCL_IB_HCA
|
||||
# value: mlx5_0:1
|
||||
#- name: NCCL_IB_GID_INDEX
|
||||
# value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
|
||||
echo 'Using single node ------------------------------------------';
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
#rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
readinessProbe:
|
||||
#tcpSocket:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8080
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 5
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Values.app}}-pvc-model
|
||||
{{- end }}
|
||||
58
vllm/vllm-app/values.yaml
Normal file
58
vllm/vllm-app/values.yaml
Normal file
@ -0,0 +1,58 @@
|
||||
# Default values for vllm-app.
|
||||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
|
||||
# 模型配置
|
||||
model:
|
||||
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct" # 用户只需输入这个
|
||||
localMountPath: "/Model" # PVC 固定挂载路径
|
||||
huggingfaceToken: "<your-hf-token>"
|
||||
download:
|
||||
enabled: false # 启用自动下载
|
||||
image: "docker.io/vllm/vllm-openai:latest" # 包含 huggingface-cli 的镜像
|
||||
|
||||
# 功能选择
|
||||
app: "vllm"
|
||||
|
||||
resources:
|
||||
gpuLimit: 1
|
||||
cpuRequest: 12
|
||||
memoryLimit: "16Gi"
|
||||
shmSize: "20Gi"
|
||||
|
||||
# vLLM 应用配置
|
||||
vllm:
|
||||
image: "docker.io/vllm/vllm-openai:latest"
|
||||
#gpuLimit: 2
|
||||
# cpuRequest: 12
|
||||
# memoryLimit: "12Gi"
|
||||
# shmSize: "15Gi"
|
||||
|
||||
llama:
|
||||
image: "docker.io/library/one-click:v1"
|
||||
|
||||
# lmdeploy 应用配置
|
||||
lmdeploy:
|
||||
image: "docker.io/openmmlab/lmdeploy:latest-cu12"
|
||||
# gpuLimit: 2
|
||||
# cpuRequest: 12
|
||||
# memoryLimit: "12Gi"
|
||||
# shmSize: "15Gi"
|
||||
|
||||
# NFS PV/PVC 配置
|
||||
nfs:
|
||||
server: "10.6.80.11"
|
||||
path: "/volume1/Dataset/PVStore/lab-data-model-pvc-c0beeab1-6dd5-4c6a-bd2c-6ce9e114c25e/Weight"
|
||||
storageClass: "local-path"
|
||||
pvSize: "500Gi"
|
||||
pvcSize: "50Gi"
|
||||
|
||||
# LeaderWorkerSet 配置
|
||||
replicaCount: 1
|
||||
workerSize: 2
|
||||
|
||||
nodeSelector: {}
|
||||
|
||||
tolerations: []
|
||||
|
||||
affinity: {}
|
||||
23
vllm/vllm-serve/.helmignore
Normal file
23
vllm/vllm-serve/.helmignore
Normal file
@ -0,0 +1,23 @@
|
||||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*.orig
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
||||
.vscode/
|
||||
25
vllm/vllm-serve/Chart.yaml
Normal file
25
vllm/vllm-serve/Chart.yaml
Normal file
@ -0,0 +1,25 @@
|
||||
apiVersion: v2
|
||||
name: vllm-serve
|
||||
description: A Helm chart for deploying vLLM with NFS storage
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep # 防止资源被意外删除
|
||||
# A chart can be either an 'application' or a 'library' chart.
|
||||
#
|
||||
# Application charts are a collection of templates that can be packaged into versioned archives
|
||||
# to be deployed.
|
||||
#
|
||||
# Library charts provide useful utilities or functions for the chart developer. They're included as
|
||||
# a dependency of application charts to inject those utilities and functions into the rendering
|
||||
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
||||
type: application
|
||||
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.1.0
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "1.16.0"
|
||||
16
vllm/vllm-serve/templates/NOTES.txt
Normal file
16
vllm/vllm-serve/templates/NOTES.txt
Normal file
@ -0,0 +1,16 @@
|
||||
1. Get the application URL by running these commands:
|
||||
{{- if contains "NodePort" .Values.svc.type }}
|
||||
export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "vllm-serve.fullname" . }})
|
||||
export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
|
||||
echo http://$NODE_IP:$NODE_PORT
|
||||
{{- else if contains "LoadBalancer" .Values.service.type }}
|
||||
NOTE: It may take a few minutes for the LoadBalancer IP to be available.
|
||||
You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "vllm-serve.fullname" . }}'
|
||||
export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "vllm-serve.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
|
||||
echo http://$SERVICE_IP:{{ .Values.service.port }}
|
||||
{{- else if contains "ClusterIP" .Values.service.type }}
|
||||
export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "vllm-serve.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
|
||||
export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
|
||||
echo "Visit http://127.0.0.1:8080 to use your application"
|
||||
kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
|
||||
{{- end }}
|
||||
62
vllm/vllm-serve/templates/_helpers.tpl
Normal file
62
vllm/vllm-serve/templates/_helpers.tpl
Normal file
@ -0,0 +1,62 @@
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "vllm-serve.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/}}
|
||||
{{- define "vllm-serve.fullname" -}}
|
||||
{{- if .Values.fullnameOverride }}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "vllm-serve.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "vllm-serve.labels" -}}
|
||||
helm.sh/chart: {{ include "vllm-serve.chart" . }}
|
||||
{{ include "vllm-serve.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Selector labels
|
||||
*/}}
|
||||
{{- define "vllm-serve.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "vllm-serve.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create the name of the service account to use
|
||||
*/}}
|
||||
{{- define "vllm-serve.serviceAccountName" -}}
|
||||
{{- if .Values.serviceAccount.create }}
|
||||
{{- default (include "vllm-serve.fullname" .) .Values.serviceAccount.name }}
|
||||
{{- else }}
|
||||
{{- default "default" .Values.serviceAccount.name }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
188
vllm/vllm-serve/templates/lws.yaml
Normal file
188
vllm/vllm-serve/templates/lws.yaml
Normal file
@ -0,0 +1,188 @@
|
||||
{{- if gt (int .Values.workerSize) 1 }}
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: {{ .Release.Name }}
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
leaderWorkerTemplate:
|
||||
size: {{ .Values.workerSize }}
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }}
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
value: https://hf-mirror.com
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
fi
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
#httpGet:
|
||||
#path: /health
|
||||
port: 8080
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 5
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Release.Name }}-pvc-model
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- with .Values.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- with .Values.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-worker
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: [ "IPC_LOCK" ]
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
rdma/rdma_shared_device_a: 10
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
env:
|
||||
# - name: HUGGING_FACE_HUB_TOKEN
|
||||
# value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: GLOO_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_SOCKET_IFNAME
|
||||
value: eth0
|
||||
- name: NCCL_IB_DISABLE
|
||||
value: "0"
|
||||
- name: NCCL_DEBUG
|
||||
value: INFO
|
||||
- name: NCCL_IB_HCA
|
||||
value: mlx5_0:1
|
||||
- name: NCCL_IB_GID_INDEX
|
||||
value: "0" # 或 "7",根据你的网络配置而定
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Release.Name }}-pvc-model
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- with .Values.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- with .Values.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
28
vllm/vllm-serve/templates/nfs-pvc.yaml
Normal file
28
vllm/vllm-serve/templates/nfs-pvc.yaml
Normal file
@ -0,0 +1,28 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: {{ .Release.Name }}-pv-model
|
||||
spec:
|
||||
storageClassName: {{ .Values.nfs.storageClass | default "local-path" }}
|
||||
capacity:
|
||||
storage: {{ .Values.nfs.pvSize }}
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
nfs:
|
||||
path: {{ .Values.nfs.path }}
|
||||
server: {{ .Values.nfs.server }}
|
||||
---
|
||||
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ .Release.Name }}-pvc-model
|
||||
annotations:
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.nfs.pvcSize }}
|
||||
volumeName: {{ .Release.Name }}-pv-model
|
||||
35
vllm/vllm-serve/templates/services.yaml
Normal file
35
vllm/vllm-serve/templates/services.yaml
Normal file
@ -0,0 +1,35 @@
|
||||
#apiVersion: v1
|
||||
#kind: Service
|
||||
#metadata:
|
||||
# name: infer-leader-loadbalancer
|
||||
#spec:
|
||||
# type: LoadBalancer
|
||||
# selector:
|
||||
# leaderworkerset.sigs.k8s.io/name: infer
|
||||
# role: leader
|
||||
# ports:
|
||||
# - protocol: TCP
|
||||
# port: 8080
|
||||
# targetPort: 8080
|
||||
#
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ .Release.Name }}-svc
|
||||
spec:
|
||||
type: {{ .Values.svc.type | default "NodePort" }}
|
||||
{{- if gt (int .Values.workerSize) 1 }}
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: {{ .Release.Name }}
|
||||
role: leader
|
||||
{{- else }}
|
||||
selector:
|
||||
app: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: {{ .Values.svc.port | default 8080 }}
|
||||
targetPort: {{ .Values.svc.port | default 8080 }}
|
||||
nodePort: {{ .Values.svc.nodePort | default 30080 }}
|
||||
|
||||
108
vllm/vllm-serve/templates/single.yaml
Normal file
108
vllm/vllm-serve/templates/single.yaml
Normal file
@ -0,0 +1,108 @@
|
||||
{{- if eq (int .Values.workerSize) 1 }}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ .Release.Name }}
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ .Release.Name }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: {{ .Release.Name }}
|
||||
spec:
|
||||
initContainers:
|
||||
# 模型下载作为第一个 initContainer
|
||||
- name: download-model
|
||||
image: {{ .Values.model.download.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: HF_ENDPOINT
|
||||
value: https://hf-mirror.com
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.model.huggingfaceToken }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
MODEL_NAME=$(basename "{{ .Values.model.huggingfaceName }}")
|
||||
DEST_DIR="{{ .Values.model.localMountPath }}/$MODEL_NAME"
|
||||
# DEST_DIR="{{ .Values.model.localMountPath }}/{{ .Values.model.huggingfaceName }}"
|
||||
# 检查模型是否存在,不存在则下载
|
||||
echo "DEST_DIR= $DEST_DIR"
|
||||
if [ ! -f "$DEST_DIR/config.json" ]; then
|
||||
ls -l {{ .Values.model.localMountPath }}
|
||||
echo "Downloading model {{ .Values.model.huggingfaceName }} to $DEST_DIR"
|
||||
wget https://hf-mirror.com/hfd/hfd.sh
|
||||
chmod a+x hfd.sh
|
||||
apt install aria2 -y
|
||||
./hfd.sh {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
# huggingface-cli download {{ .Values.model.huggingfaceName }} --local-dir "$DEST_DIR"
|
||||
else
|
||||
echo "Model already exists at $DEST_DIR"
|
||||
fi
|
||||
volumeMounts:
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
containers:
|
||||
- name: vllm-pod
|
||||
image: {{ .Values.vllm.image }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: {{ .Values.vllm.huggingfaceToken }}
|
||||
- name: RAY_DEDUP_LOGS
|
||||
value: "0"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "MODEL_NAME=$(basename '{{ .Values.model.huggingfaceName }}'); MODEL_PATH='{{ .Values.model.localMountPath }}/'$MODEL_NAME;
|
||||
echo 'Using single node ------------------------------------------';
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model $MODEL_PATH --tensor-parallel-size {{ .Values.resources.gpuLimit }} --pipeline_parallel_size {{ .Values.workerSize }} --trust_remote_code"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ .Values.resources.gpuLimit }}"
|
||||
memory: {{ .Values.resources.memoryLimit }}
|
||||
ephemeral-storage: 10Gi
|
||||
requests:
|
||||
ephemeral-storage: 10Gi
|
||||
cpu: {{ .Values.resources.cpuRequest }}
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
readinessProbe:
|
||||
#tcpSocket:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8080
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 5
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
- name: weight-volume
|
||||
mountPath: {{ .Values.model.localMountPath }}
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: {{ .Values.resources.shmSize }}
|
||||
- name: weight-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ .Release.Name }}-pvc-model
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
75
vllm/vllm-serve/values.yaml
Normal file
75
vllm/vllm-serve/values.yaml
Normal file
@ -0,0 +1,75 @@
|
||||
# Default values for vllm-app.
|
||||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
|
||||
# This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
|
||||
imagePullSecrets: []
|
||||
imagePullPolicy: IfNotPresent
|
||||
# This is to override the chart name.
|
||||
nameOverride: ""
|
||||
fullnameOverride: ""
|
||||
|
||||
# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/
|
||||
serviceAccount:
|
||||
# Specifies whether a service account should be created
|
||||
create: true
|
||||
# Automatically mount a ServiceAccount's API credentials?
|
||||
automount: true
|
||||
# Annotations to add to the service account
|
||||
annotations: {}
|
||||
# The name of the service account to use.
|
||||
# If not set and create is true, a name is generated using the fullname template
|
||||
name: ""
|
||||
|
||||
|
||||
# 模型配置
|
||||
model:
|
||||
huggingfaceName: "Qwen/Qwen2.5-0.5B-Instruct" # 用户只需输入这个
|
||||
localMountPath: "/Model" # PVC 固定挂载路径
|
||||
huggingfaceToken: "<your-hf-token>"
|
||||
download: # 启用自动下载
|
||||
image: "docker.io/vllm/vllm-openai:latest" # 包含 huggingface-cli 的镜像
|
||||
|
||||
# 功能选择
|
||||
|
||||
resources:
|
||||
gpuLimit: 1
|
||||
cpuRequest: 12
|
||||
memoryLimit: "16Gi"
|
||||
shmSize: "20Gi"
|
||||
|
||||
svc:
|
||||
type: NodePort
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
nodePort: 30080
|
||||
# vLLM 应用配置
|
||||
vllm:
|
||||
image: "docker.io/vllm/vllm-openai:latest"
|
||||
|
||||
|
||||
llama:
|
||||
image: "docker.io/library/one-click:v1"
|
||||
|
||||
# lmdeploy 应用配置
|
||||
lmdeploy:
|
||||
image: "docker.io/openmmlab/lmdeploy:latest-cu12"
|
||||
|
||||
|
||||
# NFS PV/PVC 配置
|
||||
nfs:
|
||||
server: "10.6.80.11"
|
||||
path: "/volume1/Dataset/PVStore/lab-data-model-pvc-c0beeab1-6dd5-4c6a-bd2c-6ce9e114c25e/Weight"
|
||||
storageClass: "local-path"
|
||||
pvSize: "500Gi"
|
||||
pvcSize: "50Gi"
|
||||
|
||||
# LeaderWorkerSet 配置
|
||||
replicaCount: 1
|
||||
workerSize: 2
|
||||
|
||||
nodeSelector: {}
|
||||
|
||||
tolerations: []
|
||||
|
||||
affinity: {}
|
||||
Reference in New Issue
Block a user