first commit
This commit is contained in:
8
apps/vllm-server/README.md
Normal file
8
apps/vllm-server/README.md
Normal file
@ -0,0 +1,8 @@
|
||||
# vllm-server
|
||||
|
||||
OpenAI-compatible model serving with vLLM.
|
||||
|
||||
The base is CPU-safe YAML. Add `components/gpu-nvidia` in environments that
|
||||
provide NVIDIA GPUs, and let the instance overlay patch model name, resources,
|
||||
and cache size.
|
||||
|
||||
58
apps/vllm-server/base/deployment.yaml
Normal file
58
apps/vllm-server/base/deployment.yaml
Normal file
@ -0,0 +1,58 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm-server
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm-server
|
||||
app.kubernetes.io/component: model-server
|
||||
app.kubernetes.io/part-of: ocdp-workload
|
||||
annotations: {}
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vllm-server
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm-server
|
||||
app.kubernetes.io/component: model-server
|
||||
app.kubernetes.io/part-of: ocdp-workload
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
- --model
|
||||
- Qwen/Qwen2.5-7B-Instruct
|
||||
- --served-model-name
|
||||
- default
|
||||
env:
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: vllm-secrets
|
||||
key: hfToken
|
||||
optional: true
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8000
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: http
|
||||
initialDelaySeconds: 20
|
||||
periodSeconds: 10
|
||||
resources:
|
||||
requests:
|
||||
cpu: "2"
|
||||
memory: 12Gi
|
||||
limits:
|
||||
cpu: "4"
|
||||
memory: 24Gi
|
||||
|
||||
6
apps/vllm-server/base/kustomization.yaml
Normal file
6
apps/vllm-server/base/kustomization.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- deployment.yaml
|
||||
- service.yaml
|
||||
|
||||
16
apps/vllm-server/base/service.yaml
Normal file
16
apps/vllm-server/base/service.yaml
Normal file
@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-server
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm-server
|
||||
app.kubernetes.io/component: model-server
|
||||
app.kubernetes.io/part-of: ocdp-workload
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: vllm-server
|
||||
ports:
|
||||
- name: http
|
||||
port: 8000
|
||||
targetPort: http
|
||||
|
||||
7
apps/vllm-server/components/gpu-nvidia/gpu-patch.yaml
Normal file
7
apps/vllm-server/components/gpu-nvidia/gpu-patch.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
- op: add
|
||||
path: /spec/template/spec/runtimeClassName
|
||||
value: nvidia
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/resources/limits/nvidia.com~1gpu
|
||||
value: 1
|
||||
|
||||
10
apps/vllm-server/components/gpu-nvidia/kustomization.yaml
Normal file
10
apps/vllm-server/components/gpu-nvidia/kustomization.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||
kind: Component
|
||||
patches:
|
||||
- path: gpu-patch.yaml
|
||||
target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: Deployment
|
||||
name: vllm-server
|
||||
|
||||
21
apps/vllm-server/components/ingress/ingress.yaml
Normal file
21
apps/vllm-server/components/ingress/ingress.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: vllm-server
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm-server
|
||||
app.kubernetes.io/component: ingress
|
||||
app.kubernetes.io/part-of: ocdp-workload
|
||||
spec:
|
||||
rules:
|
||||
- host: vllm.example.local
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: vllm-server
|
||||
port:
|
||||
name: http
|
||||
|
||||
5
apps/vllm-server/components/ingress/kustomization.yaml
Normal file
5
apps/vllm-server/components/ingress/kustomization.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||
kind: Component
|
||||
resources:
|
||||
- ingress.yaml
|
||||
|
||||
@ -0,0 +1,17 @@
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/env/-
|
||||
value:
|
||||
name: HF_HOME
|
||||
value: /cache/huggingface
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/volumeMounts
|
||||
value:
|
||||
- name: model-cache
|
||||
mountPath: /cache
|
||||
- op: add
|
||||
path: /spec/template/spec/volumes
|
||||
value:
|
||||
- name: model-cache
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-cache
|
||||
|
||||
12
apps/vllm-server/components/pvc-cache/kustomization.yaml
Normal file
12
apps/vllm-server/components/pvc-cache/kustomization.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||
kind: Component
|
||||
resources:
|
||||
- pvc.yaml
|
||||
patches:
|
||||
- path: deployment-cache-patch.yaml
|
||||
target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: Deployment
|
||||
name: vllm-server
|
||||
|
||||
15
apps/vllm-server/components/pvc-cache/pvc.yaml
Normal file
15
apps/vllm-server/components/pvc-cache/pvc.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: vllm-cache
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm-server
|
||||
app.kubernetes.io/component: model-cache
|
||||
app.kubernetes.io/part-of: ocdp-workload
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 100Gi
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||
kind: Component
|
||||
patches:
|
||||
- path: service-loadbalancer-patch.yaml
|
||||
target:
|
||||
kind: Service
|
||||
name: vllm-server
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
- op: add
|
||||
path: /spec/type
|
||||
value: LoadBalancer
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||
kind: Component
|
||||
patches:
|
||||
- path: service-nodeport-patch.yaml
|
||||
target:
|
||||
kind: Service
|
||||
name: vllm-server
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
- op: add
|
||||
path: /spec/type
|
||||
value: NodePort
|
||||
|
||||
Reference in New Issue
Block a user