first commit

This commit is contained in:
2026-05-28 07:21:15 +00:00
commit 6465520041
57 changed files with 942 additions and 0 deletions

View File

@ -0,0 +1,8 @@
# vllm-server
OpenAI-compatible model serving with vLLM.
The base is CPU-safe YAML. Add `components/gpu-nvidia` in environments that
provide NVIDIA GPUs, and let the instance overlay patch model name, resources,
and cache size.

View File

@ -0,0 +1,58 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server
labels:
app.kubernetes.io/name: vllm-server
app.kubernetes.io/component: model-server
app.kubernetes.io/part-of: ocdp-workload
annotations: {}
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm-server
template:
metadata:
labels:
app.kubernetes.io/name: vllm-server
app.kubernetes.io/component: model-server
app.kubernetes.io/part-of: ocdp-workload
spec:
containers:
- name: vllm
image: vllm/vllm-openai:latest
imagePullPolicy: IfNotPresent
args:
- --host
- 0.0.0.0
- --port
- "8000"
- --model
- Qwen/Qwen2.5-7B-Instruct
- --served-model-name
- default
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: vllm-secrets
key: hfToken
optional: true
ports:
- name: http
containerPort: 8000
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 20
periodSeconds: 10
resources:
requests:
cpu: "2"
memory: 12Gi
limits:
cpu: "4"
memory: 24Gi

View File

@ -0,0 +1,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- deployment.yaml
- service.yaml

View File

@ -0,0 +1,16 @@
apiVersion: v1
kind: Service
metadata:
name: vllm-server
labels:
app.kubernetes.io/name: vllm-server
app.kubernetes.io/component: model-server
app.kubernetes.io/part-of: ocdp-workload
spec:
selector:
app.kubernetes.io/name: vllm-server
ports:
- name: http
port: 8000
targetPort: http

View File

@ -0,0 +1,7 @@
- op: add
path: /spec/template/spec/runtimeClassName
value: nvidia
- op: add
path: /spec/template/spec/containers/0/resources/limits/nvidia.com~1gpu
value: 1

View File

@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
patches:
- path: gpu-patch.yaml
target:
group: apps
version: v1
kind: Deployment
name: vllm-server

View File

@ -0,0 +1,21 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: vllm-server
labels:
app.kubernetes.io/name: vllm-server
app.kubernetes.io/component: ingress
app.kubernetes.io/part-of: ocdp-workload
spec:
rules:
- host: vllm.example.local
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: vllm-server
port:
name: http

View File

@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- ingress.yaml

View File

@ -0,0 +1,17 @@
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: HF_HOME
value: /cache/huggingface
- op: add
path: /spec/template/spec/containers/0/volumeMounts
value:
- name: model-cache
mountPath: /cache
- op: add
path: /spec/template/spec/volumes
value:
- name: model-cache
persistentVolumeClaim:
claimName: vllm-cache

View File

@ -0,0 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- pvc.yaml
patches:
- path: deployment-cache-patch.yaml
target:
group: apps
version: v1
kind: Deployment
name: vllm-server

View File

@ -0,0 +1,15 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: vllm-cache
labels:
app.kubernetes.io/name: vllm-server
app.kubernetes.io/component: model-cache
app.kubernetes.io/part-of: ocdp-workload
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi

View File

@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
patches:
- path: service-loadbalancer-patch.yaml
target:
kind: Service
name: vllm-server

View File

@ -0,0 +1,4 @@
- op: add
path: /spec/type
value: LoadBalancer

View File

@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
patches:
- path: service-nodeport-patch.yaml
target:
kind: Service
name: vllm-server

View File

@ -0,0 +1,4 @@
- op: add
path: /spec/type
value: NodePort