59 lines
1.4 KiB
YAML
59 lines
1.4 KiB
YAML
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: vllm-server
|
|
labels:
|
|
app.kubernetes.io/name: vllm-server
|
|
app.kubernetes.io/component: model-server
|
|
app.kubernetes.io/part-of: ocdp-workload
|
|
annotations: {}
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: vllm-server
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: vllm-server
|
|
app.kubernetes.io/component: model-server
|
|
app.kubernetes.io/part-of: ocdp-workload
|
|
spec:
|
|
containers:
|
|
- name: vllm
|
|
image: vllm/vllm-openai:latest
|
|
imagePullPolicy: IfNotPresent
|
|
args:
|
|
- --host
|
|
- 0.0.0.0
|
|
- --port
|
|
- "8000"
|
|
- --model
|
|
- Qwen/Qwen2.5-7B-Instruct
|
|
- --served-model-name
|
|
- default
|
|
env:
|
|
- name: HF_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: vllm-secrets
|
|
key: hfToken
|
|
optional: true
|
|
ports:
|
|
- name: http
|
|
containerPort: 8000
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /health
|
|
port: http
|
|
initialDelaySeconds: 20
|
|
periodSeconds: 10
|
|
resources:
|
|
requests:
|
|
cpu: "2"
|
|
memory: 12Gi
|
|
limits:
|
|
cpu: "4"
|
|
memory: 24Gi
|
|
|