apiVersion: apps/v1 kind: Deployment metadata: name: vllm-server labels: app.kubernetes.io/name: vllm-server app.kubernetes.io/component: model-server app.kubernetes.io/part-of: ocdp-workload annotations: {} spec: replicas: 1 selector: matchLabels: app.kubernetes.io/name: vllm-server template: metadata: labels: app.kubernetes.io/name: vllm-server app.kubernetes.io/component: model-server app.kubernetes.io/part-of: ocdp-workload spec: containers: - name: vllm image: vllm/vllm-openai:latest imagePullPolicy: IfNotPresent args: - --host - 0.0.0.0 - --port - "8000" - --model - Qwen/Qwen2.5-7B-Instruct - --served-model-name - default env: - name: HF_TOKEN valueFrom: secretKeyRef: name: vllm-secrets key: hfToken optional: true ports: - name: http containerPort: 8000 readinessProbe: httpGet: path: /health port: http initialDelaySeconds: 20 periodSeconds: 10 resources: requests: cpu: "2" memory: 12Gi limits: cpu: "4" memory: 24Gi