ocdp-go/test/vllm_k3s_deploy_smoke.py

#!/usr/bin/env python3
# Covers a real k3s deployment smoke path for vllm-serve: admin-created ordinary
# user with integer GPU memory quota, tenant namespace/ResourceQuota creation,
# Harbor chart deployment with the requested vLLM image, diagnostics fetch, and
# cleanup of the instance and test user.

import json
import os
import subprocess
import sys
import tempfile
import time
import uuid
from dataclasses import dataclass
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.parse import quote, urljoin
from urllib.request import Request, urlopen


RAW_BASE_URL = os.environ.get("BASE_URL", "http://localhost:18081/api/v1").rstrip("/")
BASE_URL = RAW_BASE_URL + "/"
ADMIN_USER = os.environ.get("ADMIN_USER", os.environ.get("BOOTSTRAP_ADMIN_USER", "admin"))
ADMIN_PASS = os.environ.get("ADMIN_PASS", os.environ.get("BOOTSTRAP_ADMIN_PASS", ""))
TARGET_CLUSTER_NAME = os.environ.get("TARGET_CLUSTER_NAME", "k3s")
TARGET_REGISTRY_NAME = os.environ.get("TARGET_REGISTRY_NAME", "harbor-bwgdi")
CHART_REPOSITORY = os.environ.get("VLLM_CHART_REPOSITORY", "charts/vllm-serve")
CHART_TAG = os.environ.get("VLLM_CHART_TAG", "0.6.0")
VLLM_IMAGE = os.environ.get("VLLM_IMAGE", "harbor.bwgdi.com/library/vllm-openai:v0.17.1")
MODEL_NAME = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
GPU_MEM_MB = os.environ.get("GPU_MEM_MB", "10000")


@dataclass
class Response:
    status: int
    headers: dict[str, str]
    body: str
    json: Any


def parse_json(body: str) -> Any:
    if not body:
        return None
    try:
        return json.loads(body)
    except json.JSONDecodeError:
        return None


def request(method: str, path: str, token: str | None = None, payload: Any = None, timeout: int = 30) -> Response:
    url = path if path.startswith("http") else urljoin(BASE_URL, path.lstrip("/"))
    data = None
    headers = {"Accept": "application/json"}
    if payload is not None:
        data = json.dumps(payload).encode("utf-8")
        headers["Content-Type"] = "application/json"
    if token:
        headers["Authorization"] = f"Bearer {token}"
    req = Request(url, data=data, headers=headers, method=method)
    try:
        with urlopen(req, timeout=timeout) as res:
            body = res.read().decode("utf-8", errors="replace")
            return Response(res.status, dict(res.headers), body, parse_json(body))
    except HTTPError as exc:
        body = exc.read().decode("utf-8", errors="replace")
        return Response(exc.code, dict(exc.headers), body, parse_json(body))
    except URLError as exc:
        raise AssertionError(f"Cannot reach BASE_URL={BASE_URL}: {exc}") from exc


def assert_status(resp: Response, expected: set[int], context: str) -> None:
    if resp.status not in expected:
        raise AssertionError(f"{context}: expected HTTP {sorted(expected)}, got {resp.status}. Body: {resp.body[:800]}")


def login(username: str, password: str) -> str:
    resp = request("POST", "/auth/login", payload={"username": username, "password": password})
    assert_status(resp, {200}, f"login {username}")
    return str(resp.json["accessToken"])


def list_items(path: str, token: str, context: str) -> list[dict[str, Any]]:
    resp = request("GET", path, token)
    assert_status(resp, {200}, context)
    if isinstance(resp.json, list):
        return [item for item in resp.json if isinstance(item, dict)]
    if isinstance(resp.json, dict):
        for key in ("items", "clusters", "registries", "instances"):
            if isinstance(resp.json.get(key), list):
                return [item for item in resp.json[key] if isinstance(item, dict)]
    raise AssertionError(f"{context}: expected list response. Body: {resp.body[:800]}")


def find_by_name(items: list[dict[str, Any]], name: str, context: str) -> dict[str, Any]:
    for item in items:
        if item.get("name") == name:
            return item
    raise AssertionError(f"{context}: could not find {name!r}. Available: {[item.get('name') for item in items]}")


def issue_kubeconfig(token: str, workspace_id: str, cluster_id: str) -> str:
    resp = request(
        "POST",
        f"/workspaces/{workspace_id}/kubeconfig",
        token,
        {"clusterId": cluster_id, "ttlSeconds": 7200},
    )
    assert_status(resp, {200}, "issue tenant kubeconfig")
    return str(resp.json["kubeconfig"])


def issue_current_kubeconfig(token: str) -> str:
    resp = request("GET", "/workspaces/credentials/kubeconfig", token)
    assert_status(resp, {200}, "issue current default-cluster kubeconfig")
    if "server:" not in resp.body or "token:" not in resp.body:
        raise AssertionError(f"current kubeconfig response does not look like kubeconfig YAML: {resp.body[:300]}")
    return resp.body


def kubectl_json(kubeconfig: str, args: list[str]) -> Any:
    with tempfile.NamedTemporaryFile("w", delete=False) as handle:
        handle.write(kubeconfig)
        kubeconfig_path = handle.name
    try:
        proc = subprocess.run(
            ["kubectl", "--kubeconfig", kubeconfig_path, *args, "-o", "json"],
            text=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=60,
            check=False,
        )
        if proc.returncode != 0:
            raise AssertionError(f"kubectl {' '.join(args)} failed: {proc.stderr.strip()}")
        return json.loads(proc.stdout)
    finally:
        os.unlink(kubeconfig_path)


def values_yaml() -> str:
    return f"""vllm:
  image: "{VLLM_IMAGE}"
model:
  huggingfaceName: "{MODEL_NAME}"
resources:
  gpuLimit: 1
  gpuMem: {GPU_MEM_MB}
  cpuRequest: 4
  memoryLimit: "12Gi"
  shmSize: "4Gi"
replicaCount: 1
workerSize: 1
initContainers:
  enabled: false
"""


def main() -> int:
    if not ADMIN_PASS:
        raise AssertionError("ADMIN_PASS or BOOTSTRAP_ADMIN_PASS is required")
    suffix = uuid.uuid4().hex[:6]
    username = f"vllm-k3s-{suffix}"
    password = "VllmK3s123!"
    namespace = f"ocdp-u-vllm-{suffix}"
    release = f"ocdp-vllm-k3s-{suffix}"
    admin_token = login(ADMIN_USER, ADMIN_PASS)
    user_id = ""
    instance_id = ""
    cluster_id = ""
    try:
        clusters = list_items("/clusters", admin_token, "list clusters")
        cluster = find_by_name(clusters, TARGET_CLUSTER_NAME, "select target cluster")
        cluster_id = str(cluster["id"])
        registries = list_items("/registries", admin_token, "list registries")
        registry = find_by_name(registries, TARGET_REGISTRY_NAME, "select target registry")
        registry_id = str(registry["id"])

        artifacts = request(
            "GET",
            f"/registries/{registry_id}/repositories/{quote(CHART_REPOSITORY, safe='')}/artifacts?media_type=chart",
            admin_token,
        )
        assert_status(artifacts, {200}, "verify vllm chart artifacts")
        if CHART_TAG not in artifacts.body:
            raise AssertionError(f"{CHART_REPOSITORY}:{CHART_TAG} was not visible in Harbor artifacts")

        created = request(
            "POST",
            "/users",
            admin_token,
            {
                "username": username,
                "password": password,
                "role": "user",
                "namespace": namespace,
                "defaultClusterId": cluster_id,
                "quotaCpu": "6",
                "quotaMemory": "16Gi",
                "quotaGpu": "1",
                "quotaGpuMemory": GPU_MEM_MB,
                "isActive": True,
                "mustChangePassword": False,
            },
        )
        assert_status(created, {201}, "create vllm smoke user")
        user_id = str(created.json["id"])
        if str(created.json.get("quotaGpuMemory")) != GPU_MEM_MB:
            raise AssertionError(f"quotaGpuMemory should stay integer {GPU_MEM_MB}, got {created.json.get('quotaGpuMemory')}")

        user_token = login(username, password)
        workspaces = list_items("/workspaces", user_token, "user lists own workspace")
        workspace = workspaces[0]
        workspace_id = str(workspace["id"])
        kubeconfig = issue_current_kubeconfig(user_token)
        quota = kubectl_json(kubeconfig, ["get", "resourcequota", "tenant-quota", "-n", namespace])
        hard = quota.get("status", {}).get("hard") or quota.get("spec", {}).get("hard") or {}
        gpumem_hard = str(hard.get("requests.nvidia.com/gpumem", ""))
        if gpumem_hard not in {GPU_MEM_MB, "10k"}:
            raise AssertionError(f"ResourceQuota gpumem should be {GPU_MEM_MB} or Kubernetes canonical 10k, got {gpumem_hard!r}")
        print(f"quota gpumem={gpumem_hard}")

        payload = {
            "name": release,
            "namespace": namespace,
            "registryId": registry_id,
            "repository": CHART_REPOSITORY,
            "tag": CHART_TAG,
            "description": f"smoke deploy {MODEL_NAME}",
            "valuesYaml": values_yaml(),
        }
        created_instance = request("POST", f"/clusters/{cluster_id}/instances", user_token, payload, timeout=1200)
        assert_status(created_instance, {201}, "create vllm instance")
        instance_id = str(created_instance.json["id"])
        print(f"instance={instance_id} release={release} cluster={TARGET_CLUSTER_NAME} namespace={namespace}")

        current = created_instance
        for attempt in range(1, 61):
            current = request("GET", f"/clusters/{cluster_id}/instances/{instance_id}", user_token)
            assert_status(current, {200}, "poll vllm instance")
            status = str(current.json.get("status"))
            print(f"poll={attempt} status={status}")
            if status == "deployed":
                break
            if status == "failed":
                raise AssertionError(f"vLLM instance failed: {current.body[:1200]}")
            time.sleep(10)
        else:
            raise AssertionError(f"vLLM instance did not reach deployed. Last: {current.body[:1200]}")

        diagnostics = request("GET", f"/clusters/{cluster_id}/instances/{instance_id}/diagnostics?tailLines=80", user_token, timeout=60)
        assert_status(diagnostics, {200}, "fetch diagnostics")
        pods = diagnostics.json.get("pods", []) if isinstance(diagnostics.json, dict) else []
        services = diagnostics.json.get("services", []) if isinstance(diagnostics.json, dict) else []
        logs = diagnostics.json.get("logs", []) if isinstance(diagnostics.json, dict) else []
        print(f"diagnostics pods={len(pods)} services={len(services)} logs={len(logs)}")

        live_services = kubectl_json(kubeconfig, ["get", "svc", "-n", namespace])
        service_names = {item.get("metadata", {}).get("name") for item in live_services.get("items", [])}
        if f"{release}-svc" not in service_names:
            raise AssertionError(f"expected service {release}-svc in tenant namespace {namespace}, got {sorted(service_names)}")
        live_deployments = kubectl_json(kubeconfig, ["get", "deployments", "-n", namespace])
        deployment_names = {item.get("metadata", {}).get("name") for item in live_deployments.get("items", [])}
        if release not in deployment_names:
            raise AssertionError(f"expected deployment {release} in tenant namespace {namespace}, got {sorted(deployment_names)}")
        print(f"tenant namespace resources service={release}-svc deployment={release}")

        return 0
    finally:
        if instance_id and cluster_id:
            cleanup = request("DELETE", f"/clusters/{cluster_id}/instances/{instance_id}", admin_token, timeout=300)
            print(f"cleanup instance http={cleanup.status}")
        if user_id:
            cleanup_user = request("DELETE", f"/users/{user_id}", admin_token)
            print(f"cleanup user http={cleanup_user.status}")


if __name__ == "__main__":
    try:
        raise SystemExit(main())
    except AssertionError as exc:
        print(f"FAIL: {exc}", file=sys.stderr)
        raise SystemExit(1)