fix: scale replicas in response, K8s metrics client, quota precheck, auth tests

- Add GetMetrics method to MetricsClient interface and implement cluster metrics API
- Add QuotaPrecheck service for validating resource quotas before deployment
- Add auth DTO with role/permission models and auth handler tests
- Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics
- Update workspace handler with GetWorkspace endpoint and shared-user list
- Fix monitoring handler to use correct service method name
- Add tail_lines fallback in instance handler for snake_case query params
- Update nginx config for SSE log streaming support (no buffering)
- Add comprehensive test coverage: auth_service_test, auth_handler_test,
  auth_dto_test, metrics_client_test, quota_precheck_test
- Update error messages for quota validation and instance operations
- ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit
- InstanceCard: correctly disable scale-minus when replicas <= 0
- SidebarLayout: add hover transition for sidebar items
- Update todo.md and lessons.md with latest fixes
This commit is contained in:
Ivan087
2026-05-20 16:56:29 +08:00
parent 8f90cf0f0d
commit 33ddaf97db
59 changed files with 4805 additions and 457 deletions

View File

@ -1,7 +1,8 @@
#!/usr/bin/env python3
# Covers InstanceCard action layout: creates a harmless failed metadata instance
# with an invalid chart before Helm runs, opens the Instances page, verifies the
# Delete button remains inside the card and viewport, clicks it, and cleans up.
# Covers InstanceCard action layout. It prefers a harmless failed metadata
# instance when the API preserves one; if chart validation rejects before DB
# persistence, it falls back to mocking only the instance list/delete API so the
# visual overflow assertion remains independent from deployment behavior.
import json
import os
@ -102,6 +103,7 @@ def main() -> int:
release = f"ocdp-ui-overflow-{suffix}"
namespace = f"ocdp-ui-overflow-{suffix}"
instance_id = ""
synthetic = False
try:
create = request(
"POST",
@ -125,16 +127,44 @@ def main() -> int:
break
time.sleep(0.5)
if not instance_id:
raise AssertionError("test instance was not visible after failed chart download")
synthetic = True
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page(viewport={"width": 920, "height": 760})
page.on("dialog", lambda dialog: dialog.accept())
login_ui(page)
if synthetic:
synthetic_instance = {
"id": f"synthetic-{suffix}",
"name": release,
"namespace": namespace,
"clusterId": cluster_id,
"registryId": registry_id,
"repository": "charts/nonexistent",
"chart": "nonexistent",
"version": "0.0.0",
"status": "failed",
"ownerUsername": ADMIN_USER,
"values": {},
"createdAt": "2026-05-15T00:00:00Z",
"updatedAt": "2026-05-15T00:00:00Z",
}
def fulfill_instances(route):
if route.request.method == "GET":
route.fulfill(status=200, content_type="application/json", body=json.dumps({"instances": [synthetic_instance], "total": 1}))
return
if route.request.method == "DELETE":
route.fulfill(status=204, body="")
return
route.continue_()
page.route("**/api/v1/clusters/*/instances", fulfill_instances)
page.route("**/api/v1/clusters/*/instances/*", fulfill_instances)
page.get_by_role("button", name="Instances", exact=True).click()
page.wait_for_load_state("networkidle")
heading = page.get_by_role("heading", name=release, exact=True)
heading = page.get_by_role("heading", name=release, exact=True).first
expect(heading).to_be_visible(timeout=15000)
card = heading.locator("xpath=ancestor::div[contains(@class, 'group')][1]")
delete_button = card.get_by_role("button", name="Delete", exact=True)

View File

@ -0,0 +1,259 @@
#!/usr/bin/env python3
# Covers unresolved API regressions: compatibility tags/metrics/stats/kubeconfig
# endpoints, values/valuesYaml conflict handling, ordinary-user namespace 403,
# and quota precheck rejection before an instance is persisted.
import json
import os
import sys
import uuid
from dataclasses import dataclass
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.parse import quote, urljoin
from urllib.request import Request, urlopen
RAW_BASE_URL = os.environ.get("BASE_URL", "http://localhost:18081/api/v1").rstrip("/")
BASE_URL = RAW_BASE_URL + "/"
ADMIN_USER = os.environ.get("ADMIN_USER", os.environ.get("BOOTSTRAP_ADMIN_USER", "admin"))
ADMIN_PASS = os.environ.get("ADMIN_PASS", os.environ.get("BOOTSTRAP_ADMIN_PASS", ""))
TARGET_CLUSTER_NAME = os.environ.get("TARGET_CLUSTER_NAME", "k3s")
TARGET_REGISTRY_NAME = os.environ.get("TARGET_REGISTRY_NAME", "harbor-bwgdi")
NGINX_REPOSITORY = os.environ.get("NGINX_CHART_REPOSITORY", "charts/nginx")
NGINX_TAG = os.environ.get("NGINX_CHART_TAG", "22.1.1")
VLLM_REPOSITORY = os.environ.get("VLLM_CHART_REPOSITORY", "charts/vllm-serve")
VLLM_TAG = os.environ.get("VLLM_CHART_TAG", "0.6.0")
GPU_MEM_MB = os.environ.get("GPU_MEM_MB", "10000")
@dataclass
class Response:
status: int
headers: dict[str, str]
body: str
json: Any
def parse_json(body: str) -> Any:
try:
return json.loads(body) if body else None
except json.JSONDecodeError:
return None
def request(method: str, path: str, token: str | None = None, payload: Any = None, timeout: int = 60) -> Response:
data = None
headers = {"Accept": "application/json"}
if payload is not None:
data = json.dumps(payload).encode("utf-8")
headers["Content-Type"] = "application/json"
if token:
headers["Authorization"] = f"Bearer {token}"
url = path if path.startswith("http") else urljoin(BASE_URL, path.lstrip("/"))
try:
with urlopen(Request(url, data=data, headers=headers, method=method), timeout=timeout) as res:
body = res.read().decode("utf-8", errors="replace")
return Response(res.status, dict(res.headers), body, parse_json(body))
except HTTPError as exc:
body = exc.read().decode("utf-8", errors="replace")
return Response(exc.code, dict(exc.headers), body, parse_json(body))
except URLError as exc:
raise AssertionError(f"Cannot reach {url}: {exc}") from exc
def assert_status(resp: Response, expected: set[int], context: str) -> None:
if resp.status not in expected:
raise AssertionError(f"{context}: expected HTTP {sorted(expected)}, got {resp.status}. Body: {resp.body[:800]}")
def login(username: str, password: str) -> str:
resp = request("POST", "/auth/login", payload={"username": username, "password": password})
assert_status(resp, {200}, f"login {username}")
if not isinstance(resp.json, dict) or not resp.json.get("accessToken"):
raise AssertionError(f"login {username}: missing accessToken")
return str(resp.json["accessToken"])
def list_items(path: str, token: str, context: str) -> list[dict[str, Any]]:
resp = request("GET", path, token)
assert_status(resp, {200}, context)
if isinstance(resp.json, list):
return [item for item in resp.json if isinstance(item, dict)]
if isinstance(resp.json, dict):
for key in ("items", "clusters", "registries", "instances"):
value = resp.json.get(key)
if isinstance(value, list):
return [item for item in value if isinstance(item, dict)]
raise AssertionError(f"{context}: expected list response, got {resp.body[:800]}")
def find_by_name(items: list[dict[str, Any]], name: str, context: str) -> dict[str, Any]:
for item in items:
if item.get("name") == name:
return item
raise AssertionError(f"{context}: could not find {name!r}. Available: {[item.get('name') for item in items]}")
def encoded_repo(repo: str) -> str:
return quote(repo, safe="")
def create_test_user(admin_token: str, cluster_id: str, suffix: str, quota_gpu: str = "1") -> tuple[str, str, str]:
username = f"api-bugs-{suffix}"
password = "ApiBugs123!"
namespace = f"ocdp-u-api-bugs-{suffix}"
created = request(
"POST",
"/users",
admin_token,
{
"username": username,
"password": password,
"role": "user",
"namespace": namespace,
"defaultClusterId": cluster_id,
"quotaCpu": "2",
"quotaMemory": "8Gi",
"quotaGpu": quota_gpu,
"quotaGpuMemory": GPU_MEM_MB,
"isActive": True,
"mustChangePassword": False,
},
)
assert_status(created, {201}, "create API contract test user")
return str(created.json["id"]), username, password
def instance_names(cluster_id: str, token: str) -> set[str]:
resp = request("GET", f"/clusters/{cluster_id}/instances", token)
assert_status(resp, {200}, "list instances")
instances = resp.json.get("instances", []) if isinstance(resp.json, dict) else []
return {str(item.get("name")) for item in instances if isinstance(item, dict)}
def main() -> int:
if not ADMIN_PASS:
raise AssertionError("ADMIN_PASS or BOOTSTRAP_ADMIN_PASS is required")
suffix = uuid.uuid4().hex[:6]
admin_token = login(ADMIN_USER, ADMIN_PASS)
user_id = ""
quota_user_id = ""
try:
clusters = list_items("/clusters", admin_token, "list clusters")
cluster = find_by_name(clusters, TARGET_CLUSTER_NAME, "select target cluster")
cluster_id = str(cluster["id"])
registries = list_items("/registries", admin_token, "list registries")
registry = find_by_name(registries, TARGET_REGISTRY_NAME, "select target registry")
registry_id = str(registry["id"])
tags = request("GET", f"/registries/{registry_id}/repositories/{encoded_repo(NGINX_REPOSITORY)}/tags?media_type=chart", admin_token)
assert_status(tags, {200}, "registry repository tags alias")
if NGINX_TAG not in tags.body:
raise AssertionError(f"tags alias did not include expected {NGINX_REPOSITORY}:{NGINX_TAG}")
metrics = request("GET", f"/monitoring/clusters/{cluster_id}/metrics", admin_token)
assert_status(metrics, {200}, "monitoring metrics alias")
stats = request("GET", f"/clusters/{cluster_id}/stats", admin_token)
assert_status(stats, {200}, "cluster stats alias")
user_id, username, password = create_test_user(admin_token, cluster_id, suffix)
user_token = login(username, password)
kubeconfig = request("GET", f"/clusters/{cluster_id}/kubeconfig", user_token)
assert_status(kubeconfig, {200}, "cluster kubeconfig compatibility endpoint")
if "apiVersion: v1" not in kubeconfig.body or "kind: Config" not in kubeconfig.body or "token:" not in kubeconfig.body:
raise AssertionError(f"kubeconfig endpoint did not return tenant token kubeconfig: {kubeconfig.body[:500]}")
forbidden_fields = ("client-key-data:", "client-certificate-data:")
leaked = [field for field in forbidden_fields if field in kubeconfig.body]
if leaked:
raise AssertionError(f"kubeconfig endpoint leaked stored cert/key fields: {leaked}")
conflict = request(
"POST",
f"/clusters/{cluster_id}/instances",
user_token,
{
"name": f"values-conflict-{suffix}",
"namespace": f"ocdp-u-api-bugs-{suffix}",
"registryId": registry_id,
"repository": NGINX_REPOSITORY,
"tag": NGINX_TAG,
"values": {"replicaCount": 1},
"valuesYaml": "replicaCount: 2\n",
},
)
assert_status(conflict, {400}, "values/valuesYaml conflict")
if "conflict" not in conflict.body.lower():
raise AssertionError(f"values conflict response should explain conflict, got {conflict.body[:500]}")
before = instance_names(cluster_id, user_token)
forbidden_name = f"namespace-forbidden-{suffix}"
namespace_forbidden = request(
"POST",
f"/clusters/{cluster_id}/instances",
user_token,
{
"name": forbidden_name,
"namespace": "default",
"registryId": registry_id,
"repository": NGINX_REPOSITORY,
"tag": NGINX_TAG,
"valuesYaml": "replicaCount: 1\n",
},
)
assert_status(namespace_forbidden, {403}, "ordinary user forbidden namespace")
after = instance_names(cluster_id, user_token)
if forbidden_name in after or before != after:
raise AssertionError("forbidden namespace request must not create an instance")
quota_user_id, quota_username, quota_password = create_test_user(admin_token, cluster_id, f"quota-{suffix}", quota_gpu="0")
quota_token = login(quota_username, quota_password)
quota_name = f"quota-precheck-{suffix}"
quota_resp = request(
"POST",
f"/clusters/{cluster_id}/instances",
quota_token,
{
"name": quota_name,
"namespace": f"ocdp-u-api-bugs-quota-{suffix}",
"registryId": registry_id,
"repository": VLLM_REPOSITORY,
"tag": VLLM_TAG,
"valuesYaml": f"""resources:
gpuLimit: 1
gpuMem: {GPU_MEM_MB}
cpuRequest: 1
memoryLimit: "4Gi"
replicaCount: 1
workerSize: 1
initContainers:
enabled: false
""",
},
timeout=600,
)
assert_status(quota_resp, {403, 422}, "quota precheck rejects over-quota deployment")
if quota_name in instance_names(cluster_id, quota_token):
raise AssertionError("quota precheck must reject before persisting an instance")
print("PASS: unresolved API contract")
return 0
finally:
if user_id:
cleanup = request("DELETE", f"/users/{user_id}", admin_token)
if cleanup.status not in {204, 404}:
print(f"WARN: cleanup user {user_id} returned HTTP {cleanup.status}: {cleanup.body[:300]}", file=sys.stderr)
if quota_user_id:
cleanup = request("DELETE", f"/users/{quota_user_id}", admin_token)
if cleanup.status not in {204, 404}:
print(f"WARN: cleanup quota user {quota_user_id} returned HTTP {cleanup.status}: {cleanup.body[:300]}", file=sys.stderr)
if __name__ == "__main__":
try:
raise SystemExit(main())
except AssertionError as exc:
print(f"FAIL: {exc}", file=sys.stderr)
raise SystemExit(1)

View File

@ -0,0 +1,150 @@
#!/usr/bin/env python3
# Covers unresolved security/gateway regressions: uniform login failures,
# per-IP+username login rate limiting with Retry-After, backend CORS allowlist,
# gateway /health JSON, Nginx version hiding, and security response headers.
import json
import os
import re
import sys
import uuid
from dataclasses import dataclass
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import Request, urlopen
RAW_BASE_URL = os.environ.get("BASE_URL", "http://localhost:18081/api/v1").rstrip("/")
BASE_URL = RAW_BASE_URL + "/"
GATEWAY_URL = os.environ.get("GATEWAY_URL", "http://localhost:18080").rstrip("/")
ADMIN_USER = os.environ.get("ADMIN_USER", os.environ.get("BOOTSTRAP_ADMIN_USER", "admin"))
@dataclass
class Response:
status: int
headers: dict[str, str]
body: str
json: Any
def parse_json(body: str) -> Any:
try:
return json.loads(body) if body else None
except json.JSONDecodeError:
return None
def request(method: str, url: str, payload: Any = None, headers: dict[str, str] | None = None) -> Response:
data = None
req_headers = dict(headers or {})
req_headers.setdefault("Accept", "application/json")
if payload is not None:
data = json.dumps(payload).encode("utf-8")
req_headers["Content-Type"] = "application/json"
target = url if url.startswith("http") else urljoin(BASE_URL, url.lstrip("/"))
try:
with urlopen(Request(target, data=data, headers=req_headers, method=method), timeout=20) as res:
body = res.read().decode("utf-8", errors="replace")
return Response(res.status, dict(res.headers), body, parse_json(body))
except HTTPError as exc:
body = exc.read().decode("utf-8", errors="replace")
return Response(exc.code, dict(exc.headers), body, parse_json(body))
except URLError as exc:
raise AssertionError(f"Cannot reach {target}: {exc}") from exc
def header(resp: Response, name: str) -> str:
for key, value in resp.headers.items():
if key.lower() == name.lower():
return value
return ""
def assert_status(resp: Response, expected: set[int], context: str) -> None:
if resp.status not in expected:
raise AssertionError(f"{context}: expected HTTP {sorted(expected)}, got {resp.status}. Body: {resp.body[:500]}")
def main() -> int:
fake_user = f"no-such-user-{uuid.uuid4().hex[:8]}"
existing_failure = request(
"POST",
"/auth/login",
{"username": ADMIN_USER, "password": f"wrong-{uuid.uuid4().hex}"},
{"X-Forwarded-For": "203.0.113.10"},
)
missing_failure = request(
"POST",
"/auth/login",
{"username": fake_user, "password": "wrong-password"},
{"X-Forwarded-For": "203.0.113.11"},
)
assert_status(existing_failure, {401}, "existing-user login failure")
assert_status(missing_failure, {401}, "missing-user login failure")
if existing_failure.json != missing_failure.json:
raise AssertionError(f"login failures must be uniform, got {existing_failure.body!r} vs {missing_failure.body!r}")
limited_user = f"rate-limit-{uuid.uuid4().hex[:8]}"
rate_resp = None
for _ in range(6):
rate_resp = request(
"POST",
"/auth/login",
{"username": limited_user, "password": "bad-password"},
{"X-Forwarded-For": "203.0.113.12"},
)
assert rate_resp is not None
assert_status(rate_resp, {429}, "login rate limit")
if not header(rate_resp, "Retry-After"):
raise AssertionError("login rate limit response must include Retry-After")
allowed_origin = "http://localhost:18080"
unknown_origin = "https://evil.example"
allowed = request(
"POST",
"/auth/login",
{"username": f"cors-allowed-{uuid.uuid4().hex[:8]}", "password": "bad-password"},
{"Origin": allowed_origin, "X-Forwarded-For": "203.0.113.13"},
)
assert_status(allowed, {401}, "allowed CORS login response")
if header(allowed, "Access-Control-Allow-Origin") != allowed_origin:
raise AssertionError(f"allowed origin was not echoed: {allowed.headers}")
unknown = request(
"POST",
"/auth/login",
{"username": f"cors-unknown-{uuid.uuid4().hex[:8]}", "password": "bad-password"},
{"Origin": unknown_origin, "X-Forwarded-For": "203.0.113.14"},
)
assert_status(unknown, {401}, "unknown CORS login response")
if header(unknown, "Access-Control-Allow-Origin"):
raise AssertionError(f"unknown origin must not be allowed: {unknown.headers}")
health = request("GET", f"{GATEWAY_URL}/health")
assert_status(health, {200}, "gateway /health")
if health.json != {"status": "ok"}:
raise AssertionError(f"/health must return JSON status ok, got {health.body[:300]!r}")
server = header(health, "Server")
if re.search(r"nginx/\d", server, re.IGNORECASE):
raise AssertionError(f"Nginx precise version leaked in Server header: {server}")
for name in ("X-Frame-Options", "X-Content-Type-Options", "Referrer-Policy", "Content-Security-Policy"):
if not header(health, name):
raise AssertionError(f"missing security header {name} on /health")
healthz = request("GET", f"{GATEWAY_URL}/healthz")
assert_status(healthz, {200}, "gateway /healthz")
for name in ("X-Frame-Options", "X-Content-Type-Options", "Referrer-Policy", "Content-Security-Policy"):
if not header(healthz, name):
raise AssertionError(f"missing security header {name} on /healthz")
print("PASS: unresolved security/gateway contract")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except AssertionError as exc:
print(f"FAIL: {exc}", file=sys.stderr)
raise SystemExit(1)

View File

@ -0,0 +1,63 @@
#!/usr/bin/env python3
# Covers admin User Management layout: quota fields and action buttons remain
# visible inside the viewport on desktop and tablet widths.
import os
from playwright.sync_api import expect, sync_playwright
FRONTEND_URL = os.environ.get("FRONTEND_URL", "http://localhost:18080")
ADMIN_USER = os.environ.get("ADMIN_USER", "admin")
ADMIN_PASS = os.environ["ADMIN_PASS"]
def login(page):
page.goto(FRONTEND_URL, wait_until="networkidle")
if page.locator("input[type='password']").count() == 0:
return
page.locator("input:not([type='password'])").first.fill(ADMIN_USER)
page.locator("input[type='password']").first.fill(ADMIN_PASS)
page.get_by_role("button").filter(has_text="Login").last.click()
page.wait_for_url("**/home", timeout=15000)
page.wait_for_load_state("networkidle")
def assert_no_action_overflow(page):
expect(page.get_by_role("heading", name="User Management")).to_be_visible(timeout=15000)
expect(page.get_by_text("GPU Mem").first).to_be_visible(timeout=15000)
overflow = page.evaluate("document.documentElement.scrollWidth > document.documentElement.clientWidth + 2")
assert not overflow, "User Management page has horizontal document overflow"
buttons = page.locator("button").filter(has_text="Limits")
expect(buttons.first).to_be_visible(timeout=15000)
viewport = page.viewport_size or {"width": 0, "height": 0}
action_labels = ("To User", "To Admin", "Limits", "Disable", "Enable", "Delete")
for label in action_labels:
matches = page.get_by_role("button", name=label, exact=True)
for index in range(min(matches.count(), 8)):
box = matches.nth(index).bounding_box()
if not box:
continue
assert box["x"] >= -1, f"{label} button overflows left viewport edge"
assert box["x"] + box["width"] <= viewport["width"] + 1, f"{label} button overflows right viewport edge"
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
for viewport in (
{"width": 1440, "height": 900},
{"width": 1280, "height": 900},
{"width": 1024, "height": 800},
{"width": 900, "height": 760},
{"width": 768, "height": 900},
):
page = browser.new_page(viewport=viewport)
login(page)
page.get_by_role("button", name="Users", exact=True).click()
page.wait_for_load_state("networkidle")
page.wait_for_timeout(500)
assert_no_action_overflow(page)
page.close()
browser.close()
print("PASS: user management layout")