Files
ocdp-go/frontend/src/features/monitoring/clusters/components/ClusterMonitorCard.tsx
Ivan087 33ddaf97db fix: scale replicas in response, K8s metrics client, quota precheck, auth tests
- Add GetMetrics method to MetricsClient interface and implement cluster metrics API
- Add QuotaPrecheck service for validating resource quotas before deployment
- Add auth DTO with role/permission models and auth handler tests
- Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics
- Update workspace handler with GetWorkspace endpoint and shared-user list
- Fix monitoring handler to use correct service method name
- Add tail_lines fallback in instance handler for snake_case query params
- Update nginx config for SSE log streaming support (no buffering)
- Add comprehensive test coverage: auth_service_test, auth_handler_test,
  auth_dto_test, metrics_client_test, quota_precheck_test
- Update error messages for quota validation and instance operations
- ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit
- InstanceCard: correctly disable scale-minus when replicas <= 0
- SidebarLayout: add hover transition for sidebar items
- Update todo.md and lessons.md with latest fixes
2026-05-20 16:56:29 +08:00

338 lines
16 KiB
TypeScript

/**
* Cluster Monitor Card Component
* 显示单个集群的监控信息
*/
import React, { useState } from "react";
import { Activity, CheckCircle, AlertTriangle, XCircle, HelpCircle, Clock, Cpu, Database, Server as ServerIcon, ChevronDown, ChevronUp, TrendingUp, Users } from "lucide-react";
import { Card, Badge } from "@/shared/components";
import type { ClusterMetrics } from "@/core/types";
import { NodeMetricCard } from "./NodeMetricCard";
interface ClusterMonitorCardProps {
cluster: ClusterMetrics;
}
export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster }) => {
const [showNodes, setShowNodes] = useState(false);
const status = cluster.status ?? "unknown";
const uptime = cluster.uptime ?? "N/A";
const nodeCount = cluster.nodeCount ?? 0;
const podCount = cluster.podCount ?? 0;
const totalGpu = cluster.totalGpu ?? 0;
const usedGpu = cluster.usedGpu ?? 0;
const allocatedGpu = firstNumber(cluster.gpuAllocated, cluster.allocatedGpu, cluster.gpuAllocation, usedGpu);
const usedGpuMemory = firstDisplayValue(cluster.allocatedGpuMemoryMb, cluster.allocatedGpuMemoryMB, cluster.gpuMemoryRequestsMb, cluster.usedGpuMemory, cluster.gpuMemoryUsed, cluster.usedGpuMem);
const totalGpuMemory = firstDisplayValue(cluster.totalGpuMemory, cluster.totalGpuMem);
const cpuUsage = cluster.cpuUsage ?? 0;
const memoryUsage = cluster.memoryUsage ?? 0;
const gpuUsage = cluster.gpuUsage ?? 0;
const usedCpu = cluster.usedCpu ?? "N/A";
const totalCpu = cluster.totalCpu ?? "N/A";
const usedMemory = cluster.usedMemory ?? "N/A";
const totalMemory = cluster.totalMemory ?? "N/A";
const cpuRequestText = firstDisplayValue(cluster.cpuRequests, usedCpu);
const memoryRequestText = firstDisplayValue(cluster.memoryRequests, usedMemory);
const hasClusterTotals = Boolean(cluster.totalCpu || cluster.totalMemory || cluster.nodeCount);
const lastCheckedText = cluster.lastCheck ? new Date(cluster.lastCheck).toLocaleString() : "N/A";
const userResourceRows = getUserResourceRows(cluster);
const getStatusBadge = () => {
switch (status) {
case "healthy":
return <Badge variant="success">Healthy</Badge>;
case "warning":
case "unknown":
return <Badge variant="warning">Warning</Badge>;
case "error":
case "unhealthy":
return <Badge variant="danger">Error</Badge>;
default:
return <Badge variant="gray">Unknown</Badge>;
}
};
const getStatusIcon = () => {
switch (status) {
case "healthy":
return <CheckCircle className="w-5 h-5 text-green-400" />;
case "warning":
case "unknown":
return <AlertTriangle className="w-5 h-5 text-yellow-400" />;
case "error":
case "unhealthy":
return <XCircle className="w-5 h-5 text-red-400" />;
default:
return <HelpCircle className="w-5 h-5 text-slate-500" />;
}
};
return (
<Card className="p-5">
<div className="flex items-start justify-between">
<div className="flex items-start gap-4 flex-1">
{/* Status Icon */}
<div className="p-3 bg-white rounded-lg">
{getStatusIcon()}
</div>
{/* Cluster Info */}
<div className="flex-1 min-w-0">
<div className="flex items-center gap-3 mb-2">
<h3 className="text-lg font-semibold text-slate-900 truncate">{cluster.clusterName || "Unnamed Cluster"}</h3>
{getStatusBadge()}
</div>
{/* Metrics Grid */}
<div className="grid grid-cols-2 gap-4 mb-3 md:grid-cols-3 xl:grid-cols-5">
<div>
<p className="text-xs text-slate-500">Uptime</p>
<p className="text-sm text-slate-700 font-mono mt-1">{uptime}</p>
</div>
<div>
<p className="text-xs text-slate-500">{hasClusterTotals ? "Nodes" : "Visible Nodes"}</p>
<div className="flex items-center gap-1 mt-1">
<ServerIcon className="w-3 h-3 text-blue-400" />
<p className="text-sm text-slate-700 font-mono">{nodeCount}</p>
</div>
</div>
<div>
<p className="text-xs text-slate-500">Pods</p>
<p className="text-sm text-slate-700 font-mono mt-1">{podCount}</p>
</div>
<div>
<p className="text-xs text-slate-500">GPU</p>
<p className="text-sm text-slate-700 font-mono mt-1">
{hasClusterTotals ? `${usedGpu}/${totalGpu || "N/A"}` : `${allocatedGpu} allocated`}
</p>
</div>
<div>
<p className="text-xs text-slate-500">GPU Mem</p>
<p className="text-sm text-slate-700 font-mono mt-1">
{usedGpuMemory || "N/A"}{totalGpuMemory ? ` / ${totalGpuMemory}` : ""}
</p>
</div>
</div>
{/* Resource Usage */}
<div className="grid grid-cols-1 sm:grid-cols-3 gap-3 mt-3 p-3 bg-slate-50 rounded-lg">
<div>
<div className="flex items-center gap-2 mb-1">
<Cpu className="w-3 h-3 text-blue-400" />
<p className="text-xs text-slate-500">{hasClusterTotals ? "CPU (Cluster Total)" : "CPU Requests"}</p>
</div>
<p className="text-sm text-slate-700 font-mono">
{hasClusterTotals ? `${usedCpu} / ${totalCpu}` : cpuRequestText || "0 cores"}
</p>
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
<div
className="h-full bg-blue-500 rounded-full transition-all"
style={{ width: `${Math.min(cpuUsage, 100)}%` }}
/>
</div>
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${cpuUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
{cluster.maxNodeCpu && (
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
<div className="flex items-center gap-1">
<TrendingUp className="w-3 h-3 text-blue-400/60" />
<p className="text-xs text-slate-500">Max per node</p>
</div>
<p className="text-xs text-slate-500 font-mono">{cluster.maxNodeCpu}</p>
{cluster.maxNodeCpuUsage && cluster.maxNodeCpuUsage > 0 && (
<p className="text-xs text-slate-500">Peak: {cluster.maxNodeCpuUsage.toFixed(1)}%</p>
)}
</div>
)}
</div>
<div>
<div className="flex items-center gap-2 mb-1">
<Database className="w-3 h-3 text-green-400" />
<p className="text-xs text-slate-500">{hasClusterTotals ? "Memory (Cluster Total)" : "Memory Requests"}</p>
</div>
<p className="text-sm text-slate-700 font-mono">
{hasClusterTotals ? `${usedMemory} / ${totalMemory}` : memoryRequestText || "0 B"}
</p>
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
<div
className="h-full bg-green-500 rounded-full transition-all"
style={{ width: `${Math.min(memoryUsage, 100)}%` }}
/>
</div>
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${memoryUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
{cluster.maxNodeMemory && (
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
<div className="flex items-center gap-1">
<TrendingUp className="w-3 h-3 text-green-400/60" />
<p className="text-xs text-slate-500">Max per node</p>
</div>
<p className="text-xs text-slate-500 font-mono">{cluster.maxNodeMemory}</p>
{cluster.maxNodeMemUsage && cluster.maxNodeMemUsage > 0 && (
<p className="text-xs text-slate-500">Peak: {cluster.maxNodeMemUsage.toFixed(1)}%</p>
)}
</div>
)}
</div>
{(totalGpu > 0 || allocatedGpu > 0) && (
<div>
<div className="flex items-center gap-2 mb-1">
<Activity className="w-3 h-3 text-purple-400" />
<p className="text-xs text-slate-500">GPU Allocation</p>
</div>
<p className="text-sm text-slate-700 font-mono">{allocatedGpu} / {totalGpu || "N/A"}</p>
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
<div
className="h-full bg-purple-500 rounded-full transition-all"
style={{ width: `${Math.min(gpuUsage, 100)}%` }}
/>
</div>
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${gpuUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
{cluster.maxNodeGpu && cluster.maxNodeGpu > 0 && (
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
<div className="flex items-center gap-1">
<TrendingUp className="w-3 h-3 text-purple-400/60" />
<p className="text-xs text-slate-500">Max per node</p>
</div>
<p className="text-xs text-slate-500 font-mono">{cluster.maxNodeGpu} GPUs</p>
{cluster.maxNodeGpuUsage && cluster.maxNodeGpuUsage > 0 && (
<p className="text-xs text-slate-500">Peak: {cluster.maxNodeGpuUsage.toFixed(1)}%</p>
)}
</div>
)}
</div>
)}
{(usedGpuMemory || totalGpuMemory) && (
<div>
<div className="flex items-center gap-2 mb-1">
<Database className="w-3 h-3 text-fuchsia-500" />
<p className="text-xs text-slate-500">GPU Mem</p>
</div>
<p className="text-sm text-slate-700 font-mono">
{usedGpuMemory || "0"}{totalGpuMemory ? ` / ${totalGpuMemory}` : ""}
</p>
<p className="text-xs text-slate-500 mt-1">requests.nvidia.com/gpumem</p>
</div>
)}
</div>
{userResourceRows.length > 0 && (
<div className="mt-3 overflow-hidden rounded-lg border border-slate-200">
<div className="flex items-center gap-2 border-b border-slate-200 bg-slate-50 px-3 py-2">
<Users className="h-4 w-4 text-slate-500" />
<h4 className="text-sm font-semibold text-slate-900">User Resources</h4>
</div>
<div className="overflow-x-auto">
<table className="min-w-[720px] w-full text-left text-xs">
<thead className="bg-white text-slate-500">
<tr>
<th className="px-3 py-2 font-medium">User</th>
<th className="px-3 py-2 font-medium">Namespace</th>
<th className="px-3 py-2 font-medium">CPU</th>
<th className="px-3 py-2 font-medium">Memory</th>
<th className="px-3 py-2 font-medium">GPU</th>
<th className="px-3 py-2 font-medium">GPU Mem</th>
<th className="px-3 py-2 font-medium">Pods</th>
</tr>
</thead>
<tbody className="divide-y divide-slate-100">
{userResourceRows.map((row, index) => (
<tr key={`${row.userId || row.username || row.userName || "user"}-${index}`} className="bg-white">
<td className="max-w-[180px] truncate px-3 py-2 font-medium text-slate-800">
{row.username || row.userName || shortId(row.userId) || "-"}
</td>
<td className="max-w-[180px] truncate px-3 py-2 font-mono text-slate-600">{row.namespace || "-"}</td>
<td className="px-3 py-2 font-mono text-slate-700">{firstDisplayValue(row.cpuRequests, row.usedCpu, row.cpuUsed, row.cpuRequest, row.cpuLimits, row.cpuLimit) || "-"}</td>
<td className="px-3 py-2 font-mono text-slate-700">{firstDisplayValue(row.memoryRequests, row.usedMemory, row.memoryUsed, row.memoryRequest, row.memoryLimits, row.memoryLimit) || "-"}</td>
<td className="px-3 py-2 font-mono text-slate-700">{firstNumber(row.gpuRequests, row.gpuAllocated, row.gpuAllocation, row.usedGpu, row.gpuUsed) ?? 0}</td>
<td className="px-3 py-2 font-mono text-slate-700">
{firstDisplayValue(row.gpuMemoryRequestsMb, row.gpuMemoryAllocated, row.gpuMemAllocated, row.usedGpuMemory, row.gpuMemoryUsed, row.gpuMemUsed) || "0"}
</td>
<td className="px-3 py-2 font-mono text-slate-700">{row.podCount ?? "-"}</td>
</tr>
))}
</tbody>
</table>
</div>
</div>
)}
<div className="mt-3 flex items-center gap-2 text-xs text-slate-500">
<Clock className="w-3 h-3" />
<span>Last checked: {lastCheckedText}</span>
</div>
</div>
</div>
{/* Actions */}
<div className="flex gap-2">
{cluster.nodes && cluster.nodes.length > 0 && (
<button
onClick={() => setShowNodes(!showNodes)}
className="px-3 py-1.5 text-sm text-blue-400 hover:text-blue-300 hover:bg-blue-400/10 rounded-lg transition flex items-center gap-2"
>
{showNodes ? (
<>
<ChevronUp className="w-4 h-4" />
Hide Nodes
</>
) : (
<>
<ChevronDown className="w-4 h-4" />
Show Nodes ({cluster.nodes.length})
</>
)}
</button>
)}
</div>
</div>
{/* Nodes List */}
{showNodes && cluster.nodes && cluster.nodes.length > 0 && (
<div className="mt-4 pt-4 border-t border-slate-200">
<h4 className="text-sm font-semibold text-slate-900 mb-3 flex items-center gap-2">
<ServerIcon className="w-4 h-4 text-blue-400" />
Cluster Nodes ({cluster.nodes.length})
</h4>
<div className="grid grid-cols-1 lg:grid-cols-2 gap-3">
{cluster.nodes.map((node) => (
<NodeMetricCard key={node.nodeName} node={node} />
))}
</div>
</div>
)}
</Card>
);
};
const firstNumber = (...values: Array<number | undefined | null>): number => {
for (const value of values) {
if (typeof value === "number" && Number.isFinite(value)) {
return value;
}
}
return 0;
};
const firstDisplayValue = (...values: Array<string | number | undefined | null>): string => {
for (const value of values) {
if (typeof value === "number" && Number.isFinite(value)) {
return String(value);
}
if (typeof value === "string" && value.trim()) {
return value.trim();
}
}
return "";
};
const getUserResourceRows = (cluster: ClusterMetrics) =>
cluster.resourceUsageByUser || cluster.userResources || cluster.userResourceUsage || cluster.resourcesByUser || cluster.userResourceRows || [];
const shortId = (value?: string): string => {
const id = value?.trim();
if (!id) return "";
if (id.length <= 12) return id;
return `${id.slice(0, 8)}...${id.slice(-4)}`;
};