- Add GetMetrics method to MetricsClient interface and implement cluster metrics API - Add QuotaPrecheck service for validating resource quotas before deployment - Add auth DTO with role/permission models and auth handler tests - Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics - Update workspace handler with GetWorkspace endpoint and shared-user list - Fix monitoring handler to use correct service method name - Add tail_lines fallback in instance handler for snake_case query params - Update nginx config for SSE log streaming support (no buffering) - Add comprehensive test coverage: auth_service_test, auth_handler_test, auth_dto_test, metrics_client_test, quota_precheck_test - Update error messages for quota validation and instance operations - ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit - InstanceCard: correctly disable scale-minus when replicas <= 0 - SidebarLayout: add hover transition for sidebar items - Update todo.md and lessons.md with latest fixes
338 lines
16 KiB
TypeScript
338 lines
16 KiB
TypeScript
/**
|
|
* Cluster Monitor Card Component
|
|
* 显示单个集群的监控信息
|
|
*/
|
|
import React, { useState } from "react";
|
|
import { Activity, CheckCircle, AlertTriangle, XCircle, HelpCircle, Clock, Cpu, Database, Server as ServerIcon, ChevronDown, ChevronUp, TrendingUp, Users } from "lucide-react";
|
|
import { Card, Badge } from "@/shared/components";
|
|
import type { ClusterMetrics } from "@/core/types";
|
|
import { NodeMetricCard } from "./NodeMetricCard";
|
|
|
|
interface ClusterMonitorCardProps {
|
|
cluster: ClusterMetrics;
|
|
}
|
|
|
|
export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster }) => {
|
|
const [showNodes, setShowNodes] = useState(false);
|
|
const status = cluster.status ?? "unknown";
|
|
const uptime = cluster.uptime ?? "N/A";
|
|
const nodeCount = cluster.nodeCount ?? 0;
|
|
const podCount = cluster.podCount ?? 0;
|
|
const totalGpu = cluster.totalGpu ?? 0;
|
|
const usedGpu = cluster.usedGpu ?? 0;
|
|
const allocatedGpu = firstNumber(cluster.gpuAllocated, cluster.allocatedGpu, cluster.gpuAllocation, usedGpu);
|
|
const usedGpuMemory = firstDisplayValue(cluster.allocatedGpuMemoryMb, cluster.allocatedGpuMemoryMB, cluster.gpuMemoryRequestsMb, cluster.usedGpuMemory, cluster.gpuMemoryUsed, cluster.usedGpuMem);
|
|
const totalGpuMemory = firstDisplayValue(cluster.totalGpuMemory, cluster.totalGpuMem);
|
|
const cpuUsage = cluster.cpuUsage ?? 0;
|
|
const memoryUsage = cluster.memoryUsage ?? 0;
|
|
const gpuUsage = cluster.gpuUsage ?? 0;
|
|
const usedCpu = cluster.usedCpu ?? "N/A";
|
|
const totalCpu = cluster.totalCpu ?? "N/A";
|
|
const usedMemory = cluster.usedMemory ?? "N/A";
|
|
const totalMemory = cluster.totalMemory ?? "N/A";
|
|
const cpuRequestText = firstDisplayValue(cluster.cpuRequests, usedCpu);
|
|
const memoryRequestText = firstDisplayValue(cluster.memoryRequests, usedMemory);
|
|
const hasClusterTotals = Boolean(cluster.totalCpu || cluster.totalMemory || cluster.nodeCount);
|
|
const lastCheckedText = cluster.lastCheck ? new Date(cluster.lastCheck).toLocaleString() : "N/A";
|
|
const userResourceRows = getUserResourceRows(cluster);
|
|
|
|
const getStatusBadge = () => {
|
|
switch (status) {
|
|
case "healthy":
|
|
return <Badge variant="success">Healthy</Badge>;
|
|
case "warning":
|
|
case "unknown":
|
|
return <Badge variant="warning">Warning</Badge>;
|
|
case "error":
|
|
case "unhealthy":
|
|
return <Badge variant="danger">Error</Badge>;
|
|
default:
|
|
return <Badge variant="gray">Unknown</Badge>;
|
|
}
|
|
};
|
|
|
|
const getStatusIcon = () => {
|
|
switch (status) {
|
|
case "healthy":
|
|
return <CheckCircle className="w-5 h-5 text-green-400" />;
|
|
case "warning":
|
|
case "unknown":
|
|
return <AlertTriangle className="w-5 h-5 text-yellow-400" />;
|
|
case "error":
|
|
case "unhealthy":
|
|
return <XCircle className="w-5 h-5 text-red-400" />;
|
|
default:
|
|
return <HelpCircle className="w-5 h-5 text-slate-500" />;
|
|
}
|
|
};
|
|
|
|
return (
|
|
<Card className="p-5">
|
|
<div className="flex items-start justify-between">
|
|
<div className="flex items-start gap-4 flex-1">
|
|
{/* Status Icon */}
|
|
<div className="p-3 bg-white rounded-lg">
|
|
{getStatusIcon()}
|
|
</div>
|
|
|
|
{/* Cluster Info */}
|
|
<div className="flex-1 min-w-0">
|
|
<div className="flex items-center gap-3 mb-2">
|
|
<h3 className="text-lg font-semibold text-slate-900 truncate">{cluster.clusterName || "Unnamed Cluster"}</h3>
|
|
{getStatusBadge()}
|
|
</div>
|
|
|
|
{/* Metrics Grid */}
|
|
<div className="grid grid-cols-2 gap-4 mb-3 md:grid-cols-3 xl:grid-cols-5">
|
|
<div>
|
|
<p className="text-xs text-slate-500">Uptime</p>
|
|
<p className="text-sm text-slate-700 font-mono mt-1">{uptime}</p>
|
|
</div>
|
|
<div>
|
|
<p className="text-xs text-slate-500">{hasClusterTotals ? "Nodes" : "Visible Nodes"}</p>
|
|
<div className="flex items-center gap-1 mt-1">
|
|
<ServerIcon className="w-3 h-3 text-blue-400" />
|
|
<p className="text-sm text-slate-700 font-mono">{nodeCount}</p>
|
|
</div>
|
|
</div>
|
|
<div>
|
|
<p className="text-xs text-slate-500">Pods</p>
|
|
<p className="text-sm text-slate-700 font-mono mt-1">{podCount}</p>
|
|
</div>
|
|
<div>
|
|
<p className="text-xs text-slate-500">GPU</p>
|
|
<p className="text-sm text-slate-700 font-mono mt-1">
|
|
{hasClusterTotals ? `${usedGpu}/${totalGpu || "N/A"}` : `${allocatedGpu} allocated`}
|
|
</p>
|
|
</div>
|
|
<div>
|
|
<p className="text-xs text-slate-500">GPU Mem</p>
|
|
<p className="text-sm text-slate-700 font-mono mt-1">
|
|
{usedGpuMemory || "N/A"}{totalGpuMemory ? ` / ${totalGpuMemory}` : ""}
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
{/* Resource Usage */}
|
|
<div className="grid grid-cols-1 sm:grid-cols-3 gap-3 mt-3 p-3 bg-slate-50 rounded-lg">
|
|
<div>
|
|
<div className="flex items-center gap-2 mb-1">
|
|
<Cpu className="w-3 h-3 text-blue-400" />
|
|
<p className="text-xs text-slate-500">{hasClusterTotals ? "CPU (Cluster Total)" : "CPU Requests"}</p>
|
|
</div>
|
|
<p className="text-sm text-slate-700 font-mono">
|
|
{hasClusterTotals ? `${usedCpu} / ${totalCpu}` : cpuRequestText || "0 cores"}
|
|
</p>
|
|
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
|
|
<div
|
|
className="h-full bg-blue-500 rounded-full transition-all"
|
|
style={{ width: `${Math.min(cpuUsage, 100)}%` }}
|
|
/>
|
|
</div>
|
|
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${cpuUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
|
|
{cluster.maxNodeCpu && (
|
|
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
|
|
<div className="flex items-center gap-1">
|
|
<TrendingUp className="w-3 h-3 text-blue-400/60" />
|
|
<p className="text-xs text-slate-500">Max per node</p>
|
|
</div>
|
|
<p className="text-xs text-slate-500 font-mono">{cluster.maxNodeCpu}</p>
|
|
{cluster.maxNodeCpuUsage && cluster.maxNodeCpuUsage > 0 && (
|
|
<p className="text-xs text-slate-500">Peak: {cluster.maxNodeCpuUsage.toFixed(1)}%</p>
|
|
)}
|
|
</div>
|
|
)}
|
|
</div>
|
|
|
|
<div>
|
|
<div className="flex items-center gap-2 mb-1">
|
|
<Database className="w-3 h-3 text-green-400" />
|
|
<p className="text-xs text-slate-500">{hasClusterTotals ? "Memory (Cluster Total)" : "Memory Requests"}</p>
|
|
</div>
|
|
<p className="text-sm text-slate-700 font-mono">
|
|
{hasClusterTotals ? `${usedMemory} / ${totalMemory}` : memoryRequestText || "0 B"}
|
|
</p>
|
|
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
|
|
<div
|
|
className="h-full bg-green-500 rounded-full transition-all"
|
|
style={{ width: `${Math.min(memoryUsage, 100)}%` }}
|
|
/>
|
|
</div>
|
|
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${memoryUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
|
|
{cluster.maxNodeMemory && (
|
|
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
|
|
<div className="flex items-center gap-1">
|
|
<TrendingUp className="w-3 h-3 text-green-400/60" />
|
|
<p className="text-xs text-slate-500">Max per node</p>
|
|
</div>
|
|
<p className="text-xs text-slate-500 font-mono">{cluster.maxNodeMemory}</p>
|
|
{cluster.maxNodeMemUsage && cluster.maxNodeMemUsage > 0 && (
|
|
<p className="text-xs text-slate-500">Peak: {cluster.maxNodeMemUsage.toFixed(1)}%</p>
|
|
)}
|
|
</div>
|
|
)}
|
|
</div>
|
|
|
|
{(totalGpu > 0 || allocatedGpu > 0) && (
|
|
<div>
|
|
<div className="flex items-center gap-2 mb-1">
|
|
<Activity className="w-3 h-3 text-purple-400" />
|
|
<p className="text-xs text-slate-500">GPU Allocation</p>
|
|
</div>
|
|
<p className="text-sm text-slate-700 font-mono">{allocatedGpu} / {totalGpu || "N/A"}</p>
|
|
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
|
|
<div
|
|
className="h-full bg-purple-500 rounded-full transition-all"
|
|
style={{ width: `${Math.min(gpuUsage, 100)}%` }}
|
|
/>
|
|
</div>
|
|
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${gpuUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
|
|
{cluster.maxNodeGpu && cluster.maxNodeGpu > 0 && (
|
|
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
|
|
<div className="flex items-center gap-1">
|
|
<TrendingUp className="w-3 h-3 text-purple-400/60" />
|
|
<p className="text-xs text-slate-500">Max per node</p>
|
|
</div>
|
|
<p className="text-xs text-slate-500 font-mono">{cluster.maxNodeGpu} GPUs</p>
|
|
{cluster.maxNodeGpuUsage && cluster.maxNodeGpuUsage > 0 && (
|
|
<p className="text-xs text-slate-500">Peak: {cluster.maxNodeGpuUsage.toFixed(1)}%</p>
|
|
)}
|
|
</div>
|
|
)}
|
|
</div>
|
|
)}
|
|
|
|
{(usedGpuMemory || totalGpuMemory) && (
|
|
<div>
|
|
<div className="flex items-center gap-2 mb-1">
|
|
<Database className="w-3 h-3 text-fuchsia-500" />
|
|
<p className="text-xs text-slate-500">GPU Mem</p>
|
|
</div>
|
|
<p className="text-sm text-slate-700 font-mono">
|
|
{usedGpuMemory || "0"}{totalGpuMemory ? ` / ${totalGpuMemory}` : ""}
|
|
</p>
|
|
<p className="text-xs text-slate-500 mt-1">requests.nvidia.com/gpumem</p>
|
|
</div>
|
|
)}
|
|
</div>
|
|
|
|
{userResourceRows.length > 0 && (
|
|
<div className="mt-3 overflow-hidden rounded-lg border border-slate-200">
|
|
<div className="flex items-center gap-2 border-b border-slate-200 bg-slate-50 px-3 py-2">
|
|
<Users className="h-4 w-4 text-slate-500" />
|
|
<h4 className="text-sm font-semibold text-slate-900">User Resources</h4>
|
|
</div>
|
|
<div className="overflow-x-auto">
|
|
<table className="min-w-[720px] w-full text-left text-xs">
|
|
<thead className="bg-white text-slate-500">
|
|
<tr>
|
|
<th className="px-3 py-2 font-medium">User</th>
|
|
<th className="px-3 py-2 font-medium">Namespace</th>
|
|
<th className="px-3 py-2 font-medium">CPU</th>
|
|
<th className="px-3 py-2 font-medium">Memory</th>
|
|
<th className="px-3 py-2 font-medium">GPU</th>
|
|
<th className="px-3 py-2 font-medium">GPU Mem</th>
|
|
<th className="px-3 py-2 font-medium">Pods</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody className="divide-y divide-slate-100">
|
|
{userResourceRows.map((row, index) => (
|
|
<tr key={`${row.userId || row.username || row.userName || "user"}-${index}`} className="bg-white">
|
|
<td className="max-w-[180px] truncate px-3 py-2 font-medium text-slate-800">
|
|
{row.username || row.userName || shortId(row.userId) || "-"}
|
|
</td>
|
|
<td className="max-w-[180px] truncate px-3 py-2 font-mono text-slate-600">{row.namespace || "-"}</td>
|
|
<td className="px-3 py-2 font-mono text-slate-700">{firstDisplayValue(row.cpuRequests, row.usedCpu, row.cpuUsed, row.cpuRequest, row.cpuLimits, row.cpuLimit) || "-"}</td>
|
|
<td className="px-3 py-2 font-mono text-slate-700">{firstDisplayValue(row.memoryRequests, row.usedMemory, row.memoryUsed, row.memoryRequest, row.memoryLimits, row.memoryLimit) || "-"}</td>
|
|
<td className="px-3 py-2 font-mono text-slate-700">{firstNumber(row.gpuRequests, row.gpuAllocated, row.gpuAllocation, row.usedGpu, row.gpuUsed) ?? 0}</td>
|
|
<td className="px-3 py-2 font-mono text-slate-700">
|
|
{firstDisplayValue(row.gpuMemoryRequestsMb, row.gpuMemoryAllocated, row.gpuMemAllocated, row.usedGpuMemory, row.gpuMemoryUsed, row.gpuMemUsed) || "0"}
|
|
</td>
|
|
<td className="px-3 py-2 font-mono text-slate-700">{row.podCount ?? "-"}</td>
|
|
</tr>
|
|
))}
|
|
</tbody>
|
|
</table>
|
|
</div>
|
|
</div>
|
|
)}
|
|
|
|
<div className="mt-3 flex items-center gap-2 text-xs text-slate-500">
|
|
<Clock className="w-3 h-3" />
|
|
<span>Last checked: {lastCheckedText}</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
{/* Actions */}
|
|
<div className="flex gap-2">
|
|
{cluster.nodes && cluster.nodes.length > 0 && (
|
|
<button
|
|
onClick={() => setShowNodes(!showNodes)}
|
|
className="px-3 py-1.5 text-sm text-blue-400 hover:text-blue-300 hover:bg-blue-400/10 rounded-lg transition flex items-center gap-2"
|
|
>
|
|
{showNodes ? (
|
|
<>
|
|
<ChevronUp className="w-4 h-4" />
|
|
Hide Nodes
|
|
</>
|
|
) : (
|
|
<>
|
|
<ChevronDown className="w-4 h-4" />
|
|
Show Nodes ({cluster.nodes.length})
|
|
</>
|
|
)}
|
|
</button>
|
|
)}
|
|
</div>
|
|
</div>
|
|
|
|
{/* Nodes List */}
|
|
{showNodes && cluster.nodes && cluster.nodes.length > 0 && (
|
|
<div className="mt-4 pt-4 border-t border-slate-200">
|
|
<h4 className="text-sm font-semibold text-slate-900 mb-3 flex items-center gap-2">
|
|
<ServerIcon className="w-4 h-4 text-blue-400" />
|
|
Cluster Nodes ({cluster.nodes.length})
|
|
</h4>
|
|
<div className="grid grid-cols-1 lg:grid-cols-2 gap-3">
|
|
{cluster.nodes.map((node) => (
|
|
<NodeMetricCard key={node.nodeName} node={node} />
|
|
))}
|
|
</div>
|
|
</div>
|
|
)}
|
|
</Card>
|
|
);
|
|
};
|
|
|
|
const firstNumber = (...values: Array<number | undefined | null>): number => {
|
|
for (const value of values) {
|
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
return value;
|
|
}
|
|
}
|
|
return 0;
|
|
};
|
|
|
|
const firstDisplayValue = (...values: Array<string | number | undefined | null>): string => {
|
|
for (const value of values) {
|
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
return String(value);
|
|
}
|
|
if (typeof value === "string" && value.trim()) {
|
|
return value.trim();
|
|
}
|
|
}
|
|
return "";
|
|
};
|
|
|
|
const getUserResourceRows = (cluster: ClusterMetrics) =>
|
|
cluster.resourceUsageByUser || cluster.userResources || cluster.userResourceUsage || cluster.resourcesByUser || cluster.userResourceRows || [];
|
|
|
|
const shortId = (value?: string): string => {
|
|
const id = value?.trim();
|
|
if (!id) return "";
|
|
if (id.length <= 12) return id;
|
|
return `${id.slice(0, 8)}...${id.slice(-4)}`;
|
|
};
|