fix: scale replicas in response, K8s metrics client, quota precheck, auth tests
- Add GetMetrics method to MetricsClient interface and implement cluster metrics API - Add QuotaPrecheck service for validating resource quotas before deployment - Add auth DTO with role/permission models and auth handler tests - Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics - Update workspace handler with GetWorkspace endpoint and shared-user list - Fix monitoring handler to use correct service method name - Add tail_lines fallback in instance handler for snake_case query params - Update nginx config for SSE log streaming support (no buffering) - Add comprehensive test coverage: auth_service_test, auth_handler_test, auth_dto_test, metrics_client_test, quota_precheck_test - Update error messages for quota validation and instance operations - ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit - InstanceCard: correctly disable scale-minus when replicas <= 0 - SidebarLayout: add hover transition for sidebar items - Update todo.md and lessons.md with latest fixes
This commit is contained in:
@ -3,7 +3,7 @@
|
||||
* 显示单个集群的监控信息
|
||||
*/
|
||||
import React, { useState } from "react";
|
||||
import { Activity, CheckCircle, AlertTriangle, XCircle, HelpCircle, Clock, Cpu, Database, Server as ServerIcon, ChevronDown, ChevronUp, TrendingUp } from "lucide-react";
|
||||
import { Activity, CheckCircle, AlertTriangle, XCircle, HelpCircle, Clock, Cpu, Database, Server as ServerIcon, ChevronDown, ChevronUp, TrendingUp, Users } from "lucide-react";
|
||||
import { Card, Badge } from "@/shared/components";
|
||||
import type { ClusterMetrics } from "@/core/types";
|
||||
import { NodeMetricCard } from "./NodeMetricCard";
|
||||
@ -20,6 +20,9 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
const podCount = cluster.podCount ?? 0;
|
||||
const totalGpu = cluster.totalGpu ?? 0;
|
||||
const usedGpu = cluster.usedGpu ?? 0;
|
||||
const allocatedGpu = firstNumber(cluster.gpuAllocated, cluster.allocatedGpu, cluster.gpuAllocation, usedGpu);
|
||||
const usedGpuMemory = firstDisplayValue(cluster.allocatedGpuMemoryMb, cluster.allocatedGpuMemoryMB, cluster.gpuMemoryRequestsMb, cluster.usedGpuMemory, cluster.gpuMemoryUsed, cluster.usedGpuMem);
|
||||
const totalGpuMemory = firstDisplayValue(cluster.totalGpuMemory, cluster.totalGpuMem);
|
||||
const cpuUsage = cluster.cpuUsage ?? 0;
|
||||
const memoryUsage = cluster.memoryUsage ?? 0;
|
||||
const gpuUsage = cluster.gpuUsage ?? 0;
|
||||
@ -27,7 +30,11 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
const totalCpu = cluster.totalCpu ?? "N/A";
|
||||
const usedMemory = cluster.usedMemory ?? "N/A";
|
||||
const totalMemory = cluster.totalMemory ?? "N/A";
|
||||
const cpuRequestText = firstDisplayValue(cluster.cpuRequests, usedCpu);
|
||||
const memoryRequestText = firstDisplayValue(cluster.memoryRequests, usedMemory);
|
||||
const hasClusterTotals = Boolean(cluster.totalCpu || cluster.totalMemory || cluster.nodeCount);
|
||||
const lastCheckedText = cluster.lastCheck ? new Date(cluster.lastCheck).toLocaleString() : "N/A";
|
||||
const userResourceRows = getUserResourceRows(cluster);
|
||||
|
||||
const getStatusBadge = () => {
|
||||
switch (status) {
|
||||
@ -76,13 +83,13 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
</div>
|
||||
|
||||
{/* Metrics Grid */}
|
||||
<div className="grid grid-cols-2 sm:grid-cols-4 gap-4 mb-3">
|
||||
<div className="grid grid-cols-2 gap-4 mb-3 md:grid-cols-3 xl:grid-cols-5">
|
||||
<div>
|
||||
<p className="text-xs text-slate-500">Uptime</p>
|
||||
<p className="text-sm text-slate-700 font-mono mt-1">{uptime}</p>
|
||||
</div>
|
||||
<div>
|
||||
<p className="text-xs text-slate-500">Nodes</p>
|
||||
<p className="text-xs text-slate-500">{hasClusterTotals ? "Nodes" : "Visible Nodes"}</p>
|
||||
<div className="flex items-center gap-1 mt-1">
|
||||
<ServerIcon className="w-3 h-3 text-blue-400" />
|
||||
<p className="text-sm text-slate-700 font-mono">{nodeCount}</p>
|
||||
@ -95,7 +102,13 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
<div>
|
||||
<p className="text-xs text-slate-500">GPU</p>
|
||||
<p className="text-sm text-slate-700 font-mono mt-1">
|
||||
{usedGpu}/{totalGpu || "N/A"}
|
||||
{hasClusterTotals ? `${usedGpu}/${totalGpu || "N/A"}` : `${allocatedGpu} allocated`}
|
||||
</p>
|
||||
</div>
|
||||
<div>
|
||||
<p className="text-xs text-slate-500">GPU Mem</p>
|
||||
<p className="text-sm text-slate-700 font-mono mt-1">
|
||||
{usedGpuMemory || "N/A"}{totalGpuMemory ? ` / ${totalGpuMemory}` : ""}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
@ -105,16 +118,18 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
<div>
|
||||
<div className="flex items-center gap-2 mb-1">
|
||||
<Cpu className="w-3 h-3 text-blue-400" />
|
||||
<p className="text-xs text-slate-500">CPU (Cluster Total)</p>
|
||||
<p className="text-xs text-slate-500">{hasClusterTotals ? "CPU (Cluster Total)" : "CPU Requests"}</p>
|
||||
</div>
|
||||
<p className="text-sm text-slate-700 font-mono">{usedCpu} / {totalCpu}</p>
|
||||
<p className="text-sm text-slate-700 font-mono">
|
||||
{hasClusterTotals ? `${usedCpu} / ${totalCpu}` : cpuRequestText || "0 cores"}
|
||||
</p>
|
||||
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
|
||||
<div
|
||||
className="h-full bg-blue-500 rounded-full transition-all"
|
||||
style={{ width: `${Math.min(cpuUsage, 100)}%` }}
|
||||
/>
|
||||
</div>
|
||||
<p className="text-xs text-slate-500 mt-1">{cpuUsage.toFixed(1)}%</p>
|
||||
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${cpuUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
|
||||
{cluster.maxNodeCpu && (
|
||||
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
|
||||
<div className="flex items-center gap-1">
|
||||
@ -132,16 +147,18 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
<div>
|
||||
<div className="flex items-center gap-2 mb-1">
|
||||
<Database className="w-3 h-3 text-green-400" />
|
||||
<p className="text-xs text-slate-500">Memory (Cluster Total)</p>
|
||||
<p className="text-xs text-slate-500">{hasClusterTotals ? "Memory (Cluster Total)" : "Memory Requests"}</p>
|
||||
</div>
|
||||
<p className="text-sm text-slate-700 font-mono">{usedMemory} / {totalMemory}</p>
|
||||
<p className="text-sm text-slate-700 font-mono">
|
||||
{hasClusterTotals ? `${usedMemory} / ${totalMemory}` : memoryRequestText || "0 B"}
|
||||
</p>
|
||||
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
|
||||
<div
|
||||
className="h-full bg-green-500 rounded-full transition-all"
|
||||
style={{ width: `${Math.min(memoryUsage, 100)}%` }}
|
||||
/>
|
||||
</div>
|
||||
<p className="text-xs text-slate-500 mt-1">{memoryUsage.toFixed(1)}%</p>
|
||||
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${memoryUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
|
||||
{cluster.maxNodeMemory && (
|
||||
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
|
||||
<div className="flex items-center gap-1">
|
||||
@ -156,20 +173,20 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
)}
|
||||
</div>
|
||||
|
||||
{totalGpu > 0 && (
|
||||
{(totalGpu > 0 || allocatedGpu > 0) && (
|
||||
<div>
|
||||
<div className="flex items-center gap-2 mb-1">
|
||||
<Activity className="w-3 h-3 text-purple-400" />
|
||||
<p className="text-xs text-slate-500">GPU (Cluster Total)</p>
|
||||
<p className="text-xs text-slate-500">GPU Allocation</p>
|
||||
</div>
|
||||
<p className="text-sm text-slate-700 font-mono">{usedGpu} / {totalGpu}</p>
|
||||
<p className="text-sm text-slate-700 font-mono">{allocatedGpu} / {totalGpu || "N/A"}</p>
|
||||
<div className="mt-1 h-1.5 bg-slate-100 rounded-full overflow-hidden">
|
||||
<div
|
||||
className="h-full bg-purple-500 rounded-full transition-all"
|
||||
style={{ width: `${Math.min(gpuUsage, 100)}%` }}
|
||||
/>
|
||||
</div>
|
||||
<p className="text-xs text-slate-500 mt-1">{gpuUsage.toFixed(1)}%</p>
|
||||
<p className="text-xs text-slate-500 mt-1">{hasClusterTotals ? `${gpuUsage.toFixed(1)}%` : "self-scoped allocation"}</p>
|
||||
{cluster.maxNodeGpu && cluster.maxNodeGpu > 0 && (
|
||||
<div className="mt-1.5 pt-1.5 border-t border-slate-200">
|
||||
<div className="flex items-center gap-1">
|
||||
@ -184,8 +201,62 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{(usedGpuMemory || totalGpuMemory) && (
|
||||
<div>
|
||||
<div className="flex items-center gap-2 mb-1">
|
||||
<Database className="w-3 h-3 text-fuchsia-500" />
|
||||
<p className="text-xs text-slate-500">GPU Mem</p>
|
||||
</div>
|
||||
<p className="text-sm text-slate-700 font-mono">
|
||||
{usedGpuMemory || "0"}{totalGpuMemory ? ` / ${totalGpuMemory}` : ""}
|
||||
</p>
|
||||
<p className="text-xs text-slate-500 mt-1">requests.nvidia.com/gpumem</p>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{userResourceRows.length > 0 && (
|
||||
<div className="mt-3 overflow-hidden rounded-lg border border-slate-200">
|
||||
<div className="flex items-center gap-2 border-b border-slate-200 bg-slate-50 px-3 py-2">
|
||||
<Users className="h-4 w-4 text-slate-500" />
|
||||
<h4 className="text-sm font-semibold text-slate-900">User Resources</h4>
|
||||
</div>
|
||||
<div className="overflow-x-auto">
|
||||
<table className="min-w-[720px] w-full text-left text-xs">
|
||||
<thead className="bg-white text-slate-500">
|
||||
<tr>
|
||||
<th className="px-3 py-2 font-medium">User</th>
|
||||
<th className="px-3 py-2 font-medium">Namespace</th>
|
||||
<th className="px-3 py-2 font-medium">CPU</th>
|
||||
<th className="px-3 py-2 font-medium">Memory</th>
|
||||
<th className="px-3 py-2 font-medium">GPU</th>
|
||||
<th className="px-3 py-2 font-medium">GPU Mem</th>
|
||||
<th className="px-3 py-2 font-medium">Pods</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody className="divide-y divide-slate-100">
|
||||
{userResourceRows.map((row, index) => (
|
||||
<tr key={`${row.userId || row.username || row.userName || "user"}-${index}`} className="bg-white">
|
||||
<td className="max-w-[180px] truncate px-3 py-2 font-medium text-slate-800">
|
||||
{row.username || row.userName || shortId(row.userId) || "-"}
|
||||
</td>
|
||||
<td className="max-w-[180px] truncate px-3 py-2 font-mono text-slate-600">{row.namespace || "-"}</td>
|
||||
<td className="px-3 py-2 font-mono text-slate-700">{firstDisplayValue(row.cpuRequests, row.usedCpu, row.cpuUsed, row.cpuRequest, row.cpuLimits, row.cpuLimit) || "-"}</td>
|
||||
<td className="px-3 py-2 font-mono text-slate-700">{firstDisplayValue(row.memoryRequests, row.usedMemory, row.memoryUsed, row.memoryRequest, row.memoryLimits, row.memoryLimit) || "-"}</td>
|
||||
<td className="px-3 py-2 font-mono text-slate-700">{firstNumber(row.gpuRequests, row.gpuAllocated, row.gpuAllocation, row.usedGpu, row.gpuUsed) ?? 0}</td>
|
||||
<td className="px-3 py-2 font-mono text-slate-700">
|
||||
{firstDisplayValue(row.gpuMemoryRequestsMb, row.gpuMemoryAllocated, row.gpuMemAllocated, row.usedGpuMemory, row.gpuMemoryUsed, row.gpuMemUsed) || "0"}
|
||||
</td>
|
||||
<td className="px-3 py-2 font-mono text-slate-700">{row.podCount ?? "-"}</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="mt-3 flex items-center gap-2 text-xs text-slate-500">
|
||||
<Clock className="w-3 h-3" />
|
||||
<span>Last checked: {lastCheckedText}</span>
|
||||
@ -233,3 +304,34 @@ export const ClusterMonitorCard: React.FC<ClusterMonitorCardProps> = ({ cluster
|
||||
</Card>
|
||||
);
|
||||
};
|
||||
|
||||
const firstNumber = (...values: Array<number | undefined | null>): number => {
|
||||
for (const value of values) {
|
||||
if (typeof value === "number" && Number.isFinite(value)) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
const firstDisplayValue = (...values: Array<string | number | undefined | null>): string => {
|
||||
for (const value of values) {
|
||||
if (typeof value === "number" && Number.isFinite(value)) {
|
||||
return String(value);
|
||||
}
|
||||
if (typeof value === "string" && value.trim()) {
|
||||
return value.trim();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
};
|
||||
|
||||
const getUserResourceRows = (cluster: ClusterMetrics) =>
|
||||
cluster.resourceUsageByUser || cluster.userResources || cluster.userResourceUsage || cluster.resourcesByUser || cluster.userResourceRows || [];
|
||||
|
||||
const shortId = (value?: string): string => {
|
||||
const id = value?.trim();
|
||||
if (!id) return "";
|
||||
if (id.length <= 12) return id;
|
||||
return `${id.slice(0, 8)}...${id.slice(-4)}`;
|
||||
};
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
* 监控集群状态和健康信息
|
||||
*/
|
||||
import React, { useState, useEffect } from "react";
|
||||
import { Activity, Server, RefreshCw } from "lucide-react";
|
||||
import { Activity, Database, Server, RefreshCw } from "lucide-react";
|
||||
import { PageHeader, StatsCard, Button, LoadingState, ErrorState, EmptyState } from "@/shared";
|
||||
import { useToast } from "@/shared";
|
||||
import { ClusterErrors, SuccessMessages, formatApiError } from "@/shared/utils";
|
||||
@ -107,6 +107,12 @@ const MonitoringClustersPage: React.FC = () => {
|
||||
const healthyCount = clusters.filter(c => c.status === "healthy").length;
|
||||
const warningCount = clusters.filter(c => c.status === "warning" || c.status === "unknown").length;
|
||||
const errorCount = clusters.filter(c => c.status === "error" || c.status === "unhealthy").length;
|
||||
const allocatedGpu = clusters.reduce(
|
||||
(sum, cluster) => sum + firstNumber(cluster.gpuAllocated, cluster.allocatedGpu, cluster.gpuAllocation, cluster.usedGpu),
|
||||
0
|
||||
);
|
||||
const totalGpu = clusters.reduce((sum, cluster) => sum + (cluster.totalGpu ?? 0), 0);
|
||||
const gpuMemoryText = summarizeGpuMemory(clusters);
|
||||
|
||||
return (
|
||||
<div className="space-y-6">
|
||||
@ -127,7 +133,7 @@ const MonitoringClustersPage: React.FC = () => {
|
||||
</PageHeader>
|
||||
|
||||
{/* Summary Stats */}
|
||||
<div className="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-4 gap-4">
|
||||
<div className="grid grid-cols-1 sm:grid-cols-2 xl:grid-cols-6 gap-4">
|
||||
<StatsCard
|
||||
title="Total Clusters"
|
||||
value={clusters.length}
|
||||
@ -152,6 +158,18 @@ const MonitoringClustersPage: React.FC = () => {
|
||||
icon={Activity}
|
||||
variant="red"
|
||||
/>
|
||||
<StatsCard
|
||||
title="GPU Allocation"
|
||||
value={`${allocatedGpu}/${totalGpu || "N/A"}`}
|
||||
icon={Activity}
|
||||
variant="purple"
|
||||
/>
|
||||
<StatsCard
|
||||
title="GPU Mem"
|
||||
value={gpuMemoryText}
|
||||
icon={Database}
|
||||
variant="orange"
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Auto-refresh Info */}
|
||||
@ -173,3 +191,40 @@ const MonitoringClustersPage: React.FC = () => {
|
||||
};
|
||||
|
||||
export default MonitoringClustersPage;
|
||||
|
||||
const firstNumber = (...values: Array<number | undefined | null>): number => {
|
||||
for (const value of values) {
|
||||
if (typeof value === "number" && Number.isFinite(value)) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
const summarizeGpuMemory = (clusters: ClusterMetrics[]): string => {
|
||||
const usedValues = clusters
|
||||
.map((cluster) => firstText(cluster.allocatedGpuMemoryMb, cluster.allocatedGpuMemoryMB, cluster.gpuMemoryRequestsMb, cluster.usedGpuMemory, cluster.gpuMemoryUsed, cluster.usedGpuMem))
|
||||
.filter(Boolean);
|
||||
const totalValues = clusters
|
||||
.map((cluster) => firstText(cluster.totalGpuMemory, cluster.totalGpuMem))
|
||||
.filter(Boolean);
|
||||
if (usedValues.length === 0 && totalValues.length === 0) {
|
||||
return "N/A";
|
||||
}
|
||||
if (usedValues.length === 1 && totalValues.length <= 1) {
|
||||
return totalValues[0] ? `${usedValues[0] || "0"} / ${totalValues[0]}` : usedValues[0];
|
||||
}
|
||||
return `${usedValues.length || 0} clusters`;
|
||||
};
|
||||
|
||||
const firstText = (...values: Array<string | number | undefined | null>): string => {
|
||||
for (const value of values) {
|
||||
if (typeof value === "number" && Number.isFinite(value)) {
|
||||
return String(value);
|
||||
}
|
||||
if (typeof value === "string" && value.trim()) {
|
||||
return value.trim();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user