fix: scale replicas in response, K8s metrics client, quota precheck, auth tests

- Add GetMetrics method to MetricsClient interface and implement cluster metrics API - Add QuotaPrecheck service for validating resource quotas before deployment - Add auth DTO with role/permission models and auth handler tests - Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics - Update workspace handler with GetWorkspace endpoint and shared-user list - Fix monitoring handler to use correct service method name - Add tail_lines fallback in instance handler for snake_case query params - Update nginx config for SSE log streaming support (no buffering) - Add comprehensive test coverage: auth_service_test, auth_handler_test, auth_dto_test, metrics_client_test, quota_precheck_test - Update error messages for quota validation and instance operations - ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit - InstanceCard: correctly disable scale-minus when replicas <= 0 - SidebarLayout: add hover transition for sidebar items - Update todo.md and lessons.md with latest fixes
2026-05-20 16:56:29 +08:00
parent 8f90cf0f0d
commit 33ddaf97db
59 changed files with 4805 additions and 457 deletions
--- a/backend/internal/adapter/input/http/dto/monitoring_dto.go
+++ b/backend/internal/adapter/input/http/dto/monitoring_dto.go
@ -8,29 +8,56 @@ import (

 // ClusterMetricsResponse 集群监控响应
 type ClusterMetricsResponse struct {
-	ClusterID       string                `json:"clusterId"`
-	ClusterName     string                `json:"clusterName"`
-	Status          string                `json:"status"`
-	Uptime          string                `json:"uptime"`
-	NodeCount       int                   `json:"nodeCount"`
-	PodCount        int                   `json:"podCount"`
-	LastCheck       time.Time             `json:"lastCheck"`
-	TotalCPU        string                `json:"totalCpu"`
-	TotalMemory     string                `json:"totalMemory"`
-	TotalGPU        int                   `json:"totalGpu"`
-	UsedCPU         string                `json:"usedCpu"`
-	UsedMemory      string                `json:"usedMemory"`
-	UsedGPU         int                   `json:"usedGpu"`
-	CPUUsage        float64               `json:"cpuUsage"`
-	MemoryUsage     float64               `json:"memoryUsage"`
-	GPUUsage        float64               `json:"gpuUsage"`
-	MaxNodeCPU      string                `json:"maxNodeCpu"`
-	MaxNodeMemory   string                `json:"maxNodeMemory"`
-	MaxNodeGPU      int                   `json:"maxNodeGpu"`
-	MaxNodeCPUUsage float64               `json:"maxNodeCpuUsage"`
-	MaxNodeMemUsage float64               `json:"maxNodeMemUsage"`
-	MaxNodeGPUUsage float64               `json:"maxNodeGpuUsage"`
-	Nodes           []NodeMetricsResponse `json:"nodes,omitempty"`
+	ClusterID            string                      `json:"clusterId"`
+	ClusterName          string                      `json:"clusterName"`
+	Status               string                      `json:"status"`
+	Uptime               string                      `json:"uptime"`
+	NodeCount            int                         `json:"nodeCount"`
+	PodCount             int                         `json:"podCount"`
+	LastCheck            time.Time                   `json:"lastCheck"`
+	TotalCPU             string                      `json:"totalCpu"`
+	TotalMemory          string                      `json:"totalMemory"`
+	TotalGPU             int                         `json:"totalGpu"`
+	UsedCPU              string                      `json:"usedCpu"`
+	UsedMemory           string                      `json:"usedMemory"`
+	UsedGPU              int                         `json:"usedGpu"`
+	CPUUsage             float64                     `json:"cpuUsage"`
+	MemoryUsage          float64                     `json:"memoryUsage"`
+	GPUUsage             float64                     `json:"gpuUsage"`
+	CPURequests          string                      `json:"cpuRequests,omitempty"`
+	CPULimits            string                      `json:"cpuLimits,omitempty"`
+	MemoryRequests       string                      `json:"memoryRequests,omitempty"`
+	MemoryLimits         string                      `json:"memoryLimits,omitempty"`
+	GPURequests          int64                       `json:"gpuRequests,omitempty"`
+	GPULimits            int64                       `json:"gpuLimits,omitempty"`
+	GPUMemoryRequestsMB  int64                       `json:"gpuMemoryRequestsMb,omitempty"`
+	GPUMemoryLimitsMB    int64                       `json:"gpuMemoryLimitsMb,omitempty"`
+	AllocatedGPU         int64                       `json:"allocatedGpu,omitempty"`
+	AllocatedGPUMemoryMB int64                       `json:"allocatedGpuMemoryMb,omitempty"`
+	ResourceUsageByUser  []UserResourceUsageResponse `json:"resourceUsageByUser,omitempty"`
+	MaxNodeCPU           string                      `json:"maxNodeCpu"`
+	MaxNodeMemory        string                      `json:"maxNodeMemory"`
+	MaxNodeGPU           int                         `json:"maxNodeGpu"`
+	MaxNodeCPUUsage      float64                     `json:"maxNodeCpuUsage"`
+	MaxNodeMemUsage      float64                     `json:"maxNodeMemUsage"`
+	MaxNodeGPUUsage      float64                     `json:"maxNodeGpuUsage"`
+	Nodes                []NodeMetricsResponse       `json:"nodes,omitempty"`
+}
+
+type UserResourceUsageResponse struct {
+	UserID              string `json:"userId"`
+	Username            string `json:"username"`
+	WorkspaceID         string `json:"workspaceId"`
+	InstanceCount       int    `json:"instanceCount"`
+	PodCount            int    `json:"podCount"`
+	CPURequests         string `json:"cpuRequests"`
+	CPULimits           string `json:"cpuLimits"`
+	MemoryRequests      string `json:"memoryRequests"`
+	MemoryLimits        string `json:"memoryLimits"`
+	GPURequests         int64  `json:"gpuRequests"`
+	GPULimits           int64  `json:"gpuLimits"`
+	GPUMemoryRequestsMB int64  `json:"gpuMemoryRequestsMb"`
+	GPUMemoryLimitsMB   int64  `json:"gpuMemoryLimitsMb"`
 }

 // NodeMetricsResponse 节点监控响应
@ -72,28 +99,59 @@ type MonitoringSummaryResponse struct {
 // ToClusterMetricsResponse 转换为响应
 func ToClusterMetricsResponse(m *entity.ClusterMetrics) *ClusterMetricsResponse {
 	resp := &ClusterMetricsResponse{
-		ClusterID:       m.ClusterID,
-		ClusterName:     m.ClusterName,
-		Status:          m.Status,
-		Uptime:          m.Uptime,
-		NodeCount:       m.NodeCount,
-		PodCount:        m.PodCount,
-		LastCheck:       m.LastCheck,
-		TotalCPU:        m.TotalCPU,
-		TotalMemory:     m.TotalMemory,
-		TotalGPU:        m.TotalGPU,
-		UsedCPU:         m.UsedCPU,
-		UsedMemory:      m.UsedMemory,
-		UsedGPU:         m.UsedGPU,
-		CPUUsage:        m.CPUUsage,
-		MemoryUsage:     m.MemoryUsage,
-		GPUUsage:        m.GPUUsage,
-		MaxNodeCPU:      m.MaxNodeCPU,
-		MaxNodeMemory:   m.MaxNodeMemory,
-		MaxNodeGPU:      m.MaxNodeGPU,
-		MaxNodeCPUUsage: m.MaxNodeCPUUsage,
-		MaxNodeMemUsage: m.MaxNodeMemUsage,
-		MaxNodeGPUUsage: m.MaxNodeGPUUsage,
+		ClusterID:            m.ClusterID,
+		ClusterName:          m.ClusterName,
+		Status:               m.Status,
+		Uptime:               m.Uptime,
+		NodeCount:            m.NodeCount,
+		PodCount:             m.PodCount,
+		LastCheck:            m.LastCheck,
+		TotalCPU:             m.TotalCPU,
+		TotalMemory:          m.TotalMemory,
+		TotalGPU:             m.TotalGPU,
+		UsedCPU:              m.UsedCPU,
+		UsedMemory:           m.UsedMemory,
+		UsedGPU:              m.UsedGPU,
+		CPUUsage:             m.CPUUsage,
+		MemoryUsage:          m.MemoryUsage,
+		GPUUsage:             m.GPUUsage,
+		CPURequests:          m.CPURequests,
+		CPULimits:            m.CPULimits,
+		MemoryRequests:       m.MemoryRequests,
+		MemoryLimits:         m.MemoryLimits,
+		GPURequests:          m.GPURequests,
+		GPULimits:            m.GPULimits,
+		GPUMemoryRequestsMB:  m.GPUMemoryRequestsMB,
+		GPUMemoryLimitsMB:    m.GPUMemoryLimitsMB,
+		AllocatedGPU:         m.AllocatedGPU,
+		AllocatedGPUMemoryMB: m.AllocatedGPUMemoryMB,
+		MaxNodeCPU:           m.MaxNodeCPU,
+		MaxNodeMemory:        m.MaxNodeMemory,
+		MaxNodeGPU:           m.MaxNodeGPU,
+		MaxNodeCPUUsage:      m.MaxNodeCPUUsage,
+		MaxNodeMemUsage:      m.MaxNodeMemUsage,
+		MaxNodeGPUUsage:      m.MaxNodeGPUUsage,
+	}
+
+	if len(m.ResourceUsageByUser) > 0 {
+		resp.ResourceUsageByUser = make([]UserResourceUsageResponse, len(m.ResourceUsageByUser))
+		for i, usage := range m.ResourceUsageByUser {
+			resp.ResourceUsageByUser[i] = UserResourceUsageResponse{
+				UserID:              usage.UserID,
+				Username:            usage.Username,
+				WorkspaceID:         usage.WorkspaceID,
+				InstanceCount:       usage.InstanceCount,
+				PodCount:            usage.PodCount,
+				CPURequests:         usage.CPURequests,
+				CPULimits:           usage.CPULimits,
+				MemoryRequests:      usage.MemoryRequests,
+				MemoryLimits:        usage.MemoryLimits,
+				GPURequests:         usage.GPURequests,
+				GPULimits:           usage.GPULimits,
+				GPUMemoryRequestsMB: usage.GPUMemoryRequestsMB,
+				GPUMemoryLimitsMB:   usage.GPUMemoryLimitsMB,
+			}
+		}
 	}

 	if len(m.Nodes) > 0 {