package service import ( "context" "fmt" "sort" "github.com/ocdp/cluster-service/internal/domain/entity" "github.com/ocdp/cluster-service/internal/domain/repository" "github.com/ocdp/cluster-service/internal/pkg/authz" ) // MonitoringService 监控服务 type MonitoringService struct { clusterRepo repository.ClusterRepository metricsClient repository.MetricsClient instanceRepo repository.InstanceRepository userRepo repository.UserRepository } // NewMonitoringService 创建监控服务 func NewMonitoringService( clusterRepo repository.ClusterRepository, metricsClient repository.MetricsClient, instanceRepo repository.InstanceRepository, userRepo repository.UserRepository, ) *MonitoringService { return &MonitoringService{ clusterRepo: clusterRepo, metricsClient: metricsClient, instanceRepo: instanceRepo, userRepo: userRepo, } } // GetClusterMonitoring 获取单个集群的监控信息 func (s *MonitoringService) GetClusterMonitoring(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) { principal, err := authz.RequirePrincipal(ctx) if err != nil { return nil, entity.ErrUnauthorized } cluster, err := s.clusterRepo.GetByID(ctx, clusterID) if err != nil { return nil, entity.ErrClusterNotFound } if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) { return nil, entity.ErrClusterNotFound } metrics, err := s.metricsClient.GetClusterMetrics(ctx, clusterID) if err != nil { return nil, fmt.Errorf("failed to get cluster metrics: %w", err) } s.enrichResourceUsage(ctx, principal, metrics) s.scopeTenantMetrics(principal, metrics) return metrics, nil } // ListClusterMonitoring 获取所有集群的监控信息 func (s *MonitoringService) ListClusterMonitoring(ctx context.Context) ([]*entity.ClusterMetrics, error) { principal, err := authz.RequirePrincipal(ctx) if err != nil { return nil, entity.ErrUnauthorized } // 获取所有集群 clusters, err := s.clusterRepo.List(ctx) if err != nil { return nil, fmt.Errorf("failed to list clusters: %w", err) } // 获取每个集群的监控数据 result := make([]*entity.ClusterMetrics, 0, len(clusters)) for _, cluster := range clusters { if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) { continue } metrics, err := s.metricsClient.GetClusterMetrics(ctx, cluster.ID) if err != nil { // 如果某个集群获取失败,记录错误但继续 fmt.Printf("Warning: failed to get metrics for cluster %s: %v\n", cluster.ID, err) // 返回基本信息 metrics = &entity.ClusterMetrics{ ClusterID: cluster.ID, ClusterName: cluster.Name, Status: "unknown", } } s.enrichResourceUsage(ctx, principal, metrics) s.scopeTenantMetrics(principal, metrics) result = append(result, metrics) } return result, nil } func (s *MonitoringService) enrichResourceUsage(ctx context.Context, principal *authz.Principal, metrics *entity.ClusterMetrics) { if metrics == nil || s.instanceRepo == nil || s.metricsClient == nil { s.addVisibleUserRows(ctx, principal, metrics) return } instances, err := s.instanceRepo.ListByCluster(ctx, metrics.ClusterID) if err != nil { fmt.Printf("Warning: failed to list instances for cluster %s resource usage: %v\n", metrics.ClusterID, err) s.addVisibleUserRows(ctx, principal, metrics) return } allocations, err := s.metricsClient.GetPodResourceAllocations(ctx, metrics.ClusterID) if err != nil { fmt.Printf("Warning: failed to list pod resource allocations for cluster %s: %v\n", metrics.ClusterID, err) s.addVisibleUserRows(ctx, principal, metrics) return } visibleInstances := make(map[string]*entity.Instance) for _, instance := range instances { if instance == nil || !canReadMonitoringInstance(principal, instance) { continue } key := monitoringInstanceKey(instance.Namespace, instance.Name) visibleInstances[key] = instance } type usageAccumulator struct { userID string username string workspaceID string allocation entity.ResourceAllocation podCount int instances map[string]struct{} } byUser := make(map[string]*usageAccumulator) total := entity.ResourceAllocation{} for _, pod := range allocations { if pod == nil { continue } instance := visibleInstances[monitoringInstanceKey(pod.Namespace, pod.InstanceName)] if instance == nil { continue } total = addResourceAllocation(total, pod.Allocation) username := instance.OwnerUsername if username == "" { username = s.usernameForOwner(ctx, instance.OwnerID, principal) } acc := byUser[instance.OwnerID] if acc == nil { acc = &usageAccumulator{ userID: instance.OwnerID, username: username, workspaceID: instance.WorkspaceID, instances: map[string]struct{}{}, } byUser[instance.OwnerID] = acc } if acc.username == "" { acc.username = username } acc.allocation = addResourceAllocation(acc.allocation, pod.Allocation) acc.podCount++ acc.instances[instance.ID] = struct{}{} } metrics.CPURequests = formatCPUAllocation(total.CPURequestsMilli) metrics.CPULimits = formatCPUAllocation(total.CPULimitsMilli) metrics.MemoryRequests = formatMemoryAllocation(total.MemoryRequestsBytes) metrics.MemoryLimits = formatMemoryAllocation(total.MemoryLimitsBytes) metrics.GPURequests = total.GPURequests metrics.GPULimits = total.GPULimits metrics.GPUMemoryRequestsMB = total.GPUMemoryRequestsMB metrics.GPUMemoryLimitsMB = total.GPUMemoryLimitsMB metrics.AllocatedGPU = total.GPURequests metrics.AllocatedGPUMemoryMB = total.GPUMemoryRequestsMB userIDs := make([]string, 0, len(byUser)) for userID := range byUser { userIDs = append(userIDs, userID) } sort.Slice(userIDs, func(i, j int) bool { left := byUser[userIDs[i]] right := byUser[userIDs[j]] if left.username == right.username { return left.userID < right.userID } return left.username < right.username }) usage := make([]entity.UserResourceUsage, 0, len(userIDs)) for _, userID := range userIDs { acc := byUser[userID] usage = append(usage, entity.UserResourceUsage{ UserID: acc.userID, Username: acc.username, WorkspaceID: acc.workspaceID, InstanceCount: len(acc.instances), PodCount: acc.podCount, CPURequests: formatCPUAllocation(acc.allocation.CPURequestsMilli), CPULimits: formatCPUAllocation(acc.allocation.CPULimitsMilli), MemoryRequests: formatMemoryAllocation(acc.allocation.MemoryRequestsBytes), MemoryLimits: formatMemoryAllocation(acc.allocation.MemoryLimitsBytes), GPURequests: acc.allocation.GPURequests, GPULimits: acc.allocation.GPULimits, GPUMemoryRequestsMB: acc.allocation.GPUMemoryRequestsMB, GPUMemoryLimitsMB: acc.allocation.GPUMemoryLimitsMB, }) } metrics.ResourceUsageByUser = usage s.addVisibleUserRows(ctx, principal, metrics) } func (s *MonitoringService) addVisibleUserRows(ctx context.Context, principal *authz.Principal, metrics *entity.ClusterMetrics) { if principal == nil || metrics == nil { return } existing := make(map[string]struct{}, len(metrics.ResourceUsageByUser)) for _, row := range metrics.ResourceUsageByUser { if row.UserID != "" { existing[row.UserID] = struct{}{} } } appendEmpty := func(userID, username, workspaceID string) { if userID == "" { return } if _, ok := existing[userID]; ok { return } metrics.ResourceUsageByUser = append(metrics.ResourceUsageByUser, entity.UserResourceUsage{ UserID: userID, Username: username, WorkspaceID: workspaceID, InstanceCount: 0, PodCount: 0, CPURequests: "0 cores", CPULimits: "0 cores", MemoryRequests: "0 B", MemoryLimits: "0 B", }) existing[userID] = struct{}{} } if !principal.IsAdmin() { appendEmpty(principal.UserID, principal.Username, principal.WorkspaceID) return } if s.userRepo == nil { return } users, err := s.userRepo.List(ctx) if err != nil { fmt.Printf("Warning: failed to list users for monitoring rows: %v\n", err) return } for _, user := range users { if user == nil || user.Role != authz.RoleUser || !user.IsActive { continue } appendEmpty(user.ID, user.Username, user.WorkspaceID) } sort.Slice(metrics.ResourceUsageByUser, func(i, j int) bool { left := metrics.ResourceUsageByUser[i] right := metrics.ResourceUsageByUser[j] if left.Username == right.Username { return left.UserID < right.UserID } return left.Username < right.Username }) } func (s *MonitoringService) scopeTenantMetrics(principal *authz.Principal, metrics *entity.ClusterMetrics) { if principal == nil || principal.IsAdmin() || metrics == nil { return } var total entity.ResourceAllocation podCount := 0 instanceCount := 0 for _, usage := range metrics.ResourceUsageByUser { if usage.UserID != principal.UserID { continue } podCount += usage.PodCount instanceCount += usage.InstanceCount total.GPURequests += usage.GPURequests total.GPULimits += usage.GPULimits total.GPUMemoryRequestsMB += usage.GPUMemoryRequestsMB total.GPUMemoryLimitsMB += usage.GPUMemoryLimitsMB } metrics.NodeCount = 0 metrics.Nodes = nil metrics.PodCount = podCount metrics.TotalCPU = "" metrics.TotalMemory = "" metrics.TotalGPU = 0 metrics.UsedCPU = metrics.CPURequests metrics.UsedMemory = metrics.MemoryRequests metrics.UsedGPU = int(total.GPURequests) metrics.CPUUsage = 0 metrics.MemoryUsage = 0 metrics.GPUUsage = 0 metrics.MaxNodeCPU = "" metrics.MaxNodeMemory = "" metrics.MaxNodeGPU = 0 metrics.MaxNodeCPUUsage = 0 metrics.MaxNodeMemUsage = 0 metrics.MaxNodeGPUUsage = 0 metrics.ResourceUsageByUser = filterSelfUsage(principal.UserID, metrics.ResourceUsageByUser) if instanceCount == 0 { metrics.CPURequests = "" metrics.CPULimits = "" metrics.MemoryRequests = "" metrics.MemoryLimits = "" metrics.GPURequests = 0 metrics.GPULimits = 0 metrics.GPUMemoryRequestsMB = 0 metrics.GPUMemoryLimitsMB = 0 metrics.AllocatedGPU = 0 metrics.AllocatedGPUMemoryMB = 0 } } func filterSelfUsage(userID string, usage []entity.UserResourceUsage) []entity.UserResourceUsage { filtered := make([]entity.UserResourceUsage, 0, len(usage)) for _, row := range usage { if row.UserID == userID { filtered = append(filtered, row) } } return filtered } func canReadMonitoringInstance(principal *authz.Principal, instance *entity.Instance) bool { if principal == nil || instance == nil { return false } if principal.IsAdmin() { return true } return instance.WorkspaceID == principal.WorkspaceID && instance.OwnerID == principal.UserID } func (s *MonitoringService) usernameForOwner(ctx context.Context, ownerID string, principal *authz.Principal) string { if ownerID == "" { return "" } if principal != nil && ownerID == principal.UserID { return principal.Username } if s.userRepo == nil { return "" } user, err := s.userRepo.GetByID(ctx, ownerID) if err != nil || user == nil { return "" } return user.Username } func monitoringInstanceKey(namespace, name string) string { return namespace + "/" + name } func addResourceAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation { return entity.ResourceAllocation{ CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli, CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli, MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes, MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes, GPURequests: left.GPURequests + right.GPURequests, GPULimits: left.GPULimits + right.GPULimits, GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB, GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB, } } func formatCPUAllocation(milli int64) string { return fmt.Sprintf("%.2f cores", float64(milli)/1000.0) } func formatMemoryAllocation(bytes int64) string { const unit = 1024 if bytes < unit { return fmt.Sprintf("%d B", bytes) } div, exp := int64(unit), 0 for n := bytes / unit; n >= unit; n /= unit { div *= unit exp++ } return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp]) } // GetMonitoringSummary 获取监控汇总信息 func (s *MonitoringService) GetMonitoringSummary(ctx context.Context) (*entity.MonitoringSummary, error) { // 获取所有集群监控数据 monitoringList, err := s.ListClusterMonitoring(ctx) if err != nil { return nil, fmt.Errorf("failed to list monitoring: %w", err) } // 统计汇总 summary := &entity.MonitoringSummary{ TotalClusters: len(monitoringList), } for _, m := range monitoringList { switch m.Status { case "healthy": summary.HealthyClusters++ case "warning": summary.WarningClusters++ case "error": summary.ErrorClusters++ } summary.TotalNodes += m.NodeCount summary.TotalPods += m.PodCount } return summary, nil } // GetNodeMetrics 获取集群的节点指标 func (s *MonitoringService) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) { principal, err := authz.RequirePrincipal(ctx) if err != nil { return nil, entity.ErrUnauthorized } cluster, err := s.clusterRepo.GetByID(ctx, clusterID) if err != nil { return nil, entity.ErrClusterNotFound } if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) { return nil, entity.ErrClusterNotFound } if !principal.IsAdmin() { return nil, entity.ErrForbidden } nodes, err := s.metricsClient.GetNodeMetrics(ctx, clusterID) if err != nil { return nil, fmt.Errorf("failed to get node metrics: %w", err) } return nodes, nil }