- Add GetMetrics method to MetricsClient interface and implement cluster metrics API - Add QuotaPrecheck service for validating resource quotas before deployment - Add auth DTO with role/permission models and auth handler tests - Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics - Update workspace handler with GetWorkspace endpoint and shared-user list - Fix monitoring handler to use correct service method name - Add tail_lines fallback in instance handler for snake_case query params - Update nginx config for SSE log streaming support (no buffering) - Add comprehensive test coverage: auth_service_test, auth_handler_test, auth_dto_test, metrics_client_test, quota_precheck_test - Update error messages for quota validation and instance operations - ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit - InstanceCard: correctly disable scale-minus when replicas <= 0 - SidebarLayout: add hover transition for sidebar items - Update todo.md and lessons.md with latest fixes
442 lines
14 KiB
Go
442 lines
14 KiB
Go
package service
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sort"
|
|
|
|
"github.com/ocdp/cluster-service/internal/domain/entity"
|
|
"github.com/ocdp/cluster-service/internal/domain/repository"
|
|
"github.com/ocdp/cluster-service/internal/pkg/authz"
|
|
)
|
|
|
|
// MonitoringService 监控服务
|
|
type MonitoringService struct {
|
|
clusterRepo repository.ClusterRepository
|
|
metricsClient repository.MetricsClient
|
|
instanceRepo repository.InstanceRepository
|
|
userRepo repository.UserRepository
|
|
}
|
|
|
|
// NewMonitoringService 创建监控服务
|
|
func NewMonitoringService(
|
|
clusterRepo repository.ClusterRepository,
|
|
metricsClient repository.MetricsClient,
|
|
instanceRepo repository.InstanceRepository,
|
|
userRepo repository.UserRepository,
|
|
) *MonitoringService {
|
|
return &MonitoringService{
|
|
clusterRepo: clusterRepo,
|
|
metricsClient: metricsClient,
|
|
instanceRepo: instanceRepo,
|
|
userRepo: userRepo,
|
|
}
|
|
}
|
|
|
|
// GetClusterMonitoring 获取单个集群的监控信息
|
|
func (s *MonitoringService) GetClusterMonitoring(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) {
|
|
principal, err := authz.RequirePrincipal(ctx)
|
|
if err != nil {
|
|
return nil, entity.ErrUnauthorized
|
|
}
|
|
cluster, err := s.clusterRepo.GetByID(ctx, clusterID)
|
|
if err != nil {
|
|
return nil, entity.ErrClusterNotFound
|
|
}
|
|
if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) {
|
|
return nil, entity.ErrClusterNotFound
|
|
}
|
|
metrics, err := s.metricsClient.GetClusterMetrics(ctx, clusterID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get cluster metrics: %w", err)
|
|
}
|
|
s.enrichResourceUsage(ctx, principal, metrics)
|
|
s.scopeTenantMetrics(principal, metrics)
|
|
return metrics, nil
|
|
}
|
|
|
|
// ListClusterMonitoring 获取所有集群的监控信息
|
|
func (s *MonitoringService) ListClusterMonitoring(ctx context.Context) ([]*entity.ClusterMetrics, error) {
|
|
principal, err := authz.RequirePrincipal(ctx)
|
|
if err != nil {
|
|
return nil, entity.ErrUnauthorized
|
|
}
|
|
// 获取所有集群
|
|
clusters, err := s.clusterRepo.List(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to list clusters: %w", err)
|
|
}
|
|
|
|
// 获取每个集群的监控数据
|
|
result := make([]*entity.ClusterMetrics, 0, len(clusters))
|
|
for _, cluster := range clusters {
|
|
if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) {
|
|
continue
|
|
}
|
|
metrics, err := s.metricsClient.GetClusterMetrics(ctx, cluster.ID)
|
|
if err != nil {
|
|
// 如果某个集群获取失败,记录错误但继续
|
|
fmt.Printf("Warning: failed to get metrics for cluster %s: %v\n", cluster.ID, err)
|
|
// 返回基本信息
|
|
metrics = &entity.ClusterMetrics{
|
|
ClusterID: cluster.ID,
|
|
ClusterName: cluster.Name,
|
|
Status: "unknown",
|
|
}
|
|
}
|
|
s.enrichResourceUsage(ctx, principal, metrics)
|
|
s.scopeTenantMetrics(principal, metrics)
|
|
result = append(result, metrics)
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func (s *MonitoringService) enrichResourceUsage(ctx context.Context, principal *authz.Principal, metrics *entity.ClusterMetrics) {
|
|
if metrics == nil || s.instanceRepo == nil || s.metricsClient == nil {
|
|
s.addVisibleUserRows(ctx, principal, metrics)
|
|
return
|
|
}
|
|
instances, err := s.instanceRepo.ListByCluster(ctx, metrics.ClusterID)
|
|
if err != nil {
|
|
fmt.Printf("Warning: failed to list instances for cluster %s resource usage: %v\n", metrics.ClusterID, err)
|
|
s.addVisibleUserRows(ctx, principal, metrics)
|
|
return
|
|
}
|
|
allocations, err := s.metricsClient.GetPodResourceAllocations(ctx, metrics.ClusterID)
|
|
if err != nil {
|
|
fmt.Printf("Warning: failed to list pod resource allocations for cluster %s: %v\n", metrics.ClusterID, err)
|
|
s.addVisibleUserRows(ctx, principal, metrics)
|
|
return
|
|
}
|
|
|
|
visibleInstances := make(map[string]*entity.Instance)
|
|
for _, instance := range instances {
|
|
if instance == nil || !canReadMonitoringInstance(principal, instance) {
|
|
continue
|
|
}
|
|
key := monitoringInstanceKey(instance.Namespace, instance.Name)
|
|
visibleInstances[key] = instance
|
|
}
|
|
|
|
type usageAccumulator struct {
|
|
userID string
|
|
username string
|
|
workspaceID string
|
|
allocation entity.ResourceAllocation
|
|
podCount int
|
|
instances map[string]struct{}
|
|
}
|
|
byUser := make(map[string]*usageAccumulator)
|
|
total := entity.ResourceAllocation{}
|
|
|
|
for _, pod := range allocations {
|
|
if pod == nil {
|
|
continue
|
|
}
|
|
instance := visibleInstances[monitoringInstanceKey(pod.Namespace, pod.InstanceName)]
|
|
if instance == nil {
|
|
continue
|
|
}
|
|
total = addResourceAllocation(total, pod.Allocation)
|
|
username := instance.OwnerUsername
|
|
if username == "" {
|
|
username = s.usernameForOwner(ctx, instance.OwnerID, principal)
|
|
}
|
|
acc := byUser[instance.OwnerID]
|
|
if acc == nil {
|
|
acc = &usageAccumulator{
|
|
userID: instance.OwnerID,
|
|
username: username,
|
|
workspaceID: instance.WorkspaceID,
|
|
instances: map[string]struct{}{},
|
|
}
|
|
byUser[instance.OwnerID] = acc
|
|
}
|
|
if acc.username == "" {
|
|
acc.username = username
|
|
}
|
|
acc.allocation = addResourceAllocation(acc.allocation, pod.Allocation)
|
|
acc.podCount++
|
|
acc.instances[instance.ID] = struct{}{}
|
|
}
|
|
|
|
metrics.CPURequests = formatCPUAllocation(total.CPURequestsMilli)
|
|
metrics.CPULimits = formatCPUAllocation(total.CPULimitsMilli)
|
|
metrics.MemoryRequests = formatMemoryAllocation(total.MemoryRequestsBytes)
|
|
metrics.MemoryLimits = formatMemoryAllocation(total.MemoryLimitsBytes)
|
|
metrics.GPURequests = total.GPURequests
|
|
metrics.GPULimits = total.GPULimits
|
|
metrics.GPUMemoryRequestsMB = total.GPUMemoryRequestsMB
|
|
metrics.GPUMemoryLimitsMB = total.GPUMemoryLimitsMB
|
|
metrics.AllocatedGPU = total.GPURequests
|
|
metrics.AllocatedGPUMemoryMB = total.GPUMemoryRequestsMB
|
|
|
|
userIDs := make([]string, 0, len(byUser))
|
|
for userID := range byUser {
|
|
userIDs = append(userIDs, userID)
|
|
}
|
|
sort.Slice(userIDs, func(i, j int) bool {
|
|
left := byUser[userIDs[i]]
|
|
right := byUser[userIDs[j]]
|
|
if left.username == right.username {
|
|
return left.userID < right.userID
|
|
}
|
|
return left.username < right.username
|
|
})
|
|
|
|
usage := make([]entity.UserResourceUsage, 0, len(userIDs))
|
|
for _, userID := range userIDs {
|
|
acc := byUser[userID]
|
|
usage = append(usage, entity.UserResourceUsage{
|
|
UserID: acc.userID,
|
|
Username: acc.username,
|
|
WorkspaceID: acc.workspaceID,
|
|
InstanceCount: len(acc.instances),
|
|
PodCount: acc.podCount,
|
|
CPURequests: formatCPUAllocation(acc.allocation.CPURequestsMilli),
|
|
CPULimits: formatCPUAllocation(acc.allocation.CPULimitsMilli),
|
|
MemoryRequests: formatMemoryAllocation(acc.allocation.MemoryRequestsBytes),
|
|
MemoryLimits: formatMemoryAllocation(acc.allocation.MemoryLimitsBytes),
|
|
GPURequests: acc.allocation.GPURequests,
|
|
GPULimits: acc.allocation.GPULimits,
|
|
GPUMemoryRequestsMB: acc.allocation.GPUMemoryRequestsMB,
|
|
GPUMemoryLimitsMB: acc.allocation.GPUMemoryLimitsMB,
|
|
})
|
|
}
|
|
metrics.ResourceUsageByUser = usage
|
|
s.addVisibleUserRows(ctx, principal, metrics)
|
|
}
|
|
|
|
func (s *MonitoringService) addVisibleUserRows(ctx context.Context, principal *authz.Principal, metrics *entity.ClusterMetrics) {
|
|
if principal == nil || metrics == nil {
|
|
return
|
|
}
|
|
existing := make(map[string]struct{}, len(metrics.ResourceUsageByUser))
|
|
for _, row := range metrics.ResourceUsageByUser {
|
|
if row.UserID != "" {
|
|
existing[row.UserID] = struct{}{}
|
|
}
|
|
}
|
|
appendEmpty := func(userID, username, workspaceID string) {
|
|
if userID == "" {
|
|
return
|
|
}
|
|
if _, ok := existing[userID]; ok {
|
|
return
|
|
}
|
|
metrics.ResourceUsageByUser = append(metrics.ResourceUsageByUser, entity.UserResourceUsage{
|
|
UserID: userID,
|
|
Username: username,
|
|
WorkspaceID: workspaceID,
|
|
InstanceCount: 0,
|
|
PodCount: 0,
|
|
CPURequests: "0 cores",
|
|
CPULimits: "0 cores",
|
|
MemoryRequests: "0 B",
|
|
MemoryLimits: "0 B",
|
|
})
|
|
existing[userID] = struct{}{}
|
|
}
|
|
if !principal.IsAdmin() {
|
|
appendEmpty(principal.UserID, principal.Username, principal.WorkspaceID)
|
|
return
|
|
}
|
|
if s.userRepo == nil {
|
|
return
|
|
}
|
|
users, err := s.userRepo.List(ctx)
|
|
if err != nil {
|
|
fmt.Printf("Warning: failed to list users for monitoring rows: %v\n", err)
|
|
return
|
|
}
|
|
for _, user := range users {
|
|
if user == nil || user.Role != authz.RoleUser || !user.IsActive {
|
|
continue
|
|
}
|
|
appendEmpty(user.ID, user.Username, user.WorkspaceID)
|
|
}
|
|
sort.Slice(metrics.ResourceUsageByUser, func(i, j int) bool {
|
|
left := metrics.ResourceUsageByUser[i]
|
|
right := metrics.ResourceUsageByUser[j]
|
|
if left.Username == right.Username {
|
|
return left.UserID < right.UserID
|
|
}
|
|
return left.Username < right.Username
|
|
})
|
|
}
|
|
|
|
func (s *MonitoringService) scopeTenantMetrics(principal *authz.Principal, metrics *entity.ClusterMetrics) {
|
|
if principal == nil || principal.IsAdmin() || metrics == nil {
|
|
return
|
|
}
|
|
var total entity.ResourceAllocation
|
|
podCount := 0
|
|
instanceCount := 0
|
|
for _, usage := range metrics.ResourceUsageByUser {
|
|
if usage.UserID != principal.UserID {
|
|
continue
|
|
}
|
|
podCount += usage.PodCount
|
|
instanceCount += usage.InstanceCount
|
|
total.GPURequests += usage.GPURequests
|
|
total.GPULimits += usage.GPULimits
|
|
total.GPUMemoryRequestsMB += usage.GPUMemoryRequestsMB
|
|
total.GPUMemoryLimitsMB += usage.GPUMemoryLimitsMB
|
|
}
|
|
metrics.NodeCount = 0
|
|
metrics.Nodes = nil
|
|
metrics.PodCount = podCount
|
|
metrics.TotalCPU = ""
|
|
metrics.TotalMemory = ""
|
|
metrics.TotalGPU = 0
|
|
metrics.UsedCPU = metrics.CPURequests
|
|
metrics.UsedMemory = metrics.MemoryRequests
|
|
metrics.UsedGPU = int(total.GPURequests)
|
|
metrics.CPUUsage = 0
|
|
metrics.MemoryUsage = 0
|
|
metrics.GPUUsage = 0
|
|
metrics.MaxNodeCPU = ""
|
|
metrics.MaxNodeMemory = ""
|
|
metrics.MaxNodeGPU = 0
|
|
metrics.MaxNodeCPUUsage = 0
|
|
metrics.MaxNodeMemUsage = 0
|
|
metrics.MaxNodeGPUUsage = 0
|
|
metrics.ResourceUsageByUser = filterSelfUsage(principal.UserID, metrics.ResourceUsageByUser)
|
|
if instanceCount == 0 {
|
|
metrics.CPURequests = ""
|
|
metrics.CPULimits = ""
|
|
metrics.MemoryRequests = ""
|
|
metrics.MemoryLimits = ""
|
|
metrics.GPURequests = 0
|
|
metrics.GPULimits = 0
|
|
metrics.GPUMemoryRequestsMB = 0
|
|
metrics.GPUMemoryLimitsMB = 0
|
|
metrics.AllocatedGPU = 0
|
|
metrics.AllocatedGPUMemoryMB = 0
|
|
}
|
|
}
|
|
|
|
func filterSelfUsage(userID string, usage []entity.UserResourceUsage) []entity.UserResourceUsage {
|
|
filtered := make([]entity.UserResourceUsage, 0, len(usage))
|
|
for _, row := range usage {
|
|
if row.UserID == userID {
|
|
filtered = append(filtered, row)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func canReadMonitoringInstance(principal *authz.Principal, instance *entity.Instance) bool {
|
|
if principal == nil || instance == nil {
|
|
return false
|
|
}
|
|
if principal.IsAdmin() {
|
|
return true
|
|
}
|
|
return instance.WorkspaceID == principal.WorkspaceID && instance.OwnerID == principal.UserID
|
|
}
|
|
|
|
func (s *MonitoringService) usernameForOwner(ctx context.Context, ownerID string, principal *authz.Principal) string {
|
|
if ownerID == "" {
|
|
return ""
|
|
}
|
|
if principal != nil && ownerID == principal.UserID {
|
|
return principal.Username
|
|
}
|
|
if s.userRepo == nil {
|
|
return ""
|
|
}
|
|
user, err := s.userRepo.GetByID(ctx, ownerID)
|
|
if err != nil || user == nil {
|
|
return ""
|
|
}
|
|
return user.Username
|
|
}
|
|
|
|
func monitoringInstanceKey(namespace, name string) string {
|
|
return namespace + "/" + name
|
|
}
|
|
|
|
func addResourceAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
|
|
return entity.ResourceAllocation{
|
|
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
|
|
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
|
|
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
|
|
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
|
|
GPURequests: left.GPURequests + right.GPURequests,
|
|
GPULimits: left.GPULimits + right.GPULimits,
|
|
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
|
|
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
|
|
}
|
|
}
|
|
|
|
func formatCPUAllocation(milli int64) string {
|
|
return fmt.Sprintf("%.2f cores", float64(milli)/1000.0)
|
|
}
|
|
|
|
func formatMemoryAllocation(bytes int64) string {
|
|
const unit = 1024
|
|
if bytes < unit {
|
|
return fmt.Sprintf("%d B", bytes)
|
|
}
|
|
div, exp := int64(unit), 0
|
|
for n := bytes / unit; n >= unit; n /= unit {
|
|
div *= unit
|
|
exp++
|
|
}
|
|
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
|
}
|
|
|
|
// GetMonitoringSummary 获取监控汇总信息
|
|
func (s *MonitoringService) GetMonitoringSummary(ctx context.Context) (*entity.MonitoringSummary, error) {
|
|
// 获取所有集群监控数据
|
|
monitoringList, err := s.ListClusterMonitoring(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to list monitoring: %w", err)
|
|
}
|
|
|
|
// 统计汇总
|
|
summary := &entity.MonitoringSummary{
|
|
TotalClusters: len(monitoringList),
|
|
}
|
|
|
|
for _, m := range monitoringList {
|
|
switch m.Status {
|
|
case "healthy":
|
|
summary.HealthyClusters++
|
|
case "warning":
|
|
summary.WarningClusters++
|
|
case "error":
|
|
summary.ErrorClusters++
|
|
}
|
|
summary.TotalNodes += m.NodeCount
|
|
summary.TotalPods += m.PodCount
|
|
}
|
|
|
|
return summary, nil
|
|
}
|
|
|
|
// GetNodeMetrics 获取集群的节点指标
|
|
func (s *MonitoringService) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) {
|
|
principal, err := authz.RequirePrincipal(ctx)
|
|
if err != nil {
|
|
return nil, entity.ErrUnauthorized
|
|
}
|
|
cluster, err := s.clusterRepo.GetByID(ctx, clusterID)
|
|
if err != nil {
|
|
return nil, entity.ErrClusterNotFound
|
|
}
|
|
if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) {
|
|
return nil, entity.ErrClusterNotFound
|
|
}
|
|
if !principal.IsAdmin() {
|
|
return nil, entity.ErrForbidden
|
|
}
|
|
nodes, err := s.metricsClient.GetNodeMetrics(ctx, clusterID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get node metrics: %w", err)
|
|
}
|
|
return nodes, nil
|
|
}
|