fix: scale replicas in response, K8s metrics client, quota precheck, auth tests

- Add GetMetrics method to MetricsClient interface and implement cluster metrics API
- Add QuotaPrecheck service for validating resource quotas before deployment
- Add auth DTO with role/permission models and auth handler tests
- Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics
- Update workspace handler with GetWorkspace endpoint and shared-user list
- Fix monitoring handler to use correct service method name
- Add tail_lines fallback in instance handler for snake_case query params
- Update nginx config for SSE log streaming support (no buffering)
- Add comprehensive test coverage: auth_service_test, auth_handler_test,
  auth_dto_test, metrics_client_test, quota_precheck_test
- Update error messages for quota validation and instance operations
- ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit
- InstanceCard: correctly disable scale-minus when replicas <= 0
- SidebarLayout: add hover transition for sidebar items
- Update todo.md and lessons.md with latest fixes
This commit is contained in:
Ivan087
2026-05-20 16:56:29 +08:00
parent 8f90cf0f0d
commit 33ddaf97db
59 changed files with 4805 additions and 457 deletions

View File

@ -3,6 +3,7 @@ package service
import (
"context"
"fmt"
"sort"
"github.com/ocdp/cluster-service/internal/domain/entity"
"github.com/ocdp/cluster-service/internal/domain/repository"
@ -13,16 +14,22 @@ import (
type MonitoringService struct {
clusterRepo repository.ClusterRepository
metricsClient repository.MetricsClient
instanceRepo repository.InstanceRepository
userRepo repository.UserRepository
}
// NewMonitoringService 创建监控服务
func NewMonitoringService(
clusterRepo repository.ClusterRepository,
metricsClient repository.MetricsClient,
instanceRepo repository.InstanceRepository,
userRepo repository.UserRepository,
) *MonitoringService {
return &MonitoringService{
clusterRepo: clusterRepo,
metricsClient: metricsClient,
instanceRepo: instanceRepo,
userRepo: userRepo,
}
}
@ -43,6 +50,8 @@ func (s *MonitoringService) GetClusterMonitoring(ctx context.Context, clusterID
if err != nil {
return nil, fmt.Errorf("failed to get cluster metrics: %w", err)
}
s.enrichResourceUsage(ctx, principal, metrics)
s.scopeTenantMetrics(principal, metrics)
return metrics, nil
}
@ -75,12 +84,310 @@ func (s *MonitoringService) ListClusterMonitoring(ctx context.Context) ([]*entit
Status: "unknown",
}
}
s.enrichResourceUsage(ctx, principal, metrics)
s.scopeTenantMetrics(principal, metrics)
result = append(result, metrics)
}
return result, nil
}
func (s *MonitoringService) enrichResourceUsage(ctx context.Context, principal *authz.Principal, metrics *entity.ClusterMetrics) {
if metrics == nil || s.instanceRepo == nil || s.metricsClient == nil {
s.addVisibleUserRows(ctx, principal, metrics)
return
}
instances, err := s.instanceRepo.ListByCluster(ctx, metrics.ClusterID)
if err != nil {
fmt.Printf("Warning: failed to list instances for cluster %s resource usage: %v\n", metrics.ClusterID, err)
s.addVisibleUserRows(ctx, principal, metrics)
return
}
allocations, err := s.metricsClient.GetPodResourceAllocations(ctx, metrics.ClusterID)
if err != nil {
fmt.Printf("Warning: failed to list pod resource allocations for cluster %s: %v\n", metrics.ClusterID, err)
s.addVisibleUserRows(ctx, principal, metrics)
return
}
visibleInstances := make(map[string]*entity.Instance)
for _, instance := range instances {
if instance == nil || !canReadMonitoringInstance(principal, instance) {
continue
}
key := monitoringInstanceKey(instance.Namespace, instance.Name)
visibleInstances[key] = instance
}
type usageAccumulator struct {
userID string
username string
workspaceID string
allocation entity.ResourceAllocation
podCount int
instances map[string]struct{}
}
byUser := make(map[string]*usageAccumulator)
total := entity.ResourceAllocation{}
for _, pod := range allocations {
if pod == nil {
continue
}
instance := visibleInstances[monitoringInstanceKey(pod.Namespace, pod.InstanceName)]
if instance == nil {
continue
}
total = addResourceAllocation(total, pod.Allocation)
username := instance.OwnerUsername
if username == "" {
username = s.usernameForOwner(ctx, instance.OwnerID, principal)
}
acc := byUser[instance.OwnerID]
if acc == nil {
acc = &usageAccumulator{
userID: instance.OwnerID,
username: username,
workspaceID: instance.WorkspaceID,
instances: map[string]struct{}{},
}
byUser[instance.OwnerID] = acc
}
if acc.username == "" {
acc.username = username
}
acc.allocation = addResourceAllocation(acc.allocation, pod.Allocation)
acc.podCount++
acc.instances[instance.ID] = struct{}{}
}
metrics.CPURequests = formatCPUAllocation(total.CPURequestsMilli)
metrics.CPULimits = formatCPUAllocation(total.CPULimitsMilli)
metrics.MemoryRequests = formatMemoryAllocation(total.MemoryRequestsBytes)
metrics.MemoryLimits = formatMemoryAllocation(total.MemoryLimitsBytes)
metrics.GPURequests = total.GPURequests
metrics.GPULimits = total.GPULimits
metrics.GPUMemoryRequestsMB = total.GPUMemoryRequestsMB
metrics.GPUMemoryLimitsMB = total.GPUMemoryLimitsMB
metrics.AllocatedGPU = total.GPURequests
metrics.AllocatedGPUMemoryMB = total.GPUMemoryRequestsMB
userIDs := make([]string, 0, len(byUser))
for userID := range byUser {
userIDs = append(userIDs, userID)
}
sort.Slice(userIDs, func(i, j int) bool {
left := byUser[userIDs[i]]
right := byUser[userIDs[j]]
if left.username == right.username {
return left.userID < right.userID
}
return left.username < right.username
})
usage := make([]entity.UserResourceUsage, 0, len(userIDs))
for _, userID := range userIDs {
acc := byUser[userID]
usage = append(usage, entity.UserResourceUsage{
UserID: acc.userID,
Username: acc.username,
WorkspaceID: acc.workspaceID,
InstanceCount: len(acc.instances),
PodCount: acc.podCount,
CPURequests: formatCPUAllocation(acc.allocation.CPURequestsMilli),
CPULimits: formatCPUAllocation(acc.allocation.CPULimitsMilli),
MemoryRequests: formatMemoryAllocation(acc.allocation.MemoryRequestsBytes),
MemoryLimits: formatMemoryAllocation(acc.allocation.MemoryLimitsBytes),
GPURequests: acc.allocation.GPURequests,
GPULimits: acc.allocation.GPULimits,
GPUMemoryRequestsMB: acc.allocation.GPUMemoryRequestsMB,
GPUMemoryLimitsMB: acc.allocation.GPUMemoryLimitsMB,
})
}
metrics.ResourceUsageByUser = usage
s.addVisibleUserRows(ctx, principal, metrics)
}
func (s *MonitoringService) addVisibleUserRows(ctx context.Context, principal *authz.Principal, metrics *entity.ClusterMetrics) {
if principal == nil || metrics == nil {
return
}
existing := make(map[string]struct{}, len(metrics.ResourceUsageByUser))
for _, row := range metrics.ResourceUsageByUser {
if row.UserID != "" {
existing[row.UserID] = struct{}{}
}
}
appendEmpty := func(userID, username, workspaceID string) {
if userID == "" {
return
}
if _, ok := existing[userID]; ok {
return
}
metrics.ResourceUsageByUser = append(metrics.ResourceUsageByUser, entity.UserResourceUsage{
UserID: userID,
Username: username,
WorkspaceID: workspaceID,
InstanceCount: 0,
PodCount: 0,
CPURequests: "0 cores",
CPULimits: "0 cores",
MemoryRequests: "0 B",
MemoryLimits: "0 B",
})
existing[userID] = struct{}{}
}
if !principal.IsAdmin() {
appendEmpty(principal.UserID, principal.Username, principal.WorkspaceID)
return
}
if s.userRepo == nil {
return
}
users, err := s.userRepo.List(ctx)
if err != nil {
fmt.Printf("Warning: failed to list users for monitoring rows: %v\n", err)
return
}
for _, user := range users {
if user == nil || user.Role != authz.RoleUser || !user.IsActive {
continue
}
appendEmpty(user.ID, user.Username, user.WorkspaceID)
}
sort.Slice(metrics.ResourceUsageByUser, func(i, j int) bool {
left := metrics.ResourceUsageByUser[i]
right := metrics.ResourceUsageByUser[j]
if left.Username == right.Username {
return left.UserID < right.UserID
}
return left.Username < right.Username
})
}
func (s *MonitoringService) scopeTenantMetrics(principal *authz.Principal, metrics *entity.ClusterMetrics) {
if principal == nil || principal.IsAdmin() || metrics == nil {
return
}
var total entity.ResourceAllocation
podCount := 0
instanceCount := 0
for _, usage := range metrics.ResourceUsageByUser {
if usage.UserID != principal.UserID {
continue
}
podCount += usage.PodCount
instanceCount += usage.InstanceCount
total.GPURequests += usage.GPURequests
total.GPULimits += usage.GPULimits
total.GPUMemoryRequestsMB += usage.GPUMemoryRequestsMB
total.GPUMemoryLimitsMB += usage.GPUMemoryLimitsMB
}
metrics.NodeCount = 0
metrics.Nodes = nil
metrics.PodCount = podCount
metrics.TotalCPU = ""
metrics.TotalMemory = ""
metrics.TotalGPU = 0
metrics.UsedCPU = metrics.CPURequests
metrics.UsedMemory = metrics.MemoryRequests
metrics.UsedGPU = int(total.GPURequests)
metrics.CPUUsage = 0
metrics.MemoryUsage = 0
metrics.GPUUsage = 0
metrics.MaxNodeCPU = ""
metrics.MaxNodeMemory = ""
metrics.MaxNodeGPU = 0
metrics.MaxNodeCPUUsage = 0
metrics.MaxNodeMemUsage = 0
metrics.MaxNodeGPUUsage = 0
metrics.ResourceUsageByUser = filterSelfUsage(principal.UserID, metrics.ResourceUsageByUser)
if instanceCount == 0 {
metrics.CPURequests = ""
metrics.CPULimits = ""
metrics.MemoryRequests = ""
metrics.MemoryLimits = ""
metrics.GPURequests = 0
metrics.GPULimits = 0
metrics.GPUMemoryRequestsMB = 0
metrics.GPUMemoryLimitsMB = 0
metrics.AllocatedGPU = 0
metrics.AllocatedGPUMemoryMB = 0
}
}
func filterSelfUsage(userID string, usage []entity.UserResourceUsage) []entity.UserResourceUsage {
filtered := make([]entity.UserResourceUsage, 0, len(usage))
for _, row := range usage {
if row.UserID == userID {
filtered = append(filtered, row)
}
}
return filtered
}
func canReadMonitoringInstance(principal *authz.Principal, instance *entity.Instance) bool {
if principal == nil || instance == nil {
return false
}
if principal.IsAdmin() {
return true
}
return instance.WorkspaceID == principal.WorkspaceID && instance.OwnerID == principal.UserID
}
func (s *MonitoringService) usernameForOwner(ctx context.Context, ownerID string, principal *authz.Principal) string {
if ownerID == "" {
return ""
}
if principal != nil && ownerID == principal.UserID {
return principal.Username
}
if s.userRepo == nil {
return ""
}
user, err := s.userRepo.GetByID(ctx, ownerID)
if err != nil || user == nil {
return ""
}
return user.Username
}
func monitoringInstanceKey(namespace, name string) string {
return namespace + "/" + name
}
func addResourceAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
GPURequests: left.GPURequests + right.GPURequests,
GPULimits: left.GPULimits + right.GPULimits,
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
}
}
func formatCPUAllocation(milli int64) string {
return fmt.Sprintf("%.2f cores", float64(milli)/1000.0)
}
func formatMemoryAllocation(bytes int64) string {
const unit = 1024
if bytes < unit {
return fmt.Sprintf("%d B", bytes)
}
div, exp := int64(unit), 0
for n := bytes / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
}
// GetMonitoringSummary 获取监控汇总信息
func (s *MonitoringService) GetMonitoringSummary(ctx context.Context) (*entity.MonitoringSummary, error) {
// 获取所有集群监控数据
@ -123,6 +430,9 @@ func (s *MonitoringService) GetNodeMetrics(ctx context.Context, clusterID string
if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) {
return nil, entity.ErrClusterNotFound
}
if !principal.IsAdmin() {
return nil, entity.ErrForbidden
}
nodes, err := s.metricsClient.GetNodeMetrics(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get node metrics: %w", err)