fix: scale replicas in response, K8s metrics client, quota precheck, auth tests
- Add GetMetrics method to MetricsClient interface and implement cluster metrics API - Add QuotaPrecheck service for validating resource quotas before deployment - Add auth DTO with role/permission models and auth handler tests - Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics - Update workspace handler with GetWorkspace endpoint and shared-user list - Fix monitoring handler to use correct service method name - Add tail_lines fallback in instance handler for snake_case query params - Update nginx config for SSE log streaming support (no buffering) - Add comprehensive test coverage: auth_service_test, auth_handler_test, auth_dto_test, metrics_client_test, quota_precheck_test - Update error messages for quota validation and instance operations - ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit - InstanceCard: correctly disable scale-minus when replicas <= 0 - SidebarLayout: add hover transition for sidebar items - Update todo.md and lessons.md with latest fixes
This commit is contained in:
@ -2,6 +2,7 @@ package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -18,6 +19,10 @@ import (
|
||||
type AuthService struct {
|
||||
userRepo repository.UserRepository
|
||||
workspaceRepo repository.WorkspaceRepository
|
||||
instanceRepo repository.InstanceRepository
|
||||
clusterRepo repository.ClusterRepository
|
||||
bindingRepo repository.WorkspaceClusterBindingRepository
|
||||
tenantClient repository.TenantKubeClient
|
||||
passwordHasher PasswordHasher
|
||||
tokenGenerator TokenGenerator
|
||||
}
|
||||
@ -53,6 +58,18 @@ func NewAuthService(
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AuthService) SetUserLifecycleCleanup(
|
||||
instanceRepo repository.InstanceRepository,
|
||||
clusterRepo repository.ClusterRepository,
|
||||
bindingRepo repository.WorkspaceClusterBindingRepository,
|
||||
tenantClient repository.TenantKubeClient,
|
||||
) {
|
||||
s.instanceRepo = instanceRepo
|
||||
s.clusterRepo = clusterRepo
|
||||
s.bindingRepo = bindingRepo
|
||||
s.tenantClient = tenantClient
|
||||
}
|
||||
|
||||
// Register 注册新用户。业务入口只允许 admin 调用;初始 admin 由 bootstrap seeder 创建。
|
||||
type UserWorkspaceOptions struct {
|
||||
Namespace string
|
||||
@ -87,6 +104,9 @@ func (s *AuthService) Register(ctx context.Context, username, password, role, wo
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if normalizeUserRole(role) == authz.RoleUser {
|
||||
normalizedOpts = defaultUserQuotaOptions(normalizedOpts)
|
||||
}
|
||||
|
||||
// 默认生成占位邮箱,避免数据库约束失败
|
||||
email := username + "@local.ocdp"
|
||||
@ -96,7 +116,7 @@ func (s *AuthService) Register(ctx context.Context, username, password, role, wo
|
||||
user.ID = uuid.New().String()
|
||||
user.Role = normalizeUserRole(role)
|
||||
user.WorkspaceID = workspaceID
|
||||
if user.Role == authz.RoleUser && (user.WorkspaceID == "" || user.WorkspaceID == entity.DefaultWorkspaceID) {
|
||||
if user.Role == authz.RoleUser {
|
||||
workspace, err := s.createUserWorkspace(ctx, username, principal.UserID, normalizedOpts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -131,10 +151,7 @@ func (s *AuthService) createUserWorkspace(ctx context.Context, username, created
|
||||
if s.workspaceRepo == nil {
|
||||
return nil, entity.ErrWorkspaceNotFound
|
||||
}
|
||||
name := strings.TrimPrefix(entity.NamespaceForUser(username), "ocdp-u-")
|
||||
workspace := entity.NewWorkspace(name, createdBy)
|
||||
workspace.ID = uuid.New().String()
|
||||
workspace.DefaultClusterID = strings.TrimSpace(opts.DefaultClusterID)
|
||||
name := userWorkspaceName(username)
|
||||
namespace := strings.TrimSpace(opts.Namespace)
|
||||
if namespace == "" {
|
||||
namespace = entity.NamespaceForUser(username)
|
||||
@ -143,6 +160,32 @@ func (s *AuthService) createUserWorkspace(ctx context.Context, username, created
|
||||
if len(validation.IsDNS1123Label(namespace)) > 0 {
|
||||
return nil, entity.ErrInvalidNamespace
|
||||
}
|
||||
}
|
||||
if existing, err := s.workspaceRepo.GetByName(ctx, name); err == nil && existing != nil {
|
||||
if namespace != "" && existing.K8sNamespace != namespace {
|
||||
if err := s.ensureNamespaceAvailable(ctx, namespace, existing.ID); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
applyWorkspaceOptions(existing, opts)
|
||||
if namespace != "" {
|
||||
existing.K8sNamespace = namespace
|
||||
existing.K8sSAName = entity.ServiceAccountForNamespace(namespace)
|
||||
}
|
||||
if err := s.workspaceRepo.Update(ctx, existing); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return existing, nil
|
||||
} else if err != nil && !errors.Is(err, entity.ErrWorkspaceNotFound) {
|
||||
return nil, err
|
||||
}
|
||||
if err := s.ensureNamespaceAvailable(ctx, namespace, ""); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
workspace := entity.NewWorkspace(name, createdBy)
|
||||
workspace.ID = uuid.New().String()
|
||||
workspace.DefaultClusterID = strings.TrimSpace(opts.DefaultClusterID)
|
||||
if namespace != "" {
|
||||
workspace.K8sNamespace = namespace
|
||||
workspace.K8sSAName = entity.ServiceAccountForNamespace(namespace)
|
||||
}
|
||||
@ -151,11 +194,45 @@ func (s *AuthService) createUserWorkspace(ctx context.Context, username, created
|
||||
workspace.QuotaGPU = strings.TrimSpace(opts.QuotaGPU)
|
||||
workspace.QuotaGPUMem = strings.TrimSpace(opts.QuotaGPUMem)
|
||||
if err := s.workspaceRepo.Create(ctx, workspace); err != nil {
|
||||
if errors.Is(err, entity.ErrWorkspaceExists) {
|
||||
existing, getErr := s.workspaceRepo.GetByName(ctx, name)
|
||||
if getErr != nil {
|
||||
return nil, err
|
||||
}
|
||||
if existing.K8sNamespace != namespace {
|
||||
return nil, entity.ErrWorkspaceNamespaceConflict
|
||||
}
|
||||
return existing, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return workspace, nil
|
||||
}
|
||||
|
||||
func userWorkspaceName(username string) string {
|
||||
return strings.TrimPrefix(entity.NamespaceForUser(username), "ocdp-u-")
|
||||
}
|
||||
|
||||
func (s *AuthService) ensureNamespaceAvailable(ctx context.Context, namespace, allowedWorkspaceID string) error {
|
||||
if s.workspaceRepo == nil || strings.TrimSpace(namespace) == "" {
|
||||
return nil
|
||||
}
|
||||
workspaces, err := s.workspaceRepo.List(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, workspace := range workspaces {
|
||||
if workspace == nil || workspace.K8sNamespace != namespace {
|
||||
continue
|
||||
}
|
||||
if allowedWorkspaceID != "" && workspace.ID == allowedWorkspaceID {
|
||||
continue
|
||||
}
|
||||
return entity.ErrWorkspaceNamespaceConflict
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func normalizeQuotaOptions(opts UserWorkspaceOptions) (UserWorkspaceOptions, error) {
|
||||
opts.Namespace = strings.TrimSpace(opts.Namespace)
|
||||
opts.DefaultClusterID = strings.TrimSpace(opts.DefaultClusterID)
|
||||
@ -181,6 +258,16 @@ func normalizeQuotaOptions(opts UserWorkspaceOptions) (UserWorkspaceOptions, err
|
||||
return opts, nil
|
||||
}
|
||||
|
||||
func defaultUserQuotaOptions(opts UserWorkspaceOptions) UserWorkspaceOptions {
|
||||
if strings.TrimSpace(opts.QuotaGPU) == "" {
|
||||
opts.QuotaGPU = "0"
|
||||
}
|
||||
if strings.TrimSpace(opts.QuotaGPUMem) == "" {
|
||||
opts.QuotaGPUMem = "0"
|
||||
}
|
||||
return opts
|
||||
}
|
||||
|
||||
func (s *AuthService) ListUsers(ctx context.Context) ([]*entity.User, error) {
|
||||
principal, err := authz.RequirePrincipal(ctx)
|
||||
if err != nil {
|
||||
@ -204,25 +291,35 @@ func (s *AuthService) UpdateUser(ctx context.Context, userID, role, workspaceID
|
||||
if err != nil {
|
||||
return nil, entity.ErrUserNotFound
|
||||
}
|
||||
previousRole := user.Role
|
||||
if role != "" {
|
||||
user.Role = normalizeUserRole(role)
|
||||
}
|
||||
if workspaceID != "" {
|
||||
if workspaceID != "" && user.Role != authz.RoleUser {
|
||||
user.WorkspaceID = workspaceID
|
||||
}
|
||||
workspaceHandled := false
|
||||
if user.Role == authz.RoleAdmin {
|
||||
user.WorkspaceID = entity.DefaultWorkspaceID
|
||||
}
|
||||
if user.Role == authz.RoleUser && (user.WorkspaceID == "" || user.WorkspaceID == entity.DefaultWorkspaceID) {
|
||||
if user.Role == authz.RoleUser && (role != "" || workspaceID != "" || hasWorkspaceUpdates(opts)) {
|
||||
normalizedOpts, err := normalizeQuotaOptions(opts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
workspace, err := s.createUserWorkspace(ctx, user.Username, principal.UserID, normalizedOpts)
|
||||
normalizedOpts = defaultUserQuotaOptions(normalizedOpts)
|
||||
currentWorkspace, _ := s.currentUserWorkspace(ctx, user)
|
||||
if currentWorkspace != nil && shouldCreatePrivateWorkspace(user, previousRole, currentWorkspace) {
|
||||
if normalizedOpts.Namespace == "" || normalizedOpts.Namespace == currentWorkspace.K8sNamespace {
|
||||
normalizedOpts.Namespace = ""
|
||||
}
|
||||
}
|
||||
workspace, err := s.ensureUserWorkspaceForUpdate(ctx, user, previousRole, currentWorkspace, opts, normalizedOpts, principal.UserID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
user.WorkspaceID = workspace.ID
|
||||
workspaceHandled = true
|
||||
}
|
||||
if isActive != nil {
|
||||
if user.ID == principal.UserID && !*isActive {
|
||||
@ -233,7 +330,7 @@ func (s *AuthService) UpdateUser(ctx context.Context, userID, role, workspaceID
|
||||
if mustChangePassword != nil {
|
||||
user.MustChangePassword = *mustChangePassword
|
||||
}
|
||||
if user.Role != authz.RoleAdmin && hasWorkspaceUpdates(opts) {
|
||||
if user.Role != authz.RoleAdmin && !workspaceHandled && hasWorkspaceUpdates(opts) {
|
||||
normalizedOpts, err := normalizeQuotaOptions(opts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -242,10 +339,13 @@ func (s *AuthService) UpdateUser(ctx context.Context, userID, role, workspaceID
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyWorkspaceOptions(workspace, normalizedOpts)
|
||||
applyWorkspaceOptionsForUpdate(workspace, opts, normalizedOpts)
|
||||
if err := s.workspaceRepo.Update(ctx, workspace); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := s.syncWorkspaceBindings(ctx, workspace); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
user.RevokedAfter = time.Now()
|
||||
user.UpdatedAt = time.Now()
|
||||
@ -289,6 +389,115 @@ func applyWorkspaceOptions(workspace *entity.Workspace, opts UserWorkspaceOption
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AuthService) currentUserWorkspace(ctx context.Context, user *entity.User) (*entity.Workspace, error) {
|
||||
if s.workspaceRepo == nil || user == nil || user.WorkspaceID == "" {
|
||||
return nil, entity.ErrWorkspaceNotFound
|
||||
}
|
||||
return s.workspaceRepo.GetByID(ctx, user.WorkspaceID)
|
||||
}
|
||||
|
||||
func shouldCreatePrivateWorkspace(user *entity.User, previousRole string, current *entity.Workspace) bool {
|
||||
if user == nil {
|
||||
return true
|
||||
}
|
||||
if previousRole == authz.RoleAdmin || user.WorkspaceID == "" || user.WorkspaceID == entity.DefaultWorkspaceID {
|
||||
return true
|
||||
}
|
||||
if current == nil {
|
||||
return true
|
||||
}
|
||||
return current.Name != userWorkspaceName(user.Username)
|
||||
}
|
||||
|
||||
func (s *AuthService) ensureUserWorkspaceForUpdate(ctx context.Context, user *entity.User, previousRole string, current *entity.Workspace, rawOpts, normalizedOpts UserWorkspaceOptions, createdBy string) (*entity.Workspace, error) {
|
||||
if s.workspaceRepo == nil {
|
||||
return nil, entity.ErrWorkspaceNotFound
|
||||
}
|
||||
if shouldCreatePrivateWorkspace(user, previousRole, current) {
|
||||
return s.createUserWorkspace(ctx, user.Username, createdBy, normalizedOpts)
|
||||
}
|
||||
if rawNamespace := strings.TrimSpace(rawOpts.Namespace); rawNamespace != "" && rawNamespace != current.K8sNamespace {
|
||||
if err := s.ensureNamespaceAvailable(ctx, rawNamespace, current.ID); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
applyWorkspaceOptionsForUpdate(current, rawOpts, normalizedOpts)
|
||||
if err := s.workspaceRepo.Update(ctx, current); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := s.syncWorkspaceBindings(ctx, current); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return current, nil
|
||||
}
|
||||
|
||||
func applyWorkspaceOptionsForUpdate(workspace *entity.Workspace, rawOpts, normalizedOpts UserWorkspaceOptions) {
|
||||
if namespace := strings.TrimSpace(rawOpts.Namespace); namespace != "" {
|
||||
workspace.K8sNamespace = namespace
|
||||
workspace.K8sSAName = entity.ServiceAccountForNamespace(namespace)
|
||||
}
|
||||
if strings.TrimSpace(rawOpts.DefaultClusterID) != "" {
|
||||
workspace.DefaultClusterID = normalizedOpts.DefaultClusterID
|
||||
}
|
||||
if strings.TrimSpace(rawOpts.QuotaCPU) != "" {
|
||||
workspace.QuotaCPU = normalizedOpts.QuotaCPU
|
||||
}
|
||||
if strings.TrimSpace(rawOpts.QuotaMemory) != "" {
|
||||
workspace.QuotaMemory = normalizedOpts.QuotaMemory
|
||||
}
|
||||
if strings.TrimSpace(rawOpts.QuotaGPU) != "" {
|
||||
workspace.QuotaGPU = normalizedOpts.QuotaGPU
|
||||
}
|
||||
if strings.TrimSpace(rawOpts.QuotaGPUMem) != "" {
|
||||
workspace.QuotaGPUMem = normalizedOpts.QuotaGPUMem
|
||||
}
|
||||
}
|
||||
|
||||
func (s *AuthService) syncWorkspaceBindings(ctx context.Context, workspace *entity.Workspace) error {
|
||||
if workspace == nil || s.bindingRepo == nil {
|
||||
return nil
|
||||
}
|
||||
bindings, err := s.bindingRepo.ListByWorkspace(ctx, workspace.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, binding := range bindings {
|
||||
if binding == nil {
|
||||
continue
|
||||
}
|
||||
binding.QuotaCPU = strings.TrimSpace(workspace.QuotaCPU)
|
||||
binding.QuotaMemory = strings.TrimSpace(workspace.QuotaMemory)
|
||||
binding.QuotaGPU = strings.TrimSpace(workspace.QuotaGPU)
|
||||
if binding.QuotaGPU == "" {
|
||||
binding.QuotaGPU = "0"
|
||||
}
|
||||
binding.QuotaGPUMem = strings.TrimSpace(workspace.QuotaGPUMem)
|
||||
if binding.QuotaGPUMem == "" {
|
||||
binding.QuotaGPUMem = "0"
|
||||
}
|
||||
binding.UpdatedAt = time.Now()
|
||||
if s.tenantClient != nil && s.clusterRepo != nil {
|
||||
cluster, err := s.clusterRepo.GetByID(ctx, binding.ClusterID)
|
||||
if err != nil {
|
||||
if errors.Is(err, entity.ErrClusterNotFound) {
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
tenantBinding := entity.NewTenantBinding(binding.Namespace)
|
||||
tenantBinding.ServiceAccountName = binding.ServiceAccount
|
||||
tenantBinding.ResourceQuotaHard = bindingQuotaHard(binding)
|
||||
if err := s.tenantClient.EnsureTenant(ctx, cluster, tenantBinding); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := s.bindingRepo.Upsert(ctx, binding); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *AuthService) DeleteUser(ctx context.Context, userID string) error {
|
||||
principal, err := authz.RequirePrincipal(ctx)
|
||||
if err != nil {
|
||||
@ -300,9 +509,117 @@ func (s *AuthService) DeleteUser(ctx context.Context, userID string) error {
|
||||
if userID == principal.UserID {
|
||||
return entity.ErrForbidden
|
||||
}
|
||||
user, err := s.userRepo.GetByID(ctx, userID)
|
||||
if err != nil {
|
||||
return entity.ErrUserNotFound
|
||||
}
|
||||
if err := s.ensureUserHasNoInstances(ctx, user); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.isExclusiveUserWorkspace(ctx, user) {
|
||||
if err := s.cleanupUserWorkspace(ctx, user.WorkspaceID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return s.userRepo.Delete(ctx, userID)
|
||||
}
|
||||
|
||||
func (s *AuthService) ensureUserHasNoInstances(ctx context.Context, user *entity.User) error {
|
||||
if s.instanceRepo == nil || user == nil {
|
||||
return nil
|
||||
}
|
||||
instances, err := s.instanceRepo.List(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, instance := range instances {
|
||||
if instance == nil {
|
||||
continue
|
||||
}
|
||||
if instance.OwnerID == user.ID {
|
||||
return entity.ErrUserHasInstances
|
||||
}
|
||||
if user.WorkspaceID != "" && user.WorkspaceID != entity.DefaultWorkspaceID && instance.WorkspaceID == user.WorkspaceID {
|
||||
return entity.ErrUserHasInstances
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *AuthService) isExclusiveUserWorkspace(ctx context.Context, user *entity.User) bool {
|
||||
if user == nil || user.Role == authz.RoleAdmin || user.WorkspaceID == "" || user.WorkspaceID == entity.DefaultWorkspaceID {
|
||||
return false
|
||||
}
|
||||
users, err := s.userRepo.List(ctx)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, other := range users {
|
||||
if other == nil || other.ID == user.ID {
|
||||
continue
|
||||
}
|
||||
if other.WorkspaceID == user.WorkspaceID {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *AuthService) cleanupUserWorkspace(ctx context.Context, workspaceID string) error {
|
||||
if s.workspaceRepo == nil || s.bindingRepo == nil {
|
||||
return nil
|
||||
}
|
||||
workspace, err := s.workspaceRepo.GetByID(ctx, workspaceID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if isProtectedWorkspaceNamespace(workspace.K8sNamespace) {
|
||||
return entity.ErrProtectedNamespace
|
||||
}
|
||||
bindings, err := s.bindingRepo.ListByWorkspace(ctx, workspace.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, binding := range bindings {
|
||||
if binding == nil {
|
||||
continue
|
||||
}
|
||||
if isProtectedWorkspaceNamespace(binding.Namespace) {
|
||||
return entity.ErrProtectedNamespace
|
||||
}
|
||||
if s.tenantClient != nil && s.clusterRepo != nil {
|
||||
cluster, err := s.clusterRepo.GetByID(ctx, binding.ClusterID)
|
||||
if err != nil && !errors.Is(err, entity.ErrClusterNotFound) {
|
||||
return err
|
||||
}
|
||||
if err == nil {
|
||||
tenantBinding := entity.NewTenantBinding(binding.Namespace)
|
||||
tenantBinding.ServiceAccountName = binding.ServiceAccount
|
||||
tenantBinding.ResourceQuotaHard = resourceQuotaHard(workspace)
|
||||
if err := s.tenantClient.DeleteTenant(ctx, cluster, tenantBinding); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := s.bindingRepo.Delete(ctx, binding.WorkspaceID, binding.ClusterID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := s.workspaceRepo.Delete(ctx, workspace.ID); err != nil && !errors.Is(err, entity.ErrWorkspaceNotFound) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func isProtectedWorkspaceNamespace(namespace string) bool {
|
||||
switch strings.TrimSpace(namespace) {
|
||||
case "", "default", "kube-system", "kube-public", "kube-node-lease":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeUserRole(role string) string {
|
||||
if role == authz.RoleAdmin {
|
||||
return authz.RoleAdmin
|
||||
|
||||
322
backend/internal/domain/service/auth_service_test.go
Normal file
322
backend/internal/domain/service/auth_service_test.go
Normal file
@ -0,0 +1,322 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/ocdp/cluster-service/internal/adapter/output/persistence/mock"
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
"github.com/ocdp/cluster-service/internal/pkg/authz"
|
||||
jwtpkg "github.com/ocdp/cluster-service/internal/pkg/jwt"
|
||||
)
|
||||
|
||||
func TestAuthServiceUpdateUserDowngradeReusesUsernameWorkspace(t *testing.T) {
|
||||
ctx := adminContext()
|
||||
userRepo := mock.NewUserRepositoryMock()
|
||||
workspaceRepo := mock.NewWorkspaceRepositoryMock()
|
||||
svc := NewAuthService(userRepo, workspaceRepo, testPasswordHasher{}, testTokenGenerator{})
|
||||
|
||||
target := testUser("user-1", "alice", authz.RoleAdmin, entity.DefaultWorkspaceID)
|
||||
if err := userRepo.Create(ctx, target); err != nil {
|
||||
t.Fatalf("seed user: %v", err)
|
||||
}
|
||||
workspace := entity.NewWorkspace(userWorkspaceName("alice"), "admin")
|
||||
workspace.ID = "workspace-alice"
|
||||
workspace.K8sNamespace = entity.NamespaceForUser("alice")
|
||||
workspace.K8sSAName = entity.ServiceAccountForNamespace(workspace.K8sNamespace)
|
||||
if err := workspaceRepo.Create(ctx, workspace); err != nil {
|
||||
t.Fatalf("seed workspace: %v", err)
|
||||
}
|
||||
|
||||
updated, err := svc.UpdateUser(ctx, target.ID, authz.RoleUser, "", UserWorkspaceOptions{DefaultClusterID: "cluster-1"}, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("UpdateUser returned error: %v", err)
|
||||
}
|
||||
|
||||
if updated.Role != authz.RoleUser {
|
||||
t.Fatalf("expected user role, got %q", updated.Role)
|
||||
}
|
||||
if updated.WorkspaceID != workspace.ID {
|
||||
t.Fatalf("expected reused workspace %q, got %q", workspace.ID, updated.WorkspaceID)
|
||||
}
|
||||
reused, err := workspaceRepo.GetByID(ctx, workspace.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("get reused workspace: %v", err)
|
||||
}
|
||||
if reused.DefaultClusterID != "cluster-1" {
|
||||
t.Fatalf("expected updated default cluster, got %q", reused.DefaultClusterID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthServiceRegisterUserAlwaysCreatesPrivateWorkspaceWithZeroDefaultQuotas(t *testing.T) {
|
||||
ctx := adminContext()
|
||||
userRepo := mock.NewUserRepositoryMock()
|
||||
workspaceRepo := mock.NewWorkspaceRepositoryMock()
|
||||
svc := NewAuthService(userRepo, workspaceRepo, testPasswordHasher{}, testTokenGenerator{})
|
||||
|
||||
user, err := svc.Register(ctx, "alice", "password", authz.RoleUser, "shared-workspace", UserWorkspaceOptions{}, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("Register returned error: %v", err)
|
||||
}
|
||||
if user.WorkspaceID == "shared-workspace" || user.WorkspaceID == entity.DefaultWorkspaceID {
|
||||
t.Fatalf("expected private user workspace, got %q", user.WorkspaceID)
|
||||
}
|
||||
workspace, err := workspaceRepo.GetByID(ctx, user.WorkspaceID)
|
||||
if err != nil {
|
||||
t.Fatalf("get user workspace: %v", err)
|
||||
}
|
||||
if workspace.K8sNamespace != entity.NamespaceForUser("alice") {
|
||||
t.Fatalf("expected user namespace %q, got %q", entity.NamespaceForUser("alice"), workspace.K8sNamespace)
|
||||
}
|
||||
if workspace.QuotaCPU != "" || workspace.QuotaMemory != "" || workspace.QuotaGPU != "0" || workspace.QuotaGPUMem != "0" {
|
||||
t.Fatalf("expected omitted CPU/memory to stay unlimited and GPU/gpumem to default zero, got cpu=%q memory=%q gpu=%q gpumem=%q", workspace.QuotaCPU, workspace.QuotaMemory, workspace.QuotaGPU, workspace.QuotaGPUMem)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthServiceUpdateUserDowngradeRejectsNamespaceConflict(t *testing.T) {
|
||||
ctx := adminContext()
|
||||
userRepo := mock.NewUserRepositoryMock()
|
||||
workspaceRepo := mock.NewWorkspaceRepositoryMock()
|
||||
svc := NewAuthService(userRepo, workspaceRepo, testPasswordHasher{}, testTokenGenerator{})
|
||||
|
||||
target := testUser("user-1", "alice", authz.RoleAdmin, entity.DefaultWorkspaceID)
|
||||
if err := userRepo.Create(ctx, target); err != nil {
|
||||
t.Fatalf("seed user: %v", err)
|
||||
}
|
||||
conflicting := entity.NewWorkspace("someone-else", "admin")
|
||||
conflicting.ID = "workspace-other"
|
||||
conflicting.K8sNamespace = entity.NamespaceForUser("alice")
|
||||
conflicting.K8sSAName = entity.ServiceAccountForNamespace(conflicting.K8sNamespace)
|
||||
if err := workspaceRepo.Create(ctx, conflicting); err != nil {
|
||||
t.Fatalf("seed conflicting workspace: %v", err)
|
||||
}
|
||||
|
||||
_, err := svc.UpdateUser(ctx, target.ID, authz.RoleUser, "", UserWorkspaceOptions{}, nil, nil)
|
||||
if !errors.Is(err, entity.ErrWorkspaceNamespaceConflict) {
|
||||
t.Fatalf("expected namespace conflict, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthServiceDeleteUserRejectsUserWithInstances(t *testing.T) {
|
||||
ctx := adminContext()
|
||||
userRepo := mock.NewUserRepositoryMock()
|
||||
workspaceRepo := mock.NewWorkspaceRepositoryMock()
|
||||
instanceRepo := mock.NewInstanceRepositoryMock()
|
||||
svc := NewAuthService(userRepo, workspaceRepo, testPasswordHasher{}, testTokenGenerator{})
|
||||
svc.SetUserLifecycleCleanup(instanceRepo, nil, nil, nil)
|
||||
|
||||
user := testUser("user-1", "alice", authz.RoleUser, "workspace-alice")
|
||||
if err := userRepo.Create(ctx, user); err != nil {
|
||||
t.Fatalf("seed user: %v", err)
|
||||
}
|
||||
instance := entity.NewInstance("cluster-1", "app", "ocdp-u-alice", "registry-1", "repo", "chart", "1.0.0")
|
||||
instance.ID = "instance-1"
|
||||
instance.OwnerID = user.ID
|
||||
instance.WorkspaceID = user.WorkspaceID
|
||||
if err := instanceRepo.Create(ctx, instance); err != nil {
|
||||
t.Fatalf("seed instance: %v", err)
|
||||
}
|
||||
|
||||
err := svc.DeleteUser(ctx, user.ID)
|
||||
if !errors.Is(err, entity.ErrUserHasInstances) {
|
||||
t.Fatalf("expected user instance conflict, got %v", err)
|
||||
}
|
||||
if _, err := userRepo.GetByID(ctx, user.ID); err != nil {
|
||||
t.Fatalf("user should not be deleted: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthServiceDeleteUserRejectsWorkspaceInstanceEvenWithDifferentOwner(t *testing.T) {
|
||||
ctx := adminContext()
|
||||
userRepo := mock.NewUserRepositoryMock()
|
||||
workspaceRepo := mock.NewWorkspaceRepositoryMock()
|
||||
instanceRepo := mock.NewInstanceRepositoryMock()
|
||||
svc := NewAuthService(userRepo, workspaceRepo, testPasswordHasher{}, testTokenGenerator{})
|
||||
svc.SetUserLifecycleCleanup(instanceRepo, nil, nil, nil)
|
||||
|
||||
user := testUser("user-1", "alice", authz.RoleUser, "workspace-alice")
|
||||
if err := userRepo.Create(ctx, user); err != nil {
|
||||
t.Fatalf("seed user: %v", err)
|
||||
}
|
||||
instance := entity.NewInstance("cluster-1", "shared-workspace-app", "ocdp-u-alice", "registry-1", "repo", "chart", "1.0.0")
|
||||
instance.ID = "instance-1"
|
||||
instance.OwnerID = "other-user"
|
||||
instance.WorkspaceID = user.WorkspaceID
|
||||
if err := instanceRepo.Create(ctx, instance); err != nil {
|
||||
t.Fatalf("seed workspace instance: %v", err)
|
||||
}
|
||||
|
||||
err := svc.DeleteUser(ctx, user.ID)
|
||||
if !errors.Is(err, entity.ErrUserHasInstances) {
|
||||
t.Fatalf("expected workspace instance conflict, got %v", err)
|
||||
}
|
||||
if _, err := userRepo.GetByID(ctx, user.ID); err != nil {
|
||||
t.Fatalf("user should not be deleted: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuthServiceDeleteUserCleansExclusiveWorkspaceBindings(t *testing.T) {
|
||||
ctx := adminContext()
|
||||
userRepo := mock.NewUserRepositoryMock()
|
||||
workspaceRepo := mock.NewWorkspaceRepositoryMock()
|
||||
instanceRepo := mock.NewInstanceRepositoryMock()
|
||||
bindingRepo := mock.NewWorkspaceClusterBindingRepositoryMock()
|
||||
clusterRepo := &testClusterRepo{clusters: map[string]*entity.Cluster{
|
||||
"cluster-1": {ID: "cluster-1", Name: "cluster-1", Host: "https://cluster.invalid", Token: "token"},
|
||||
}}
|
||||
tenantClient := &recordingTenantClient{}
|
||||
svc := NewAuthService(userRepo, workspaceRepo, testPasswordHasher{}, testTokenGenerator{})
|
||||
svc.SetUserLifecycleCleanup(instanceRepo, clusterRepo, bindingRepo, tenantClient)
|
||||
|
||||
workspace := entity.NewWorkspace(userWorkspaceName("alice"), "admin")
|
||||
workspace.ID = "workspace-alice"
|
||||
workspace.K8sNamespace = entity.NamespaceForUser("alice")
|
||||
workspace.K8sSAName = entity.ServiceAccountForNamespace(workspace.K8sNamespace)
|
||||
if err := workspaceRepo.Create(ctx, workspace); err != nil {
|
||||
t.Fatalf("seed workspace: %v", err)
|
||||
}
|
||||
user := testUser("user-1", "alice", authz.RoleUser, workspace.ID)
|
||||
if err := userRepo.Create(ctx, user); err != nil {
|
||||
t.Fatalf("seed user: %v", err)
|
||||
}
|
||||
if err := bindingRepo.Upsert(ctx, &entity.WorkspaceClusterBinding{
|
||||
ID: "binding-1",
|
||||
WorkspaceID: workspace.ID,
|
||||
ClusterID: "cluster-1",
|
||||
Namespace: workspace.K8sNamespace,
|
||||
ServiceAccount: workspace.K8sSAName,
|
||||
Status: "active",
|
||||
}); err != nil {
|
||||
t.Fatalf("seed binding: %v", err)
|
||||
}
|
||||
|
||||
if err := svc.DeleteUser(ctx, user.ID); err != nil {
|
||||
t.Fatalf("DeleteUser returned error: %v", err)
|
||||
}
|
||||
if _, err := userRepo.GetByID(ctx, user.ID); !errors.Is(err, entity.ErrUserNotFound) {
|
||||
t.Fatalf("expected user deleted, got %v", err)
|
||||
}
|
||||
if bindings, err := bindingRepo.ListByWorkspace(ctx, workspace.ID); err != nil || len(bindings) != 0 {
|
||||
t.Fatalf("expected bindings cleaned, got len=%d err=%v", len(bindings), err)
|
||||
}
|
||||
if len(tenantClient.deleted) != 1 || tenantClient.deleted[0] != workspace.K8sNamespace {
|
||||
t.Fatalf("expected tenant namespace cleanup, got %#v", tenantClient.deleted)
|
||||
}
|
||||
if _, err := workspaceRepo.GetByID(ctx, workspace.ID); !errors.Is(err, entity.ErrWorkspaceNotFound) {
|
||||
t.Fatalf("expected exclusive workspace deleted, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func adminContext() context.Context {
|
||||
return authz.WithPrincipal(context.Background(), &authz.Principal{
|
||||
UserID: "admin-1",
|
||||
Username: "admin",
|
||||
Role: authz.RoleAdmin,
|
||||
WorkspaceID: entity.DefaultWorkspaceID,
|
||||
})
|
||||
}
|
||||
|
||||
func testUser(id, username, role, workspaceID string) *entity.User {
|
||||
user := entity.NewUser(username, "hash", username+"@local.ocdp")
|
||||
user.ID = id
|
||||
user.Role = role
|
||||
user.WorkspaceID = workspaceID
|
||||
return user
|
||||
}
|
||||
|
||||
type testPasswordHasher struct{}
|
||||
|
||||
func (testPasswordHasher) Hash(password string) (string, error) { return "hash:" + password, nil }
|
||||
func (testPasswordHasher) Verify(password, hash string) error { return nil }
|
||||
|
||||
type testTokenGenerator struct{}
|
||||
|
||||
func (testTokenGenerator) Generate(userID, username, role, workspaceID string) (string, string, error) {
|
||||
return "access", "refresh", nil
|
||||
}
|
||||
func (testTokenGenerator) Verify(token string) (string, string, error) { return "", "", nil }
|
||||
func (testTokenGenerator) VerifyWithIssuedAt(token string) (string, string, int64, error) {
|
||||
return "", "", 0, nil
|
||||
}
|
||||
func (testTokenGenerator) VerifyAccess(token string) (*jwtpkg.Claims, error) { return nil, nil }
|
||||
func (testTokenGenerator) VerifyRefresh(token string) (*jwtpkg.Claims, error) { return nil, nil }
|
||||
func (testTokenGenerator) Refresh(refreshToken string) (string, error) { return "access", nil }
|
||||
|
||||
type testClusterRepo struct {
|
||||
clusters map[string]*entity.Cluster
|
||||
}
|
||||
|
||||
func (r *testClusterRepo) Create(ctx context.Context, cluster *entity.Cluster) error {
|
||||
if cluster.ID == "" {
|
||||
cluster.ID = uuid.New().String()
|
||||
}
|
||||
copy := *cluster
|
||||
r.clusters[cluster.ID] = ©
|
||||
return nil
|
||||
}
|
||||
func (r *testClusterRepo) GetByID(ctx context.Context, id string) (*entity.Cluster, error) {
|
||||
cluster, ok := r.clusters[id]
|
||||
if !ok {
|
||||
return nil, entity.ErrClusterNotFound
|
||||
}
|
||||
copy := *cluster
|
||||
return ©, nil
|
||||
}
|
||||
func (r *testClusterRepo) GetByName(ctx context.Context, name string) (*entity.Cluster, error) {
|
||||
for _, cluster := range r.clusters {
|
||||
if cluster.Name == name {
|
||||
copy := *cluster
|
||||
return ©, nil
|
||||
}
|
||||
}
|
||||
return nil, entity.ErrClusterNotFound
|
||||
}
|
||||
func (r *testClusterRepo) Update(ctx context.Context, cluster *entity.Cluster) error {
|
||||
copy := *cluster
|
||||
r.clusters[cluster.ID] = ©
|
||||
return nil
|
||||
}
|
||||
func (r *testClusterRepo) Delete(ctx context.Context, id string) error {
|
||||
delete(r.clusters, id)
|
||||
return nil
|
||||
}
|
||||
func (r *testClusterRepo) List(ctx context.Context) ([]*entity.Cluster, error) {
|
||||
result := make([]*entity.Cluster, 0, len(r.clusters))
|
||||
for _, cluster := range r.clusters {
|
||||
copy := *cluster
|
||||
result = append(result, ©)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
type recordingTenantClient struct {
|
||||
deleted []string
|
||||
usage *repository.ResourceQuotaUsage
|
||||
}
|
||||
|
||||
func (c *recordingTenantClient) EnsureTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
|
||||
return nil
|
||||
}
|
||||
func (c *recordingTenantClient) IssueKubeconfig(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding, ttl time.Duration) (*entity.TenantKubeconfig, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (c *recordingTenantClient) GetResourceQuotaUsage(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) (*repository.ResourceQuotaUsage, error) {
|
||||
if c.usage != nil {
|
||||
return c.usage, nil
|
||||
}
|
||||
return &repository.ResourceQuotaUsage{}, nil
|
||||
}
|
||||
func (c *recordingTenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
|
||||
return nil
|
||||
}
|
||||
func (c *recordingTenantClient) DeleteTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
|
||||
if err := binding.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
c.deleted = append(c.deleted, binding.Namespace)
|
||||
return nil
|
||||
}
|
||||
@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@ -34,6 +35,7 @@ type InstanceService struct {
|
||||
entryClient repository.InstanceEntryClient
|
||||
diagClient repository.InstanceDiagnosticsClient
|
||||
workspaceRepo repository.WorkspaceRepository
|
||||
userRepo repository.UserRepository
|
||||
tenantClient repository.TenantKubeClient
|
||||
scaleClient ScaleClient
|
||||
}
|
||||
@ -76,6 +78,10 @@ func (s *InstanceService) SetTenantProvisioning(workspaceRepo repository.Workspa
|
||||
s.tenantClient = tenantClient
|
||||
}
|
||||
|
||||
func (s *InstanceService) SetUserRepository(userRepo repository.UserRepository) {
|
||||
s.userRepo = userRepo
|
||||
}
|
||||
|
||||
const chartCacheDir = "/tmp/charts"
|
||||
|
||||
func (s *InstanceService) chartArchivePath(instance *entity.Instance) string {
|
||||
@ -131,15 +137,21 @@ func (s *InstanceService) CreateInstance(ctx context.Context, instance *entity.I
|
||||
return err
|
||||
}
|
||||
enforceNamespaceValues(instance)
|
||||
if err := s.ensureTenantForInstance(ctx, principal, cluster, instance); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 检查实例是否已存在
|
||||
existingInstance, _ := s.instanceRepo.GetByClusterAndName(ctx, instance.ClusterID, instance.Name)
|
||||
if existingInstance != nil {
|
||||
return entity.ErrInstanceExists
|
||||
}
|
||||
if err := s.downloadChart(ctx, registry, instance); err != nil {
|
||||
return err
|
||||
}
|
||||
binding, err := s.ensureTenantForInstance(ctx, principal, cluster, instance)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.precheckInstanceQuota(ctx, principal, cluster, binding, instance, nil); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
instance.BeginOperation(entity.OperationInstall, "Preparing installation")
|
||||
|
||||
@ -148,13 +160,6 @@ func (s *InstanceService) CreateInstance(ctx context.Context, instance *entity.I
|
||||
return err
|
||||
}
|
||||
|
||||
// 下载 chart artifact 供 Helm 使用
|
||||
if err := s.downloadChart(ctx, registry, instance); err != nil {
|
||||
instance.MarkFailure("Failed to download chart", err)
|
||||
_ = s.instanceRepo.Update(ctx, instance)
|
||||
return err
|
||||
}
|
||||
|
||||
// 异步执行 Helm 安装并监控状态
|
||||
go s.executeAndSyncInstall(context.Background(), instance.ID, cluster, registry, instance)
|
||||
|
||||
@ -175,6 +180,7 @@ func (s *InstanceService) GetInstance(ctx context.Context, id string) (*entity.I
|
||||
if !s.canReadInstance(principal, instance) {
|
||||
return nil, entity.ErrInstanceNotFound
|
||||
}
|
||||
s.enrichOwnerUsernames(ctx, []*entity.Instance{instance})
|
||||
return instance, nil
|
||||
}
|
||||
|
||||
@ -219,8 +225,22 @@ func (s *InstanceService) UpdateInstance(ctx context.Context, instance *entity.I
|
||||
if !s.canWriteInstance(principal, existingInstance) {
|
||||
return entity.ErrForbidden
|
||||
}
|
||||
instance.ClusterID = existingInstance.ClusterID
|
||||
instance.WorkspaceID = existingInstance.WorkspaceID
|
||||
instance.OwnerID = existingInstance.OwnerID
|
||||
instance.Name = existingInstance.Name
|
||||
if instance.RegistryID == "" {
|
||||
instance.RegistryID = existingInstance.RegistryID
|
||||
}
|
||||
if instance.Repository == "" {
|
||||
instance.Repository = existingInstance.Repository
|
||||
}
|
||||
if instance.Chart == "" {
|
||||
instance.Chart = existingInstance.Chart
|
||||
}
|
||||
if instance.Version == "" {
|
||||
instance.Version = existingInstance.Version
|
||||
}
|
||||
|
||||
// 获取集群信息
|
||||
cluster, err := s.clusterRepo.GetByID(ctx, existingInstance.ClusterID)
|
||||
@ -236,15 +256,21 @@ func (s *InstanceService) UpdateInstance(ctx context.Context, instance *entity.I
|
||||
|
||||
instance.Namespace = existingInstance.Namespace
|
||||
enforceNamespaceValues(instance)
|
||||
instance.BeginOperation(entity.OperationUpgrade, "Pending upgrade")
|
||||
if err := s.instanceRepo.Update(ctx, instance); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 下载所需 Chart
|
||||
if err := s.downloadChart(ctx, registry, instance); err != nil {
|
||||
instance.MarkFailure("Failed to download chart", err)
|
||||
_ = s.instanceRepo.Update(ctx, instance)
|
||||
return err
|
||||
}
|
||||
binding, err := s.ensureTenantForInstance(ctx, principal, cluster, instance)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.precheckInstanceQuota(ctx, principal, cluster, binding, instance, existingInstance); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
instance.BeginOperation(entity.OperationUpgrade, "Pending upgrade")
|
||||
if err := s.instanceRepo.Update(ctx, instance); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -364,9 +390,32 @@ func (s *InstanceService) ListInstancesByCluster(ctx context.Context, clusterID
|
||||
visible = append(visible, instance)
|
||||
}
|
||||
}
|
||||
s.enrichOwnerUsernames(ctx, visible)
|
||||
return visible, nil
|
||||
}
|
||||
|
||||
func (s *InstanceService) enrichOwnerUsernames(ctx context.Context, instances []*entity.Instance) {
|
||||
if s.userRepo == nil || len(instances) == 0 {
|
||||
return
|
||||
}
|
||||
usernames := make(map[string]string)
|
||||
for _, instance := range instances {
|
||||
if instance == nil || instance.OwnerID == "" {
|
||||
continue
|
||||
}
|
||||
if username, ok := usernames[instance.OwnerID]; ok {
|
||||
instance.OwnerUsername = username
|
||||
continue
|
||||
}
|
||||
user, err := s.userRepo.GetByID(ctx, instance.OwnerID)
|
||||
if err != nil || user == nil {
|
||||
continue
|
||||
}
|
||||
usernames[instance.OwnerID] = user.Username
|
||||
instance.OwnerUsername = user.Username
|
||||
}
|
||||
}
|
||||
|
||||
// ListInstanceEntries 列出实例关联的入口信息(Service / Ingress)
|
||||
func (s *InstanceService) ListInstanceEntries(ctx context.Context, clusterID, instanceID string) ([]*entity.InstanceEntry, error) {
|
||||
instance, err := s.GetInstance(ctx, instanceID)
|
||||
@ -442,27 +491,57 @@ func (s *InstanceService) ScaleInstance(ctx context.Context, clusterID, instance
|
||||
if !s.canWriteInstance(principal, instance) {
|
||||
return nil, entity.ErrForbidden
|
||||
}
|
||||
if instance.ClusterID != clusterID {
|
||||
return nil, entity.ErrInstanceNotFound
|
||||
}
|
||||
cluster, err := s.clusterRepo.GetByID(ctx, clusterID)
|
||||
if err != nil {
|
||||
return nil, entity.ErrClusterNotFound
|
||||
}
|
||||
|
||||
current := cloneInstanceForQuota(instance)
|
||||
currentValues, err := s.helmClient.GetValues(ctx, cluster, instance.Name, instance.Namespace)
|
||||
if err == nil && currentValues != nil {
|
||||
current.SetValues(currentValues)
|
||||
}
|
||||
target := cloneInstanceForQuota(instance)
|
||||
targetValues := copyValues(current.Values)
|
||||
if targetValues == nil {
|
||||
targetValues = copyValues(instance.Values)
|
||||
}
|
||||
if targetValues == nil {
|
||||
targetValues = map[string]interface{}{}
|
||||
}
|
||||
targetValues["replicaCount"] = replicas
|
||||
target.SetValues(targetValues)
|
||||
registry, err := s.registryRepo.GetByID(ctx, instance.RegistryID)
|
||||
if err != nil {
|
||||
return nil, entity.ErrRegistryNotFound
|
||||
}
|
||||
if err := s.downloadChart(ctx, registry, target); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
binding, err := s.ensureTenantForInstance(ctx, principal, cluster, target)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := s.precheckInstanceQuota(ctx, principal, cluster, binding, target, current); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Scale via K8s API directly (like kubectl scale deploy --replicas=N)
|
||||
if s.scaleClient != nil {
|
||||
if err := s.scaleClient.ScaleDeployment(ctx, cluster, instance.Namespace, instance.Name, int32(replicas)); err != nil {
|
||||
return nil, fmt.Errorf("failed to scale deployment: %w", err)
|
||||
}
|
||||
instance.SetValues(targetValues)
|
||||
instance.Replicas = replicas
|
||||
if err := s.instanceRepo.Update(ctx, instance); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
// Fallback: Helm upgrade with replicaCount
|
||||
vals, err := s.helmClient.GetValues(ctx, cluster, instance.Name, instance.Namespace)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get current values: %w", err)
|
||||
}
|
||||
if vals == nil {
|
||||
vals = make(map[string]interface{})
|
||||
}
|
||||
vals["replicaCount"] = replicas
|
||||
instance.SetValues(vals)
|
||||
instance.SetValues(targetValues)
|
||||
instance.BeginOperation(entity.OperationUpgrade, fmt.Sprintf("Scaling to %d replicas", replicas))
|
||||
if err := s.instanceRepo.Update(ctx, instance); err != nil {
|
||||
return nil, err
|
||||
@ -516,6 +595,9 @@ func (s *InstanceService) GetInstanceValuesDiff(ctx context.Context, clusterID,
|
||||
if !s.canReadInstance(principal, instance) {
|
||||
return nil, entity.ErrInstanceNotFound
|
||||
}
|
||||
if instance.ClusterID != clusterID {
|
||||
return nil, entity.ErrInstanceNotFound
|
||||
}
|
||||
cluster, err := s.clusterRepo.GetByID(ctx, clusterID)
|
||||
if err != nil {
|
||||
return nil, entity.ErrClusterNotFound
|
||||
@ -528,6 +610,18 @@ func (s *InstanceService) GetInstanceValuesDiff(ctx context.Context, clusterID,
|
||||
|
||||
// Get default values from the chart archive
|
||||
chartPath := s.chartArchivePath(instance)
|
||||
if _, statErr := os.Stat(chartPath); statErr != nil {
|
||||
if !errors.Is(statErr, os.ErrNotExist) {
|
||||
return nil, fmt.Errorf("failed to inspect chart defaults: %w", statErr)
|
||||
}
|
||||
registry, err := s.registryRepo.GetByID(ctx, instance.RegistryID)
|
||||
if err != nil {
|
||||
return nil, entity.ErrRegistryNotFound
|
||||
}
|
||||
if err := s.downloadChart(ctx, registry, instance); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
defaults, err := s.helmClient.GetChartDefaultValues(chartPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read chart defaults: %w", err)
|
||||
@ -593,9 +687,6 @@ func (s *InstanceService) applyNamespacePolicy(ctx context.Context, principal *a
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if isReservedNamespace(instance.Namespace) {
|
||||
return entity.ErrInvalidNamespace
|
||||
}
|
||||
if cluster.Visibility != authz.VisibilityPrivate || cluster.OwnerID != principal.UserID {
|
||||
namespace := principal.Namespace
|
||||
if namespace == "" {
|
||||
@ -606,9 +697,15 @@ func (s *InstanceService) applyNamespacePolicy(ctx context.Context, principal *a
|
||||
namespace = binding.Namespace
|
||||
}
|
||||
}
|
||||
if instance.Namespace != "" && instance.Namespace != namespace {
|
||||
return entity.ErrForbidden
|
||||
}
|
||||
instance.Namespace = namespace
|
||||
return nil
|
||||
}
|
||||
if isReservedNamespace(instance.Namespace) {
|
||||
return entity.ErrInvalidNamespace
|
||||
}
|
||||
if instance.Namespace == "" {
|
||||
if cluster.DefaultNamespace != "" {
|
||||
instance.Namespace = cluster.DefaultNamespace
|
||||
@ -621,8 +718,62 @@ func (s *InstanceService) applyNamespacePolicy(ctx context.Context, principal *a
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *InstanceService) ensureTenantForInstance(ctx context.Context, principal *authz.Principal, cluster *entity.Cluster, instance *entity.Instance) error {
|
||||
func (s *InstanceService) ensureTenantForInstance(ctx context.Context, principal *authz.Principal, cluster *entity.Cluster, instance *entity.Instance) (*entity.WorkspaceClusterBinding, error) {
|
||||
if principal.IsAdmin() || s.workspaceRepo == nil || s.tenantClient == nil {
|
||||
return nil, nil
|
||||
}
|
||||
workspace, err := s.workspaceRepo.GetByID(ctx, principal.WorkspaceID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if workspace.Status == entity.WorkspaceSuspended {
|
||||
return nil, entity.ErrWorkspaceSuspended
|
||||
}
|
||||
binding := &entity.WorkspaceClusterBinding{
|
||||
ID: uuid.New().String(),
|
||||
WorkspaceID: workspace.ID,
|
||||
ClusterID: cluster.ID,
|
||||
Namespace: instance.Namespace,
|
||||
ServiceAccount: workspace.K8sSAName,
|
||||
QuotaCPU: strings.TrimSpace(workspace.QuotaCPU),
|
||||
QuotaMemory: strings.TrimSpace(workspace.QuotaMemory),
|
||||
QuotaGPU: zeroIfEmptyQuota(workspace.QuotaGPU),
|
||||
QuotaGPUMem: zeroIfEmptyQuota(workspace.QuotaGPUMem),
|
||||
Status: "active",
|
||||
CreatedAt: time.Now(),
|
||||
UpdatedAt: time.Now(),
|
||||
}
|
||||
if s.bindingRepo != nil {
|
||||
if existing, err := s.bindingRepo.Get(ctx, workspace.ID, cluster.ID); err == nil && existing != nil {
|
||||
binding.ID = existing.ID
|
||||
binding.CreatedAt = existing.CreatedAt
|
||||
if existing.Namespace != "" {
|
||||
binding.Namespace = existing.Namespace
|
||||
instance.Namespace = existing.Namespace
|
||||
enforceNamespaceValues(instance)
|
||||
}
|
||||
if existing.ServiceAccount != "" {
|
||||
binding.ServiceAccount = existing.ServiceAccount
|
||||
}
|
||||
if existing.Status != "" {
|
||||
binding.Status = existing.Status
|
||||
}
|
||||
}
|
||||
}
|
||||
tenantBinding := tenantBindingFromWorkspaceClusterBinding(binding)
|
||||
if err := s.tenantClient.EnsureTenant(ctx, cluster, tenantBinding); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if s.bindingRepo != nil {
|
||||
if err := s.bindingRepo.Upsert(ctx, binding); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return binding, nil
|
||||
}
|
||||
|
||||
func (s *InstanceService) precheckInstanceQuota(ctx context.Context, principal *authz.Principal, cluster *entity.Cluster, binding *entity.WorkspaceClusterBinding, target, current *entity.Instance) error {
|
||||
if principal.IsAdmin() || s.workspaceRepo == nil || s.helmClient == nil {
|
||||
return nil
|
||||
}
|
||||
workspace, err := s.workspaceRepo.GetByID(ctx, principal.WorkspaceID)
|
||||
@ -632,29 +783,45 @@ func (s *InstanceService) ensureTenantForInstance(ctx context.Context, principal
|
||||
if workspace.Status == entity.WorkspaceSuspended {
|
||||
return entity.ErrWorkspaceSuspended
|
||||
}
|
||||
binding := entity.NewTenantBinding(instance.Namespace)
|
||||
binding.ServiceAccountName = workspace.K8sSAName
|
||||
binding.ResourceQuotaHard = instanceResourceQuotaHard(workspace)
|
||||
if err := s.tenantClient.EnsureTenant(ctx, cluster, binding); err != nil {
|
||||
return err
|
||||
if binding == nil {
|
||||
binding = &entity.WorkspaceClusterBinding{
|
||||
WorkspaceID: principal.WorkspaceID,
|
||||
ClusterID: cluster.ID,
|
||||
Namespace: target.Namespace,
|
||||
QuotaCPU: strings.TrimSpace(workspace.QuotaCPU),
|
||||
QuotaMemory: strings.TrimSpace(workspace.QuotaMemory),
|
||||
QuotaGPU: zeroIfEmptyQuota(workspace.QuotaGPU),
|
||||
QuotaGPUMem: zeroIfEmptyQuota(workspace.QuotaGPUMem),
|
||||
}
|
||||
}
|
||||
if s.bindingRepo != nil {
|
||||
_ = s.bindingRepo.Upsert(ctx, &entity.WorkspaceClusterBinding{
|
||||
ID: uuid.New().String(),
|
||||
WorkspaceID: workspace.ID,
|
||||
ClusterID: cluster.ID,
|
||||
Namespace: instance.Namespace,
|
||||
ServiceAccount: workspace.K8sSAName,
|
||||
QuotaCPU: workspace.QuotaCPU,
|
||||
QuotaMemory: workspace.QuotaMemory,
|
||||
QuotaGPU: workspace.QuotaGPU,
|
||||
QuotaGPUMem: workspace.QuotaGPUMem,
|
||||
Status: "active",
|
||||
CreatedAt: time.Now(),
|
||||
UpdatedAt: time.Now(),
|
||||
})
|
||||
var usage *repository.ResourceQuotaUsage
|
||||
if s.tenantClient != nil {
|
||||
tenantBinding := tenantBindingFromWorkspaceClusterBinding(binding)
|
||||
quotaUsage, err := s.tenantClient.GetResourceQuotaUsage(ctx, cluster, tenantBinding)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
usage = quotaUsage
|
||||
}
|
||||
return nil
|
||||
result, err := NewQuotaPrecheckService(s.helmClient).EstimateAndCompareBinding(ctx, cluster, binding, usage, target, current)
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
if errors.Is(err, ErrQuotaExceeded) && result != nil {
|
||||
return fmt.Errorf("%w: %s", ErrQuotaExceeded, formatQuotaExceeded(result.Exceeded))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func formatQuotaExceeded(exceeded []QuotaExceededResource) string {
|
||||
if len(exceeded) == 0 {
|
||||
return "requested resources exceed workspace quota"
|
||||
}
|
||||
parts := make([]string, 0, len(exceeded))
|
||||
for _, item := range exceeded {
|
||||
parts = append(parts, fmt.Sprintf("%s required=%s quota=%s", item.Name, item.Required, item.Hard))
|
||||
}
|
||||
return strings.Join(parts, "; ")
|
||||
}
|
||||
|
||||
func instanceResourceQuotaHard(workspace *entity.Workspace) corev1.ResourceList {
|
||||
@ -687,6 +854,46 @@ func instanceResourceQuotaHard(workspace *entity.Workspace) corev1.ResourceList
|
||||
return hard
|
||||
}
|
||||
|
||||
func tenantBindingFromWorkspaceClusterBinding(binding *entity.WorkspaceClusterBinding) entity.TenantBinding {
|
||||
namespace := ""
|
||||
if binding != nil {
|
||||
namespace = binding.Namespace
|
||||
}
|
||||
tenantBinding := entity.NewTenantBinding(namespace)
|
||||
if binding != nil {
|
||||
tenantBinding.ServiceAccountName = binding.ServiceAccount
|
||||
tenantBinding.ResourceQuotaHard = bindingQuotaHard(binding)
|
||||
}
|
||||
return tenantBinding
|
||||
}
|
||||
|
||||
func zeroIfEmptyQuota(value string) string {
|
||||
if strings.TrimSpace(value) == "" {
|
||||
return "0"
|
||||
}
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
|
||||
func cloneInstanceForQuota(instance *entity.Instance) *entity.Instance {
|
||||
if instance == nil {
|
||||
return nil
|
||||
}
|
||||
cloned := *instance
|
||||
cloned.SetValues(copyValues(instance.Values))
|
||||
return &cloned
|
||||
}
|
||||
|
||||
func copyValues(values map[string]interface{}) map[string]interface{} {
|
||||
if values == nil {
|
||||
return nil
|
||||
}
|
||||
copied := make(map[string]interface{}, len(values))
|
||||
for key, value := range values {
|
||||
copied[key] = value
|
||||
}
|
||||
return copied
|
||||
}
|
||||
|
||||
func isReservedNamespace(namespace string) bool {
|
||||
switch namespace {
|
||||
case "default", "kube-system", "kube-public", "kube-node-lease":
|
||||
|
||||
@ -10,6 +10,7 @@ import (
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
"github.com/ocdp/cluster-service/internal/pkg/authz"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
)
|
||||
|
||||
func TestDeleteInstanceIgnoresMissingRelease(t *testing.T) {
|
||||
@ -85,6 +86,210 @@ func TestEnforceNamespaceValuesOverridesChartNamespaceKnobs(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyNamespacePolicyRejectsMismatchedTenantNamespace(t *testing.T) {
|
||||
principal := &authz.Principal{
|
||||
UserID: "user-1",
|
||||
Username: "alice",
|
||||
Role: authz.RoleUser,
|
||||
WorkspaceID: "workspace-1",
|
||||
WorkspaceName: "alice",
|
||||
Namespace: "ocdp-u-alice",
|
||||
}
|
||||
cluster := &entity.Cluster{
|
||||
ID: "cluster-1",
|
||||
OwnerID: "admin",
|
||||
Visibility: authz.VisibilityWorkspaceShared,
|
||||
}
|
||||
instance := &entity.Instance{Namespace: "other-namespace"}
|
||||
svc := NewInstanceService(nil, nil, nil, nil, nil, nil)
|
||||
|
||||
if err := svc.applyNamespacePolicy(context.Background(), principal, cluster, instance); !errors.Is(err, entity.ErrForbidden) {
|
||||
t.Fatalf("expected ErrForbidden for mismatched tenant namespace, got %v", err)
|
||||
}
|
||||
if instance.Namespace != "other-namespace" {
|
||||
t.Fatalf("expected namespace to remain unchanged on rejection, got %q", instance.Namespace)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyNamespacePolicyAllowsTenantNamespace(t *testing.T) {
|
||||
principal := &authz.Principal{
|
||||
UserID: "user-1",
|
||||
Username: "alice",
|
||||
Role: authz.RoleUser,
|
||||
WorkspaceID: "workspace-1",
|
||||
WorkspaceName: "alice",
|
||||
Namespace: "ocdp-u-alice",
|
||||
}
|
||||
cluster := &entity.Cluster{
|
||||
ID: "cluster-1",
|
||||
OwnerID: "admin",
|
||||
Visibility: authz.VisibilityWorkspaceShared,
|
||||
}
|
||||
instance := &entity.Instance{Namespace: "ocdp-u-alice"}
|
||||
svc := NewInstanceService(nil, nil, nil, nil, nil, nil)
|
||||
|
||||
if err := svc.applyNamespacePolicy(context.Background(), principal, cluster, instance); err != nil {
|
||||
t.Fatalf("expected matching tenant namespace to be allowed, got %v", err)
|
||||
}
|
||||
if instance.Namespace != "ocdp-u-alice" {
|
||||
t.Fatalf("expected namespace to remain the allowed tenant namespace, got %q", instance.Namespace)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichReplicasSetsLiveReplicaCount(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cluster := &entity.Cluster{ID: "cluster-1", Name: "cluster"}
|
||||
svc := NewInstanceService(nil, &stubClusterRepo{cluster: cluster}, nil, nil, nil, nil)
|
||||
svc.SetScaleClient(&stubScaleClient{replicas: 3})
|
||||
|
||||
instances := []*entity.Instance{{
|
||||
ID: "inst-1",
|
||||
ClusterID: "cluster-1",
|
||||
Name: "demo",
|
||||
Namespace: "ocdp-u-alice",
|
||||
Replicas: 1,
|
||||
}}
|
||||
|
||||
enriched := svc.EnrichReplicas(ctx, "cluster-1", instances)
|
||||
if enriched[0].Replicas != 3 {
|
||||
t.Fatalf("expected live replicas to overwrite stored count, got %d", enriched[0].Replicas)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListInstancesByClusterHydratesOwnerUsername(t *testing.T) {
|
||||
ctx := authz.WithPrincipal(context.Background(), &authz.Principal{
|
||||
UserID: "admin-1",
|
||||
Username: "admin",
|
||||
Role: authz.RoleAdmin,
|
||||
WorkspaceID: "workspace-admin",
|
||||
})
|
||||
instanceRepo := persistencemock.NewInstanceRepositoryMock()
|
||||
userRepo := persistencemock.NewUserRepositoryMock()
|
||||
if err := userRepo.Create(ctx, &entity.User{ID: "user-1", Username: "alice", PasswordHash: "hash", Role: "user", WorkspaceID: "workspace-1"}); err != nil {
|
||||
t.Fatalf("failed to seed user: %v", err)
|
||||
}
|
||||
instance := &entity.Instance{
|
||||
ID: "inst-1",
|
||||
WorkspaceID: "workspace-1",
|
||||
OwnerID: "user-1",
|
||||
ClusterID: "cluster-1",
|
||||
Name: "demo",
|
||||
Namespace: "ocdp-u-alice",
|
||||
}
|
||||
if err := instanceRepo.Create(ctx, instance); err != nil {
|
||||
t.Fatalf("failed to seed instance: %v", err)
|
||||
}
|
||||
svc := NewInstanceService(
|
||||
instanceRepo,
|
||||
&stubClusterRepo{cluster: &entity.Cluster{ID: "cluster-1", Name: "cluster"}},
|
||||
nil,
|
||||
nil,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
svc.SetUserRepository(userRepo)
|
||||
|
||||
instances, err := svc.ListInstancesByCluster(ctx, "cluster-1")
|
||||
if err != nil {
|
||||
t.Fatalf("ListInstancesByCluster returned error: %v", err)
|
||||
}
|
||||
if len(instances) != 1 {
|
||||
t.Fatalf("expected 1 instance, got %d", len(instances))
|
||||
}
|
||||
if instances[0].OwnerUsername != "alice" {
|
||||
t.Fatalf("expected owner username alice, got %q", instances[0].OwnerUsername)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateInstanceRejectsGPUWhenWorkspaceQuotaEmptyBeforeCreate(t *testing.T) {
|
||||
ctx := authz.WithPrincipal(context.Background(), &authz.Principal{
|
||||
UserID: "user-ivanwu",
|
||||
Username: "ivanwu",
|
||||
Role: authz.RoleUser,
|
||||
WorkspaceID: "workspace-ivanwu",
|
||||
WorkspaceName: "ivanwu",
|
||||
Namespace: "ocdp-u-ivanwu",
|
||||
})
|
||||
instanceRepo := persistencemock.NewInstanceRepositoryMock()
|
||||
workspaceRepo := persistencemock.NewWorkspaceRepositoryMock()
|
||||
bindingRepo := persistencemock.NewWorkspaceClusterBindingRepositoryMock()
|
||||
workspace := entity.NewWorkspace("ivanwu", "admin")
|
||||
workspace.ID = "workspace-ivanwu"
|
||||
workspace.K8sNamespace = "ocdp-u-ivanwu"
|
||||
workspace.K8sSAName = entity.ServiceAccountForNamespace(workspace.K8sNamespace)
|
||||
workspace.QuotaCPU = "8"
|
||||
workspace.QuotaMemory = "32Gi"
|
||||
workspace.QuotaGPU = ""
|
||||
workspace.QuotaGPUMem = ""
|
||||
if err := workspaceRepo.Create(ctx, workspace); err != nil {
|
||||
t.Fatalf("seed workspace: %v", err)
|
||||
}
|
||||
|
||||
cluster := &entity.Cluster{
|
||||
ID: "k3s",
|
||||
Name: "k3s",
|
||||
Host: "https://k3s.invalid",
|
||||
Token: "token",
|
||||
OwnerID: "admin",
|
||||
Visibility: authz.VisibilityGlobalShared,
|
||||
}
|
||||
registry := &entity.Registry{
|
||||
ID: "registry-1",
|
||||
Name: "harbor",
|
||||
URL: "https://harbor.invalid",
|
||||
OwnerID: "admin",
|
||||
Visibility: authz.VisibilityGlobalShared,
|
||||
}
|
||||
helm := &stubHelmClient{
|
||||
estimate: &repository.ResourceEstimate{
|
||||
Requests: repository.ResourceVector{
|
||||
CPU: resource.MustParse("2"),
|
||||
Memory: resource.MustParse("8Gi"),
|
||||
GPU: 1,
|
||||
GPUMemoryMB: 10000,
|
||||
},
|
||||
},
|
||||
}
|
||||
oci := &stubOCIClient{}
|
||||
svc := NewInstanceService(
|
||||
instanceRepo,
|
||||
&stubClusterRepo{cluster: cluster},
|
||||
&stubRegistryRepo{registry: registry},
|
||||
helm,
|
||||
oci,
|
||||
nil,
|
||||
bindingRepo,
|
||||
)
|
||||
svc.SetTenantProvisioning(workspaceRepo, &recordingTenantClient{usage: &repository.ResourceQuotaUsage{}})
|
||||
|
||||
instance := entity.NewInstance("k3s", "vllm-qwen", "ocdp-u-ivanwu", registry.ID, "library/vllm-serve", "vllm-serve", "0.1.0")
|
||||
instance.SetValues(map[string]interface{}{
|
||||
"image": map[string]interface{}{
|
||||
"repository": "harbor.bwgdi.com/library/vllm-openai",
|
||||
"tag": "v0.17.1",
|
||||
},
|
||||
"model": "Qwen/Qwen2.5-0.5B",
|
||||
})
|
||||
|
||||
err := svc.CreateInstance(ctx, instance)
|
||||
if !errors.Is(err, ErrQuotaExceeded) {
|
||||
t.Fatalf("expected GPU quota rejection, got %v", err)
|
||||
}
|
||||
instances, listErr := instanceRepo.List(ctx)
|
||||
if listErr != nil {
|
||||
t.Fatalf("list instances: %v", listErr)
|
||||
}
|
||||
if len(instances) != 0 {
|
||||
t.Fatalf("expected quota rejection before instance DB create, got %#v", instances)
|
||||
}
|
||||
if helm.installCalls != 0 {
|
||||
t.Fatalf("expected Helm install not to be called, got %d calls", helm.installCalls)
|
||||
}
|
||||
if oci.pullCalls != 1 {
|
||||
t.Fatalf("expected chart pull for quota rendering, got %d pulls", oci.pullCalls)
|
||||
}
|
||||
}
|
||||
|
||||
func waitForInstanceDeleted(t *testing.T, ctx context.Context, repo repository.InstanceRepository, id string) {
|
||||
t.Helper()
|
||||
|
||||
@ -133,13 +338,19 @@ func (*stubClusterRepo) List(ctx context.Context) ([]*entity.Cluster, error) { r
|
||||
|
||||
type stubHelmClient struct {
|
||||
uninstallErr error
|
||||
estimate *repository.ResourceEstimate
|
||||
values map[string]interface{}
|
||||
installCalls int
|
||||
upgradeCalls int
|
||||
}
|
||||
|
||||
func (*stubHelmClient) Install(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) error {
|
||||
func (s *stubHelmClient) Install(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) error {
|
||||
s.installCalls++
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*stubHelmClient) Upgrade(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) error {
|
||||
func (s *stubHelmClient) Upgrade(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) error {
|
||||
s.upgradeCalls++
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -163,13 +374,116 @@ func (*stubHelmClient) List(ctx context.Context, cluster *entity.Cluster, namesp
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (*stubHelmClient) GetValues(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) (map[string]interface{}, error) {
|
||||
return nil, nil
|
||||
func (s *stubHelmClient) GetValues(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) (map[string]interface{}, error) {
|
||||
return s.values, nil
|
||||
}
|
||||
|
||||
func (*stubHelmClient) GetChartDefaultValues(chartPath string) (map[string]interface{}, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *stubHelmClient) EstimateInstanceResources(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) (*repository.ResourceEstimate, error) {
|
||||
if s.estimate != nil {
|
||||
return s.estimate, nil
|
||||
}
|
||||
return &repository.ResourceEstimate{}, nil
|
||||
}
|
||||
|
||||
type stubRegistryRepo struct {
|
||||
registry *entity.Registry
|
||||
}
|
||||
|
||||
func (s *stubRegistryRepo) Create(ctx context.Context, registry *entity.Registry) error {
|
||||
s.registry = registry
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubRegistryRepo) GetByID(ctx context.Context, id string) (*entity.Registry, error) {
|
||||
if s.registry != nil && s.registry.ID == id {
|
||||
return s.registry, nil
|
||||
}
|
||||
return nil, entity.ErrRegistryNotFound
|
||||
}
|
||||
|
||||
func (s *stubRegistryRepo) GetByName(ctx context.Context, name string) (*entity.Registry, error) {
|
||||
if s.registry != nil && s.registry.Name == name {
|
||||
return s.registry, nil
|
||||
}
|
||||
return nil, entity.ErrRegistryNotFound
|
||||
}
|
||||
|
||||
func (s *stubRegistryRepo) Update(ctx context.Context, registry *entity.Registry) error {
|
||||
s.registry = registry
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *stubRegistryRepo) Delete(ctx context.Context, id string) error {
|
||||
if s.registry != nil && s.registry.ID == id {
|
||||
s.registry = nil
|
||||
return nil
|
||||
}
|
||||
return entity.ErrRegistryNotFound
|
||||
}
|
||||
|
||||
func (s *stubRegistryRepo) List(ctx context.Context) ([]*entity.Registry, error) {
|
||||
if s.registry == nil {
|
||||
return nil, nil
|
||||
}
|
||||
return []*entity.Registry{s.registry}, nil
|
||||
}
|
||||
|
||||
type stubOCIClient struct {
|
||||
pullCalls int
|
||||
}
|
||||
|
||||
func (*stubOCIClient) ListRepositories(ctx context.Context, registry *entity.Registry, artifactType string) ([]string, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (*stubOCIClient) ListArtifacts(ctx context.Context, registry *entity.Registry, repositoryName, mediaTypeFilter string) ([]*entity.Artifact, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (*stubOCIClient) GetArtifact(ctx context.Context, registry *entity.Registry, repositoryName, reference string) (*entity.Artifact, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (*stubOCIClient) GetValuesSchema(ctx context.Context, registry *entity.Registry, repositoryName, reference string) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (*stubOCIClient) GetValuesYAML(ctx context.Context, registry *entity.Registry, repositoryName, reference string) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (s *stubOCIClient) PullArtifact(ctx context.Context, registry *entity.Registry, repositoryName, reference, destPath string) error {
|
||||
s.pullCalls++
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*stubOCIClient) PushArtifact(ctx context.Context, registry *entity.Registry, repositoryName, tag, sourcePath string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*stubOCIClient) CheckHealth(ctx context.Context, registry *entity.Registry) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type stubScaleClient struct {
|
||||
replicas int32
|
||||
}
|
||||
|
||||
func (s *stubScaleClient) GetDeploymentReplicas(ctx context.Context, cluster *entity.Cluster, namespace, releaseName string) (int32, error) {
|
||||
return s.replicas, nil
|
||||
}
|
||||
|
||||
func (s *stubScaleClient) ScaleDeployment(ctx context.Context, cluster *entity.Cluster, namespace, releaseName string, replicas int32) error {
|
||||
s.replicas = replicas
|
||||
return nil
|
||||
}
|
||||
|
||||
var _ repository.ClusterRepository = (*stubClusterRepo)(nil)
|
||||
var _ repository.RegistryRepository = (*stubRegistryRepo)(nil)
|
||||
var _ repository.HelmClient = (*stubHelmClient)(nil)
|
||||
var _ repository.OCIClient = (*stubOCIClient)(nil)
|
||||
var _ ScaleClient = (*stubScaleClient)(nil)
|
||||
|
||||
@ -3,6 +3,7 @@ package service
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
@ -13,16 +14,22 @@ import (
|
||||
type MonitoringService struct {
|
||||
clusterRepo repository.ClusterRepository
|
||||
metricsClient repository.MetricsClient
|
||||
instanceRepo repository.InstanceRepository
|
||||
userRepo repository.UserRepository
|
||||
}
|
||||
|
||||
// NewMonitoringService 创建监控服务
|
||||
func NewMonitoringService(
|
||||
clusterRepo repository.ClusterRepository,
|
||||
metricsClient repository.MetricsClient,
|
||||
instanceRepo repository.InstanceRepository,
|
||||
userRepo repository.UserRepository,
|
||||
) *MonitoringService {
|
||||
return &MonitoringService{
|
||||
clusterRepo: clusterRepo,
|
||||
metricsClient: metricsClient,
|
||||
instanceRepo: instanceRepo,
|
||||
userRepo: userRepo,
|
||||
}
|
||||
}
|
||||
|
||||
@ -43,6 +50,8 @@ func (s *MonitoringService) GetClusterMonitoring(ctx context.Context, clusterID
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get cluster metrics: %w", err)
|
||||
}
|
||||
s.enrichResourceUsage(ctx, principal, metrics)
|
||||
s.scopeTenantMetrics(principal, metrics)
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
@ -75,12 +84,310 @@ func (s *MonitoringService) ListClusterMonitoring(ctx context.Context) ([]*entit
|
||||
Status: "unknown",
|
||||
}
|
||||
}
|
||||
s.enrichResourceUsage(ctx, principal, metrics)
|
||||
s.scopeTenantMetrics(principal, metrics)
|
||||
result = append(result, metrics)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *MonitoringService) enrichResourceUsage(ctx context.Context, principal *authz.Principal, metrics *entity.ClusterMetrics) {
|
||||
if metrics == nil || s.instanceRepo == nil || s.metricsClient == nil {
|
||||
s.addVisibleUserRows(ctx, principal, metrics)
|
||||
return
|
||||
}
|
||||
instances, err := s.instanceRepo.ListByCluster(ctx, metrics.ClusterID)
|
||||
if err != nil {
|
||||
fmt.Printf("Warning: failed to list instances for cluster %s resource usage: %v\n", metrics.ClusterID, err)
|
||||
s.addVisibleUserRows(ctx, principal, metrics)
|
||||
return
|
||||
}
|
||||
allocations, err := s.metricsClient.GetPodResourceAllocations(ctx, metrics.ClusterID)
|
||||
if err != nil {
|
||||
fmt.Printf("Warning: failed to list pod resource allocations for cluster %s: %v\n", metrics.ClusterID, err)
|
||||
s.addVisibleUserRows(ctx, principal, metrics)
|
||||
return
|
||||
}
|
||||
|
||||
visibleInstances := make(map[string]*entity.Instance)
|
||||
for _, instance := range instances {
|
||||
if instance == nil || !canReadMonitoringInstance(principal, instance) {
|
||||
continue
|
||||
}
|
||||
key := monitoringInstanceKey(instance.Namespace, instance.Name)
|
||||
visibleInstances[key] = instance
|
||||
}
|
||||
|
||||
type usageAccumulator struct {
|
||||
userID string
|
||||
username string
|
||||
workspaceID string
|
||||
allocation entity.ResourceAllocation
|
||||
podCount int
|
||||
instances map[string]struct{}
|
||||
}
|
||||
byUser := make(map[string]*usageAccumulator)
|
||||
total := entity.ResourceAllocation{}
|
||||
|
||||
for _, pod := range allocations {
|
||||
if pod == nil {
|
||||
continue
|
||||
}
|
||||
instance := visibleInstances[monitoringInstanceKey(pod.Namespace, pod.InstanceName)]
|
||||
if instance == nil {
|
||||
continue
|
||||
}
|
||||
total = addResourceAllocation(total, pod.Allocation)
|
||||
username := instance.OwnerUsername
|
||||
if username == "" {
|
||||
username = s.usernameForOwner(ctx, instance.OwnerID, principal)
|
||||
}
|
||||
acc := byUser[instance.OwnerID]
|
||||
if acc == nil {
|
||||
acc = &usageAccumulator{
|
||||
userID: instance.OwnerID,
|
||||
username: username,
|
||||
workspaceID: instance.WorkspaceID,
|
||||
instances: map[string]struct{}{},
|
||||
}
|
||||
byUser[instance.OwnerID] = acc
|
||||
}
|
||||
if acc.username == "" {
|
||||
acc.username = username
|
||||
}
|
||||
acc.allocation = addResourceAllocation(acc.allocation, pod.Allocation)
|
||||
acc.podCount++
|
||||
acc.instances[instance.ID] = struct{}{}
|
||||
}
|
||||
|
||||
metrics.CPURequests = formatCPUAllocation(total.CPURequestsMilli)
|
||||
metrics.CPULimits = formatCPUAllocation(total.CPULimitsMilli)
|
||||
metrics.MemoryRequests = formatMemoryAllocation(total.MemoryRequestsBytes)
|
||||
metrics.MemoryLimits = formatMemoryAllocation(total.MemoryLimitsBytes)
|
||||
metrics.GPURequests = total.GPURequests
|
||||
metrics.GPULimits = total.GPULimits
|
||||
metrics.GPUMemoryRequestsMB = total.GPUMemoryRequestsMB
|
||||
metrics.GPUMemoryLimitsMB = total.GPUMemoryLimitsMB
|
||||
metrics.AllocatedGPU = total.GPURequests
|
||||
metrics.AllocatedGPUMemoryMB = total.GPUMemoryRequestsMB
|
||||
|
||||
userIDs := make([]string, 0, len(byUser))
|
||||
for userID := range byUser {
|
||||
userIDs = append(userIDs, userID)
|
||||
}
|
||||
sort.Slice(userIDs, func(i, j int) bool {
|
||||
left := byUser[userIDs[i]]
|
||||
right := byUser[userIDs[j]]
|
||||
if left.username == right.username {
|
||||
return left.userID < right.userID
|
||||
}
|
||||
return left.username < right.username
|
||||
})
|
||||
|
||||
usage := make([]entity.UserResourceUsage, 0, len(userIDs))
|
||||
for _, userID := range userIDs {
|
||||
acc := byUser[userID]
|
||||
usage = append(usage, entity.UserResourceUsage{
|
||||
UserID: acc.userID,
|
||||
Username: acc.username,
|
||||
WorkspaceID: acc.workspaceID,
|
||||
InstanceCount: len(acc.instances),
|
||||
PodCount: acc.podCount,
|
||||
CPURequests: formatCPUAllocation(acc.allocation.CPURequestsMilli),
|
||||
CPULimits: formatCPUAllocation(acc.allocation.CPULimitsMilli),
|
||||
MemoryRequests: formatMemoryAllocation(acc.allocation.MemoryRequestsBytes),
|
||||
MemoryLimits: formatMemoryAllocation(acc.allocation.MemoryLimitsBytes),
|
||||
GPURequests: acc.allocation.GPURequests,
|
||||
GPULimits: acc.allocation.GPULimits,
|
||||
GPUMemoryRequestsMB: acc.allocation.GPUMemoryRequestsMB,
|
||||
GPUMemoryLimitsMB: acc.allocation.GPUMemoryLimitsMB,
|
||||
})
|
||||
}
|
||||
metrics.ResourceUsageByUser = usage
|
||||
s.addVisibleUserRows(ctx, principal, metrics)
|
||||
}
|
||||
|
||||
func (s *MonitoringService) addVisibleUserRows(ctx context.Context, principal *authz.Principal, metrics *entity.ClusterMetrics) {
|
||||
if principal == nil || metrics == nil {
|
||||
return
|
||||
}
|
||||
existing := make(map[string]struct{}, len(metrics.ResourceUsageByUser))
|
||||
for _, row := range metrics.ResourceUsageByUser {
|
||||
if row.UserID != "" {
|
||||
existing[row.UserID] = struct{}{}
|
||||
}
|
||||
}
|
||||
appendEmpty := func(userID, username, workspaceID string) {
|
||||
if userID == "" {
|
||||
return
|
||||
}
|
||||
if _, ok := existing[userID]; ok {
|
||||
return
|
||||
}
|
||||
metrics.ResourceUsageByUser = append(metrics.ResourceUsageByUser, entity.UserResourceUsage{
|
||||
UserID: userID,
|
||||
Username: username,
|
||||
WorkspaceID: workspaceID,
|
||||
InstanceCount: 0,
|
||||
PodCount: 0,
|
||||
CPURequests: "0 cores",
|
||||
CPULimits: "0 cores",
|
||||
MemoryRequests: "0 B",
|
||||
MemoryLimits: "0 B",
|
||||
})
|
||||
existing[userID] = struct{}{}
|
||||
}
|
||||
if !principal.IsAdmin() {
|
||||
appendEmpty(principal.UserID, principal.Username, principal.WorkspaceID)
|
||||
return
|
||||
}
|
||||
if s.userRepo == nil {
|
||||
return
|
||||
}
|
||||
users, err := s.userRepo.List(ctx)
|
||||
if err != nil {
|
||||
fmt.Printf("Warning: failed to list users for monitoring rows: %v\n", err)
|
||||
return
|
||||
}
|
||||
for _, user := range users {
|
||||
if user == nil || user.Role != authz.RoleUser || !user.IsActive {
|
||||
continue
|
||||
}
|
||||
appendEmpty(user.ID, user.Username, user.WorkspaceID)
|
||||
}
|
||||
sort.Slice(metrics.ResourceUsageByUser, func(i, j int) bool {
|
||||
left := metrics.ResourceUsageByUser[i]
|
||||
right := metrics.ResourceUsageByUser[j]
|
||||
if left.Username == right.Username {
|
||||
return left.UserID < right.UserID
|
||||
}
|
||||
return left.Username < right.Username
|
||||
})
|
||||
}
|
||||
|
||||
func (s *MonitoringService) scopeTenantMetrics(principal *authz.Principal, metrics *entity.ClusterMetrics) {
|
||||
if principal == nil || principal.IsAdmin() || metrics == nil {
|
||||
return
|
||||
}
|
||||
var total entity.ResourceAllocation
|
||||
podCount := 0
|
||||
instanceCount := 0
|
||||
for _, usage := range metrics.ResourceUsageByUser {
|
||||
if usage.UserID != principal.UserID {
|
||||
continue
|
||||
}
|
||||
podCount += usage.PodCount
|
||||
instanceCount += usage.InstanceCount
|
||||
total.GPURequests += usage.GPURequests
|
||||
total.GPULimits += usage.GPULimits
|
||||
total.GPUMemoryRequestsMB += usage.GPUMemoryRequestsMB
|
||||
total.GPUMemoryLimitsMB += usage.GPUMemoryLimitsMB
|
||||
}
|
||||
metrics.NodeCount = 0
|
||||
metrics.Nodes = nil
|
||||
metrics.PodCount = podCount
|
||||
metrics.TotalCPU = ""
|
||||
metrics.TotalMemory = ""
|
||||
metrics.TotalGPU = 0
|
||||
metrics.UsedCPU = metrics.CPURequests
|
||||
metrics.UsedMemory = metrics.MemoryRequests
|
||||
metrics.UsedGPU = int(total.GPURequests)
|
||||
metrics.CPUUsage = 0
|
||||
metrics.MemoryUsage = 0
|
||||
metrics.GPUUsage = 0
|
||||
metrics.MaxNodeCPU = ""
|
||||
metrics.MaxNodeMemory = ""
|
||||
metrics.MaxNodeGPU = 0
|
||||
metrics.MaxNodeCPUUsage = 0
|
||||
metrics.MaxNodeMemUsage = 0
|
||||
metrics.MaxNodeGPUUsage = 0
|
||||
metrics.ResourceUsageByUser = filterSelfUsage(principal.UserID, metrics.ResourceUsageByUser)
|
||||
if instanceCount == 0 {
|
||||
metrics.CPURequests = ""
|
||||
metrics.CPULimits = ""
|
||||
metrics.MemoryRequests = ""
|
||||
metrics.MemoryLimits = ""
|
||||
metrics.GPURequests = 0
|
||||
metrics.GPULimits = 0
|
||||
metrics.GPUMemoryRequestsMB = 0
|
||||
metrics.GPUMemoryLimitsMB = 0
|
||||
metrics.AllocatedGPU = 0
|
||||
metrics.AllocatedGPUMemoryMB = 0
|
||||
}
|
||||
}
|
||||
|
||||
func filterSelfUsage(userID string, usage []entity.UserResourceUsage) []entity.UserResourceUsage {
|
||||
filtered := make([]entity.UserResourceUsage, 0, len(usage))
|
||||
for _, row := range usage {
|
||||
if row.UserID == userID {
|
||||
filtered = append(filtered, row)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
func canReadMonitoringInstance(principal *authz.Principal, instance *entity.Instance) bool {
|
||||
if principal == nil || instance == nil {
|
||||
return false
|
||||
}
|
||||
if principal.IsAdmin() {
|
||||
return true
|
||||
}
|
||||
return instance.WorkspaceID == principal.WorkspaceID && instance.OwnerID == principal.UserID
|
||||
}
|
||||
|
||||
func (s *MonitoringService) usernameForOwner(ctx context.Context, ownerID string, principal *authz.Principal) string {
|
||||
if ownerID == "" {
|
||||
return ""
|
||||
}
|
||||
if principal != nil && ownerID == principal.UserID {
|
||||
return principal.Username
|
||||
}
|
||||
if s.userRepo == nil {
|
||||
return ""
|
||||
}
|
||||
user, err := s.userRepo.GetByID(ctx, ownerID)
|
||||
if err != nil || user == nil {
|
||||
return ""
|
||||
}
|
||||
return user.Username
|
||||
}
|
||||
|
||||
func monitoringInstanceKey(namespace, name string) string {
|
||||
return namespace + "/" + name
|
||||
}
|
||||
|
||||
func addResourceAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
|
||||
return entity.ResourceAllocation{
|
||||
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
|
||||
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
|
||||
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
|
||||
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
|
||||
GPURequests: left.GPURequests + right.GPURequests,
|
||||
GPULimits: left.GPULimits + right.GPULimits,
|
||||
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
|
||||
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
|
||||
}
|
||||
}
|
||||
|
||||
func formatCPUAllocation(milli int64) string {
|
||||
return fmt.Sprintf("%.2f cores", float64(milli)/1000.0)
|
||||
}
|
||||
|
||||
func formatMemoryAllocation(bytes int64) string {
|
||||
const unit = 1024
|
||||
if bytes < unit {
|
||||
return fmt.Sprintf("%d B", bytes)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := bytes / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
// GetMonitoringSummary 获取监控汇总信息
|
||||
func (s *MonitoringService) GetMonitoringSummary(ctx context.Context) (*entity.MonitoringSummary, error) {
|
||||
// 获取所有集群监控数据
|
||||
@ -123,6 +430,9 @@ func (s *MonitoringService) GetNodeMetrics(ctx context.Context, clusterID string
|
||||
if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) {
|
||||
return nil, entity.ErrClusterNotFound
|
||||
}
|
||||
if !principal.IsAdmin() {
|
||||
return nil, entity.ErrForbidden
|
||||
}
|
||||
nodes, err := s.metricsClient.GetNodeMetrics(ctx, clusterID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get node metrics: %w", err)
|
||||
|
||||
228
backend/internal/domain/service/monitoring_service_test.go
Normal file
228
backend/internal/domain/service/monitoring_service_test.go
Normal file
@ -0,0 +1,228 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
persistencemock "github.com/ocdp/cluster-service/internal/adapter/output/persistence/mock"
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/pkg/authz"
|
||||
)
|
||||
|
||||
func TestListClusterMonitoringAggregatesResourceUsageForAdmin(t *testing.T) {
|
||||
ctx := authz.WithPrincipal(context.Background(), &authz.Principal{
|
||||
UserID: "admin-1",
|
||||
Username: "admin",
|
||||
Role: authz.RoleAdmin,
|
||||
WorkspaceID: "workspace-admin",
|
||||
})
|
||||
instanceRepo, userRepo := seedMonitoringOwners(t, ctx)
|
||||
svc := NewMonitoringService(
|
||||
&monitoringClusterRepo{clusters: []*entity.Cluster{{ID: "cluster-1", Name: "cluster", Visibility: authz.VisibilityGlobalShared}}},
|
||||
&stubMetricsClient{allocations: monitoringAllocations()},
|
||||
instanceRepo,
|
||||
userRepo,
|
||||
)
|
||||
|
||||
metrics, err := svc.ListClusterMonitoring(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("ListClusterMonitoring returned error: %v", err)
|
||||
}
|
||||
if len(metrics) != 1 {
|
||||
t.Fatalf("expected 1 cluster metric, got %d", len(metrics))
|
||||
}
|
||||
got := metrics[0]
|
||||
if got.AllocatedGPU != 3 || got.AllocatedGPUMemoryMB != 30000 {
|
||||
t.Fatalf("expected total GPU/gpumem allocation 3/30000, got %d/%d", got.AllocatedGPU, got.AllocatedGPUMemoryMB)
|
||||
}
|
||||
if len(got.ResourceUsageByUser) != 2 {
|
||||
t.Fatalf("expected 2 user usage rows, got %d: %#v", len(got.ResourceUsageByUser), got.ResourceUsageByUser)
|
||||
}
|
||||
if got.ResourceUsageByUser[0].Username != "alice" || got.ResourceUsageByUser[0].GPURequests != 1 {
|
||||
t.Fatalf("expected alice GPU request row first, got %#v", got.ResourceUsageByUser[0])
|
||||
}
|
||||
if got.ResourceUsageByUser[1].Username != "bob" || got.ResourceUsageByUser[1].GPURequests != 2 {
|
||||
t.Fatalf("expected bob GPU request row second, got %#v", got.ResourceUsageByUser[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestListClusterMonitoringFiltersResourceUsageForOrdinaryUser(t *testing.T) {
|
||||
ctx := authz.WithPrincipal(context.Background(), &authz.Principal{
|
||||
UserID: "user-1",
|
||||
Username: "alice",
|
||||
Role: authz.RoleUser,
|
||||
WorkspaceID: "workspace-1",
|
||||
})
|
||||
instanceRepo, userRepo := seedMonitoringOwners(t, ctx)
|
||||
svc := NewMonitoringService(
|
||||
&monitoringClusterRepo{clusters: []*entity.Cluster{{ID: "cluster-1", Name: "cluster", Visibility: authz.VisibilityGlobalShared}}},
|
||||
&stubMetricsClient{allocations: monitoringAllocations()},
|
||||
instanceRepo,
|
||||
userRepo,
|
||||
)
|
||||
|
||||
metrics, err := svc.ListClusterMonitoring(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("ListClusterMonitoring returned error: %v", err)
|
||||
}
|
||||
got := metrics[0]
|
||||
if got.AllocatedGPU != 1 || got.AllocatedGPUMemoryMB != 10000 {
|
||||
t.Fatalf("expected ordinary user allocation to be scoped to alice, got %d/%d", got.AllocatedGPU, got.AllocatedGPUMemoryMB)
|
||||
}
|
||||
if len(got.ResourceUsageByUser) != 1 {
|
||||
t.Fatalf("expected only alice usage row, got %d: %#v", len(got.ResourceUsageByUser), got.ResourceUsageByUser)
|
||||
}
|
||||
if got.ResourceUsageByUser[0].UserID != "user-1" || got.ResourceUsageByUser[0].Username != "alice" {
|
||||
t.Fatalf("expected alice usage row, got %#v", got.ResourceUsageByUser[0])
|
||||
}
|
||||
if got.NodeCount != 0 || len(got.Nodes) != 0 || got.TotalCPU != "" || got.TotalMemory != "" {
|
||||
t.Fatalf("expected ordinary user cluster-wide metrics to be sanitized, got nodes=%d/%d totalCPU=%q totalMemory=%q", got.NodeCount, len(got.Nodes), got.TotalCPU, got.TotalMemory)
|
||||
}
|
||||
if got.PodCount != 1 {
|
||||
t.Fatalf("expected ordinary user pod count to be self scoped, got %d", got.PodCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetNodeMetricsForbiddenForOrdinaryUser(t *testing.T) {
|
||||
ctx := authz.WithPrincipal(context.Background(), &authz.Principal{
|
||||
UserID: "user-1",
|
||||
Username: "alice",
|
||||
Role: authz.RoleUser,
|
||||
WorkspaceID: "workspace-1",
|
||||
})
|
||||
svc := NewMonitoringService(
|
||||
&monitoringClusterRepo{clusters: []*entity.Cluster{{ID: "cluster-1", Name: "cluster", Visibility: authz.VisibilityGlobalShared}}},
|
||||
&stubMetricsClient{allocations: monitoringAllocations()},
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
|
||||
_, err := svc.GetNodeMetrics(ctx, "cluster-1")
|
||||
if err != entity.ErrForbidden {
|
||||
t.Fatalf("expected ordinary user node metrics to be forbidden, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func seedMonitoringOwners(t *testing.T, ctx context.Context) (*persistencemock.InstanceRepositoryMock, *persistencemock.UserRepositoryMock) {
|
||||
t.Helper()
|
||||
instanceRepo := persistencemock.NewInstanceRepositoryMock().(*persistencemock.InstanceRepositoryMock)
|
||||
userRepo := persistencemock.NewUserRepositoryMock().(*persistencemock.UserRepositoryMock)
|
||||
for _, user := range []*entity.User{
|
||||
{ID: "user-1", Username: "alice", PasswordHash: "hash", Role: "user", WorkspaceID: "workspace-1"},
|
||||
{ID: "user-2", Username: "bob", PasswordHash: "hash", Role: "user", WorkspaceID: "workspace-2"},
|
||||
} {
|
||||
if err := userRepo.Create(ctx, user); err != nil {
|
||||
t.Fatalf("failed to seed user %s: %v", user.ID, err)
|
||||
}
|
||||
}
|
||||
for _, instance := range []*entity.Instance{
|
||||
{ID: "inst-1", ClusterID: "cluster-1", Name: "alice-app", Namespace: "ocdp-u-alice", WorkspaceID: "workspace-1", OwnerID: "user-1"},
|
||||
{ID: "inst-2", ClusterID: "cluster-1", Name: "bob-app", Namespace: "ocdp-u-bob", WorkspaceID: "workspace-2", OwnerID: "user-2"},
|
||||
} {
|
||||
if err := instanceRepo.Create(ctx, instance); err != nil {
|
||||
t.Fatalf("failed to seed instance %s: %v", instance.ID, err)
|
||||
}
|
||||
}
|
||||
return instanceRepo, userRepo
|
||||
}
|
||||
|
||||
func monitoringAllocations() []*entity.PodResourceAllocation {
|
||||
return []*entity.PodResourceAllocation{
|
||||
{
|
||||
ClusterID: "cluster-1",
|
||||
Namespace: "ocdp-u-alice",
|
||||
PodName: "alice-app-0",
|
||||
InstanceName: "alice-app",
|
||||
Allocation: entity.ResourceAllocation{
|
||||
CPURequestsMilli: 500,
|
||||
CPULimitsMilli: 1000,
|
||||
MemoryRequestsBytes: 1024 * 1024 * 1024,
|
||||
MemoryLimitsBytes: 2 * 1024 * 1024 * 1024,
|
||||
GPURequests: 1,
|
||||
GPULimits: 1,
|
||||
GPUMemoryRequestsMB: 10000,
|
||||
GPUMemoryLimitsMB: 10000,
|
||||
},
|
||||
},
|
||||
{
|
||||
ClusterID: "cluster-1",
|
||||
Namespace: "ocdp-u-bob",
|
||||
PodName: "bob-app-0",
|
||||
InstanceName: "bob-app",
|
||||
Allocation: entity.ResourceAllocation{
|
||||
CPURequestsMilli: 2000,
|
||||
CPULimitsMilli: 4000,
|
||||
MemoryRequestsBytes: 4 * 1024 * 1024 * 1024,
|
||||
MemoryLimitsBytes: 8 * 1024 * 1024 * 1024,
|
||||
GPURequests: 2,
|
||||
GPULimits: 2,
|
||||
GPUMemoryRequestsMB: 20000,
|
||||
GPUMemoryLimitsMB: 20000,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
type monitoringClusterRepo struct {
|
||||
clusters []*entity.Cluster
|
||||
}
|
||||
|
||||
func (r *monitoringClusterRepo) Create(ctx context.Context, cluster *entity.Cluster) error {
|
||||
r.clusters = append(r.clusters, cluster)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *monitoringClusterRepo) GetByID(ctx context.Context, id string) (*entity.Cluster, error) {
|
||||
for _, cluster := range r.clusters {
|
||||
if cluster.ID == id {
|
||||
return cluster, nil
|
||||
}
|
||||
}
|
||||
return nil, entity.ErrClusterNotFound
|
||||
}
|
||||
|
||||
func (r *monitoringClusterRepo) GetByName(ctx context.Context, name string) (*entity.Cluster, error) {
|
||||
for _, cluster := range r.clusters {
|
||||
if cluster.Name == name {
|
||||
return cluster, nil
|
||||
}
|
||||
}
|
||||
return nil, entity.ErrClusterNotFound
|
||||
}
|
||||
|
||||
func (r *monitoringClusterRepo) Update(ctx context.Context, cluster *entity.Cluster) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *monitoringClusterRepo) Delete(ctx context.Context, id string) error { return nil }
|
||||
|
||||
func (r *monitoringClusterRepo) List(ctx context.Context) ([]*entity.Cluster, error) {
|
||||
return r.clusters, nil
|
||||
}
|
||||
|
||||
type stubMetricsClient struct {
|
||||
allocations []*entity.PodResourceAllocation
|
||||
}
|
||||
|
||||
func (c *stubMetricsClient) GetClusterMetrics(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) {
|
||||
return &entity.ClusterMetrics{
|
||||
ClusterID: clusterID,
|
||||
ClusterName: "cluster",
|
||||
Status: "healthy",
|
||||
NodeCount: 3,
|
||||
PodCount: 99,
|
||||
TotalCPU: "48 cores",
|
||||
TotalMemory: "256Gi",
|
||||
Nodes: []entity.NodeMetrics{{NodeName: "node-a"}},
|
||||
LastCheck: time.Now(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *stubMetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (c *stubMetricsClient) GetPodResourceAllocations(ctx context.Context, clusterID string) ([]*entity.PodResourceAllocation, error) {
|
||||
return c.allocations, nil
|
||||
}
|
||||
400
backend/internal/domain/service/quota_precheck.go
Normal file
400
backend/internal/domain/service/quota_precheck.go
Normal file
@ -0,0 +1,400 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||
"k8s.io/apimachinery/pkg/util/yaml"
|
||||
)
|
||||
|
||||
var ErrQuotaExceeded = errors.New("quota exceeded")
|
||||
|
||||
type QuotaExceededResource struct {
|
||||
Name string
|
||||
Required string
|
||||
Hard string
|
||||
}
|
||||
|
||||
type QuotaPrecheckResult struct {
|
||||
Allowed bool
|
||||
Required repository.ResourceEstimate
|
||||
Hard repository.ResourceVector
|
||||
Exceeded []QuotaExceededResource
|
||||
}
|
||||
|
||||
type QuotaPrecheckService struct {
|
||||
helmClient repository.HelmClient
|
||||
}
|
||||
|
||||
func NewQuotaPrecheckService(helmClient repository.HelmClient) *QuotaPrecheckService {
|
||||
return &QuotaPrecheckService{helmClient: helmClient}
|
||||
}
|
||||
|
||||
func (s *QuotaPrecheckService) EstimateAndCompare(ctx context.Context, cluster *entity.Cluster, workspace *entity.Workspace, instance *entity.Instance) (*QuotaPrecheckResult, error) {
|
||||
if s == nil || s.helmClient == nil {
|
||||
return nil, errors.New("quota precheck requires helm client")
|
||||
}
|
||||
estimate, err := s.helmClient.EstimateInstanceResources(ctx, cluster, instance)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
result, err := CompareWorkspaceQuota(workspace, estimate)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *QuotaPrecheckService) EstimateAndCompareBinding(ctx context.Context, cluster *entity.Cluster, binding *entity.WorkspaceClusterBinding, usage *repository.ResourceQuotaUsage, target *entity.Instance, current *entity.Instance) (*QuotaPrecheckResult, error) {
|
||||
if s == nil || s.helmClient == nil {
|
||||
return nil, errors.New("quota precheck requires helm client")
|
||||
}
|
||||
targetEstimate, err := s.helmClient.EstimateInstanceResources(ctx, cluster, target)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var currentEstimate *repository.ResourceEstimate
|
||||
if current != nil {
|
||||
currentEstimate, err = s.helmClient.EstimateInstanceResources(ctx, cluster, current)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
result, err := CompareBindingQuota(binding, usage, targetEstimate, currentEstimate)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func CompareWorkspaceQuota(workspace *entity.Workspace, estimate *repository.ResourceEstimate) (*QuotaPrecheckResult, error) {
|
||||
return compareQuotaList(resourceQuotaHard(workspace), nil, estimate, nil)
|
||||
}
|
||||
|
||||
func CompareBindingQuota(binding *entity.WorkspaceClusterBinding, usage *repository.ResourceQuotaUsage, targetEstimate, currentEstimate *repository.ResourceEstimate) (*QuotaPrecheckResult, error) {
|
||||
return compareQuotaList(bindingQuotaHard(binding), usage, targetEstimate, currentEstimate)
|
||||
}
|
||||
|
||||
func compareQuotaList(hardList corev1.ResourceList, usage *repository.ResourceQuotaUsage, targetEstimate, currentEstimate *repository.ResourceEstimate) (*QuotaPrecheckResult, error) {
|
||||
if targetEstimate == nil {
|
||||
targetEstimate = &repository.ResourceEstimate{}
|
||||
}
|
||||
current := effectiveQuotaRequests(currentEstimate)
|
||||
target := effectiveQuotaRequests(targetEstimate)
|
||||
used := repository.ResourceVector{}
|
||||
if usage != nil {
|
||||
used = usage.Used
|
||||
}
|
||||
required := addResourceVector(subtractResourceVectorFloorZero(used, current), target)
|
||||
hard := resourceVectorFromQuotaHard(hardList)
|
||||
result := &QuotaPrecheckResult{
|
||||
Allowed: true,
|
||||
Required: repository.ResourceEstimate{
|
||||
Requests: required,
|
||||
},
|
||||
Hard: hard,
|
||||
}
|
||||
addExceeded := func(name, required, limit string) {
|
||||
result.Allowed = false
|
||||
result.Exceeded = append(result.Exceeded, QuotaExceededResource{
|
||||
Name: name,
|
||||
Required: required,
|
||||
Hard: limit,
|
||||
})
|
||||
}
|
||||
if quantity, ok := hardList[corev1.ResourceName("requests.cpu")]; ok && required.CPU.Cmp(quantity) > 0 {
|
||||
addExceeded("requests.cpu", required.CPU.String(), quantity.String())
|
||||
}
|
||||
if quantity, ok := hardList[corev1.ResourceName("requests.memory")]; ok && required.Memory.Cmp(quantity) > 0 {
|
||||
addExceeded("requests.memory", required.Memory.String(), quantity.String())
|
||||
}
|
||||
if quantity, ok := hardList[corev1.ResourceName("requests.nvidia.com/gpu")]; ok && required.GPU > quantity.Value() {
|
||||
addExceeded("requests.nvidia.com/gpu", strconv.FormatInt(required.GPU, 10), quantity.String())
|
||||
}
|
||||
if quantity, ok := hardList[corev1.ResourceName("requests.nvidia.com/gpumem")]; ok && required.GPUMemoryMB > quantity.Value() {
|
||||
addExceeded("requests.nvidia.com/gpumem", strconv.FormatInt(required.GPUMemoryMB, 10), quantity.String())
|
||||
}
|
||||
sort.Slice(result.Exceeded, func(i, j int) bool {
|
||||
return result.Exceeded[i].Name < result.Exceeded[j].Name
|
||||
})
|
||||
if !result.Allowed {
|
||||
return result, ErrQuotaExceeded
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func legacyCompareWorkspaceQuota(workspace *entity.Workspace, estimate *repository.ResourceEstimate) (*QuotaPrecheckResult, error) {
|
||||
if estimate == nil {
|
||||
estimate = &repository.ResourceEstimate{}
|
||||
}
|
||||
hardList := resourceQuotaHard(workspace)
|
||||
hard := resourceVectorFromQuotaHard(hardList)
|
||||
result := &QuotaPrecheckResult{
|
||||
Allowed: true,
|
||||
Required: *estimate,
|
||||
Hard: hard,
|
||||
}
|
||||
effectiveRequests := effectiveQuotaRequests(estimate)
|
||||
addExceeded := func(name, required, limit string) {
|
||||
result.Allowed = false
|
||||
result.Exceeded = append(result.Exceeded, QuotaExceededResource{
|
||||
Name: name,
|
||||
Required: required,
|
||||
Hard: limit,
|
||||
})
|
||||
}
|
||||
if quantity, ok := hardList[corev1.ResourceName("requests.cpu")]; ok && effectiveRequests.CPU.Cmp(quantity) > 0 {
|
||||
addExceeded("requests.cpu", effectiveRequests.CPU.String(), quantity.String())
|
||||
}
|
||||
if quantity, ok := hardList[corev1.ResourceName("requests.memory")]; ok && effectiveRequests.Memory.Cmp(quantity) > 0 {
|
||||
addExceeded("requests.memory", effectiveRequests.Memory.String(), quantity.String())
|
||||
}
|
||||
if quantity, ok := hardList[corev1.ResourceName("requests.nvidia.com/gpu")]; ok && effectiveRequests.GPU > quantity.Value() {
|
||||
addExceeded("requests.nvidia.com/gpu", strconv.FormatInt(effectiveRequests.GPU, 10), quantity.String())
|
||||
}
|
||||
if quantity, ok := hardList[corev1.ResourceName("requests.nvidia.com/gpumem")]; ok && effectiveRequests.GPUMemoryMB > quantity.Value() {
|
||||
addExceeded("requests.nvidia.com/gpumem", strconv.FormatInt(effectiveRequests.GPUMemoryMB, 10), quantity.String())
|
||||
}
|
||||
sort.Slice(result.Exceeded, func(i, j int) bool {
|
||||
return result.Exceeded[i].Name < result.Exceeded[j].Name
|
||||
})
|
||||
if !result.Allowed {
|
||||
return result, ErrQuotaExceeded
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func effectiveQuotaRequests(estimate *repository.ResourceEstimate) repository.ResourceVector {
|
||||
if estimate == nil {
|
||||
return repository.ResourceVector{}
|
||||
}
|
||||
return repository.ResourceVector{
|
||||
CPU: maxQuantity(estimate.Requests.CPU, estimate.Limits.CPU),
|
||||
Memory: maxQuantity(estimate.Requests.Memory, estimate.Limits.Memory),
|
||||
GPU: maxInt64(estimate.Requests.GPU, estimate.Limits.GPU),
|
||||
GPUMemoryMB: maxInt64(estimate.Requests.GPUMemoryMB, estimate.Limits.GPUMemoryMB),
|
||||
}
|
||||
}
|
||||
|
||||
func addResourceVector(left, right repository.ResourceVector) repository.ResourceVector {
|
||||
out := left
|
||||
out.CPU.Add(right.CPU)
|
||||
out.Memory.Add(right.Memory)
|
||||
out.GPU += right.GPU
|
||||
out.GPUMemoryMB += right.GPUMemoryMB
|
||||
return out
|
||||
}
|
||||
|
||||
func subtractResourceVectorFloorZero(left, right repository.ResourceVector) repository.ResourceVector {
|
||||
out := left
|
||||
out.CPU.Sub(right.CPU)
|
||||
if out.CPU.Sign() < 0 {
|
||||
out.CPU = resource.Quantity{}
|
||||
}
|
||||
out.Memory.Sub(right.Memory)
|
||||
if out.Memory.Sign() < 0 {
|
||||
out.Memory = resource.Quantity{}
|
||||
}
|
||||
out.GPU -= right.GPU
|
||||
if out.GPU < 0 {
|
||||
out.GPU = 0
|
||||
}
|
||||
out.GPUMemoryMB -= right.GPUMemoryMB
|
||||
if out.GPUMemoryMB < 0 {
|
||||
out.GPUMemoryMB = 0
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func maxQuantity(left, right resource.Quantity) resource.Quantity {
|
||||
if left.Cmp(right) >= 0 {
|
||||
return left
|
||||
}
|
||||
return right
|
||||
}
|
||||
|
||||
func maxInt64(left, right int64) int64 {
|
||||
if left >= right {
|
||||
return left
|
||||
}
|
||||
return right
|
||||
}
|
||||
|
||||
func EstimateRenderedManifestResources(manifest string) (*repository.ResourceEstimate, error) {
|
||||
decoder := yaml.NewYAMLOrJSONDecoder(strings.NewReader(manifest), 4096)
|
||||
estimate := &repository.ResourceEstimate{}
|
||||
for {
|
||||
var obj unstructured.Unstructured
|
||||
if err := decoder.Decode(&obj); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
return nil, fmt.Errorf("failed to decode rendered manifest: %w", err)
|
||||
}
|
||||
if obj.GetKind() == "" {
|
||||
continue
|
||||
}
|
||||
podSpec, replicas, ok := podTemplateSpec(obj.Object)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
addPodSpecResources(estimate, podSpec, replicas)
|
||||
}
|
||||
return estimate, nil
|
||||
}
|
||||
|
||||
func resourceVectorFromQuotaHard(hard corev1.ResourceList) repository.ResourceVector {
|
||||
gpu := hard[corev1.ResourceName("requests.nvidia.com/gpu")]
|
||||
gpuMemory := hard[corev1.ResourceName("requests.nvidia.com/gpumem")]
|
||||
return repository.ResourceVector{
|
||||
CPU: hard[corev1.ResourceName("requests.cpu")],
|
||||
Memory: hard[corev1.ResourceName("requests.memory")],
|
||||
GPU: gpu.Value(),
|
||||
GPUMemoryMB: gpuMemory.Value(),
|
||||
}
|
||||
}
|
||||
|
||||
func bindingQuotaHard(binding *entity.WorkspaceClusterBinding) corev1.ResourceList {
|
||||
hard := corev1.ResourceList{}
|
||||
if binding == nil {
|
||||
return hard
|
||||
}
|
||||
addQuantity := func(name corev1.ResourceName, value string) {
|
||||
value = normalizeStandardQuotaQuantity(value)
|
||||
if value == "" {
|
||||
return
|
||||
}
|
||||
if quantity, err := resource.ParseQuantity(value); err == nil {
|
||||
hard[name] = quantity
|
||||
}
|
||||
}
|
||||
addGPUMemoryQuantity := func(value string) {
|
||||
value, err := normalizeGPUMemoryQuota(value)
|
||||
if err != nil || value == "" {
|
||||
return
|
||||
}
|
||||
if quantity, err := resource.ParseQuantity(value); err == nil {
|
||||
hard[corev1.ResourceName("requests.nvidia.com/gpumem")] = quantity
|
||||
}
|
||||
}
|
||||
addQuantity(corev1.ResourceName("requests.cpu"), binding.QuotaCPU)
|
||||
addQuantity(corev1.ResourceName("requests.memory"), binding.QuotaMemory)
|
||||
addQuantity(corev1.ResourceName("requests.nvidia.com/gpu"), binding.QuotaGPU)
|
||||
addGPUMemoryQuantity(binding.QuotaGPUMem)
|
||||
return hard
|
||||
}
|
||||
|
||||
func podTemplateSpec(obj map[string]interface{}) (map[string]interface{}, int64, bool) {
|
||||
kind, _, _ := unstructured.NestedString(obj, "kind")
|
||||
switch kind {
|
||||
case "Pod":
|
||||
spec, ok := nestedMap(obj, "spec")
|
||||
return spec, 1, ok
|
||||
case "Deployment", "ReplicaSet", "StatefulSet", "ReplicationController":
|
||||
spec, replicas, ok := workloadTemplateSpec(obj)
|
||||
return spec, replicas, ok
|
||||
case "DaemonSet", "Job":
|
||||
spec, ok := nestedMap(obj, "spec", "template", "spec")
|
||||
return spec, 1, ok
|
||||
case "CronJob":
|
||||
spec, ok := nestedMap(obj, "spec", "jobTemplate", "spec", "template", "spec")
|
||||
return spec, 1, ok
|
||||
default:
|
||||
return nil, 0, false
|
||||
}
|
||||
}
|
||||
|
||||
func workloadTemplateSpec(obj map[string]interface{}) (map[string]interface{}, int64, bool) {
|
||||
spec, ok := nestedMap(obj, "spec", "template", "spec")
|
||||
if !ok {
|
||||
return nil, 0, false
|
||||
}
|
||||
replicas, _, err := unstructured.NestedInt64(obj, "spec", "replicas")
|
||||
if err != nil || replicas < 1 {
|
||||
replicas = 1
|
||||
}
|
||||
return spec, replicas, true
|
||||
}
|
||||
|
||||
func nestedMap(obj map[string]interface{}, fields ...string) (map[string]interface{}, bool) {
|
||||
value, ok, err := unstructured.NestedMap(obj, fields...)
|
||||
return value, ok && err == nil
|
||||
}
|
||||
|
||||
func addPodSpecResources(estimate *repository.ResourceEstimate, podSpec map[string]interface{}, replicas int64) {
|
||||
if replicas < 1 {
|
||||
replicas = 1
|
||||
}
|
||||
for _, field := range []string{"initContainers", "containers"} {
|
||||
containers, ok, err := unstructured.NestedSlice(podSpec, field)
|
||||
if err != nil || !ok {
|
||||
continue
|
||||
}
|
||||
for _, item := range containers {
|
||||
container, ok := item.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
addContainerResourceList(&estimate.Requests, replicas, container, "resources", "requests")
|
||||
addContainerResourceList(&estimate.Limits, replicas, container, "resources", "limits")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func addContainerResourceList(target *repository.ResourceVector, replicas int64, container map[string]interface{}, fields ...string) {
|
||||
resources, ok := nestedMap(container, fields...)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for name, value := range resources {
|
||||
switch name {
|
||||
case "cpu":
|
||||
addQuantity(&target.CPU, value, replicas)
|
||||
case "memory":
|
||||
addQuantity(&target.Memory, value, replicas)
|
||||
case "nvidia.com/gpu", "requests.nvidia.com/gpu", "limits.nvidia.com/gpu":
|
||||
target.GPU += parseIntegerResource(value) * replicas
|
||||
case "nvidia.com/gpumem", "requests.nvidia.com/gpumem", "limits.nvidia.com/gpumem":
|
||||
target.GPUMemoryMB += parseGPUMemoryResource(value) * replicas
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func addQuantity(target *resource.Quantity, value interface{}, replicas int64) {
|
||||
quantity, err := resource.ParseQuantity(fmt.Sprint(value))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
quantity.Mul(replicas)
|
||||
target.Add(quantity)
|
||||
}
|
||||
|
||||
func parseIntegerResource(value interface{}) int64 {
|
||||
quantity, err := resource.ParseQuantity(fmt.Sprint(value))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return quantity.Value()
|
||||
}
|
||||
|
||||
func parseGPUMemoryResource(value interface{}) int64 {
|
||||
normalized, err := normalizeGPUMemoryQuota(fmt.Sprint(value))
|
||||
if err != nil || normalized == "" {
|
||||
return 0
|
||||
}
|
||||
parsed, err := strconv.ParseInt(normalized, 10, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
241
backend/internal/domain/service/quota_precheck_test.go
Normal file
241
backend/internal/domain/service/quota_precheck_test.go
Normal file
@ -0,0 +1,241 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
)
|
||||
|
||||
func TestCompareWorkspaceQuotaReportsExceededRequests(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
workspace := &entity.Workspace{
|
||||
QuotaCPU: "2",
|
||||
QuotaMemory: "4Gi",
|
||||
QuotaGPU: "1",
|
||||
QuotaGPUMem: "10000",
|
||||
}
|
||||
estimate := &repository.ResourceEstimate{
|
||||
Requests: repository.ResourceVector{
|
||||
CPU: resource.MustParse("2500m"),
|
||||
Memory: resource.MustParse("3Gi"),
|
||||
GPU: 1,
|
||||
GPUMemoryMB: 12000,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := CompareWorkspaceQuota(workspace, estimate)
|
||||
if !errors.Is(err, ErrQuotaExceeded) {
|
||||
t.Fatalf("expected ErrQuotaExceeded, got %v", err)
|
||||
}
|
||||
if result == nil || result.Allowed {
|
||||
t.Fatalf("expected denied result, got %#v", result)
|
||||
}
|
||||
if len(result.Exceeded) != 2 {
|
||||
t.Fatalf("expected 2 exceeded resources, got %#v", result.Exceeded)
|
||||
}
|
||||
if result.Exceeded[0].Name != "requests.cpu" {
|
||||
t.Fatalf("expected requests.cpu exceeded first, got %#v", result.Exceeded)
|
||||
}
|
||||
if result.Exceeded[1].Name != "requests.nvidia.com/gpumem" {
|
||||
t.Fatalf("expected requests.nvidia.com/gpumem exceeded second, got %#v", result.Exceeded)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCompareWorkspaceQuotaUsesLimitsAsEffectiveRequests(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
workspace := &entity.Workspace{
|
||||
QuotaGPU: "0",
|
||||
QuotaGPUMem: "9999",
|
||||
}
|
||||
estimate := &repository.ResourceEstimate{
|
||||
Limits: repository.ResourceVector{
|
||||
GPU: 1,
|
||||
GPUMemoryMB: 10000,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := CompareWorkspaceQuota(workspace, estimate)
|
||||
if !errors.Is(err, ErrQuotaExceeded) {
|
||||
t.Fatalf("expected ErrQuotaExceeded from limits-only GPU resources, got %v", err)
|
||||
}
|
||||
if result == nil || len(result.Exceeded) != 2 {
|
||||
t.Fatalf("expected gpu and gpumem to be exceeded, got %#v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCompareBindingQuotaSubtractsCurrentReleaseFromUsedQuota(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
binding := &entity.WorkspaceClusterBinding{
|
||||
QuotaCPU: "1",
|
||||
QuotaMemory: "2Gi",
|
||||
QuotaGPU: "1",
|
||||
QuotaGPUMem: "10000",
|
||||
}
|
||||
usage := &repository.ResourceQuotaUsage{
|
||||
Used: repository.ResourceVector{
|
||||
CPU: resource.MustParse("1"),
|
||||
Memory: resource.MustParse("2Gi"),
|
||||
GPU: 1,
|
||||
GPUMemoryMB: 10000,
|
||||
},
|
||||
}
|
||||
current := &repository.ResourceEstimate{
|
||||
Requests: repository.ResourceVector{
|
||||
CPU: resource.MustParse("1"),
|
||||
Memory: resource.MustParse("2Gi"),
|
||||
GPU: 1,
|
||||
GPUMemoryMB: 10000,
|
||||
},
|
||||
}
|
||||
targetSameSize := &repository.ResourceEstimate{
|
||||
Requests: repository.ResourceVector{
|
||||
CPU: resource.MustParse("1"),
|
||||
Memory: resource.MustParse("2Gi"),
|
||||
GPU: 1,
|
||||
GPUMemoryMB: 10000,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := CompareBindingQuota(binding, usage, targetSameSize, current)
|
||||
if err != nil {
|
||||
t.Fatalf("expected update with same resource footprint to fit quota, got %v", err)
|
||||
}
|
||||
if result.Required.Requests.GPU != 1 || result.Required.Requests.GPUMemoryMB != 10000 {
|
||||
t.Fatalf("expected required resources to subtract current release before target, got %#v", result.Required.Requests)
|
||||
}
|
||||
|
||||
targetScaledUp := &repository.ResourceEstimate{
|
||||
Requests: repository.ResourceVector{
|
||||
CPU: resource.MustParse("2"),
|
||||
Memory: resource.MustParse("4Gi"),
|
||||
GPU: 2,
|
||||
GPUMemoryMB: 20000,
|
||||
},
|
||||
}
|
||||
result, err = CompareBindingQuota(binding, usage, targetScaledUp, current)
|
||||
if !errors.Is(err, ErrQuotaExceeded) {
|
||||
t.Fatalf("expected scale-up beyond quota to be rejected, got %v", err)
|
||||
}
|
||||
if result == nil || result.Allowed {
|
||||
t.Fatalf("expected denied quota result, got %#v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCompareBindingQuotaTreatsExplicitZeroGPUAsNoGPUAllowed(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
binding := &entity.WorkspaceClusterBinding{
|
||||
QuotaCPU: "8",
|
||||
QuotaMemory: "32Gi",
|
||||
QuotaGPU: "0",
|
||||
QuotaGPUMem: "0",
|
||||
}
|
||||
vllmLikeEstimate := &repository.ResourceEstimate{
|
||||
Requests: repository.ResourceVector{
|
||||
CPU: resource.MustParse("2"),
|
||||
Memory: resource.MustParse("8Gi"),
|
||||
GPU: 1,
|
||||
GPUMemoryMB: 10000,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := CompareBindingQuota(binding, &repository.ResourceQuotaUsage{}, vllmLikeEstimate, nil)
|
||||
if !errors.Is(err, ErrQuotaExceeded) {
|
||||
t.Fatalf("expected GPU request to exceed explicit zero quota, got %v", err)
|
||||
}
|
||||
exceeded := map[string]bool{}
|
||||
for _, item := range result.Exceeded {
|
||||
exceeded[item.Name] = true
|
||||
}
|
||||
for _, name := range []string{"requests.nvidia.com/gpu", "requests.nvidia.com/gpumem"} {
|
||||
if !exceeded[name] {
|
||||
t.Fatalf("expected %s to be exceeded, got %#v", name, result.Exceeded)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBindingQuotaHardKeepsGPUMemoryAsIntegerMB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
hard := bindingQuotaHard(&entity.WorkspaceClusterBinding{QuotaGPU: "1", QuotaGPUMem: "10000"})
|
||||
gpuMem := hard[corev1.ResourceName("requests.nvidia.com/gpumem")]
|
||||
if gpuMem.Value() != 10000 {
|
||||
t.Fatalf("expected gpumem quota to remain integer MB 10000, got %s value=%d", gpuMem.String(), gpuMem.Value())
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateRenderedManifestResourcesSumsPodTemplates(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
manifest := `
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: gpu-worker
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
spec:
|
||||
initContainers:
|
||||
- name: init
|
||||
image: busybox
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
containers:
|
||||
- name: app
|
||||
image: busybox
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
nvidia.com/gpu: "1"
|
||||
nvidia.com/gpumem: "10000"
|
||||
limits:
|
||||
cpu: "1"
|
||||
memory: 2Gi
|
||||
nvidia.com/gpu: "1"
|
||||
nvidia.com/gpumem: "12000"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: ignored
|
||||
`
|
||||
estimate, err := EstimateRenderedManifestResources(manifest)
|
||||
if err != nil {
|
||||
t.Fatalf("EstimateRenderedManifestResources returned error: %v", err)
|
||||
}
|
||||
if estimate.Requests.CPU.Cmp(resource.MustParse("1800m")) != 0 {
|
||||
t.Fatalf("expected requests cpu 1800m, got %s", estimate.Requests.CPU.String())
|
||||
}
|
||||
if estimate.Requests.Memory.Cmp(resource.MustParse("3456Mi")) != 0 {
|
||||
t.Fatalf("expected requests memory 3456Mi, got %s", estimate.Requests.Memory.String())
|
||||
}
|
||||
if estimate.Requests.GPU != 3 {
|
||||
t.Fatalf("expected requests gpu 3, got %d", estimate.Requests.GPU)
|
||||
}
|
||||
if estimate.Requests.GPUMemoryMB != 30000 {
|
||||
t.Fatalf("expected requests gpumem 30000, got %d", estimate.Requests.GPUMemoryMB)
|
||||
}
|
||||
if estimate.Limits.CPU.Cmp(resource.MustParse("3")) != 0 {
|
||||
t.Fatalf("expected limits cpu 3, got %s", estimate.Limits.CPU.String())
|
||||
}
|
||||
if estimate.Limits.Memory.Cmp(resource.MustParse("6Gi")) != 0 {
|
||||
t.Fatalf("expected limits memory 6Gi, got %s", estimate.Limits.Memory.String())
|
||||
}
|
||||
if estimate.Limits.GPU != 3 {
|
||||
t.Fatalf("expected limits gpu 3, got %d", estimate.Limits.GPU)
|
||||
}
|
||||
if estimate.Limits.GPUMemoryMB != 36000 {
|
||||
t.Fatalf("expected limits gpumem 36000, got %d", estimate.Limits.GPUMemoryMB)
|
||||
}
|
||||
}
|
||||
@ -9,6 +9,10 @@ import (
|
||||
|
||||
func normalizeStandardQuotaQuantity(value string) string {
|
||||
value = strings.TrimSpace(value)
|
||||
switch strings.ToLower(value) {
|
||||
case "unlimited", "none", "no-limit", "nolimit":
|
||||
return ""
|
||||
}
|
||||
upper := strings.ToUpper(value)
|
||||
switch {
|
||||
case strings.HasSuffix(upper, "MB"):
|
||||
|
||||
@ -3,6 +3,7 @@ package service
|
||||
import (
|
||||
"context"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
@ -94,17 +95,17 @@ func (s *WorkspaceService) EnsureClusterBinding(ctx context.Context, workspaceID
|
||||
ClusterID: cluster.ID,
|
||||
Namespace: workspace.K8sNamespace,
|
||||
ServiceAccount: workspace.K8sSAName,
|
||||
QuotaCPU: workspace.QuotaCPU,
|
||||
QuotaMemory: workspace.QuotaMemory,
|
||||
QuotaGPU: workspace.QuotaGPU,
|
||||
QuotaGPUMem: workspace.QuotaGPUMem,
|
||||
QuotaCPU: strings.TrimSpace(workspace.QuotaCPU),
|
||||
QuotaMemory: strings.TrimSpace(workspace.QuotaMemory),
|
||||
QuotaGPU: zeroIfEmptyQuota(workspace.QuotaGPU),
|
||||
QuotaGPUMem: zeroIfEmptyQuota(workspace.QuotaGPUMem),
|
||||
Status: "active",
|
||||
CreatedAt: time.Now(),
|
||||
UpdatedAt: time.Now(),
|
||||
}
|
||||
tenantBinding := entity.NewTenantBinding(binding.Namespace)
|
||||
tenantBinding.ServiceAccountName = binding.ServiceAccount
|
||||
tenantBinding.ResourceQuotaHard = resourceQuotaHard(workspace)
|
||||
tenantBinding.ResourceQuotaHard = bindingQuotaHard(binding)
|
||||
if s.tenantClient != nil {
|
||||
if err := s.tenantClient.EnsureTenant(ctx, cluster, tenantBinding); err != nil {
|
||||
return nil, err
|
||||
@ -145,10 +146,22 @@ func (s *WorkspaceService) IssueKubeconfig(ctx context.Context, workspaceID, clu
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
binding.QuotaCPU = strings.TrimSpace(workspace.QuotaCPU)
|
||||
binding.QuotaMemory = strings.TrimSpace(workspace.QuotaMemory)
|
||||
binding.QuotaGPU = zeroIfEmptyQuota(workspace.QuotaGPU)
|
||||
binding.QuotaGPUMem = zeroIfEmptyQuota(workspace.QuotaGPUMem)
|
||||
binding.UpdatedAt = time.Now()
|
||||
}
|
||||
tenantBinding := entity.NewTenantBinding(binding.Namespace)
|
||||
tenantBinding.ServiceAccountName = binding.ServiceAccount
|
||||
tenantBinding.ResourceQuotaHard = resourceQuotaHard(workspace)
|
||||
tenantBinding.ResourceQuotaHard = bindingQuotaHard(binding)
|
||||
if s.tenantClient != nil {
|
||||
if err := s.tenantClient.EnsureTenant(ctx, cluster, tenantBinding); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
_ = s.bindingRepo.Upsert(ctx, binding)
|
||||
kubeconfig, err := s.tenantClient.IssueKubeconfig(ctx, cluster, tenantBinding, ttl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
Reference in New Issue
Block a user