refactor: full-stack restructure with multi-tenancy, workspace management, and K8s diagnostics

- Add Workspace domain (entity, repository, service, handler, DTO)
- Add multi-tenant K8s client with tenant binding and quota management
- Add K8s diagnostics client (instance diagnostics)
- Add authorization middleware (authz package)
- Restructure frontend to feature-based architecture (features/)
- Add User Management page in configuration
- Add AccessDenied page and route guards
- Refactor shared components (form inputs, layout, UI)
- Update Tailwind config for new design system
- Add comprehensive documentation (docs/, tasks/, plans)
- Improve cluster service with better kubeconfig handling
- Add tests for crypto, config, helm client, tenant binding
This commit is contained in:
Ivan087
2026-05-12 16:15:14 +08:00
parent c5e51ed069
commit 7f238a3168
172 changed files with 15703 additions and 3162 deletions

View File

@ -11,16 +11,23 @@ import (
"github.com/google/uuid"
"github.com/ocdp/cluster-service/internal/domain/entity"
"github.com/ocdp/cluster-service/internal/domain/repository"
"github.com/ocdp/cluster-service/internal/pkg/authz"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
// InstanceService Helm 实例管理领域服务
type InstanceService struct {
instanceRepo repository.InstanceRepository
clusterRepo repository.ClusterRepository
registryRepo repository.RegistryRepository
helmClient repository.HelmClient
ociClient repository.OCIClient
entryClient repository.InstanceEntryClient
instanceRepo repository.InstanceRepository
clusterRepo repository.ClusterRepository
registryRepo repository.RegistryRepository
bindingRepo repository.WorkspaceClusterBindingRepository
helmClient repository.HelmClient
ociClient repository.OCIClient
entryClient repository.InstanceEntryClient
diagClient repository.InstanceDiagnosticsClient
workspaceRepo repository.WorkspaceRepository
tenantClient repository.TenantKubeClient
}
// NewInstanceService 创建实例服务
@ -31,17 +38,32 @@ func NewInstanceService(
helmClient repository.HelmClient,
ociClient repository.OCIClient,
entryClient repository.InstanceEntryClient,
bindingRepo ...repository.WorkspaceClusterBindingRepository,
) *InstanceService {
var workspaceBindingRepo repository.WorkspaceClusterBindingRepository
if len(bindingRepo) > 0 {
workspaceBindingRepo = bindingRepo[0]
}
return &InstanceService{
instanceRepo: instanceRepo,
clusterRepo: clusterRepo,
registryRepo: registryRepo,
bindingRepo: workspaceBindingRepo,
helmClient: helmClient,
ociClient: ociClient,
entryClient: entryClient,
}
}
func (s *InstanceService) SetDiagnosticsClient(client repository.InstanceDiagnosticsClient) {
s.diagClient = client
}
func (s *InstanceService) SetTenantProvisioning(workspaceRepo repository.WorkspaceRepository, tenantClient repository.TenantKubeClient) {
s.workspaceRepo = workspaceRepo
s.tenantClient = tenantClient
}
const chartCacheDir = "/tmp/charts"
func (s *InstanceService) chartArchivePath(instance *entity.Instance) string {
@ -62,8 +84,14 @@ func (s *InstanceService) downloadChart(ctx context.Context, registry *entity.Re
// CreateInstance 创建(安装)新实例
func (s *InstanceService) CreateInstance(ctx context.Context, instance *entity.Instance) error {
principal, err := authz.RequirePrincipal(ctx)
if err != nil {
return entity.ErrUnauthorized
}
// 生成 ID
instance.ID = uuid.New().String()
instance.WorkspaceID = principal.WorkspaceID
instance.OwnerID = principal.UserID
// 验证
if err := instance.Validate(); err != nil {
@ -75,12 +103,25 @@ func (s *InstanceService) CreateInstance(ctx context.Context, instance *entity.I
if err != nil {
return entity.ErrClusterNotFound
}
if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) {
return entity.ErrClusterNotFound
}
// 检查 Registry 是否存在
registry, err := s.registryRepo.GetByID(ctx, instance.RegistryID)
if err != nil {
return entity.ErrRegistryNotFound
}
if !authz.CanReadResource(principal, registry.WorkspaceID, registry.OwnerID, registry.Visibility) {
return entity.ErrRegistryNotFound
}
if err := s.applyNamespacePolicy(ctx, principal, cluster, instance); err != nil {
return err
}
enforceNamespaceValues(instance)
if err := s.ensureTenantForInstance(ctx, principal, cluster, instance); err != nil {
return err
}
// 检查实例是否已存在
existingInstance, _ := s.instanceRepo.GetByClusterAndName(ctx, instance.ClusterID, instance.Name)
@ -111,13 +152,24 @@ func (s *InstanceService) CreateInstance(ctx context.Context, instance *entity.I
// GetInstance 获取实例
func (s *InstanceService) GetInstance(ctx context.Context, id string) (*entity.Instance, error) {
return s.instanceRepo.GetByID(ctx, id)
principal, err := authz.RequirePrincipal(ctx)
if err != nil {
return nil, entity.ErrUnauthorized
}
instance, err := s.instanceRepo.GetByID(ctx, id)
if err != nil {
return nil, err
}
if !s.canReadInstance(principal, instance) {
return nil, entity.ErrInstanceNotFound
}
return instance, nil
}
// GetInstanceStatus 获取实例实时状态
func (s *InstanceService) GetInstanceStatus(ctx context.Context, id string) (*entity.Instance, error) {
// 从数据库获取基本信息
instance, err := s.instanceRepo.GetByID(ctx, id)
instance, err := s.GetInstance(ctx, id)
if err != nil {
return nil, entity.ErrInstanceNotFound
}
@ -143,11 +195,20 @@ func (s *InstanceService) GetInstanceStatus(ctx context.Context, id string) (*en
// UpdateInstance 更新(升级)实例
func (s *InstanceService) UpdateInstance(ctx context.Context, instance *entity.Instance) error {
principal, err := authz.RequirePrincipal(ctx)
if err != nil {
return entity.ErrUnauthorized
}
// 检查实例是否存在
existingInstance, err := s.instanceRepo.GetByID(ctx, instance.ID)
if err != nil {
return entity.ErrInstanceNotFound
}
if !s.canWriteInstance(principal, existingInstance) {
return entity.ErrForbidden
}
instance.WorkspaceID = existingInstance.WorkspaceID
instance.OwnerID = existingInstance.OwnerID
// 获取集群信息
cluster, err := s.clusterRepo.GetByID(ctx, existingInstance.ClusterID)
@ -161,6 +222,8 @@ func (s *InstanceService) UpdateInstance(ctx context.Context, instance *entity.I
return entity.ErrRegistryNotFound
}
instance.Namespace = existingInstance.Namespace
enforceNamespaceValues(instance)
instance.BeginOperation(entity.OperationUpgrade, "Pending upgrade")
if err := s.instanceRepo.Update(ctx, instance); err != nil {
return err
@ -182,11 +245,18 @@ func (s *InstanceService) UpdateInstance(ctx context.Context, instance *entity.I
// DeleteInstance 删除(卸载)实例
func (s *InstanceService) DeleteInstance(ctx context.Context, id string) error {
principal, err := authz.RequirePrincipal(ctx)
if err != nil {
return entity.ErrUnauthorized
}
// 检查实例是否存在
instance, err := s.instanceRepo.GetByID(ctx, id)
if err != nil {
return entity.ErrInstanceNotFound
}
if !s.canWriteInstance(principal, instance) {
return entity.ErrForbidden
}
// 获取集群信息
cluster, err := s.clusterRepo.GetByID(ctx, instance.ClusterID)
@ -208,11 +278,18 @@ func (s *InstanceService) DeleteInstance(ctx context.Context, id string) error {
// RollbackInstance 回滚实例
func (s *InstanceService) RollbackInstance(ctx context.Context, id string, revision int) error {
principal, err := authz.RequirePrincipal(ctx)
if err != nil {
return entity.ErrUnauthorized
}
// 检查实例是否存在
instance, err := s.instanceRepo.GetByID(ctx, id)
if err != nil {
return entity.ErrInstanceNotFound
}
if !s.canWriteInstance(principal, instance) {
return entity.ErrForbidden
}
// 获取集群信息
cluster, err := s.clusterRepo.GetByID(ctx, instance.ClusterID)
@ -235,7 +312,7 @@ func (s *InstanceService) RollbackInstance(ctx context.Context, id string, revis
// GetInstanceHistory 获取实例历史
func (s *InstanceService) GetInstanceHistory(ctx context.Context, id string) ([]*entity.ReleaseHistory, error) {
// 检查实例是否存在
instance, err := s.instanceRepo.GetByID(ctx, id)
instance, err := s.GetInstance(ctx, id)
if err != nil {
return nil, entity.ErrInstanceNotFound
}
@ -252,18 +329,35 @@ func (s *InstanceService) GetInstanceHistory(ctx context.Context, id string) ([]
// ListInstancesByCluster 列出集群的所有实例
func (s *InstanceService) ListInstancesByCluster(ctx context.Context, clusterID string) ([]*entity.Instance, error) {
principal, err := authz.RequirePrincipal(ctx)
if err != nil {
return nil, entity.ErrUnauthorized
}
// 检查集群是否存在
_, err := s.clusterRepo.GetByID(ctx, clusterID)
cluster, err := s.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, entity.ErrClusterNotFound
}
if !authz.CanReadResource(principal, cluster.WorkspaceID, cluster.OwnerID, cluster.Visibility) {
return nil, entity.ErrClusterNotFound
}
return s.instanceRepo.ListByCluster(ctx, clusterID)
instances, err := s.instanceRepo.ListByCluster(ctx, clusterID)
if err != nil {
return nil, err
}
visible := make([]*entity.Instance, 0, len(instances))
for _, instance := range instances {
if s.canReadInstance(principal, instance) {
visible = append(visible, instance)
}
}
return visible, nil
}
// ListInstanceEntries 列出实例关联的入口信息Service / Ingress
func (s *InstanceService) ListInstanceEntries(ctx context.Context, clusterID, instanceID string) ([]*entity.InstanceEntry, error) {
instance, err := s.instanceRepo.GetByID(ctx, instanceID)
instance, err := s.GetInstance(ctx, instanceID)
if err != nil {
return nil, entity.ErrInstanceNotFound
}
@ -283,6 +377,187 @@ func (s *InstanceService) ListInstanceEntries(ctx context.Context, clusterID, in
return s.entryClient.ListEntries(ctx, cluster, instance)
}
func (s *InstanceService) GetInstanceDiagnostics(ctx context.Context, clusterID, instanceID string, tailLines int64) (*entity.InstanceDiagnostics, error) {
instance, err := s.GetInstance(ctx, instanceID)
if err != nil {
return nil, entity.ErrInstanceNotFound
}
if instance.ClusterID != clusterID {
return nil, entity.ErrInstanceNotFound
}
cluster, err := s.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, entity.ErrClusterNotFound
}
if s.diagClient == nil {
return nil, fmt.Errorf("instance diagnostics client is not configured")
}
return s.diagClient.GetDiagnostics(ctx, cluster, instance, tailLines)
}
func (s *InstanceService) canReadInstance(principal *authz.Principal, instance *entity.Instance) bool {
if principal.IsAdmin() {
return true
}
return instance.WorkspaceID == principal.WorkspaceID && instance.OwnerID == principal.UserID
}
func (s *InstanceService) canWriteInstance(principal *authz.Principal, instance *entity.Instance) bool {
if principal.IsAdmin() {
return true
}
return instance.WorkspaceID == principal.WorkspaceID && instance.OwnerID == principal.UserID
}
func enforceNamespaceValues(instance *entity.Instance) {
if instance == nil || instance.Namespace == "" {
return
}
if instance.Values == nil {
instance.Values = map[string]interface{}{}
}
instance.Values["namespace"] = instance.Namespace
setExistingStringValue(instance.Values, "namespaceOverride", instance.Namespace)
setExistingStringValue(instance.Values, "targetNamespace", instance.Namespace)
setExistingNestedStringValue(instance.Values, "global", "namespace", instance.Namespace)
setExistingNestedStringValue(instance.Values, "global", "namespaceOverride", instance.Namespace)
}
func setExistingStringValue(values map[string]interface{}, key, namespace string) {
if _, ok := values[key]; ok {
values[key] = namespace
}
}
func setExistingNestedStringValue(values map[string]interface{}, parent, key, namespace string) {
child, ok := values[parent].(map[string]interface{})
if !ok {
return
}
if _, ok := child[key]; ok {
child[key] = namespace
}
}
func (s *InstanceService) applyNamespacePolicy(ctx context.Context, principal *authz.Principal, cluster *entity.Cluster, instance *entity.Instance) error {
if principal.IsAdmin() {
if isProtectedSystemNamespace(instance.Namespace) {
return entity.ErrInvalidNamespace
}
return nil
}
if isReservedNamespace(instance.Namespace) {
return entity.ErrInvalidNamespace
}
if cluster.Visibility != authz.VisibilityPrivate || cluster.OwnerID != principal.UserID {
namespace := principal.Namespace
if namespace == "" {
namespace = entity.NamespaceForWorkspace(principal.WorkspaceName)
}
if s.bindingRepo != nil {
if binding, err := s.bindingRepo.Get(ctx, principal.WorkspaceID, cluster.ID); err == nil && binding != nil && binding.Namespace != "" {
namespace = binding.Namespace
}
}
instance.Namespace = namespace
return nil
}
if instance.Namespace == "" {
if cluster.DefaultNamespace != "" {
instance.Namespace = cluster.DefaultNamespace
} else if principal.Namespace != "" {
instance.Namespace = principal.Namespace
} else {
instance.Namespace = entity.NamespaceForWorkspace(principal.Username)
}
}
return nil
}
func (s *InstanceService) ensureTenantForInstance(ctx context.Context, principal *authz.Principal, cluster *entity.Cluster, instance *entity.Instance) error {
if principal.IsAdmin() || s.workspaceRepo == nil || s.tenantClient == nil {
return nil
}
workspace, err := s.workspaceRepo.GetByID(ctx, principal.WorkspaceID)
if err != nil {
return err
}
if workspace.Status == entity.WorkspaceSuspended {
return entity.ErrWorkspaceSuspended
}
binding := entity.NewTenantBinding(instance.Namespace)
binding.ServiceAccountName = workspace.K8sSAName
binding.ResourceQuotaHard = instanceResourceQuotaHard(workspace)
if err := s.tenantClient.EnsureTenant(ctx, cluster, binding); err != nil {
return err
}
if s.bindingRepo != nil {
_ = s.bindingRepo.Upsert(ctx, &entity.WorkspaceClusterBinding{
ID: uuid.New().String(),
WorkspaceID: workspace.ID,
ClusterID: cluster.ID,
Namespace: instance.Namespace,
ServiceAccount: workspace.K8sSAName,
QuotaCPU: workspace.QuotaCPU,
QuotaMemory: workspace.QuotaMemory,
QuotaGPU: workspace.QuotaGPU,
QuotaGPUMem: workspace.QuotaGPUMem,
Status: "active",
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
})
}
return nil
}
func instanceResourceQuotaHard(workspace *entity.Workspace) corev1.ResourceList {
hard := corev1.ResourceList{}
addQuantity := func(name corev1.ResourceName, value string) {
value = normalizeStandardQuotaQuantity(value)
if value == "" {
return
}
if quantity, err := resource.ParseQuantity(value); err == nil {
hard[name] = quantity
}
}
addGPUMemoryQuantity := func(value string) {
value, err := normalizeGPUMemoryQuota(value)
if err != nil || value == "" {
return
}
if quantity, err := resource.ParseQuantity(value); err == nil {
hard[corev1.ResourceName("requests.nvidia.com/gpumem")] = quantity
}
}
if workspace == nil {
return hard
}
addQuantity(corev1.ResourceName("requests.cpu"), workspace.QuotaCPU)
addQuantity(corev1.ResourceName("requests.memory"), workspace.QuotaMemory)
addQuantity(corev1.ResourceName("requests.nvidia.com/gpu"), workspace.QuotaGPU)
addGPUMemoryQuantity(workspace.QuotaGPUMem)
return hard
}
func isReservedNamespace(namespace string) bool {
switch namespace {
case "default", "kube-system", "kube-public", "kube-node-lease":
return true
default:
return false
}
}
func isProtectedSystemNamespace(namespace string) bool {
switch namespace {
case "kube-system", "kube-public", "kube-node-lease":
return true
default:
return false
}
}
// executeAndSyncInstall 异步执行安装并监控状态
func (s *InstanceService) executeAndSyncInstall(ctx context.Context, instanceID string, cluster *entity.Cluster, registry *entity.Registry, instance *entity.Instance) {
// 执行 Helm 安装
@ -338,7 +613,7 @@ func (s *InstanceService) executeAndSyncRollback(ctx context.Context, instanceID
func (s *InstanceService) executeAndSyncUninstall(ctx context.Context, instanceID string, cluster *entity.Cluster, releaseName, namespace string) {
// 执行 Helm 卸载
err := s.helmClient.Uninstall(ctx, cluster, releaseName, namespace)
// 获取实例
instance, getErr := s.instanceRepo.GetByID(ctx, instanceID)
if getErr != nil {
@ -360,7 +635,7 @@ func (s *InstanceService) executeAndSyncUninstall(ctx context.Context, instanceI
// 卸载成功,标记为已卸载
instance.MarkSuccess(entity.StatusUninstalled, instance.Revision, "Instance uninstalled successfully")
_ = s.instanceRepo.Update(ctx, instance)
// 验证卸载是否完成:尝试获取状态,如果获取不到说明已卸载
time.Sleep(3 * time.Second)
_, statusErr := s.helmClient.GetStatus(ctx, cluster, releaseName, namespace)
@ -377,7 +652,7 @@ func (s *InstanceService) executeAndSyncUninstall(ctx context.Context, instanceI
// syncInstanceStatus 同步实例状态(定期检查 Helm 状态并更新数据库)
func (s *InstanceService) syncInstanceStatus(ctx context.Context, instanceID string, cluster *entity.Cluster, releaseName, namespace string, operation entity.InstanceOperation) {
maxAttempts := 30 // 最多尝试30次约5分钟
maxAttempts := 30 // 最多尝试30次约5分钟
interval := 10 * time.Second // 每10秒检查一次
for i := 0; i < maxAttempts; i++ {