fix: scale replicas in response, K8s metrics client, quota precheck, auth tests
- Add GetMetrics method to MetricsClient interface and implement cluster metrics API - Add QuotaPrecheck service for validating resource quotas before deployment - Add auth DTO with role/permission models and auth handler tests - Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics - Update workspace handler with GetWorkspace endpoint and shared-user list - Fix monitoring handler to use correct service method name - Add tail_lines fallback in instance handler for snake_case query params - Update nginx config for SSE log streaming support (no buffering) - Add comprehensive test coverage: auth_service_test, auth_handler_test, auth_dto_test, metrics_client_test, quota_precheck_test - Update error messages for quota validation and instance operations - ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit - InstanceCard: correctly disable scale-minus when replicas <= 0 - SidebarLayout: add hover transition for sidebar items - Update todo.md and lessons.md with latest fixes
This commit is contained in:
@ -4,7 +4,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
)
|
||||
@ -12,38 +12,47 @@ import (
|
||||
// HelmClientMock Helm 客户端 Mock 实现
|
||||
type HelmClientMock struct {
|
||||
// Mock 数据存储
|
||||
releases map[string]map[string]*entity.Instance // clusterID -> releaseName -> instance
|
||||
history map[string]map[string][]*entity.ReleaseHistory // clusterID -> releaseName -> []history
|
||||
releases map[string]map[string]*entity.Instance // clusterID -> releaseName -> instance
|
||||
history map[string]map[string][]*entity.ReleaseHistory // clusterID -> releaseName -> []history
|
||||
estimates map[string]map[string]*repository.ResourceEstimate // clusterID -> releaseName -> estimate
|
||||
}
|
||||
|
||||
// NewHelmClientMock 创建 Mock 实现
|
||||
func NewHelmClientMock() repository.HelmClient {
|
||||
return &HelmClientMock{
|
||||
releases: make(map[string]map[string]*entity.Instance),
|
||||
history: make(map[string]map[string][]*entity.ReleaseHistory),
|
||||
releases: make(map[string]map[string]*entity.Instance),
|
||||
history: make(map[string]map[string][]*entity.ReleaseHistory),
|
||||
estimates: make(map[string]map[string]*repository.ResourceEstimate),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) SetResourceEstimate(clusterID, namespace, releaseName string, estimate *repository.ResourceEstimate) {
|
||||
if c.estimates[clusterID] == nil {
|
||||
c.estimates[clusterID] = make(map[string]*repository.ResourceEstimate)
|
||||
}
|
||||
c.estimates[clusterID][fmt.Sprintf("%s/%s", namespace, releaseName)] = estimate
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) Install(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) error {
|
||||
// 初始化集群数据
|
||||
if c.releases[cluster.ID] == nil {
|
||||
c.releases[cluster.ID] = make(map[string]*entity.Instance)
|
||||
c.history[cluster.ID] = make(map[string][]*entity.ReleaseHistory)
|
||||
}
|
||||
|
||||
|
||||
// 检查是否已存在
|
||||
key := fmt.Sprintf("%s/%s", instance.Namespace, instance.Name)
|
||||
if _, exists := c.releases[cluster.ID][key]; exists {
|
||||
return entity.ErrInstanceExists
|
||||
}
|
||||
|
||||
|
||||
// Mock 安装
|
||||
instance.Status = entity.StatusDeployed
|
||||
instance.Revision = 1
|
||||
instance.UpdatedAt = time.Now()
|
||||
|
||||
|
||||
c.releases[cluster.ID][key] = instance
|
||||
|
||||
|
||||
// 添加历史记录
|
||||
c.history[cluster.ID][key] = []*entity.ReleaseHistory{
|
||||
{
|
||||
@ -55,25 +64,25 @@ func (c *HelmClientMock) Install(ctx context.Context, cluster *entity.Cluster, i
|
||||
Description: "Install complete",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) Upgrade(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) error {
|
||||
key := fmt.Sprintf("%s/%s", instance.Namespace, instance.Name)
|
||||
|
||||
|
||||
existing, exists := c.releases[cluster.ID][key]
|
||||
if !exists {
|
||||
return entity.ErrInstanceNotFound
|
||||
}
|
||||
|
||||
|
||||
// Mock 升级
|
||||
instance.Revision = existing.Revision + 1
|
||||
instance.Status = entity.StatusDeployed
|
||||
instance.UpdatedAt = time.Now()
|
||||
|
||||
|
||||
c.releases[cluster.ID][key] = instance
|
||||
|
||||
|
||||
// 添加历史记录
|
||||
history := &entity.ReleaseHistory{
|
||||
Revision: instance.Revision,
|
||||
@ -84,44 +93,44 @@ func (c *HelmClientMock) Upgrade(ctx context.Context, cluster *entity.Cluster, i
|
||||
Description: "Upgrade complete",
|
||||
}
|
||||
c.history[cluster.ID][key] = append(c.history[cluster.ID][key], history)
|
||||
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) Uninstall(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) error {
|
||||
key := fmt.Sprintf("%s/%s", namespace, releaseName)
|
||||
|
||||
|
||||
if _, exists := c.releases[cluster.ID][key]; !exists {
|
||||
return entity.ErrInstanceNotFound
|
||||
}
|
||||
|
||||
|
||||
// Mock 卸载
|
||||
delete(c.releases[cluster.ID], key)
|
||||
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) Rollback(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string, revision int) error {
|
||||
key := fmt.Sprintf("%s/%s", namespace, releaseName)
|
||||
|
||||
|
||||
instance, exists := c.releases[cluster.ID][key]
|
||||
if !exists {
|
||||
return entity.ErrInstanceNotFound
|
||||
}
|
||||
|
||||
|
||||
// 检查历史记录是否存在
|
||||
histories := c.history[cluster.ID][key]
|
||||
if revision > len(histories) || revision < 1 {
|
||||
return fmt.Errorf("revision %d not found", revision)
|
||||
}
|
||||
|
||||
|
||||
// Mock 回滚
|
||||
instance.Revision = len(histories) + 1
|
||||
instance.Status = entity.StatusDeployed
|
||||
instance.UpdatedAt = time.Now()
|
||||
|
||||
|
||||
c.releases[cluster.ID][key] = instance
|
||||
|
||||
|
||||
// 添加回滚历史记录
|
||||
history := &entity.ReleaseHistory{
|
||||
Revision: instance.Revision,
|
||||
@ -132,33 +141,33 @@ func (c *HelmClientMock) Rollback(ctx context.Context, cluster *entity.Cluster,
|
||||
Description: fmt.Sprintf("Rollback to revision %d", revision),
|
||||
}
|
||||
c.history[cluster.ID][key] = append(c.history[cluster.ID][key], history)
|
||||
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) GetStatus(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) (*entity.Instance, error) {
|
||||
key := fmt.Sprintf("%s/%s", namespace, releaseName)
|
||||
|
||||
|
||||
instance, exists := c.releases[cluster.ID][key]
|
||||
if !exists {
|
||||
return nil, entity.ErrInstanceNotFound
|
||||
}
|
||||
|
||||
|
||||
return instance, nil
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) GetHistory(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) ([]*entity.ReleaseHistory, error) {
|
||||
key := fmt.Sprintf("%s/%s", namespace, releaseName)
|
||||
|
||||
|
||||
if _, exists := c.releases[cluster.ID][key]; !exists {
|
||||
return nil, entity.ErrInstanceNotFound
|
||||
}
|
||||
|
||||
|
||||
histories := c.history[cluster.ID][key]
|
||||
if histories == nil {
|
||||
return []*entity.ReleaseHistory{}, nil
|
||||
}
|
||||
|
||||
|
||||
return histories, nil
|
||||
}
|
||||
|
||||
@ -167,7 +176,7 @@ func (c *HelmClientMock) List(ctx context.Context, cluster *entity.Cluster, name
|
||||
if clusterReleases == nil {
|
||||
return []*entity.Instance{}, nil
|
||||
}
|
||||
|
||||
|
||||
instances := make([]*entity.Instance, 0)
|
||||
for key, instance := range clusterReleases {
|
||||
// 如果指定了 namespace,只返回该 namespace 的
|
||||
@ -179,18 +188,18 @@ func (c *HelmClientMock) List(ctx context.Context, cluster *entity.Cluster, name
|
||||
}
|
||||
instances = append(instances, c.releases[cluster.ID][key])
|
||||
}
|
||||
|
||||
|
||||
return instances, nil
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) GetValues(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) (map[string]interface{}, error) {
|
||||
key := fmt.Sprintf("%s/%s", namespace, releaseName)
|
||||
|
||||
|
||||
instance, exists := c.releases[cluster.ID][key]
|
||||
if !exists {
|
||||
return nil, entity.ErrInstanceNotFound
|
||||
}
|
||||
|
||||
|
||||
return instance.Values, nil
|
||||
}
|
||||
|
||||
@ -204,3 +213,16 @@ func (c *HelmClientMock) GetChartDefaultValues(chartPath string) (map[string]int
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *HelmClientMock) EstimateInstanceResources(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) (*repository.ResourceEstimate, error) {
|
||||
clusterID := ""
|
||||
if cluster != nil {
|
||||
clusterID = cluster.ID
|
||||
}
|
||||
key := fmt.Sprintf("%s/%s", instance.Namespace, instance.Name)
|
||||
if c.estimates[clusterID] != nil {
|
||||
if estimate := c.estimates[clusterID][key]; estimate != nil {
|
||||
return estimate, nil
|
||||
}
|
||||
}
|
||||
return &repository.ResourceEstimate{}, nil
|
||||
}
|
||||
|
||||
@ -10,6 +10,7 @@ import (
|
||||
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
domainservice "github.com/ocdp/cluster-service/internal/domain/service"
|
||||
"helm.sh/helm/v3/pkg/action"
|
||||
"helm.sh/helm/v3/pkg/chart/loader"
|
||||
"helm.sh/helm/v3/pkg/cli"
|
||||
@ -346,6 +347,41 @@ func (h *HelmClient) GetChartDefaultValues(chartPath string) (map[string]interfa
|
||||
return vals, nil
|
||||
}
|
||||
|
||||
func (h *HelmClient) EstimateInstanceResources(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) (*repository.ResourceEstimate, error) {
|
||||
chartPath := fmt.Sprintf("/tmp/charts/%s-%s.tgz", instance.Chart, instance.Version)
|
||||
chart, err := loader.Load(chartPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load chart: %w", err)
|
||||
}
|
||||
actionConfig := new(action.Configuration)
|
||||
actionConfig.Log = func(format string, v ...interface{}) {}
|
||||
|
||||
install := action.NewInstall(actionConfig)
|
||||
install.ReleaseName = instance.Name
|
||||
if install.ReleaseName == "" {
|
||||
install.ReleaseName = "quota-precheck"
|
||||
}
|
||||
install.Namespace = instance.Namespace
|
||||
if install.Namespace == "" {
|
||||
install.Namespace = "default"
|
||||
}
|
||||
install.DryRun = true
|
||||
install.DryRunOption = "client"
|
||||
install.ClientOnly = true
|
||||
install.Replace = true
|
||||
install.SkipSchemaValidation = true
|
||||
|
||||
values := instance.Values
|
||||
if values == nil {
|
||||
values = map[string]interface{}{}
|
||||
}
|
||||
release, err := install.RunWithContext(ctx, chart, values)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to render chart for quota estimate: %w", err)
|
||||
}
|
||||
return domainservice.EstimateRenderedManifestResources(release.Manifest)
|
||||
}
|
||||
|
||||
// convertReleaseToInstance 转换 Helm Release 为 Instance
|
||||
func (h *HelmClient) convertReleaseToInstance(rel *release.Release) *entity.Instance {
|
||||
return &entity.Instance{
|
||||
|
||||
@ -63,7 +63,7 @@ func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string)
|
||||
|
||||
// 计算集群级别汇总
|
||||
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
|
||||
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
@ -87,6 +87,37 @@ func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([
|
||||
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
|
||||
}
|
||||
|
||||
// GetPodResourceAllocations returns Kubernetes Pod requests/limits without
|
||||
// inventing utilization values. GPU memory is treated as vendor integer MB.
|
||||
func (c *MetricsClient) GetPodResourceAllocations(ctx context.Context, clusterID string) ([]*entity.PodResourceAllocation, error) {
|
||||
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get cluster: %w", err)
|
||||
}
|
||||
|
||||
clientset, _, err := c.createK8sClients(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create k8s client: %w", err)
|
||||
}
|
||||
|
||||
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list pods: %w", err)
|
||||
}
|
||||
|
||||
result := make([]*entity.PodResourceAllocation, 0, len(pods.Items))
|
||||
for _, pod := range pods.Items {
|
||||
result = append(result, &entity.PodResourceAllocation{
|
||||
ClusterID: clusterID,
|
||||
Namespace: pod.Namespace,
|
||||
PodName: pod.Name,
|
||||
InstanceName: inferHelmReleaseName(pod.Labels),
|
||||
Allocation: podResourceAllocation(&pod),
|
||||
})
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// createK8sClients 创建 Kubernetes 客户端
|
||||
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
|
||||
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
|
||||
@ -127,14 +158,14 @@ func (c *MetricsClient) getNodeMetricsData(
|
||||
|
||||
for _, node := range nodes {
|
||||
nodeMetric := &entity.NodeMetrics{
|
||||
NodeName: node.Name,
|
||||
Status: getNodeStatus(&node),
|
||||
Role: getNodeRole(&node),
|
||||
Age: getNodeAge(&node),
|
||||
OSImage: node.Status.NodeInfo.OSImage,
|
||||
KernelVersion: node.Status.NodeInfo.KernelVersion,
|
||||
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
|
||||
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
|
||||
NodeName: node.Name,
|
||||
Status: getNodeStatus(&node),
|
||||
Role: getNodeRole(&node),
|
||||
Age: getNodeAge(&node),
|
||||
OSImage: node.Status.NodeInfo.OSImage,
|
||||
KernelVersion: node.Status.NodeInfo.KernelVersion,
|
||||
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
|
||||
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
|
||||
}
|
||||
|
||||
// CPU
|
||||
@ -213,7 +244,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
var totalCPU, totalMem, usedCPU, usedMem int64
|
||||
var totalGPU, usedGPU int
|
||||
healthyNodes := 0
|
||||
|
||||
|
||||
// 单机最大值
|
||||
var maxNodeCPU, maxNodeMem int64
|
||||
var maxNodeGPU int
|
||||
@ -251,7 +282,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
// 从 nodeMetrics 获取使用情况
|
||||
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
|
||||
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
|
||||
|
||||
|
||||
// 更新单机最大使用率
|
||||
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
|
||||
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
|
||||
@ -274,7 +305,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
|
||||
metrics.TotalMemory = formatBytes(totalMem)
|
||||
metrics.TotalGPU = totalGPU
|
||||
|
||||
|
||||
// 格式化单机最大值
|
||||
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
|
||||
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
|
||||
@ -292,7 +323,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
|
||||
usedGPU += nm.GPUUsage
|
||||
}
|
||||
|
||||
|
||||
if totalCPU > 0 {
|
||||
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
|
||||
}
|
||||
@ -302,7 +333,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
if totalGPU > 0 {
|
||||
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
|
||||
}
|
||||
|
||||
|
||||
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
|
||||
metrics.UsedMemory = formatBytes(usedMem)
|
||||
metrics.UsedGPU = usedGPU
|
||||
@ -348,7 +379,7 @@ func getNodeAge(node *corev1.Node) string {
|
||||
age := time.Since(node.CreationTimestamp.Time)
|
||||
days := int(age.Hours() / 24)
|
||||
hours := int(age.Hours()) % 24
|
||||
|
||||
|
||||
if days > 0 {
|
||||
return fmt.Sprintf("%dd %dh", days, hours)
|
||||
}
|
||||
@ -368,3 +399,110 @@ func formatBytes(bytes int64) string {
|
||||
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
func inferHelmReleaseName(labels map[string]string) string {
|
||||
if labels == nil {
|
||||
return ""
|
||||
}
|
||||
for _, key := range []string{
|
||||
"app.kubernetes.io/instance",
|
||||
"release",
|
||||
"helm.sh/release",
|
||||
"meta.helm.sh/release-name",
|
||||
"app",
|
||||
} {
|
||||
if value := labels[key]; value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func podResourceAllocation(pod *corev1.Pod) entity.ResourceAllocation {
|
||||
if pod == nil {
|
||||
return entity.ResourceAllocation{}
|
||||
}
|
||||
sum := entity.ResourceAllocation{}
|
||||
for _, container := range pod.Spec.Containers {
|
||||
sum = addContainerAllocation(sum, container)
|
||||
}
|
||||
initMax := entity.ResourceAllocation{}
|
||||
for _, container := range pod.Spec.InitContainers {
|
||||
initMax = maxAllocation(initMax, containerAllocation(container))
|
||||
}
|
||||
return maxAllocation(sum, initMax)
|
||||
}
|
||||
|
||||
func addContainerAllocation(base entity.ResourceAllocation, container corev1.Container) entity.ResourceAllocation {
|
||||
return addAllocation(base, containerAllocation(container))
|
||||
}
|
||||
|
||||
func containerAllocation(container corev1.Container) entity.ResourceAllocation {
|
||||
requests := container.Resources.Requests
|
||||
limits := container.Resources.Limits
|
||||
return entity.ResourceAllocation{
|
||||
CPURequestsMilli: quantityMilliValue(requests, corev1.ResourceCPU),
|
||||
CPULimitsMilli: quantityMilliValue(limits, corev1.ResourceCPU),
|
||||
MemoryRequestsBytes: quantityValue(requests, corev1.ResourceMemory),
|
||||
MemoryLimitsBytes: quantityValue(limits, corev1.ResourceMemory),
|
||||
GPURequests: quantityValue(requests, corev1.ResourceName("nvidia.com/gpu")),
|
||||
GPULimits: quantityValue(limits, corev1.ResourceName("nvidia.com/gpu")),
|
||||
GPUMemoryRequestsMB: quantityValueAny(requests, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
|
||||
GPUMemoryLimitsMB: quantityValueAny(limits, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
|
||||
}
|
||||
}
|
||||
|
||||
func addAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
|
||||
return entity.ResourceAllocation{
|
||||
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
|
||||
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
|
||||
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
|
||||
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
|
||||
GPURequests: left.GPURequests + right.GPURequests,
|
||||
GPULimits: left.GPULimits + right.GPULimits,
|
||||
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
|
||||
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
|
||||
}
|
||||
}
|
||||
|
||||
func maxAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
|
||||
return entity.ResourceAllocation{
|
||||
CPURequestsMilli: maxInt64(left.CPURequestsMilli, right.CPURequestsMilli),
|
||||
CPULimitsMilli: maxInt64(left.CPULimitsMilli, right.CPULimitsMilli),
|
||||
MemoryRequestsBytes: maxInt64(left.MemoryRequestsBytes, right.MemoryRequestsBytes),
|
||||
MemoryLimitsBytes: maxInt64(left.MemoryLimitsBytes, right.MemoryLimitsBytes),
|
||||
GPURequests: maxInt64(left.GPURequests, right.GPURequests),
|
||||
GPULimits: maxInt64(left.GPULimits, right.GPULimits),
|
||||
GPUMemoryRequestsMB: maxInt64(left.GPUMemoryRequestsMB, right.GPUMemoryRequestsMB),
|
||||
GPUMemoryLimitsMB: maxInt64(left.GPUMemoryLimitsMB, right.GPUMemoryLimitsMB),
|
||||
}
|
||||
}
|
||||
|
||||
func quantityMilliValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
|
||||
if quantity, ok := resources[name]; ok {
|
||||
return quantity.MilliValue()
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func quantityValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
|
||||
if quantity, ok := resources[name]; ok {
|
||||
return quantity.Value()
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func quantityValueAny(resources corev1.ResourceList, names ...corev1.ResourceName) int64 {
|
||||
for _, name := range names {
|
||||
if quantity, ok := resources[name]; ok {
|
||||
return quantity.Value()
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func maxInt64(left, right int64) int64 {
|
||||
if left > right {
|
||||
return left
|
||||
}
|
||||
return right
|
||||
}
|
||||
|
||||
29
backend/internal/adapter/output/k8s/metrics_client_test.go
Normal file
29
backend/internal/adapter/output/k8s/metrics_client_test.go
Normal file
@ -0,0 +1,29 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
)
|
||||
|
||||
func TestContainerAllocationCountsVendorGPUMemoryKey(t *testing.T) {
|
||||
container := corev1.Container{
|
||||
Resources: corev1.ResourceRequirements{
|
||||
Requests: corev1.ResourceList{
|
||||
corev1.ResourceName("nvidia.com/gpumem"): resource.MustParse("10000"),
|
||||
},
|
||||
Limits: corev1.ResourceList{
|
||||
corev1.ResourceName("nvidia.com/gpumem"): resource.MustParse("12000"),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
allocation := containerAllocation(container)
|
||||
if allocation.GPUMemoryRequestsMB != 10000 {
|
||||
t.Fatalf("expected GPU memory requests 10000 MB, got %d", allocation.GPUMemoryRequestsMB)
|
||||
}
|
||||
if allocation.GPUMemoryLimitsMB != 12000 {
|
||||
t.Fatalf("expected GPU memory limits 12000 MB, got %d", allocation.GPUMemoryLimitsMB)
|
||||
}
|
||||
}
|
||||
@ -106,6 +106,25 @@ func (c *TenantClient) IssueKubeconfig(ctx context.Context, cluster *entity.Clus
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *TenantClient) GetResourceQuotaUsage(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) (*repository.ResourceQuotaUsage, error) {
|
||||
binding = binding.WithDefaults()
|
||||
if err := binding.Validate(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
clientset, _, err := c.clientsetForCluster(cluster)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
quota, err := clientset.CoreV1().ResourceQuotas(binding.Namespace).Get(ctx, binding.ResourceQuotaName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get tenant resource quota usage: %w", err)
|
||||
}
|
||||
return &repository.ResourceQuotaUsage{
|
||||
Hard: resourceVectorFromList(quota.Status.Hard),
|
||||
Used: resourceVectorFromList(quota.Status.Used),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// SuspendTenant revokes tenant API access by deleting only the RoleBinding.
|
||||
func (c *TenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
|
||||
binding = binding.WithDefaults()
|
||||
@ -128,6 +147,82 @@ func (c *TenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluste
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *TenantClient) DeleteTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
|
||||
binding = binding.WithDefaults()
|
||||
if err := binding.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if isProtectedTenantNamespace(binding.Namespace) {
|
||||
return entity.ErrProtectedNamespace
|
||||
}
|
||||
clientset, _, err := c.clientsetForCluster(cluster)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := deleteIgnoringNotFound(ctx, func() error {
|
||||
return clientset.RbacV1().RoleBindings(binding.Namespace).Delete(ctx, binding.RoleBindingName, metav1.DeleteOptions{})
|
||||
}); err != nil {
|
||||
return fmt.Errorf("failed to delete tenant role binding: %w", err)
|
||||
}
|
||||
if err := deleteIgnoringNotFound(ctx, func() error {
|
||||
return clientset.CoreV1().ResourceQuotas(binding.Namespace).Delete(ctx, binding.ResourceQuotaName, metav1.DeleteOptions{})
|
||||
}); err != nil {
|
||||
return fmt.Errorf("failed to delete tenant resource quota: %w", err)
|
||||
}
|
||||
if err := deleteIgnoringNotFound(ctx, func() error {
|
||||
return clientset.CoreV1().ServiceAccounts(binding.Namespace).Delete(ctx, binding.ServiceAccountName, metav1.DeleteOptions{})
|
||||
}); err != nil {
|
||||
return fmt.Errorf("failed to delete tenant service account: %w", err)
|
||||
}
|
||||
namespace, err := clientset.CoreV1().Namespaces().Get(ctx, binding.Namespace, metav1.GetOptions{})
|
||||
if apierrors.IsNotFound(err) {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get tenant namespace before deletion: %w", err)
|
||||
}
|
||||
if namespace.Labels["ocdp.io/managed-by"] != "ocdp" || namespace.Labels["ocdp.io/tenant"] != binding.Namespace {
|
||||
return fmt.Errorf("refusing to delete unmanaged namespace %q", binding.Namespace)
|
||||
}
|
||||
if err := deleteIgnoringNotFound(ctx, func() error {
|
||||
return clientset.CoreV1().Namespaces().Delete(ctx, binding.Namespace, metav1.DeleteOptions{})
|
||||
}); err != nil {
|
||||
return fmt.Errorf("failed to delete tenant namespace: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func deleteIgnoringNotFound(ctx context.Context, deleteFn func() error) error {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
err := deleteFn()
|
||||
if apierrors.IsNotFound(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func isProtectedTenantNamespace(namespace string) bool {
|
||||
switch strings.TrimSpace(namespace) {
|
||||
case "", "default", "kube-system", "kube-public", "kube-node-lease":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func resourceVectorFromList(values corev1.ResourceList) repository.ResourceVector {
|
||||
gpu := values[corev1.ResourceName("requests.nvidia.com/gpu")]
|
||||
gpuMem := values[corev1.ResourceName("requests.nvidia.com/gpumem")]
|
||||
return repository.ResourceVector{
|
||||
CPU: values[corev1.ResourceName("requests.cpu")],
|
||||
Memory: values[corev1.ResourceName("requests.memory")],
|
||||
GPU: gpu.Value(),
|
||||
GPUMemoryMB: gpuMem.Value(),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *TenantClient) clientsetForCluster(cluster *entity.Cluster) (kubernetes.Interface, *rest.Config, error) {
|
||||
if c.clientset != nil {
|
||||
config := &rest.Config{Host: "https://kubernetes.default.svc"}
|
||||
|
||||
@ -2,6 +2,7 @@ package k8s
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
@ -58,7 +59,7 @@ func TestTenantClientEnsureTenantUpdatesExistingResources(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
binding := tenantBinding()
|
||||
clientset := fake.NewSimpleClientset(
|
||||
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace}},
|
||||
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
|
||||
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
|
||||
&rbacv1.RoleBinding{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: binding.RoleBindingName, Namespace: binding.Namespace},
|
||||
@ -100,7 +101,7 @@ func TestTenantClientSuspendTenantDeletesOnlyRoleBinding(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
binding := tenantBinding()
|
||||
clientset := fake.NewSimpleClientset(
|
||||
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace}},
|
||||
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
|
||||
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
|
||||
desiredRoleBinding(binding),
|
||||
)
|
||||
@ -117,6 +118,47 @@ func TestTenantClientSuspendTenantDeletesOnlyRoleBinding(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTenantClientDeleteTenantDeletesTenantResources(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
binding := tenantBinding()
|
||||
clientset := fake.NewSimpleClientset(
|
||||
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
|
||||
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
|
||||
desiredRoleBinding(binding),
|
||||
&corev1.ResourceQuota{ObjectMeta: metav1.ObjectMeta{Name: binding.ResourceQuotaName, Namespace: binding.Namespace}},
|
||||
)
|
||||
client := NewTenantClientForClientset(clientset)
|
||||
|
||||
if err := client.DeleteTenant(ctx, nil, binding); err != nil {
|
||||
t.Fatalf("DeleteTenant returned error: %v", err)
|
||||
}
|
||||
if _, err := clientset.RbacV1().RoleBindings(binding.Namespace).Get(ctx, binding.RoleBindingName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
|
||||
t.Fatalf("expected role binding deleted, got %v", err)
|
||||
}
|
||||
if _, err := clientset.CoreV1().ResourceQuotas(binding.Namespace).Get(ctx, binding.ResourceQuotaName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
|
||||
t.Fatalf("expected resource quota deleted, got %v", err)
|
||||
}
|
||||
if _, err := clientset.CoreV1().ServiceAccounts(binding.Namespace).Get(ctx, binding.ServiceAccountName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
|
||||
t.Fatalf("expected service account deleted, got %v", err)
|
||||
}
|
||||
if _, err := clientset.CoreV1().Namespaces().Get(ctx, binding.Namespace, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
|
||||
t.Fatalf("expected namespace deleted, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTenantClientDeleteTenantRejectsProtectedNamespace(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
client := NewTenantClientForClientset(fake.NewSimpleClientset(
|
||||
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}},
|
||||
))
|
||||
binding := entity.NewTenantBinding("default")
|
||||
|
||||
err := client.DeleteTenant(ctx, nil, binding)
|
||||
if !errors.Is(err, entity.ErrProtectedNamespace) {
|
||||
t.Fatalf("expected protected namespace error, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTenantClientIssueKubeconfigCapsTokenTTL(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
binding := tenantBinding()
|
||||
|
||||
@ -31,6 +31,28 @@ func (c *MockTenantClient) IssueKubeconfig(ctx context.Context, cluster *entity.
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *MockTenantClient) GetResourceQuotaUsage(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) (*repository.ResourceQuotaUsage, error) {
|
||||
if err := binding.Validate(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &repository.ResourceQuotaUsage{
|
||||
Hard: resourceVectorFromList(binding.ResourceQuotaHard),
|
||||
Used: repository.ResourceVector{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *MockTenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
|
||||
return binding.Validate()
|
||||
}
|
||||
|
||||
func (c *MockTenantClient) DeleteTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
|
||||
if err := binding.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
switch binding.Namespace {
|
||||
case "", "default", "kube-system", "kube-public", "kube-node-lease":
|
||||
return entity.ErrProtectedNamespace
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
@ -72,6 +72,16 @@ func (r *WorkspaceRepositoryMock) Update(ctx context.Context, workspace *entity.
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *WorkspaceRepositoryMock) Delete(ctx context.Context, id string) error {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
if _, ok := r.workspaces[id]; !ok {
|
||||
return entity.ErrWorkspaceNotFound
|
||||
}
|
||||
delete(r.workspaces, id)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *WorkspaceRepositoryMock) List(ctx context.Context) ([]*entity.Workspace, error) {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
@ -118,6 +128,20 @@ func (r *WorkspaceClusterBindingRepositoryMock) Get(ctx context.Context, workspa
|
||||
return ©, nil
|
||||
}
|
||||
|
||||
func (r *WorkspaceClusterBindingRepositoryMock) ListByWorkspace(ctx context.Context, workspaceID string) ([]*entity.WorkspaceClusterBinding, error) {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
result := make([]*entity.WorkspaceClusterBinding, 0)
|
||||
for _, binding := range r.bindings {
|
||||
if binding.WorkspaceID != workspaceID {
|
||||
continue
|
||||
}
|
||||
copy := *binding
|
||||
result = append(result, ©)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (r *WorkspaceClusterBindingRepositoryMock) Delete(ctx context.Context, workspaceID, clusterID string) error {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
@ -27,8 +27,9 @@ func (r *WorkspaceRepository) Create(ctx context.Context, workspace *entity.Work
|
||||
query := `
|
||||
INSERT INTO workspaces (id, name, status, k8s_namespace, k8s_sa_name, default_cluster_id, quota_cpu, quota_memory, quota_gpu, quota_gpu_memory, created_by, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
|
||||
ON CONFLICT (name) DO NOTHING
|
||||
`
|
||||
_, err := r.db.conn.ExecContext(ctx, query,
|
||||
result, err := r.db.conn.ExecContext(ctx, query,
|
||||
workspace.ID,
|
||||
workspace.Name,
|
||||
workspace.Status,
|
||||
@ -46,6 +47,13 @@ func (r *WorkspaceRepository) Create(ctx context.Context, workspace *entity.Work
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create workspace: %w", err)
|
||||
}
|
||||
rows, err := result.RowsAffected()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get affected rows: %w", err)
|
||||
}
|
||||
if rows == 0 {
|
||||
return entity.ErrWorkspaceExists
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -132,6 +140,21 @@ func (r *WorkspaceRepository) Update(ctx context.Context, workspace *entity.Work
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *WorkspaceRepository) Delete(ctx context.Context, id string) error {
|
||||
result, err := r.db.conn.ExecContext(ctx, `DELETE FROM workspaces WHERE id = $1`, id)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete workspace: %w", err)
|
||||
}
|
||||
rows, err := result.RowsAffected()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get affected rows: %w", err)
|
||||
}
|
||||
if rows == 0 {
|
||||
return entity.ErrWorkspaceNotFound
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *WorkspaceRepository) List(ctx context.Context) ([]*entity.Workspace, error) {
|
||||
query := `
|
||||
SELECT id, name, status, k8s_namespace, k8s_sa_name, default_cluster_id, quota_cpu, quota_memory, quota_gpu, quota_gpu_memory, created_by, created_at, updated_at
|
||||
@ -256,6 +279,42 @@ func (r *WorkspaceClusterBindingRepository) Get(ctx context.Context, workspaceID
|
||||
return binding, nil
|
||||
}
|
||||
|
||||
func (r *WorkspaceClusterBindingRepository) ListByWorkspace(ctx context.Context, workspaceID string) ([]*entity.WorkspaceClusterBinding, error) {
|
||||
query := `
|
||||
SELECT id, workspace_id, cluster_id, namespace, service_account, quota_cpu, quota_memory, quota_gpu, quota_gpu_memory, status, created_at, updated_at
|
||||
FROM workspace_cluster_bindings
|
||||
WHERE workspace_id = $1
|
||||
ORDER BY created_at ASC
|
||||
`
|
||||
rows, err := r.db.conn.QueryContext(ctx, query, workspaceID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list workspace cluster bindings: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
bindings := make([]*entity.WorkspaceClusterBinding, 0)
|
||||
for rows.Next() {
|
||||
binding := &entity.WorkspaceClusterBinding{}
|
||||
if err := rows.Scan(
|
||||
&binding.ID,
|
||||
&binding.WorkspaceID,
|
||||
&binding.ClusterID,
|
||||
&binding.Namespace,
|
||||
&binding.ServiceAccount,
|
||||
&binding.QuotaCPU,
|
||||
&binding.QuotaMemory,
|
||||
&binding.QuotaGPU,
|
||||
&binding.QuotaGPUMem,
|
||||
&binding.Status,
|
||||
&binding.CreatedAt,
|
||||
&binding.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("failed to scan workspace cluster binding: %w", err)
|
||||
}
|
||||
bindings = append(bindings, binding)
|
||||
}
|
||||
return bindings, rows.Err()
|
||||
}
|
||||
|
||||
func (r *WorkspaceClusterBindingRepository) Delete(ctx context.Context, workspaceID, clusterID string) error {
|
||||
_, err := r.db.conn.ExecContext(ctx, `DELETE FROM workspace_cluster_bindings WHERE workspace_id = $1 AND cluster_id = $2`, workspaceID, clusterID)
|
||||
return err
|
||||
|
||||
Reference in New Issue
Block a user