fix: scale replicas in response, K8s metrics client, quota precheck, auth tests

- Add GetMetrics method to MetricsClient interface and implement cluster metrics API
- Add QuotaPrecheck service for validating resource quotas before deployment
- Add auth DTO with role/permission models and auth handler tests
- Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics
- Update workspace handler with GetWorkspace endpoint and shared-user list
- Fix monitoring handler to use correct service method name
- Add tail_lines fallback in instance handler for snake_case query params
- Update nginx config for SSE log streaming support (no buffering)
- Add comprehensive test coverage: auth_service_test, auth_handler_test,
  auth_dto_test, metrics_client_test, quota_precheck_test
- Update error messages for quota validation and instance operations
- ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit
- InstanceCard: correctly disable scale-minus when replicas <= 0
- SidebarLayout: add hover transition for sidebar items
- Update todo.md and lessons.md with latest fixes
This commit is contained in:
Ivan087
2026-05-20 16:56:29 +08:00
parent 8f90cf0f0d
commit 33ddaf97db
59 changed files with 4805 additions and 457 deletions

View File

@ -63,7 +63,7 @@ func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string)
// 计算集群级别汇总
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
return metrics, nil
}
@ -87,6 +87,37 @@ func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
}
// GetPodResourceAllocations returns Kubernetes Pod requests/limits without
// inventing utilization values. GPU memory is treated as vendor integer MB.
func (c *MetricsClient) GetPodResourceAllocations(ctx context.Context, clusterID string) ([]*entity.PodResourceAllocation, error) {
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
clientset, _, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list pods: %w", err)
}
result := make([]*entity.PodResourceAllocation, 0, len(pods.Items))
for _, pod := range pods.Items {
result = append(result, &entity.PodResourceAllocation{
ClusterID: clusterID,
Namespace: pod.Namespace,
PodName: pod.Name,
InstanceName: inferHelmReleaseName(pod.Labels),
Allocation: podResourceAllocation(&pod),
})
}
return result, nil
}
// createK8sClients 创建 Kubernetes 客户端
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
@ -127,14 +158,14 @@ func (c *MetricsClient) getNodeMetricsData(
for _, node := range nodes {
nodeMetric := &entity.NodeMetrics{
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
}
// CPU
@ -213,7 +244,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
var totalCPU, totalMem, usedCPU, usedMem int64
var totalGPU, usedGPU int
healthyNodes := 0
// 单机最大值
var maxNodeCPU, maxNodeMem int64
var maxNodeGPU int
@ -251,7 +282,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
// 从 nodeMetrics 获取使用情况
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
// 更新单机最大使用率
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
@ -274,7 +305,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
metrics.TotalMemory = formatBytes(totalMem)
metrics.TotalGPU = totalGPU
// 格式化单机最大值
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
@ -292,7 +323,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
usedGPU += nm.GPUUsage
}
if totalCPU > 0 {
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
}
@ -302,7 +333,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
if totalGPU > 0 {
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
}
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
metrics.UsedMemory = formatBytes(usedMem)
metrics.UsedGPU = usedGPU
@ -348,7 +379,7 @@ func getNodeAge(node *corev1.Node) string {
age := time.Since(node.CreationTimestamp.Time)
days := int(age.Hours() / 24)
hours := int(age.Hours()) % 24
if days > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
@ -368,3 +399,110 @@ func formatBytes(bytes int64) string {
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
}
func inferHelmReleaseName(labels map[string]string) string {
if labels == nil {
return ""
}
for _, key := range []string{
"app.kubernetes.io/instance",
"release",
"helm.sh/release",
"meta.helm.sh/release-name",
"app",
} {
if value := labels[key]; value != "" {
return value
}
}
return ""
}
func podResourceAllocation(pod *corev1.Pod) entity.ResourceAllocation {
if pod == nil {
return entity.ResourceAllocation{}
}
sum := entity.ResourceAllocation{}
for _, container := range pod.Spec.Containers {
sum = addContainerAllocation(sum, container)
}
initMax := entity.ResourceAllocation{}
for _, container := range pod.Spec.InitContainers {
initMax = maxAllocation(initMax, containerAllocation(container))
}
return maxAllocation(sum, initMax)
}
func addContainerAllocation(base entity.ResourceAllocation, container corev1.Container) entity.ResourceAllocation {
return addAllocation(base, containerAllocation(container))
}
func containerAllocation(container corev1.Container) entity.ResourceAllocation {
requests := container.Resources.Requests
limits := container.Resources.Limits
return entity.ResourceAllocation{
CPURequestsMilli: quantityMilliValue(requests, corev1.ResourceCPU),
CPULimitsMilli: quantityMilliValue(limits, corev1.ResourceCPU),
MemoryRequestsBytes: quantityValue(requests, corev1.ResourceMemory),
MemoryLimitsBytes: quantityValue(limits, corev1.ResourceMemory),
GPURequests: quantityValue(requests, corev1.ResourceName("nvidia.com/gpu")),
GPULimits: quantityValue(limits, corev1.ResourceName("nvidia.com/gpu")),
GPUMemoryRequestsMB: quantityValueAny(requests, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
GPUMemoryLimitsMB: quantityValueAny(limits, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
}
}
func addAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
GPURequests: left.GPURequests + right.GPURequests,
GPULimits: left.GPULimits + right.GPULimits,
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
}
}
func maxAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: maxInt64(left.CPURequestsMilli, right.CPURequestsMilli),
CPULimitsMilli: maxInt64(left.CPULimitsMilli, right.CPULimitsMilli),
MemoryRequestsBytes: maxInt64(left.MemoryRequestsBytes, right.MemoryRequestsBytes),
MemoryLimitsBytes: maxInt64(left.MemoryLimitsBytes, right.MemoryLimitsBytes),
GPURequests: maxInt64(left.GPURequests, right.GPURequests),
GPULimits: maxInt64(left.GPULimits, right.GPULimits),
GPUMemoryRequestsMB: maxInt64(left.GPUMemoryRequestsMB, right.GPUMemoryRequestsMB),
GPUMemoryLimitsMB: maxInt64(left.GPUMemoryLimitsMB, right.GPUMemoryLimitsMB),
}
}
func quantityMilliValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
if quantity, ok := resources[name]; ok {
return quantity.MilliValue()
}
return 0
}
func quantityValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
if quantity, ok := resources[name]; ok {
return quantity.Value()
}
return 0
}
func quantityValueAny(resources corev1.ResourceList, names ...corev1.ResourceName) int64 {
for _, name := range names {
if quantity, ok := resources[name]; ok {
return quantity.Value()
}
}
return 0
}
func maxInt64(left, right int64) int64 {
if left > right {
return left
}
return right
}

View File

@ -0,0 +1,29 @@
package k8s
import (
"testing"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
func TestContainerAllocationCountsVendorGPUMemoryKey(t *testing.T) {
container := corev1.Container{
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceName("nvidia.com/gpumem"): resource.MustParse("10000"),
},
Limits: corev1.ResourceList{
corev1.ResourceName("nvidia.com/gpumem"): resource.MustParse("12000"),
},
},
}
allocation := containerAllocation(container)
if allocation.GPUMemoryRequestsMB != 10000 {
t.Fatalf("expected GPU memory requests 10000 MB, got %d", allocation.GPUMemoryRequestsMB)
}
if allocation.GPUMemoryLimitsMB != 12000 {
t.Fatalf("expected GPU memory limits 12000 MB, got %d", allocation.GPUMemoryLimitsMB)
}
}

View File

@ -106,6 +106,25 @@ func (c *TenantClient) IssueKubeconfig(ctx context.Context, cluster *entity.Clus
}, nil
}
func (c *TenantClient) GetResourceQuotaUsage(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) (*repository.ResourceQuotaUsage, error) {
binding = binding.WithDefaults()
if err := binding.Validate(); err != nil {
return nil, err
}
clientset, _, err := c.clientsetForCluster(cluster)
if err != nil {
return nil, err
}
quota, err := clientset.CoreV1().ResourceQuotas(binding.Namespace).Get(ctx, binding.ResourceQuotaName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("failed to get tenant resource quota usage: %w", err)
}
return &repository.ResourceQuotaUsage{
Hard: resourceVectorFromList(quota.Status.Hard),
Used: resourceVectorFromList(quota.Status.Used),
}, nil
}
// SuspendTenant revokes tenant API access by deleting only the RoleBinding.
func (c *TenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
binding = binding.WithDefaults()
@ -128,6 +147,82 @@ func (c *TenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluste
return nil
}
func (c *TenantClient) DeleteTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
binding = binding.WithDefaults()
if err := binding.Validate(); err != nil {
return err
}
if isProtectedTenantNamespace(binding.Namespace) {
return entity.ErrProtectedNamespace
}
clientset, _, err := c.clientsetForCluster(cluster)
if err != nil {
return err
}
if err := deleteIgnoringNotFound(ctx, func() error {
return clientset.RbacV1().RoleBindings(binding.Namespace).Delete(ctx, binding.RoleBindingName, metav1.DeleteOptions{})
}); err != nil {
return fmt.Errorf("failed to delete tenant role binding: %w", err)
}
if err := deleteIgnoringNotFound(ctx, func() error {
return clientset.CoreV1().ResourceQuotas(binding.Namespace).Delete(ctx, binding.ResourceQuotaName, metav1.DeleteOptions{})
}); err != nil {
return fmt.Errorf("failed to delete tenant resource quota: %w", err)
}
if err := deleteIgnoringNotFound(ctx, func() error {
return clientset.CoreV1().ServiceAccounts(binding.Namespace).Delete(ctx, binding.ServiceAccountName, metav1.DeleteOptions{})
}); err != nil {
return fmt.Errorf("failed to delete tenant service account: %w", err)
}
namespace, err := clientset.CoreV1().Namespaces().Get(ctx, binding.Namespace, metav1.GetOptions{})
if apierrors.IsNotFound(err) {
return nil
}
if err != nil {
return fmt.Errorf("failed to get tenant namespace before deletion: %w", err)
}
if namespace.Labels["ocdp.io/managed-by"] != "ocdp" || namespace.Labels["ocdp.io/tenant"] != binding.Namespace {
return fmt.Errorf("refusing to delete unmanaged namespace %q", binding.Namespace)
}
if err := deleteIgnoringNotFound(ctx, func() error {
return clientset.CoreV1().Namespaces().Delete(ctx, binding.Namespace, metav1.DeleteOptions{})
}); err != nil {
return fmt.Errorf("failed to delete tenant namespace: %w", err)
}
return nil
}
func deleteIgnoringNotFound(ctx context.Context, deleteFn func() error) error {
if err := ctx.Err(); err != nil {
return err
}
err := deleteFn()
if apierrors.IsNotFound(err) {
return nil
}
return err
}
func isProtectedTenantNamespace(namespace string) bool {
switch strings.TrimSpace(namespace) {
case "", "default", "kube-system", "kube-public", "kube-node-lease":
return true
default:
return false
}
}
func resourceVectorFromList(values corev1.ResourceList) repository.ResourceVector {
gpu := values[corev1.ResourceName("requests.nvidia.com/gpu")]
gpuMem := values[corev1.ResourceName("requests.nvidia.com/gpumem")]
return repository.ResourceVector{
CPU: values[corev1.ResourceName("requests.cpu")],
Memory: values[corev1.ResourceName("requests.memory")],
GPU: gpu.Value(),
GPUMemoryMB: gpuMem.Value(),
}
}
func (c *TenantClient) clientsetForCluster(cluster *entity.Cluster) (kubernetes.Interface, *rest.Config, error) {
if c.clientset != nil {
config := &rest.Config{Host: "https://kubernetes.default.svc"}

View File

@ -2,6 +2,7 @@ package k8s
import (
"context"
"errors"
"strings"
"testing"
"time"
@ -58,7 +59,7 @@ func TestTenantClientEnsureTenantUpdatesExistingResources(t *testing.T) {
ctx := context.Background()
binding := tenantBinding()
clientset := fake.NewSimpleClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace}},
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
&rbacv1.RoleBinding{
ObjectMeta: metav1.ObjectMeta{Name: binding.RoleBindingName, Namespace: binding.Namespace},
@ -100,7 +101,7 @@ func TestTenantClientSuspendTenantDeletesOnlyRoleBinding(t *testing.T) {
ctx := context.Background()
binding := tenantBinding()
clientset := fake.NewSimpleClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace}},
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
desiredRoleBinding(binding),
)
@ -117,6 +118,47 @@ func TestTenantClientSuspendTenantDeletesOnlyRoleBinding(t *testing.T) {
}
}
func TestTenantClientDeleteTenantDeletesTenantResources(t *testing.T) {
ctx := context.Background()
binding := tenantBinding()
clientset := fake.NewSimpleClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
desiredRoleBinding(binding),
&corev1.ResourceQuota{ObjectMeta: metav1.ObjectMeta{Name: binding.ResourceQuotaName, Namespace: binding.Namespace}},
)
client := NewTenantClientForClientset(clientset)
if err := client.DeleteTenant(ctx, nil, binding); err != nil {
t.Fatalf("DeleteTenant returned error: %v", err)
}
if _, err := clientset.RbacV1().RoleBindings(binding.Namespace).Get(ctx, binding.RoleBindingName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
t.Fatalf("expected role binding deleted, got %v", err)
}
if _, err := clientset.CoreV1().ResourceQuotas(binding.Namespace).Get(ctx, binding.ResourceQuotaName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
t.Fatalf("expected resource quota deleted, got %v", err)
}
if _, err := clientset.CoreV1().ServiceAccounts(binding.Namespace).Get(ctx, binding.ServiceAccountName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
t.Fatalf("expected service account deleted, got %v", err)
}
if _, err := clientset.CoreV1().Namespaces().Get(ctx, binding.Namespace, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
t.Fatalf("expected namespace deleted, got %v", err)
}
}
func TestTenantClientDeleteTenantRejectsProtectedNamespace(t *testing.T) {
ctx := context.Background()
client := NewTenantClientForClientset(fake.NewSimpleClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}},
))
binding := entity.NewTenantBinding("default")
err := client.DeleteTenant(ctx, nil, binding)
if !errors.Is(err, entity.ErrProtectedNamespace) {
t.Fatalf("expected protected namespace error, got %v", err)
}
}
func TestTenantClientIssueKubeconfigCapsTokenTTL(t *testing.T) {
ctx := context.Background()
binding := tenantBinding()

View File

@ -31,6 +31,28 @@ func (c *MockTenantClient) IssueKubeconfig(ctx context.Context, cluster *entity.
}, nil
}
func (c *MockTenantClient) GetResourceQuotaUsage(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) (*repository.ResourceQuotaUsage, error) {
if err := binding.Validate(); err != nil {
return nil, err
}
return &repository.ResourceQuotaUsage{
Hard: resourceVectorFromList(binding.ResourceQuotaHard),
Used: repository.ResourceVector{},
}, nil
}
func (c *MockTenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
return binding.Validate()
}
func (c *MockTenantClient) DeleteTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
if err := binding.Validate(); err != nil {
return err
}
switch binding.Namespace {
case "", "default", "kube-system", "kube-public", "kube-node-lease":
return entity.ErrProtectedNamespace
default:
return nil
}
}