fix: scale replicas in response, K8s metrics client, quota precheck, auth tests

- Add GetMetrics method to MetricsClient interface and implement cluster metrics API
- Add QuotaPrecheck service for validating resource quotas before deployment
- Add auth DTO with role/permission models and auth handler tests
- Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics
- Update workspace handler with GetWorkspace endpoint and shared-user list
- Fix monitoring handler to use correct service method name
- Add tail_lines fallback in instance handler for snake_case query params
- Update nginx config for SSE log streaming support (no buffering)
- Add comprehensive test coverage: auth_service_test, auth_handler_test,
  auth_dto_test, metrics_client_test, quota_precheck_test
- Update error messages for quota validation and instance operations
- ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit
- InstanceCard: correctly disable scale-minus when replicas <= 0
- SidebarLayout: add hover transition for sidebar items
- Update todo.md and lessons.md with latest fixes
This commit is contained in:
Ivan087
2026-05-20 16:56:29 +08:00
parent 8f90cf0f0d
commit 33ddaf97db
59 changed files with 4805 additions and 457 deletions

View File

@ -63,7 +63,7 @@ func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string)
// 计算集群级别汇总
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
return metrics, nil
}
@ -87,6 +87,37 @@ func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
}
// GetPodResourceAllocations returns Kubernetes Pod requests/limits without
// inventing utilization values. GPU memory is treated as vendor integer MB.
func (c *MetricsClient) GetPodResourceAllocations(ctx context.Context, clusterID string) ([]*entity.PodResourceAllocation, error) {
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
clientset, _, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list pods: %w", err)
}
result := make([]*entity.PodResourceAllocation, 0, len(pods.Items))
for _, pod := range pods.Items {
result = append(result, &entity.PodResourceAllocation{
ClusterID: clusterID,
Namespace: pod.Namespace,
PodName: pod.Name,
InstanceName: inferHelmReleaseName(pod.Labels),
Allocation: podResourceAllocation(&pod),
})
}
return result, nil
}
// createK8sClients 创建 Kubernetes 客户端
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
@ -127,14 +158,14 @@ func (c *MetricsClient) getNodeMetricsData(
for _, node := range nodes {
nodeMetric := &entity.NodeMetrics{
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
}
// CPU
@ -213,7 +244,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
var totalCPU, totalMem, usedCPU, usedMem int64
var totalGPU, usedGPU int
healthyNodes := 0
// 单机最大值
var maxNodeCPU, maxNodeMem int64
var maxNodeGPU int
@ -251,7 +282,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
// 从 nodeMetrics 获取使用情况
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
// 更新单机最大使用率
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
@ -274,7 +305,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
metrics.TotalMemory = formatBytes(totalMem)
metrics.TotalGPU = totalGPU
// 格式化单机最大值
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
@ -292,7 +323,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
usedGPU += nm.GPUUsage
}
if totalCPU > 0 {
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
}
@ -302,7 +333,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
if totalGPU > 0 {
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
}
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
metrics.UsedMemory = formatBytes(usedMem)
metrics.UsedGPU = usedGPU
@ -348,7 +379,7 @@ func getNodeAge(node *corev1.Node) string {
age := time.Since(node.CreationTimestamp.Time)
days := int(age.Hours() / 24)
hours := int(age.Hours()) % 24
if days > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
@ -368,3 +399,110 @@ func formatBytes(bytes int64) string {
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
}
func inferHelmReleaseName(labels map[string]string) string {
if labels == nil {
return ""
}
for _, key := range []string{
"app.kubernetes.io/instance",
"release",
"helm.sh/release",
"meta.helm.sh/release-name",
"app",
} {
if value := labels[key]; value != "" {
return value
}
}
return ""
}
func podResourceAllocation(pod *corev1.Pod) entity.ResourceAllocation {
if pod == nil {
return entity.ResourceAllocation{}
}
sum := entity.ResourceAllocation{}
for _, container := range pod.Spec.Containers {
sum = addContainerAllocation(sum, container)
}
initMax := entity.ResourceAllocation{}
for _, container := range pod.Spec.InitContainers {
initMax = maxAllocation(initMax, containerAllocation(container))
}
return maxAllocation(sum, initMax)
}
func addContainerAllocation(base entity.ResourceAllocation, container corev1.Container) entity.ResourceAllocation {
return addAllocation(base, containerAllocation(container))
}
func containerAllocation(container corev1.Container) entity.ResourceAllocation {
requests := container.Resources.Requests
limits := container.Resources.Limits
return entity.ResourceAllocation{
CPURequestsMilli: quantityMilliValue(requests, corev1.ResourceCPU),
CPULimitsMilli: quantityMilliValue(limits, corev1.ResourceCPU),
MemoryRequestsBytes: quantityValue(requests, corev1.ResourceMemory),
MemoryLimitsBytes: quantityValue(limits, corev1.ResourceMemory),
GPURequests: quantityValue(requests, corev1.ResourceName("nvidia.com/gpu")),
GPULimits: quantityValue(limits, corev1.ResourceName("nvidia.com/gpu")),
GPUMemoryRequestsMB: quantityValueAny(requests, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
GPUMemoryLimitsMB: quantityValueAny(limits, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
}
}
func addAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
GPURequests: left.GPURequests + right.GPURequests,
GPULimits: left.GPULimits + right.GPULimits,
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
}
}
func maxAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: maxInt64(left.CPURequestsMilli, right.CPURequestsMilli),
CPULimitsMilli: maxInt64(left.CPULimitsMilli, right.CPULimitsMilli),
MemoryRequestsBytes: maxInt64(left.MemoryRequestsBytes, right.MemoryRequestsBytes),
MemoryLimitsBytes: maxInt64(left.MemoryLimitsBytes, right.MemoryLimitsBytes),
GPURequests: maxInt64(left.GPURequests, right.GPURequests),
GPULimits: maxInt64(left.GPULimits, right.GPULimits),
GPUMemoryRequestsMB: maxInt64(left.GPUMemoryRequestsMB, right.GPUMemoryRequestsMB),
GPUMemoryLimitsMB: maxInt64(left.GPUMemoryLimitsMB, right.GPUMemoryLimitsMB),
}
}
func quantityMilliValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
if quantity, ok := resources[name]; ok {
return quantity.MilliValue()
}
return 0
}
func quantityValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
if quantity, ok := resources[name]; ok {
return quantity.Value()
}
return 0
}
func quantityValueAny(resources corev1.ResourceList, names ...corev1.ResourceName) int64 {
for _, name := range names {
if quantity, ok := resources[name]; ok {
return quantity.Value()
}
}
return 0
}
func maxInt64(left, right int64) int64 {
if left > right {
return left
}
return right
}