fix: scale replicas in response, K8s metrics client, quota precheck, auth tests
- Add GetMetrics method to MetricsClient interface and implement cluster metrics API - Add QuotaPrecheck service for validating resource quotas before deployment - Add auth DTO with role/permission models and auth handler tests - Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics - Update workspace handler with GetWorkspace endpoint and shared-user list - Fix monitoring handler to use correct service method name - Add tail_lines fallback in instance handler for snake_case query params - Update nginx config for SSE log streaming support (no buffering) - Add comprehensive test coverage: auth_service_test, auth_handler_test, auth_dto_test, metrics_client_test, quota_precheck_test - Update error messages for quota validation and instance operations - ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit - InstanceCard: correctly disable scale-minus when replicas <= 0 - SidebarLayout: add hover transition for sidebar items - Update todo.md and lessons.md with latest fixes
This commit is contained in:
@ -63,7 +63,7 @@ func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string)
|
||||
|
||||
// 计算集群级别汇总
|
||||
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
|
||||
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
@ -87,6 +87,37 @@ func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([
|
||||
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
|
||||
}
|
||||
|
||||
// GetPodResourceAllocations returns Kubernetes Pod requests/limits without
|
||||
// inventing utilization values. GPU memory is treated as vendor integer MB.
|
||||
func (c *MetricsClient) GetPodResourceAllocations(ctx context.Context, clusterID string) ([]*entity.PodResourceAllocation, error) {
|
||||
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get cluster: %w", err)
|
||||
}
|
||||
|
||||
clientset, _, err := c.createK8sClients(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create k8s client: %w", err)
|
||||
}
|
||||
|
||||
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list pods: %w", err)
|
||||
}
|
||||
|
||||
result := make([]*entity.PodResourceAllocation, 0, len(pods.Items))
|
||||
for _, pod := range pods.Items {
|
||||
result = append(result, &entity.PodResourceAllocation{
|
||||
ClusterID: clusterID,
|
||||
Namespace: pod.Namespace,
|
||||
PodName: pod.Name,
|
||||
InstanceName: inferHelmReleaseName(pod.Labels),
|
||||
Allocation: podResourceAllocation(&pod),
|
||||
})
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// createK8sClients 创建 Kubernetes 客户端
|
||||
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
|
||||
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
|
||||
@ -127,14 +158,14 @@ func (c *MetricsClient) getNodeMetricsData(
|
||||
|
||||
for _, node := range nodes {
|
||||
nodeMetric := &entity.NodeMetrics{
|
||||
NodeName: node.Name,
|
||||
Status: getNodeStatus(&node),
|
||||
Role: getNodeRole(&node),
|
||||
Age: getNodeAge(&node),
|
||||
OSImage: node.Status.NodeInfo.OSImage,
|
||||
KernelVersion: node.Status.NodeInfo.KernelVersion,
|
||||
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
|
||||
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
|
||||
NodeName: node.Name,
|
||||
Status: getNodeStatus(&node),
|
||||
Role: getNodeRole(&node),
|
||||
Age: getNodeAge(&node),
|
||||
OSImage: node.Status.NodeInfo.OSImage,
|
||||
KernelVersion: node.Status.NodeInfo.KernelVersion,
|
||||
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
|
||||
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
|
||||
}
|
||||
|
||||
// CPU
|
||||
@ -213,7 +244,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
var totalCPU, totalMem, usedCPU, usedMem int64
|
||||
var totalGPU, usedGPU int
|
||||
healthyNodes := 0
|
||||
|
||||
|
||||
// 单机最大值
|
||||
var maxNodeCPU, maxNodeMem int64
|
||||
var maxNodeGPU int
|
||||
@ -251,7 +282,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
// 从 nodeMetrics 获取使用情况
|
||||
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
|
||||
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
|
||||
|
||||
|
||||
// 更新单机最大使用率
|
||||
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
|
||||
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
|
||||
@ -274,7 +305,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
|
||||
metrics.TotalMemory = formatBytes(totalMem)
|
||||
metrics.TotalGPU = totalGPU
|
||||
|
||||
|
||||
// 格式化单机最大值
|
||||
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
|
||||
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
|
||||
@ -292,7 +323,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
|
||||
usedGPU += nm.GPUUsage
|
||||
}
|
||||
|
||||
|
||||
if totalCPU > 0 {
|
||||
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
|
||||
}
|
||||
@ -302,7 +333,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
|
||||
if totalGPU > 0 {
|
||||
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
|
||||
}
|
||||
|
||||
|
||||
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
|
||||
metrics.UsedMemory = formatBytes(usedMem)
|
||||
metrics.UsedGPU = usedGPU
|
||||
@ -348,7 +379,7 @@ func getNodeAge(node *corev1.Node) string {
|
||||
age := time.Since(node.CreationTimestamp.Time)
|
||||
days := int(age.Hours() / 24)
|
||||
hours := int(age.Hours()) % 24
|
||||
|
||||
|
||||
if days > 0 {
|
||||
return fmt.Sprintf("%dd %dh", days, hours)
|
||||
}
|
||||
@ -368,3 +399,110 @@ func formatBytes(bytes int64) string {
|
||||
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
func inferHelmReleaseName(labels map[string]string) string {
|
||||
if labels == nil {
|
||||
return ""
|
||||
}
|
||||
for _, key := range []string{
|
||||
"app.kubernetes.io/instance",
|
||||
"release",
|
||||
"helm.sh/release",
|
||||
"meta.helm.sh/release-name",
|
||||
"app",
|
||||
} {
|
||||
if value := labels[key]; value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func podResourceAllocation(pod *corev1.Pod) entity.ResourceAllocation {
|
||||
if pod == nil {
|
||||
return entity.ResourceAllocation{}
|
||||
}
|
||||
sum := entity.ResourceAllocation{}
|
||||
for _, container := range pod.Spec.Containers {
|
||||
sum = addContainerAllocation(sum, container)
|
||||
}
|
||||
initMax := entity.ResourceAllocation{}
|
||||
for _, container := range pod.Spec.InitContainers {
|
||||
initMax = maxAllocation(initMax, containerAllocation(container))
|
||||
}
|
||||
return maxAllocation(sum, initMax)
|
||||
}
|
||||
|
||||
func addContainerAllocation(base entity.ResourceAllocation, container corev1.Container) entity.ResourceAllocation {
|
||||
return addAllocation(base, containerAllocation(container))
|
||||
}
|
||||
|
||||
func containerAllocation(container corev1.Container) entity.ResourceAllocation {
|
||||
requests := container.Resources.Requests
|
||||
limits := container.Resources.Limits
|
||||
return entity.ResourceAllocation{
|
||||
CPURequestsMilli: quantityMilliValue(requests, corev1.ResourceCPU),
|
||||
CPULimitsMilli: quantityMilliValue(limits, corev1.ResourceCPU),
|
||||
MemoryRequestsBytes: quantityValue(requests, corev1.ResourceMemory),
|
||||
MemoryLimitsBytes: quantityValue(limits, corev1.ResourceMemory),
|
||||
GPURequests: quantityValue(requests, corev1.ResourceName("nvidia.com/gpu")),
|
||||
GPULimits: quantityValue(limits, corev1.ResourceName("nvidia.com/gpu")),
|
||||
GPUMemoryRequestsMB: quantityValueAny(requests, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
|
||||
GPUMemoryLimitsMB: quantityValueAny(limits, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
|
||||
}
|
||||
}
|
||||
|
||||
func addAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
|
||||
return entity.ResourceAllocation{
|
||||
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
|
||||
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
|
||||
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
|
||||
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
|
||||
GPURequests: left.GPURequests + right.GPURequests,
|
||||
GPULimits: left.GPULimits + right.GPULimits,
|
||||
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
|
||||
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
|
||||
}
|
||||
}
|
||||
|
||||
func maxAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
|
||||
return entity.ResourceAllocation{
|
||||
CPURequestsMilli: maxInt64(left.CPURequestsMilli, right.CPURequestsMilli),
|
||||
CPULimitsMilli: maxInt64(left.CPULimitsMilli, right.CPULimitsMilli),
|
||||
MemoryRequestsBytes: maxInt64(left.MemoryRequestsBytes, right.MemoryRequestsBytes),
|
||||
MemoryLimitsBytes: maxInt64(left.MemoryLimitsBytes, right.MemoryLimitsBytes),
|
||||
GPURequests: maxInt64(left.GPURequests, right.GPURequests),
|
||||
GPULimits: maxInt64(left.GPULimits, right.GPULimits),
|
||||
GPUMemoryRequestsMB: maxInt64(left.GPUMemoryRequestsMB, right.GPUMemoryRequestsMB),
|
||||
GPUMemoryLimitsMB: maxInt64(left.GPUMemoryLimitsMB, right.GPUMemoryLimitsMB),
|
||||
}
|
||||
}
|
||||
|
||||
func quantityMilliValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
|
||||
if quantity, ok := resources[name]; ok {
|
||||
return quantity.MilliValue()
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func quantityValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
|
||||
if quantity, ok := resources[name]; ok {
|
||||
return quantity.Value()
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func quantityValueAny(resources corev1.ResourceList, names ...corev1.ResourceName) int64 {
|
||||
for _, name := range names {
|
||||
if quantity, ok := resources[name]; ok {
|
||||
return quantity.Value()
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func maxInt64(left, right int64) int64 {
|
||||
if left > right {
|
||||
return left
|
||||
}
|
||||
return right
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user