package k8s import ( "context" "fmt" "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" metricsv "k8s.io/metrics/pkg/client/clientset/versioned" "github.com/ocdp/cluster-service/internal/domain/entity" "github.com/ocdp/cluster-service/internal/domain/repository" ) // MetricsClient 实现从 Kubernetes 集群获取监控指标 type MetricsClient struct { clusterRepo repository.ClusterRepository } // NewMetricsClient 创建 MetricsClient func NewMetricsClient(clusterRepo repository.ClusterRepository) *MetricsClient { return &MetricsClient{ clusterRepo: clusterRepo, } } // GetClusterMetrics 获取集群监控指标 func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) { // 获取集群信息 cluster, err := c.clusterRepo.GetByID(ctx, clusterID) if err != nil { return nil, fmt.Errorf("failed to get cluster: %w", err) } // 创建 Kubernetes 客户端 clientset, metricsClient, err := c.createK8sClients(cluster) if err != nil { return nil, fmt.Errorf("failed to create k8s client: %w", err) } // 获取节点列表 nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return nil, fmt.Errorf("failed to list nodes: %w", err) } // 获取所有 Pods pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) if err != nil { return nil, fmt.Errorf("failed to list pods: %w", err) } // 获取节点指标(CPU/内存使用情况) nodeMetrics, err := c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items) if err != nil { // 如果无法获取 metrics,记录错误但继续 fmt.Printf("Warning: failed to get node metrics: %v\n", err) } // 计算集群级别汇总 metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics) return metrics, nil } // GetNodeMetrics 获取集群节点指标 func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) { cluster, err := c.clusterRepo.GetByID(ctx, clusterID) if err != nil { return nil, fmt.Errorf("failed to get cluster: %w", err) } clientset, metricsClient, err := c.createK8sClients(cluster) if err != nil { return nil, fmt.Errorf("failed to create k8s client: %w", err) } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return nil, fmt.Errorf("failed to list nodes: %w", err) } return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items) } // createK8sClients 创建 Kubernetes 客户端 func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) { config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig())) if err != nil { // 如果无法从 kubeconfig 创建,尝试使用集群配置 config = &rest.Config{ Host: cluster.Host, TLSClientConfig: rest.TLSClientConfig{ CAData: []byte(cluster.CAData), CertData: []byte(cluster.CertData), KeyData: []byte(cluster.KeyData), }, } } clientset, err := kubernetes.NewForConfig(config) if err != nil { return nil, nil, fmt.Errorf("failed to create clientset: %w", err) } metricsClient, err := metricsv.NewForConfig(config) if err != nil { // Metrics API 可能不可用,返回 nil 但不报错 return clientset, nil, nil } return clientset, metricsClient, nil } // getNodeMetricsData 获取节点详细指标 func (c *MetricsClient) getNodeMetricsData( ctx context.Context, clientset *kubernetes.Clientset, metricsClient *metricsv.Clientset, nodes []corev1.Node, ) ([]*entity.NodeMetrics, error) { result := make([]*entity.NodeMetrics, 0, len(nodes)) for _, node := range nodes { nodeMetric := &entity.NodeMetrics{ NodeName: node.Name, Status: getNodeStatus(&node), Role: getNodeRole(&node), Age: getNodeAge(&node), OSImage: node.Status.NodeInfo.OSImage, KernelVersion: node.Status.NodeInfo.KernelVersion, ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion, KubeletVersion: node.Status.NodeInfo.KubeletVersion, } // CPU cpuCapacity := node.Status.Capacity.Cpu() cpuAllocatable := node.Status.Allocatable.Cpu() nodeMetric.CPUCapacity = fmt.Sprintf("%.2f cores", float64(cpuCapacity.MilliValue())/1000.0) nodeMetric.CPUAllocatable = fmt.Sprintf("%.2f cores", float64(cpuAllocatable.MilliValue())/1000.0) // Memory memCapacity := node.Status.Capacity.Memory() memAllocatable := node.Status.Allocatable.Memory() nodeMetric.MemoryCapacity = formatBytes(memCapacity.Value()) nodeMetric.MemoryAllocatable = formatBytes(memAllocatable.Value()) // GPU (从 node allocatable 中查找) if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok { nodeMetric.GPUCapacity = int(gpu.Value()) // 尝试获取 GPU 类型 if gpuType, ok := node.Labels["nvidia.com/gpu.product"]; ok { nodeMetric.GPUType = gpuType } } // 获取 Pod 数量 pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{ FieldSelector: fmt.Sprintf("spec.nodeName=%s", node.Name), }) if err == nil { nodeMetric.PodCount = len(pods.Items) } // 如果有 metrics client,获取实时使用情况 if metricsClient != nil { nodeMetricData, err := metricsClient.MetricsV1beta1().NodeMetricses().Get(ctx, node.Name, metav1.GetOptions{}) if err == nil { // CPU 使用 cpuUsage := nodeMetricData.Usage.Cpu() nodeMetric.CPUUsage = fmt.Sprintf("%.2f cores", float64(cpuUsage.MilliValue())/1000.0) if cpuAllocatable.MilliValue() > 0 { nodeMetric.CPUPercent = float64(cpuUsage.MilliValue()) / float64(cpuAllocatable.MilliValue()) * 100 } // Memory 使用 memUsage := nodeMetricData.Usage.Memory() nodeMetric.MemoryUsage = formatBytes(memUsage.Value()) if memAllocatable.Value() > 0 { nodeMetric.MemoryPercent = float64(memUsage.Value()) / float64(memAllocatable.Value()) * 100 } } } result = append(result, nodeMetric) } return result, nil } // aggregateClusterMetrics 聚合集群级别指标 func (c *MetricsClient) aggregateClusterMetrics( cluster *entity.Cluster, nodes []corev1.Node, pods []corev1.Pod, nodeMetrics []*entity.NodeMetrics, ) *entity.ClusterMetrics { metrics := &entity.ClusterMetrics{ ClusterID: cluster.ID, ClusterName: cluster.Name, Status: "healthy", NodeCount: len(nodes), PodCount: len(pods), LastCheck: time.Now(), Nodes: make([]entity.NodeMetrics, 0), } // 汇总资源 var totalCPU, totalMem, usedCPU, usedMem int64 var totalGPU, usedGPU int healthyNodes := 0 // 单机最大值 var maxNodeCPU, maxNodeMem int64 var maxNodeGPU int var maxNodeCPUUsage, maxNodeMemUsage, maxNodeGPUUsage float64 for i, node := range nodes { // CPU cpuCap := node.Status.Capacity.Cpu() totalCPU += cpuCap.MilliValue() if cpuCap.MilliValue() > maxNodeCPU { maxNodeCPU = cpuCap.MilliValue() } // Memory memCap := node.Status.Capacity.Memory() totalMem += memCap.Value() if memCap.Value() > maxNodeMem { maxNodeMem = memCap.Value() } // GPU if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok { gpuCount := int(gpu.Value()) totalGPU += gpuCount if gpuCount > maxNodeGPU { maxNodeGPU = gpuCount } } // Node status if getNodeStatus(&node) == "Ready" { healthyNodes++ } // 从 nodeMetrics 获取使用情况 if i < len(nodeMetrics) && nodeMetrics[i] != nil { metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i]) // 更新单机最大使用率 if nodeMetrics[i].CPUPercent > maxNodeCPUUsage { maxNodeCPUUsage = nodeMetrics[i].CPUPercent } if nodeMetrics[i].MemoryPercent > maxNodeMemUsage { maxNodeMemUsage = nodeMetrics[i].MemoryPercent } if nodeMetrics[i].GPUPercent > maxNodeGPUUsage { maxNodeGPUUsage = nodeMetrics[i].GPUPercent } } } // 计算集群 uptime(简化:使用最老节点的年龄) if len(nodes) > 0 { metrics.Uptime = getNodeAge(&nodes[0]) } // 格式化总资源 metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0) metrics.TotalMemory = formatBytes(totalMem) metrics.TotalGPU = totalGPU // 格式化单机最大值 metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0) metrics.MaxNodeMemory = formatBytes(maxNodeMem) metrics.MaxNodeGPU = maxNodeGPU metrics.MaxNodeCPUUsage = maxNodeCPUUsage metrics.MaxNodeMemUsage = maxNodeMemUsage metrics.MaxNodeGPUUsage = maxNodeGPUUsage // 使用情况(简化处理) if len(nodeMetrics) > 0 { for _, nm := range nodeMetrics { // 解析使用的 CPU 和内存 // 这里简化处理,实际应该解析字符串 usedCPU += int64(nm.CPUPercent * float64(totalCPU) / 100.0) usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0) usedGPU += nm.GPUUsage } if totalCPU > 0 { metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100 } if totalMem > 0 { metrics.MemoryUsage = float64(usedMem) / float64(totalMem) * 100 } if totalGPU > 0 { metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100 } metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0) metrics.UsedMemory = formatBytes(usedMem) metrics.UsedGPU = usedGPU } // 确定集群状态 if healthyNodes == len(nodes) { metrics.Status = "healthy" } else if healthyNodes > 0 { metrics.Status = "warning" } else { metrics.Status = "error" } return metrics } // Helper functions func getNodeStatus(node *corev1.Node) string { for _, condition := range node.Status.Conditions { if condition.Type == corev1.NodeReady { if condition.Status == corev1.ConditionTrue { return "Ready" } return "NotReady" } } return "Unknown" } func getNodeRole(node *corev1.Node) string { if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok { return "control-plane" } if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok { return "control-plane" } return "worker" } func getNodeAge(node *corev1.Node) string { age := time.Since(node.CreationTimestamp.Time) days := int(age.Hours() / 24) hours := int(age.Hours()) % 24 if days > 0 { return fmt.Sprintf("%dd %dh", days, hours) } return fmt.Sprintf("%dh", hours) } func formatBytes(bytes int64) string { const unit = 1024 if bytes < unit { return fmt.Sprintf("%d B", bytes) } div, exp := int64(unit), 0 for n := bytes / unit; n >= unit; n /= unit { div *= unit exp++ } return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp]) }