ocdp v1

2025-11-13 02:54:06 +00:00
commit c5e51ed069
254 changed files with 54901 additions and 0 deletions
--- a/backend/internal/adapter/output/k8s/metrics_client.go
+++ b/backend/internal/adapter/output/k8s/metrics_client.go
@ -0,0 +1,370 @@
+package k8s
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/clientcmd"
+	metricsv "k8s.io/metrics/pkg/client/clientset/versioned"
+
+	"github.com/ocdp/cluster-service/internal/domain/entity"
+	"github.com/ocdp/cluster-service/internal/domain/repository"
+)
+
+// MetricsClient 实现从 Kubernetes 集群获取监控指标
+type MetricsClient struct {
+	clusterRepo repository.ClusterRepository
+}
+
+// NewMetricsClient 创建 MetricsClient
+func NewMetricsClient(clusterRepo repository.ClusterRepository) *MetricsClient {
+	return &MetricsClient{
+		clusterRepo: clusterRepo,
+	}
+}
+
+// GetClusterMetrics 获取集群监控指标
+func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) {
+	// 获取集群信息
+	cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get cluster: %w", err)
+	}
+
+	// 创建 Kubernetes 客户端
+	clientset, metricsClient, err := c.createK8sClients(cluster)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create k8s client: %w", err)
+	}
+
+	// 获取节点列表
+	nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to list nodes: %w", err)
+	}
+
+	// 获取所有 Pods
+	pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to list pods: %w", err)
+	}
+
+	// 获取节点指标（CPU/内存使用情况）
+	nodeMetrics, err := c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
+	if err != nil {
+		// 如果无法获取 metrics，记录错误但继续
+		fmt.Printf("Warning: failed to get node metrics: %v\n", err)
+	}
+
+	// 计算集群级别汇总
+	metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
+	
+	return metrics, nil
+}
+
+// GetNodeMetrics 获取集群节点指标
+func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) {
+	cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get cluster: %w", err)
+	}
+
+	clientset, metricsClient, err := c.createK8sClients(cluster)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create k8s client: %w", err)
+	}
+
+	nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to list nodes: %w", err)
+	}
+
+	return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
+}
+
+// createK8sClients 创建 Kubernetes 客户端
+func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
+	config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
+	if err != nil {
+		// 如果无法从 kubeconfig 创建，尝试使用集群配置
+		config = &rest.Config{
+			Host: cluster.Host,
+			TLSClientConfig: rest.TLSClientConfig{
+				CAData:   []byte(cluster.CAData),
+				CertData: []byte(cluster.CertData),
+				KeyData:  []byte(cluster.KeyData),
+			},
+		}
+	}
+
+	clientset, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create clientset: %w", err)
+	}
+
+	metricsClient, err := metricsv.NewForConfig(config)
+	if err != nil {
+		// Metrics API 可能不可用，返回 nil 但不报错
+		return clientset, nil, nil
+	}
+
+	return clientset, metricsClient, nil
+}
+
+// getNodeMetricsData 获取节点详细指标
+func (c *MetricsClient) getNodeMetricsData(
+	ctx context.Context,
+	clientset *kubernetes.Clientset,
+	metricsClient *metricsv.Clientset,
+	nodes []corev1.Node,
+) ([]*entity.NodeMetrics, error) {
+	result := make([]*entity.NodeMetrics, 0, len(nodes))
+
+	for _, node := range nodes {
+		nodeMetric := &entity.NodeMetrics{
+			NodeName:          node.Name,
+			Status:            getNodeStatus(&node),
+			Role:              getNodeRole(&node),
+			Age:               getNodeAge(&node),
+			OSImage:           node.Status.NodeInfo.OSImage,
+			KernelVersion:     node.Status.NodeInfo.KernelVersion,
+			ContainerRuntime:  node.Status.NodeInfo.ContainerRuntimeVersion,
+			KubeletVersion:    node.Status.NodeInfo.KubeletVersion,
+		}
+
+		// CPU
+		cpuCapacity := node.Status.Capacity.Cpu()
+		cpuAllocatable := node.Status.Allocatable.Cpu()
+		nodeMetric.CPUCapacity = fmt.Sprintf("%.2f cores", float64(cpuCapacity.MilliValue())/1000.0)
+		nodeMetric.CPUAllocatable = fmt.Sprintf("%.2f cores", float64(cpuAllocatable.MilliValue())/1000.0)
+
+		// Memory
+		memCapacity := node.Status.Capacity.Memory()
+		memAllocatable := node.Status.Allocatable.Memory()
+		nodeMetric.MemoryCapacity = formatBytes(memCapacity.Value())
+		nodeMetric.MemoryAllocatable = formatBytes(memAllocatable.Value())
+
+		// GPU (从 node allocatable 中查找)
+		if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
+			nodeMetric.GPUCapacity = int(gpu.Value())
+			// 尝试获取 GPU 类型
+			if gpuType, ok := node.Labels["nvidia.com/gpu.product"]; ok {
+				nodeMetric.GPUType = gpuType
+			}
+		}
+
+		// 获取 Pod 数量
+		pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
+			FieldSelector: fmt.Sprintf("spec.nodeName=%s", node.Name),
+		})
+		if err == nil {
+			nodeMetric.PodCount = len(pods.Items)
+		}
+
+		// 如果有 metrics client，获取实时使用情况
+		if metricsClient != nil {
+			nodeMetricData, err := metricsClient.MetricsV1beta1().NodeMetricses().Get(ctx, node.Name, metav1.GetOptions{})
+			if err == nil {
+				// CPU 使用
+				cpuUsage := nodeMetricData.Usage.Cpu()
+				nodeMetric.CPUUsage = fmt.Sprintf("%.2f cores", float64(cpuUsage.MilliValue())/1000.0)
+				if cpuAllocatable.MilliValue() > 0 {
+					nodeMetric.CPUPercent = float64(cpuUsage.MilliValue()) / float64(cpuAllocatable.MilliValue()) * 100
+				}
+
+				// Memory 使用
+				memUsage := nodeMetricData.Usage.Memory()
+				nodeMetric.MemoryUsage = formatBytes(memUsage.Value())
+				if memAllocatable.Value() > 0 {
+					nodeMetric.MemoryPercent = float64(memUsage.Value()) / float64(memAllocatable.Value()) * 100
+				}
+			}
+		}
+
+		result = append(result, nodeMetric)
+	}
+
+	return result, nil
+}
+
+// aggregateClusterMetrics 聚合集群级别指标
+func (c *MetricsClient) aggregateClusterMetrics(
+	cluster *entity.Cluster,
+	nodes []corev1.Node,
+	pods []corev1.Pod,
+	nodeMetrics []*entity.NodeMetrics,
+) *entity.ClusterMetrics {
+	metrics := &entity.ClusterMetrics{
+		ClusterID:   cluster.ID,
+		ClusterName: cluster.Name,
+		Status:      "healthy",
+		NodeCount:   len(nodes),
+		PodCount:    len(pods),
+		LastCheck:   time.Now(),
+		Nodes:       make([]entity.NodeMetrics, 0),
+	}
+
+	// 汇总资源
+	var totalCPU, totalMem, usedCPU, usedMem int64
+	var totalGPU, usedGPU int
+	healthyNodes := 0
+	
+	// 单机最大值
+	var maxNodeCPU, maxNodeMem int64
+	var maxNodeGPU int
+	var maxNodeCPUUsage, maxNodeMemUsage, maxNodeGPUUsage float64
+
+	for i, node := range nodes {
+		// CPU
+		cpuCap := node.Status.Capacity.Cpu()
+		totalCPU += cpuCap.MilliValue()
+		if cpuCap.MilliValue() > maxNodeCPU {
+			maxNodeCPU = cpuCap.MilliValue()
+		}
+
+		// Memory
+		memCap := node.Status.Capacity.Memory()
+		totalMem += memCap.Value()
+		if memCap.Value() > maxNodeMem {
+			maxNodeMem = memCap.Value()
+		}
+
+		// GPU
+		if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
+			gpuCount := int(gpu.Value())
+			totalGPU += gpuCount
+			if gpuCount > maxNodeGPU {
+				maxNodeGPU = gpuCount
+			}
+		}
+
+		// Node status
+		if getNodeStatus(&node) == "Ready" {
+			healthyNodes++
+		}
+
+		// 从 nodeMetrics 获取使用情况
+		if i < len(nodeMetrics) && nodeMetrics[i] != nil {
+			metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
+			
+			// 更新单机最大使用率
+			if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
+				maxNodeCPUUsage = nodeMetrics[i].CPUPercent
+			}
+			if nodeMetrics[i].MemoryPercent > maxNodeMemUsage {
+				maxNodeMemUsage = nodeMetrics[i].MemoryPercent
+			}
+			if nodeMetrics[i].GPUPercent > maxNodeGPUUsage {
+				maxNodeGPUUsage = nodeMetrics[i].GPUPercent
+			}
+		}
+	}
+
+	// 计算集群 uptime（简化：使用最老节点的年龄）
+	if len(nodes) > 0 {
+		metrics.Uptime = getNodeAge(&nodes[0])
+	}
+
+	// 格式化总资源
+	metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
+	metrics.TotalMemory = formatBytes(totalMem)
+	metrics.TotalGPU = totalGPU
+	
+	// 格式化单机最大值
+	metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
+	metrics.MaxNodeMemory = formatBytes(maxNodeMem)
+	metrics.MaxNodeGPU = maxNodeGPU
+	metrics.MaxNodeCPUUsage = maxNodeCPUUsage
+	metrics.MaxNodeMemUsage = maxNodeMemUsage
+	metrics.MaxNodeGPUUsage = maxNodeGPUUsage
+
+	// 使用情况（简化处理）
+	if len(nodeMetrics) > 0 {
+		for _, nm := range nodeMetrics {
+			// 解析使用的 CPU 和内存
+			// 这里简化处理，实际应该解析字符串
+			usedCPU += int64(nm.CPUPercent * float64(totalCPU) / 100.0)
+			usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
+			usedGPU += nm.GPUUsage
+		}
+		
+		if totalCPU > 0 {
+			metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
+		}
+		if totalMem > 0 {
+			metrics.MemoryUsage = float64(usedMem) / float64(totalMem) * 100
+		}
+		if totalGPU > 0 {
+			metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
+		}
+		
+		metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
+		metrics.UsedMemory = formatBytes(usedMem)
+		metrics.UsedGPU = usedGPU
+	}
+
+	// 确定集群状态
+	if healthyNodes == len(nodes) {
+		metrics.Status = "healthy"
+	} else if healthyNodes > 0 {
+		metrics.Status = "warning"
+	} else {
+		metrics.Status = "error"
+	}
+
+	return metrics
+}
+
+// Helper functions
+
+func getNodeStatus(node *corev1.Node) string {
+	for _, condition := range node.Status.Conditions {
+		if condition.Type == corev1.NodeReady {
+			if condition.Status == corev1.ConditionTrue {
+				return "Ready"
+			}
+			return "NotReady"
+		}
+	}
+	return "Unknown"
+}
+
+func getNodeRole(node *corev1.Node) string {
+	if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
+		return "control-plane"
+	}
+	if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
+		return "control-plane"
+	}
+	return "worker"
+}
+
+func getNodeAge(node *corev1.Node) string {
+	age := time.Since(node.CreationTimestamp.Time)
+	days := int(age.Hours() / 24)
+	hours := int(age.Hours()) % 24
+	
+	if days > 0 {
+		return fmt.Sprintf("%dd %dh", days, hours)
+	}
+	return fmt.Sprintf("%dh", hours)
+}
+
+func formatBytes(bytes int64) string {
+	const unit = 1024
+	if bytes < unit {
+		return fmt.Sprintf("%d B", bytes)
+	}
+	div, exp := int64(unit), 0
+	for n := bytes / unit; n >= unit; n /= unit {
+		div *= unit
+		exp++
+	}
+	return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
+}
+