ocdp-go/backend/internal/adapter/output/k8s/metrics_client.go

package k8s

import (
	"context"
	"fmt"
	"time"

	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/client-go/kubernetes"
	"k8s.io/client-go/rest"
	"k8s.io/client-go/tools/clientcmd"
	metricsv "k8s.io/metrics/pkg/client/clientset/versioned"

	"github.com/ocdp/cluster-service/internal/domain/entity"
	"github.com/ocdp/cluster-service/internal/domain/repository"
)

// MetricsClient 实现从 Kubernetes 集群获取监控指标
type MetricsClient struct {
	clusterRepo repository.ClusterRepository
}

// NewMetricsClient 创建 MetricsClient
func NewMetricsClient(clusterRepo repository.ClusterRepository) *MetricsClient {
	return &MetricsClient{
		clusterRepo: clusterRepo,
	}
}

// GetClusterMetrics 获取集群监控指标
func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) {
	// 获取集群信息
	cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
	if err != nil {
		return nil, fmt.Errorf("failed to get cluster: %w", err)
	}

	// 创建 Kubernetes 客户端
	clientset, metricsClient, err := c.createK8sClients(cluster)
	if err != nil {
		return nil, fmt.Errorf("failed to create k8s client: %w", err)
	}

	// 获取节点列表
	nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
	if err != nil {
		return nil, fmt.Errorf("failed to list nodes: %w", err)
	}

	// 获取所有 Pods
	pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
	if err != nil {
		return nil, fmt.Errorf("failed to list pods: %w", err)
	}

	// 获取节点指标（CPU/内存使用情况）
	nodeMetrics, err := c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
	if err != nil {
		// 如果无法获取 metrics，记录错误但继续
		fmt.Printf("Warning: failed to get node metrics: %v\n", err)
	}

	// 计算集群级别汇总
	metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)

	return metrics, nil
}

// GetNodeMetrics 获取集群节点指标
func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) {
	cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
	if err != nil {
		return nil, fmt.Errorf("failed to get cluster: %w", err)
	}

	clientset, metricsClient, err := c.createK8sClients(cluster)
	if err != nil {
		return nil, fmt.Errorf("failed to create k8s client: %w", err)
	}

	nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
	if err != nil {
		return nil, fmt.Errorf("failed to list nodes: %w", err)
	}

	return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
}

// GetPodResourceAllocations returns Kubernetes Pod requests/limits without
// inventing utilization values. GPU memory is treated as vendor integer MB.
func (c *MetricsClient) GetPodResourceAllocations(ctx context.Context, clusterID string) ([]*entity.PodResourceAllocation, error) {
	cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
	if err != nil {
		return nil, fmt.Errorf("failed to get cluster: %w", err)
	}

	clientset, _, err := c.createK8sClients(cluster)
	if err != nil {
		return nil, fmt.Errorf("failed to create k8s client: %w", err)
	}

	pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
	if err != nil {
		return nil, fmt.Errorf("failed to list pods: %w", err)
	}

	result := make([]*entity.PodResourceAllocation, 0, len(pods.Items))
	for _, pod := range pods.Items {
		result = append(result, &entity.PodResourceAllocation{
			ClusterID:    clusterID,
			Namespace:    pod.Namespace,
			PodName:      pod.Name,
			InstanceName: inferHelmReleaseName(pod.Labels),
			Allocation:   podResourceAllocation(&pod),
		})
	}
	return result, nil
}

// createK8sClients 创建 Kubernetes 客户端
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
	config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
	if err != nil {
		// 如果无法从 kubeconfig 创建，尝试使用集群配置
		config = &rest.Config{
			Host: cluster.Host,
			TLSClientConfig: rest.TLSClientConfig{
				CAData:   []byte(cluster.CAData),
				CertData: []byte(cluster.CertData),
				KeyData:  []byte(cluster.KeyData),
			},
		}
	}

	clientset, err := kubernetes.NewForConfig(config)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to create clientset: %w", err)
	}

	metricsClient, err := metricsv.NewForConfig(config)
	if err != nil {
		// Metrics API 可能不可用，返回 nil 但不报错
		return clientset, nil, nil
	}

	return clientset, metricsClient, nil
}

// getNodeMetricsData 获取节点详细指标
func (c *MetricsClient) getNodeMetricsData(
	ctx context.Context,
	clientset *kubernetes.Clientset,
	metricsClient *metricsv.Clientset,
	nodes []corev1.Node,
) ([]*entity.NodeMetrics, error) {
	result := make([]*entity.NodeMetrics, 0, len(nodes))

	for _, node := range nodes {
		nodeMetric := &entity.NodeMetrics{
			NodeName:         node.Name,
			Status:           getNodeStatus(&node),
			Role:             getNodeRole(&node),
			Age:              getNodeAge(&node),
			OSImage:          node.Status.NodeInfo.OSImage,
			KernelVersion:    node.Status.NodeInfo.KernelVersion,
			ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
			KubeletVersion:   node.Status.NodeInfo.KubeletVersion,
		}

		// CPU
		cpuCapacity := node.Status.Capacity.Cpu()
		cpuAllocatable := node.Status.Allocatable.Cpu()
		nodeMetric.CPUCapacity = fmt.Sprintf("%.2f cores", float64(cpuCapacity.MilliValue())/1000.0)
		nodeMetric.CPUAllocatable = fmt.Sprintf("%.2f cores", float64(cpuAllocatable.MilliValue())/1000.0)

		// Memory
		memCapacity := node.Status.Capacity.Memory()
		memAllocatable := node.Status.Allocatable.Memory()
		nodeMetric.MemoryCapacity = formatBytes(memCapacity.Value())
		nodeMetric.MemoryAllocatable = formatBytes(memAllocatable.Value())

		// GPU (从 node allocatable 中查找)
		if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
			nodeMetric.GPUCapacity = int(gpu.Value())
			// 尝试获取 GPU 类型
			if gpuType, ok := node.Labels["nvidia.com/gpu.product"]; ok {
				nodeMetric.GPUType = gpuType
			}
		}

		// 获取 Pod 数量
		pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
			FieldSelector: fmt.Sprintf("spec.nodeName=%s", node.Name),
		})
		if err == nil {
			nodeMetric.PodCount = len(pods.Items)
		}

		// 如果有 metrics client，获取实时使用情况
		if metricsClient != nil {
			nodeMetricData, err := metricsClient.MetricsV1beta1().NodeMetricses().Get(ctx, node.Name, metav1.GetOptions{})
			if err == nil {
				// CPU 使用
				cpuUsage := nodeMetricData.Usage.Cpu()
				nodeMetric.CPUUsage = fmt.Sprintf("%.2f cores", float64(cpuUsage.MilliValue())/1000.0)
				if cpuAllocatable.MilliValue() > 0 {
					nodeMetric.CPUPercent = float64(cpuUsage.MilliValue()) / float64(cpuAllocatable.MilliValue()) * 100
				}

				// Memory 使用
				memUsage := nodeMetricData.Usage.Memory()
				nodeMetric.MemoryUsage = formatBytes(memUsage.Value())
				if memAllocatable.Value() > 0 {
					nodeMetric.MemoryPercent = float64(memUsage.Value()) / float64(memAllocatable.Value()) * 100
				}
			}
		}

		result = append(result, nodeMetric)
	}

	return result, nil
}

// aggregateClusterMetrics 聚合集群级别指标
func (c *MetricsClient) aggregateClusterMetrics(
	cluster *entity.Cluster,
	nodes []corev1.Node,
	pods []corev1.Pod,
	nodeMetrics []*entity.NodeMetrics,
) *entity.ClusterMetrics {
	metrics := &entity.ClusterMetrics{
		ClusterID:   cluster.ID,
		ClusterName: cluster.Name,
		Status:      "healthy",
		NodeCount:   len(nodes),
		PodCount:    len(pods),
		LastCheck:   time.Now(),
		Nodes:       make([]entity.NodeMetrics, 0),
	}

	// 汇总资源
	var totalCPU, totalMem, usedCPU, usedMem int64
	var totalGPU, usedGPU int
	healthyNodes := 0

	// 单机最大值
	var maxNodeCPU, maxNodeMem int64
	var maxNodeGPU int
	var maxNodeCPUUsage, maxNodeMemUsage, maxNodeGPUUsage float64

	for i, node := range nodes {
		// CPU
		cpuCap := node.Status.Capacity.Cpu()
		totalCPU += cpuCap.MilliValue()
		if cpuCap.MilliValue() > maxNodeCPU {
			maxNodeCPU = cpuCap.MilliValue()
		}

		// Memory
		memCap := node.Status.Capacity.Memory()
		totalMem += memCap.Value()
		if memCap.Value() > maxNodeMem {
			maxNodeMem = memCap.Value()
		}

		// GPU
		if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
			gpuCount := int(gpu.Value())
			totalGPU += gpuCount
			if gpuCount > maxNodeGPU {
				maxNodeGPU = gpuCount
			}
		}

		// Node status
		if getNodeStatus(&node) == "Ready" {
			healthyNodes++
		}

		// 从 nodeMetrics 获取使用情况
		if i < len(nodeMetrics) && nodeMetrics[i] != nil {
			metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])

			// 更新单机最大使用率
			if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
				maxNodeCPUUsage = nodeMetrics[i].CPUPercent
			}
			if nodeMetrics[i].MemoryPercent > maxNodeMemUsage {
				maxNodeMemUsage = nodeMetrics[i].MemoryPercent
			}
			if nodeMetrics[i].GPUPercent > maxNodeGPUUsage {
				maxNodeGPUUsage = nodeMetrics[i].GPUPercent
			}
		}
	}

	// 计算集群 uptime（简化：使用最老节点的年龄）
	if len(nodes) > 0 {
		metrics.Uptime = getNodeAge(&nodes[0])
	}

	// 格式化总资源
	metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
	metrics.TotalMemory = formatBytes(totalMem)
	metrics.TotalGPU = totalGPU

	// 格式化单机最大值
	metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
	metrics.MaxNodeMemory = formatBytes(maxNodeMem)
	metrics.MaxNodeGPU = maxNodeGPU
	metrics.MaxNodeCPUUsage = maxNodeCPUUsage
	metrics.MaxNodeMemUsage = maxNodeMemUsage
	metrics.MaxNodeGPUUsage = maxNodeGPUUsage

	// 使用情况（简化处理）
	if len(nodeMetrics) > 0 {
		for _, nm := range nodeMetrics {
			// 解析使用的 CPU 和内存
			// 这里简化处理，实际应该解析字符串
			usedCPU += int64(nm.CPUPercent * float64(totalCPU) / 100.0)
			usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
			usedGPU += nm.GPUUsage
		}

		if totalCPU > 0 {
			metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
		}
		if totalMem > 0 {
			metrics.MemoryUsage = float64(usedMem) / float64(totalMem) * 100
		}
		if totalGPU > 0 {
			metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
		}

		metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
		metrics.UsedMemory = formatBytes(usedMem)
		metrics.UsedGPU = usedGPU
	}

	// 确定集群状态
	if healthyNodes == len(nodes) {
		metrics.Status = "healthy"
	} else if healthyNodes > 0 {
		metrics.Status = "warning"
	} else {
		metrics.Status = "error"
	}

	return metrics
}

// Helper functions

func getNodeStatus(node *corev1.Node) string {
	for _, condition := range node.Status.Conditions {
		if condition.Type == corev1.NodeReady {
			if condition.Status == corev1.ConditionTrue {
				return "Ready"
			}
			return "NotReady"
		}
	}
	return "Unknown"
}

func getNodeRole(node *corev1.Node) string {
	if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
		return "control-plane"
	}
	if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
		return "control-plane"
	}
	return "worker"
}

func getNodeAge(node *corev1.Node) string {
	age := time.Since(node.CreationTimestamp.Time)
	days := int(age.Hours() / 24)
	hours := int(age.Hours()) % 24

	if days > 0 {
		return fmt.Sprintf("%dd %dh", days, hours)
	}
	return fmt.Sprintf("%dh", hours)
}

func formatBytes(bytes int64) string {
	const unit = 1024
	if bytes < unit {
		return fmt.Sprintf("%d B", bytes)
	}
	div, exp := int64(unit), 0
	for n := bytes / unit; n >= unit; n /= unit {
		div *= unit
		exp++
	}
	return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
}

func inferHelmReleaseName(labels map[string]string) string {
	if labels == nil {
		return ""
	}
	for _, key := range []string{
		"app.kubernetes.io/instance",
		"release",
		"helm.sh/release",
		"meta.helm.sh/release-name",
		"app",
	} {
		if value := labels[key]; value != "" {
			return value
		}
	}
	return ""
}

func podResourceAllocation(pod *corev1.Pod) entity.ResourceAllocation {
	if pod == nil {
		return entity.ResourceAllocation{}
	}
	sum := entity.ResourceAllocation{}
	for _, container := range pod.Spec.Containers {
		sum = addContainerAllocation(sum, container)
	}
	initMax := entity.ResourceAllocation{}
	for _, container := range pod.Spec.InitContainers {
		initMax = maxAllocation(initMax, containerAllocation(container))
	}
	return maxAllocation(sum, initMax)
}

func addContainerAllocation(base entity.ResourceAllocation, container corev1.Container) entity.ResourceAllocation {
	return addAllocation(base, containerAllocation(container))
}

func containerAllocation(container corev1.Container) entity.ResourceAllocation {
	requests := container.Resources.Requests
	limits := container.Resources.Limits
	return entity.ResourceAllocation{
		CPURequestsMilli:    quantityMilliValue(requests, corev1.ResourceCPU),
		CPULimitsMilli:      quantityMilliValue(limits, corev1.ResourceCPU),
		MemoryRequestsBytes: quantityValue(requests, corev1.ResourceMemory),
		MemoryLimitsBytes:   quantityValue(limits, corev1.ResourceMemory),
		GPURequests:         quantityValue(requests, corev1.ResourceName("nvidia.com/gpu")),
		GPULimits:           quantityValue(limits, corev1.ResourceName("nvidia.com/gpu")),
		GPUMemoryRequestsMB: quantityValueAny(requests, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
		GPUMemoryLimitsMB:   quantityValueAny(limits, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
	}
}

func addAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
	return entity.ResourceAllocation{
		CPURequestsMilli:    left.CPURequestsMilli + right.CPURequestsMilli,
		CPULimitsMilli:      left.CPULimitsMilli + right.CPULimitsMilli,
		MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
		MemoryLimitsBytes:   left.MemoryLimitsBytes + right.MemoryLimitsBytes,
		GPURequests:         left.GPURequests + right.GPURequests,
		GPULimits:           left.GPULimits + right.GPULimits,
		GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
		GPUMemoryLimitsMB:   left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
	}
}

func maxAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
	return entity.ResourceAllocation{
		CPURequestsMilli:    maxInt64(left.CPURequestsMilli, right.CPURequestsMilli),
		CPULimitsMilli:      maxInt64(left.CPULimitsMilli, right.CPULimitsMilli),
		MemoryRequestsBytes: maxInt64(left.MemoryRequestsBytes, right.MemoryRequestsBytes),
		MemoryLimitsBytes:   maxInt64(left.MemoryLimitsBytes, right.MemoryLimitsBytes),
		GPURequests:         maxInt64(left.GPURequests, right.GPURequests),
		GPULimits:           maxInt64(left.GPULimits, right.GPULimits),
		GPUMemoryRequestsMB: maxInt64(left.GPUMemoryRequestsMB, right.GPUMemoryRequestsMB),
		GPUMemoryLimitsMB:   maxInt64(left.GPUMemoryLimitsMB, right.GPUMemoryLimitsMB),
	}
}

func quantityMilliValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
	if quantity, ok := resources[name]; ok {
		return quantity.MilliValue()
	}
	return 0
}

func quantityValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
	if quantity, ok := resources[name]; ok {
		return quantity.Value()
	}
	return 0
}

func quantityValueAny(resources corev1.ResourceList, names ...corev1.ResourceName) int64 {
	for _, name := range names {
		if quantity, ok := resources[name]; ok {
			return quantity.Value()
		}
	}
	return 0
}

func maxInt64(left, right int64) int64 {
	if left > right {
		return left
	}
	return right
}