Files
ocdp-go/backend/internal/adapter/output/k8s/metrics_client.go
mangomqy c5e51ed069 ocdp v1
2025-11-13 02:54:06 +00:00

371 lines
10 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package k8s
import (
"context"
"fmt"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
metricsv "k8s.io/metrics/pkg/client/clientset/versioned"
"github.com/ocdp/cluster-service/internal/domain/entity"
"github.com/ocdp/cluster-service/internal/domain/repository"
)
// MetricsClient 实现从 Kubernetes 集群获取监控指标
type MetricsClient struct {
clusterRepo repository.ClusterRepository
}
// NewMetricsClient 创建 MetricsClient
func NewMetricsClient(clusterRepo repository.ClusterRepository) *MetricsClient {
return &MetricsClient{
clusterRepo: clusterRepo,
}
}
// GetClusterMetrics 获取集群监控指标
func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) {
// 获取集群信息
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
// 创建 Kubernetes 客户端
clientset, metricsClient, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
// 获取节点列表
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list nodes: %w", err)
}
// 获取所有 Pods
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list pods: %w", err)
}
// 获取节点指标CPU/内存使用情况)
nodeMetrics, err := c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
if err != nil {
// 如果无法获取 metrics记录错误但继续
fmt.Printf("Warning: failed to get node metrics: %v\n", err)
}
// 计算集群级别汇总
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
return metrics, nil
}
// GetNodeMetrics 获取集群节点指标
func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) {
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
clientset, metricsClient, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list nodes: %w", err)
}
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
}
// createK8sClients 创建 Kubernetes 客户端
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
if err != nil {
// 如果无法从 kubeconfig 创建,尝试使用集群配置
config = &rest.Config{
Host: cluster.Host,
TLSClientConfig: rest.TLSClientConfig{
CAData: []byte(cluster.CAData),
CertData: []byte(cluster.CertData),
KeyData: []byte(cluster.KeyData),
},
}
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return nil, nil, fmt.Errorf("failed to create clientset: %w", err)
}
metricsClient, err := metricsv.NewForConfig(config)
if err != nil {
// Metrics API 可能不可用,返回 nil 但不报错
return clientset, nil, nil
}
return clientset, metricsClient, nil
}
// getNodeMetricsData 获取节点详细指标
func (c *MetricsClient) getNodeMetricsData(
ctx context.Context,
clientset *kubernetes.Clientset,
metricsClient *metricsv.Clientset,
nodes []corev1.Node,
) ([]*entity.NodeMetrics, error) {
result := make([]*entity.NodeMetrics, 0, len(nodes))
for _, node := range nodes {
nodeMetric := &entity.NodeMetrics{
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
}
// CPU
cpuCapacity := node.Status.Capacity.Cpu()
cpuAllocatable := node.Status.Allocatable.Cpu()
nodeMetric.CPUCapacity = fmt.Sprintf("%.2f cores", float64(cpuCapacity.MilliValue())/1000.0)
nodeMetric.CPUAllocatable = fmt.Sprintf("%.2f cores", float64(cpuAllocatable.MilliValue())/1000.0)
// Memory
memCapacity := node.Status.Capacity.Memory()
memAllocatable := node.Status.Allocatable.Memory()
nodeMetric.MemoryCapacity = formatBytes(memCapacity.Value())
nodeMetric.MemoryAllocatable = formatBytes(memAllocatable.Value())
// GPU (从 node allocatable 中查找)
if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
nodeMetric.GPUCapacity = int(gpu.Value())
// 尝试获取 GPU 类型
if gpuType, ok := node.Labels["nvidia.com/gpu.product"]; ok {
nodeMetric.GPUType = gpuType
}
}
// 获取 Pod 数量
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.nodeName=%s", node.Name),
})
if err == nil {
nodeMetric.PodCount = len(pods.Items)
}
// 如果有 metrics client获取实时使用情况
if metricsClient != nil {
nodeMetricData, err := metricsClient.MetricsV1beta1().NodeMetricses().Get(ctx, node.Name, metav1.GetOptions{})
if err == nil {
// CPU 使用
cpuUsage := nodeMetricData.Usage.Cpu()
nodeMetric.CPUUsage = fmt.Sprintf("%.2f cores", float64(cpuUsage.MilliValue())/1000.0)
if cpuAllocatable.MilliValue() > 0 {
nodeMetric.CPUPercent = float64(cpuUsage.MilliValue()) / float64(cpuAllocatable.MilliValue()) * 100
}
// Memory 使用
memUsage := nodeMetricData.Usage.Memory()
nodeMetric.MemoryUsage = formatBytes(memUsage.Value())
if memAllocatable.Value() > 0 {
nodeMetric.MemoryPercent = float64(memUsage.Value()) / float64(memAllocatable.Value()) * 100
}
}
}
result = append(result, nodeMetric)
}
return result, nil
}
// aggregateClusterMetrics 聚合集群级别指标
func (c *MetricsClient) aggregateClusterMetrics(
cluster *entity.Cluster,
nodes []corev1.Node,
pods []corev1.Pod,
nodeMetrics []*entity.NodeMetrics,
) *entity.ClusterMetrics {
metrics := &entity.ClusterMetrics{
ClusterID: cluster.ID,
ClusterName: cluster.Name,
Status: "healthy",
NodeCount: len(nodes),
PodCount: len(pods),
LastCheck: time.Now(),
Nodes: make([]entity.NodeMetrics, 0),
}
// 汇总资源
var totalCPU, totalMem, usedCPU, usedMem int64
var totalGPU, usedGPU int
healthyNodes := 0
// 单机最大值
var maxNodeCPU, maxNodeMem int64
var maxNodeGPU int
var maxNodeCPUUsage, maxNodeMemUsage, maxNodeGPUUsage float64
for i, node := range nodes {
// CPU
cpuCap := node.Status.Capacity.Cpu()
totalCPU += cpuCap.MilliValue()
if cpuCap.MilliValue() > maxNodeCPU {
maxNodeCPU = cpuCap.MilliValue()
}
// Memory
memCap := node.Status.Capacity.Memory()
totalMem += memCap.Value()
if memCap.Value() > maxNodeMem {
maxNodeMem = memCap.Value()
}
// GPU
if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
gpuCount := int(gpu.Value())
totalGPU += gpuCount
if gpuCount > maxNodeGPU {
maxNodeGPU = gpuCount
}
}
// Node status
if getNodeStatus(&node) == "Ready" {
healthyNodes++
}
// 从 nodeMetrics 获取使用情况
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
// 更新单机最大使用率
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
}
if nodeMetrics[i].MemoryPercent > maxNodeMemUsage {
maxNodeMemUsage = nodeMetrics[i].MemoryPercent
}
if nodeMetrics[i].GPUPercent > maxNodeGPUUsage {
maxNodeGPUUsage = nodeMetrics[i].GPUPercent
}
}
}
// 计算集群 uptime简化使用最老节点的年龄
if len(nodes) > 0 {
metrics.Uptime = getNodeAge(&nodes[0])
}
// 格式化总资源
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
metrics.TotalMemory = formatBytes(totalMem)
metrics.TotalGPU = totalGPU
// 格式化单机最大值
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
metrics.MaxNodeGPU = maxNodeGPU
metrics.MaxNodeCPUUsage = maxNodeCPUUsage
metrics.MaxNodeMemUsage = maxNodeMemUsage
metrics.MaxNodeGPUUsage = maxNodeGPUUsage
// 使用情况(简化处理)
if len(nodeMetrics) > 0 {
for _, nm := range nodeMetrics {
// 解析使用的 CPU 和内存
// 这里简化处理,实际应该解析字符串
usedCPU += int64(nm.CPUPercent * float64(totalCPU) / 100.0)
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
usedGPU += nm.GPUUsage
}
if totalCPU > 0 {
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
}
if totalMem > 0 {
metrics.MemoryUsage = float64(usedMem) / float64(totalMem) * 100
}
if totalGPU > 0 {
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
}
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
metrics.UsedMemory = formatBytes(usedMem)
metrics.UsedGPU = usedGPU
}
// 确定集群状态
if healthyNodes == len(nodes) {
metrics.Status = "healthy"
} else if healthyNodes > 0 {
metrics.Status = "warning"
} else {
metrics.Status = "error"
}
return metrics
}
// Helper functions
func getNodeStatus(node *corev1.Node) string {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady {
if condition.Status == corev1.ConditionTrue {
return "Ready"
}
return "NotReady"
}
}
return "Unknown"
}
func getNodeRole(node *corev1.Node) string {
if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
return "control-plane"
}
if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
return "control-plane"
}
return "worker"
}
func getNodeAge(node *corev1.Node) string {
age := time.Since(node.CreationTimestamp.Time)
days := int(age.Hours() / 24)
hours := int(age.Hours()) % 24
if days > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
return fmt.Sprintf("%dh", hours)
}
func formatBytes(bytes int64) string {
const unit = 1024
if bytes < unit {
return fmt.Sprintf("%d B", bytes)
}
div, exp := int64(unit), 0
for n := bytes / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
}