This commit is contained in:
mangomqy
2025-11-13 02:54:06 +00:00
commit c5e51ed069
254 changed files with 54901 additions and 0 deletions

View File

@ -0,0 +1,370 @@
package k8s
import (
"context"
"fmt"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
metricsv "k8s.io/metrics/pkg/client/clientset/versioned"
"github.com/ocdp/cluster-service/internal/domain/entity"
"github.com/ocdp/cluster-service/internal/domain/repository"
)
// MetricsClient 实现从 Kubernetes 集群获取监控指标
type MetricsClient struct {
clusterRepo repository.ClusterRepository
}
// NewMetricsClient 创建 MetricsClient
func NewMetricsClient(clusterRepo repository.ClusterRepository) *MetricsClient {
return &MetricsClient{
clusterRepo: clusterRepo,
}
}
// GetClusterMetrics 获取集群监控指标
func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) {
// 获取集群信息
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
// 创建 Kubernetes 客户端
clientset, metricsClient, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
// 获取节点列表
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list nodes: %w", err)
}
// 获取所有 Pods
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list pods: %w", err)
}
// 获取节点指标CPU/内存使用情况)
nodeMetrics, err := c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
if err != nil {
// 如果无法获取 metrics记录错误但继续
fmt.Printf("Warning: failed to get node metrics: %v\n", err)
}
// 计算集群级别汇总
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
return metrics, nil
}
// GetNodeMetrics 获取集群节点指标
func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) {
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
clientset, metricsClient, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list nodes: %w", err)
}
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
}
// createK8sClients 创建 Kubernetes 客户端
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
if err != nil {
// 如果无法从 kubeconfig 创建,尝试使用集群配置
config = &rest.Config{
Host: cluster.Host,
TLSClientConfig: rest.TLSClientConfig{
CAData: []byte(cluster.CAData),
CertData: []byte(cluster.CertData),
KeyData: []byte(cluster.KeyData),
},
}
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return nil, nil, fmt.Errorf("failed to create clientset: %w", err)
}
metricsClient, err := metricsv.NewForConfig(config)
if err != nil {
// Metrics API 可能不可用,返回 nil 但不报错
return clientset, nil, nil
}
return clientset, metricsClient, nil
}
// getNodeMetricsData 获取节点详细指标
func (c *MetricsClient) getNodeMetricsData(
ctx context.Context,
clientset *kubernetes.Clientset,
metricsClient *metricsv.Clientset,
nodes []corev1.Node,
) ([]*entity.NodeMetrics, error) {
result := make([]*entity.NodeMetrics, 0, len(nodes))
for _, node := range nodes {
nodeMetric := &entity.NodeMetrics{
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
}
// CPU
cpuCapacity := node.Status.Capacity.Cpu()
cpuAllocatable := node.Status.Allocatable.Cpu()
nodeMetric.CPUCapacity = fmt.Sprintf("%.2f cores", float64(cpuCapacity.MilliValue())/1000.0)
nodeMetric.CPUAllocatable = fmt.Sprintf("%.2f cores", float64(cpuAllocatable.MilliValue())/1000.0)
// Memory
memCapacity := node.Status.Capacity.Memory()
memAllocatable := node.Status.Allocatable.Memory()
nodeMetric.MemoryCapacity = formatBytes(memCapacity.Value())
nodeMetric.MemoryAllocatable = formatBytes(memAllocatable.Value())
// GPU (从 node allocatable 中查找)
if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
nodeMetric.GPUCapacity = int(gpu.Value())
// 尝试获取 GPU 类型
if gpuType, ok := node.Labels["nvidia.com/gpu.product"]; ok {
nodeMetric.GPUType = gpuType
}
}
// 获取 Pod 数量
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.nodeName=%s", node.Name),
})
if err == nil {
nodeMetric.PodCount = len(pods.Items)
}
// 如果有 metrics client获取实时使用情况
if metricsClient != nil {
nodeMetricData, err := metricsClient.MetricsV1beta1().NodeMetricses().Get(ctx, node.Name, metav1.GetOptions{})
if err == nil {
// CPU 使用
cpuUsage := nodeMetricData.Usage.Cpu()
nodeMetric.CPUUsage = fmt.Sprintf("%.2f cores", float64(cpuUsage.MilliValue())/1000.0)
if cpuAllocatable.MilliValue() > 0 {
nodeMetric.CPUPercent = float64(cpuUsage.MilliValue()) / float64(cpuAllocatable.MilliValue()) * 100
}
// Memory 使用
memUsage := nodeMetricData.Usage.Memory()
nodeMetric.MemoryUsage = formatBytes(memUsage.Value())
if memAllocatable.Value() > 0 {
nodeMetric.MemoryPercent = float64(memUsage.Value()) / float64(memAllocatable.Value()) * 100
}
}
}
result = append(result, nodeMetric)
}
return result, nil
}
// aggregateClusterMetrics 聚合集群级别指标
func (c *MetricsClient) aggregateClusterMetrics(
cluster *entity.Cluster,
nodes []corev1.Node,
pods []corev1.Pod,
nodeMetrics []*entity.NodeMetrics,
) *entity.ClusterMetrics {
metrics := &entity.ClusterMetrics{
ClusterID: cluster.ID,
ClusterName: cluster.Name,
Status: "healthy",
NodeCount: len(nodes),
PodCount: len(pods),
LastCheck: time.Now(),
Nodes: make([]entity.NodeMetrics, 0),
}
// 汇总资源
var totalCPU, totalMem, usedCPU, usedMem int64
var totalGPU, usedGPU int
healthyNodes := 0
// 单机最大值
var maxNodeCPU, maxNodeMem int64
var maxNodeGPU int
var maxNodeCPUUsage, maxNodeMemUsage, maxNodeGPUUsage float64
for i, node := range nodes {
// CPU
cpuCap := node.Status.Capacity.Cpu()
totalCPU += cpuCap.MilliValue()
if cpuCap.MilliValue() > maxNodeCPU {
maxNodeCPU = cpuCap.MilliValue()
}
// Memory
memCap := node.Status.Capacity.Memory()
totalMem += memCap.Value()
if memCap.Value() > maxNodeMem {
maxNodeMem = memCap.Value()
}
// GPU
if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
gpuCount := int(gpu.Value())
totalGPU += gpuCount
if gpuCount > maxNodeGPU {
maxNodeGPU = gpuCount
}
}
// Node status
if getNodeStatus(&node) == "Ready" {
healthyNodes++
}
// 从 nodeMetrics 获取使用情况
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
// 更新单机最大使用率
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
}
if nodeMetrics[i].MemoryPercent > maxNodeMemUsage {
maxNodeMemUsage = nodeMetrics[i].MemoryPercent
}
if nodeMetrics[i].GPUPercent > maxNodeGPUUsage {
maxNodeGPUUsage = nodeMetrics[i].GPUPercent
}
}
}
// 计算集群 uptime简化使用最老节点的年龄
if len(nodes) > 0 {
metrics.Uptime = getNodeAge(&nodes[0])
}
// 格式化总资源
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
metrics.TotalMemory = formatBytes(totalMem)
metrics.TotalGPU = totalGPU
// 格式化单机最大值
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
metrics.MaxNodeGPU = maxNodeGPU
metrics.MaxNodeCPUUsage = maxNodeCPUUsage
metrics.MaxNodeMemUsage = maxNodeMemUsage
metrics.MaxNodeGPUUsage = maxNodeGPUUsage
// 使用情况(简化处理)
if len(nodeMetrics) > 0 {
for _, nm := range nodeMetrics {
// 解析使用的 CPU 和内存
// 这里简化处理,实际应该解析字符串
usedCPU += int64(nm.CPUPercent * float64(totalCPU) / 100.0)
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
usedGPU += nm.GPUUsage
}
if totalCPU > 0 {
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
}
if totalMem > 0 {
metrics.MemoryUsage = float64(usedMem) / float64(totalMem) * 100
}
if totalGPU > 0 {
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
}
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
metrics.UsedMemory = formatBytes(usedMem)
metrics.UsedGPU = usedGPU
}
// 确定集群状态
if healthyNodes == len(nodes) {
metrics.Status = "healthy"
} else if healthyNodes > 0 {
metrics.Status = "warning"
} else {
metrics.Status = "error"
}
return metrics
}
// Helper functions
func getNodeStatus(node *corev1.Node) string {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady {
if condition.Status == corev1.ConditionTrue {
return "Ready"
}
return "NotReady"
}
}
return "Unknown"
}
func getNodeRole(node *corev1.Node) string {
if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
return "control-plane"
}
if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
return "control-plane"
}
return "worker"
}
func getNodeAge(node *corev1.Node) string {
age := time.Since(node.CreationTimestamp.Time)
days := int(age.Hours() / 24)
hours := int(age.Hours()) % 24
if days > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
return fmt.Sprintf("%dh", hours)
}
func formatBytes(bytes int64) string {
const unit = 1024
if bytes < unit {
return fmt.Sprintf("%d B", bytes)
}
div, exp := int64(unit), 0
for n := bytes / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
}