ocdp v1
This commit is contained in:
321
backend/internal/adapter/output/k8s/entry_client.go
Normal file
321
backend/internal/adapter/output/k8s/entry_client.go
Normal file
@ -0,0 +1,321 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
networkingv1 "k8s.io/api/networking/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/intstr"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/clientcmd"
|
||||
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
)
|
||||
|
||||
// EntryClient 使用 Kubernetes API 查询实例相关 Service/Ingress
|
||||
type EntryClient struct{}
|
||||
|
||||
// NewEntryClient 创建 EntryClient
|
||||
func NewEntryClient() repository.InstanceEntryClient {
|
||||
return &EntryClient{}
|
||||
}
|
||||
|
||||
// ListEntries 查询实例的 Service/Ingress 入口
|
||||
func (c *EntryClient) ListEntries(
|
||||
ctx context.Context,
|
||||
cluster *entity.Cluster,
|
||||
instance *entity.Instance,
|
||||
) ([]*entity.InstanceEntry, error) {
|
||||
clientset, err := c.createClientset(cluster)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
selector := fmt.Sprintf("app.kubernetes.io/instance=%s", instance.Name)
|
||||
|
||||
serviceEntries, err := c.collectServiceEntries(ctx, clientset, instance, selector)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ingressEntries, err := c.collectIngressEntries(ctx, clientset, instance, selector)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return append(serviceEntries, ingressEntries...), nil
|
||||
}
|
||||
|
||||
func (c *EntryClient) collectServiceEntries(
|
||||
ctx context.Context,
|
||||
clientset *kubernetes.Clientset,
|
||||
instance *entity.Instance,
|
||||
selector string,
|
||||
) ([]*entity.InstanceEntry, error) {
|
||||
services, err := c.listServices(ctx, clientset, instance.Namespace, selector)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
entries := convertServicesToEntries(services, instance, selector == "")
|
||||
if len(entries) == 0 && selector != "" {
|
||||
// Fallback: widen the search scope and filter manually.
|
||||
services, err = c.listServices(ctx, clientset, instance.Namespace, "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
entries = convertServicesToEntries(services, instance, true)
|
||||
}
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
func (c *EntryClient) collectIngressEntries(
|
||||
ctx context.Context,
|
||||
clientset *kubernetes.Clientset,
|
||||
instance *entity.Instance,
|
||||
selector string,
|
||||
) ([]*entity.InstanceEntry, error) {
|
||||
ingresses, err := c.listIngresses(ctx, clientset, instance.Namespace, selector)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
entries := convertIngressesToEntries(ingresses, instance, selector == "")
|
||||
if len(entries) == 0 && selector != "" {
|
||||
ingresses, err = c.listIngresses(ctx, clientset, instance.Namespace, "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
entries = convertIngressesToEntries(ingresses, instance, true)
|
||||
}
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
func (c *EntryClient) listServices(
|
||||
ctx context.Context,
|
||||
clientset *kubernetes.Clientset,
|
||||
namespace, selector string,
|
||||
) ([]corev1.Service, error) {
|
||||
listOptions := metav1.ListOptions{}
|
||||
if selector != "" {
|
||||
listOptions.LabelSelector = selector
|
||||
}
|
||||
services, err := clientset.CoreV1().
|
||||
Services(namespace).
|
||||
List(ctx, listOptions)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list services: %w", err)
|
||||
}
|
||||
return services.Items, nil
|
||||
}
|
||||
|
||||
func (c *EntryClient) listIngresses(
|
||||
ctx context.Context,
|
||||
clientset *kubernetes.Clientset,
|
||||
namespace, selector string,
|
||||
) ([]networkingv1.Ingress, error) {
|
||||
listOptions := metav1.ListOptions{}
|
||||
if selector != "" {
|
||||
listOptions.LabelSelector = selector
|
||||
}
|
||||
ingresses, err := clientset.NetworkingV1().
|
||||
Ingresses(namespace).
|
||||
List(ctx, listOptions)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list ingresses: %w", err)
|
||||
}
|
||||
return ingresses.Items, nil
|
||||
}
|
||||
|
||||
func convertServicesToEntries(services []corev1.Service, instance *entity.Instance, enforceMatch bool) []*entity.InstanceEntry {
|
||||
entries := make([]*entity.InstanceEntry, 0, len(services))
|
||||
for _, svc := range services {
|
||||
if enforceMatch && !resourceMatchesInstance(svc.ObjectMeta, instance) {
|
||||
continue
|
||||
}
|
||||
entries = append(entries, convertServiceToEntry(&svc))
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func convertIngressesToEntries(ingresses []networkingv1.Ingress, instance *entity.Instance, enforceMatch bool) []*entity.InstanceEntry {
|
||||
entries := make([]*entity.InstanceEntry, 0, len(ingresses))
|
||||
for _, ing := range ingresses {
|
||||
if enforceMatch && !resourceMatchesInstance(ing.ObjectMeta, instance) {
|
||||
continue
|
||||
}
|
||||
entries = append(entries, convertIngressToEntry(&ing))
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func (c *EntryClient) createClientset(cluster *entity.Cluster) (*kubernetes.Clientset, error) {
|
||||
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
|
||||
if err != nil {
|
||||
config = &rest.Config{
|
||||
Host: cluster.Host,
|
||||
TLSClientConfig: rest.TLSClientConfig{
|
||||
CAData: []byte(cluster.CAData),
|
||||
CertData: []byte(cluster.CertData),
|
||||
KeyData: []byte(cluster.KeyData),
|
||||
},
|
||||
BearerToken: cluster.Token,
|
||||
}
|
||||
}
|
||||
|
||||
clientset, err := kubernetes.NewForConfig(config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create kubernetes client: %w", err)
|
||||
}
|
||||
|
||||
return clientset, nil
|
||||
}
|
||||
|
||||
func convertServiceToEntry(svc *corev1.Service) *entity.InstanceEntry {
|
||||
clusterIP := svc.Spec.ClusterIP
|
||||
if clusterIP == corev1.ClusterIPNone {
|
||||
clusterIP = ""
|
||||
}
|
||||
|
||||
lbIngress := make([]string, 0, len(svc.Status.LoadBalancer.Ingress))
|
||||
for _, ing := range svc.Status.LoadBalancer.Ingress {
|
||||
if ing.IP != "" {
|
||||
lbIngress = append(lbIngress, ing.IP)
|
||||
}
|
||||
if ing.Hostname != "" {
|
||||
lbIngress = append(lbIngress, ing.Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
ports := make([]entity.InstanceEntryPort, 0, len(svc.Spec.Ports))
|
||||
for _, port := range svc.Spec.Ports {
|
||||
ports = append(ports, entity.InstanceEntryPort{
|
||||
Name: port.Name,
|
||||
Protocol: string(port.Protocol),
|
||||
Port: port.Port,
|
||||
TargetPort: intOrStringToString(port.TargetPort),
|
||||
NodePort: port.NodePort,
|
||||
})
|
||||
}
|
||||
|
||||
return &entity.InstanceEntry{
|
||||
Kind: "Service",
|
||||
Name: svc.Name,
|
||||
Namespace: svc.Namespace,
|
||||
Type: string(svc.Spec.Type),
|
||||
ClusterIP: clusterIP,
|
||||
ExternalIPs: append([]string{}, svc.Spec.ExternalIPs...),
|
||||
LoadBalancerIngress: lbIngress,
|
||||
Ports: ports,
|
||||
}
|
||||
}
|
||||
|
||||
func convertIngressToEntry(ing *networkingv1.Ingress) *entity.InstanceEntry {
|
||||
lbIngress := make([]string, 0, len(ing.Status.LoadBalancer.Ingress))
|
||||
for _, addr := range ing.Status.LoadBalancer.Ingress {
|
||||
if addr.IP != "" {
|
||||
lbIngress = append(lbIngress, addr.IP)
|
||||
}
|
||||
if addr.Hostname != "" {
|
||||
lbIngress = append(lbIngress, addr.Hostname)
|
||||
}
|
||||
}
|
||||
|
||||
hosts := make([]entity.InstanceEntryHost, 0, len(ing.Spec.Rules))
|
||||
for _, rule := range ing.Spec.Rules {
|
||||
hostEntry := entity.InstanceEntryHost{
|
||||
Host: rule.Host,
|
||||
}
|
||||
if rule.HTTP != nil {
|
||||
paths := make([]entity.InstanceEntryPath, 0, len(rule.HTTP.Paths))
|
||||
for _, path := range rule.HTTP.Paths {
|
||||
name := ""
|
||||
port := ""
|
||||
if path.Backend.Service != nil {
|
||||
name = path.Backend.Service.Name
|
||||
port = serviceBackendPortString(path.Backend.Service.Port)
|
||||
}
|
||||
paths = append(paths, entity.InstanceEntryPath{
|
||||
Path: path.Path,
|
||||
ServiceName: name,
|
||||
ServicePort: port,
|
||||
})
|
||||
}
|
||||
hostEntry.Paths = paths
|
||||
}
|
||||
hosts = append(hosts, hostEntry)
|
||||
}
|
||||
|
||||
tlsEntries := make([]entity.InstanceEntryTLS, 0, len(ing.Spec.TLS))
|
||||
for _, tls := range ing.Spec.TLS {
|
||||
tlsEntries = append(tlsEntries, entity.InstanceEntryTLS{
|
||||
Hosts: append([]string{}, tls.Hosts...),
|
||||
SecretName: tls.SecretName,
|
||||
})
|
||||
}
|
||||
|
||||
entryType := "Ingress"
|
||||
if ing.Spec.IngressClassName != nil {
|
||||
entryType = *ing.Spec.IngressClassName
|
||||
}
|
||||
|
||||
return &entity.InstanceEntry{
|
||||
Kind: "Ingress",
|
||||
Name: ing.Name,
|
||||
Namespace: ing.Namespace,
|
||||
Type: entryType,
|
||||
LoadBalancerIngress: lbIngress,
|
||||
Hosts: hosts,
|
||||
TLS: tlsEntries,
|
||||
}
|
||||
}
|
||||
|
||||
func intOrStringToString(v intstr.IntOrString) string {
|
||||
if v.Type == intstr.String {
|
||||
return v.StrVal
|
||||
}
|
||||
return fmt.Sprintf("%d", v.IntValue())
|
||||
}
|
||||
|
||||
func serviceBackendPortString(port networkingv1.ServiceBackendPort) string {
|
||||
if port.Name != "" {
|
||||
return port.Name
|
||||
}
|
||||
if port.Number != 0 {
|
||||
return fmt.Sprintf("%d", port.Number)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func resourceMatchesInstance(meta metav1.ObjectMeta, instance *entity.Instance) bool {
|
||||
if instance == nil {
|
||||
return false
|
||||
}
|
||||
labels := meta.GetLabels()
|
||||
if labels != nil {
|
||||
if labels["app.kubernetes.io/instance"] == instance.Name {
|
||||
return true
|
||||
}
|
||||
labelKeys := []string{"app", "app.kubernetes.io/name", "app.kubernetes.io/component", "release"}
|
||||
for _, key := range labelKeys {
|
||||
if labels[key] == instance.Name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
annotations := meta.GetAnnotations()
|
||||
if annotations != nil {
|
||||
if annotations["meta.helm.sh/release-name"] == instance.Name {
|
||||
if ns := annotations["meta.helm.sh/release-namespace"]; ns == "" || ns == instance.Namespace {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
name := meta.GetName()
|
||||
if name == instance.Name || strings.HasPrefix(name, instance.Name+"-") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
54
backend/internal/adapter/output/k8s/entry_client_test.go
Normal file
54
backend/internal/adapter/output/k8s/entry_client_test.go
Normal file
@ -0,0 +1,54 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
)
|
||||
|
||||
func TestResourceMatchesInstance(t *testing.T) {
|
||||
instance := &entity.Instance{
|
||||
Name: "demo",
|
||||
Namespace: "default",
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
meta metav1.ObjectMeta
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "matches by standard label",
|
||||
meta: metav1.ObjectMeta{Labels: map[string]string{
|
||||
"app.kubernetes.io/instance": "demo",
|
||||
}},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "matches by helm annotations",
|
||||
meta: metav1.ObjectMeta{Annotations: map[string]string{
|
||||
"meta.helm.sh/release-name": "demo",
|
||||
"meta.helm.sh/release-namespace": "default",
|
||||
}},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "matches by resource name prefix",
|
||||
meta: metav1.ObjectMeta{Name: "demo-nginx"},
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "does not match unrelated resource",
|
||||
meta: metav1.ObjectMeta{Name: "other"},
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
if got := resourceMatchesInstance(tc.meta, instance); got != tc.want {
|
||||
t.Fatalf("%s: expected %v, got %v", tc.name, tc.want, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
370
backend/internal/adapter/output/k8s/metrics_client.go
Normal file
370
backend/internal/adapter/output/k8s/metrics_client.go
Normal file
@ -0,0 +1,370 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/clientcmd"
|
||||
metricsv "k8s.io/metrics/pkg/client/clientset/versioned"
|
||||
|
||||
"github.com/ocdp/cluster-service/internal/domain/entity"
|
||||
"github.com/ocdp/cluster-service/internal/domain/repository"
|
||||
)
|
||||
|
||||
// MetricsClient 实现从 Kubernetes 集群获取监控指标
|
||||
type MetricsClient struct {
|
||||
clusterRepo repository.ClusterRepository
|
||||
}
|
||||
|
||||
// NewMetricsClient 创建 MetricsClient
|
||||
func NewMetricsClient(clusterRepo repository.ClusterRepository) *MetricsClient {
|
||||
return &MetricsClient{
|
||||
clusterRepo: clusterRepo,
|
||||
}
|
||||
}
|
||||
|
||||
// GetClusterMetrics 获取集群监控指标
|
||||
func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) {
|
||||
// 获取集群信息
|
||||
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get cluster: %w", err)
|
||||
}
|
||||
|
||||
// 创建 Kubernetes 客户端
|
||||
clientset, metricsClient, err := c.createK8sClients(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create k8s client: %w", err)
|
||||
}
|
||||
|
||||
// 获取节点列表
|
||||
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list nodes: %w", err)
|
||||
}
|
||||
|
||||
// 获取所有 Pods
|
||||
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list pods: %w", err)
|
||||
}
|
||||
|
||||
// 获取节点指标(CPU/内存使用情况)
|
||||
nodeMetrics, err := c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
|
||||
if err != nil {
|
||||
// 如果无法获取 metrics,记录错误但继续
|
||||
fmt.Printf("Warning: failed to get node metrics: %v\n", err)
|
||||
}
|
||||
|
||||
// 计算集群级别汇总
|
||||
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
// GetNodeMetrics 获取集群节点指标
|
||||
func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) {
|
||||
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get cluster: %w", err)
|
||||
}
|
||||
|
||||
clientset, metricsClient, err := c.createK8sClients(cluster)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create k8s client: %w", err)
|
||||
}
|
||||
|
||||
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to list nodes: %w", err)
|
||||
}
|
||||
|
||||
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
|
||||
}
|
||||
|
||||
// createK8sClients 创建 Kubernetes 客户端
|
||||
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
|
||||
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
|
||||
if err != nil {
|
||||
// 如果无法从 kubeconfig 创建,尝试使用集群配置
|
||||
config = &rest.Config{
|
||||
Host: cluster.Host,
|
||||
TLSClientConfig: rest.TLSClientConfig{
|
||||
CAData: []byte(cluster.CAData),
|
||||
CertData: []byte(cluster.CertData),
|
||||
KeyData: []byte(cluster.KeyData),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
clientset, err := kubernetes.NewForConfig(config)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to create clientset: %w", err)
|
||||
}
|
||||
|
||||
metricsClient, err := metricsv.NewForConfig(config)
|
||||
if err != nil {
|
||||
// Metrics API 可能不可用,返回 nil 但不报错
|
||||
return clientset, nil, nil
|
||||
}
|
||||
|
||||
return clientset, metricsClient, nil
|
||||
}
|
||||
|
||||
// getNodeMetricsData 获取节点详细指标
|
||||
func (c *MetricsClient) getNodeMetricsData(
|
||||
ctx context.Context,
|
||||
clientset *kubernetes.Clientset,
|
||||
metricsClient *metricsv.Clientset,
|
||||
nodes []corev1.Node,
|
||||
) ([]*entity.NodeMetrics, error) {
|
||||
result := make([]*entity.NodeMetrics, 0, len(nodes))
|
||||
|
||||
for _, node := range nodes {
|
||||
nodeMetric := &entity.NodeMetrics{
|
||||
NodeName: node.Name,
|
||||
Status: getNodeStatus(&node),
|
||||
Role: getNodeRole(&node),
|
||||
Age: getNodeAge(&node),
|
||||
OSImage: node.Status.NodeInfo.OSImage,
|
||||
KernelVersion: node.Status.NodeInfo.KernelVersion,
|
||||
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
|
||||
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
|
||||
}
|
||||
|
||||
// CPU
|
||||
cpuCapacity := node.Status.Capacity.Cpu()
|
||||
cpuAllocatable := node.Status.Allocatable.Cpu()
|
||||
nodeMetric.CPUCapacity = fmt.Sprintf("%.2f cores", float64(cpuCapacity.MilliValue())/1000.0)
|
||||
nodeMetric.CPUAllocatable = fmt.Sprintf("%.2f cores", float64(cpuAllocatable.MilliValue())/1000.0)
|
||||
|
||||
// Memory
|
||||
memCapacity := node.Status.Capacity.Memory()
|
||||
memAllocatable := node.Status.Allocatable.Memory()
|
||||
nodeMetric.MemoryCapacity = formatBytes(memCapacity.Value())
|
||||
nodeMetric.MemoryAllocatable = formatBytes(memAllocatable.Value())
|
||||
|
||||
// GPU (从 node allocatable 中查找)
|
||||
if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
|
||||
nodeMetric.GPUCapacity = int(gpu.Value())
|
||||
// 尝试获取 GPU 类型
|
||||
if gpuType, ok := node.Labels["nvidia.com/gpu.product"]; ok {
|
||||
nodeMetric.GPUType = gpuType
|
||||
}
|
||||
}
|
||||
|
||||
// 获取 Pod 数量
|
||||
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
|
||||
FieldSelector: fmt.Sprintf("spec.nodeName=%s", node.Name),
|
||||
})
|
||||
if err == nil {
|
||||
nodeMetric.PodCount = len(pods.Items)
|
||||
}
|
||||
|
||||
// 如果有 metrics client,获取实时使用情况
|
||||
if metricsClient != nil {
|
||||
nodeMetricData, err := metricsClient.MetricsV1beta1().NodeMetricses().Get(ctx, node.Name, metav1.GetOptions{})
|
||||
if err == nil {
|
||||
// CPU 使用
|
||||
cpuUsage := nodeMetricData.Usage.Cpu()
|
||||
nodeMetric.CPUUsage = fmt.Sprintf("%.2f cores", float64(cpuUsage.MilliValue())/1000.0)
|
||||
if cpuAllocatable.MilliValue() > 0 {
|
||||
nodeMetric.CPUPercent = float64(cpuUsage.MilliValue()) / float64(cpuAllocatable.MilliValue()) * 100
|
||||
}
|
||||
|
||||
// Memory 使用
|
||||
memUsage := nodeMetricData.Usage.Memory()
|
||||
nodeMetric.MemoryUsage = formatBytes(memUsage.Value())
|
||||
if memAllocatable.Value() > 0 {
|
||||
nodeMetric.MemoryPercent = float64(memUsage.Value()) / float64(memAllocatable.Value()) * 100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result = append(result, nodeMetric)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// aggregateClusterMetrics 聚合集群级别指标
|
||||
func (c *MetricsClient) aggregateClusterMetrics(
|
||||
cluster *entity.Cluster,
|
||||
nodes []corev1.Node,
|
||||
pods []corev1.Pod,
|
||||
nodeMetrics []*entity.NodeMetrics,
|
||||
) *entity.ClusterMetrics {
|
||||
metrics := &entity.ClusterMetrics{
|
||||
ClusterID: cluster.ID,
|
||||
ClusterName: cluster.Name,
|
||||
Status: "healthy",
|
||||
NodeCount: len(nodes),
|
||||
PodCount: len(pods),
|
||||
LastCheck: time.Now(),
|
||||
Nodes: make([]entity.NodeMetrics, 0),
|
||||
}
|
||||
|
||||
// 汇总资源
|
||||
var totalCPU, totalMem, usedCPU, usedMem int64
|
||||
var totalGPU, usedGPU int
|
||||
healthyNodes := 0
|
||||
|
||||
// 单机最大值
|
||||
var maxNodeCPU, maxNodeMem int64
|
||||
var maxNodeGPU int
|
||||
var maxNodeCPUUsage, maxNodeMemUsage, maxNodeGPUUsage float64
|
||||
|
||||
for i, node := range nodes {
|
||||
// CPU
|
||||
cpuCap := node.Status.Capacity.Cpu()
|
||||
totalCPU += cpuCap.MilliValue()
|
||||
if cpuCap.MilliValue() > maxNodeCPU {
|
||||
maxNodeCPU = cpuCap.MilliValue()
|
||||
}
|
||||
|
||||
// Memory
|
||||
memCap := node.Status.Capacity.Memory()
|
||||
totalMem += memCap.Value()
|
||||
if memCap.Value() > maxNodeMem {
|
||||
maxNodeMem = memCap.Value()
|
||||
}
|
||||
|
||||
// GPU
|
||||
if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
|
||||
gpuCount := int(gpu.Value())
|
||||
totalGPU += gpuCount
|
||||
if gpuCount > maxNodeGPU {
|
||||
maxNodeGPU = gpuCount
|
||||
}
|
||||
}
|
||||
|
||||
// Node status
|
||||
if getNodeStatus(&node) == "Ready" {
|
||||
healthyNodes++
|
||||
}
|
||||
|
||||
// 从 nodeMetrics 获取使用情况
|
||||
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
|
||||
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
|
||||
|
||||
// 更新单机最大使用率
|
||||
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
|
||||
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
|
||||
}
|
||||
if nodeMetrics[i].MemoryPercent > maxNodeMemUsage {
|
||||
maxNodeMemUsage = nodeMetrics[i].MemoryPercent
|
||||
}
|
||||
if nodeMetrics[i].GPUPercent > maxNodeGPUUsage {
|
||||
maxNodeGPUUsage = nodeMetrics[i].GPUPercent
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 计算集群 uptime(简化:使用最老节点的年龄)
|
||||
if len(nodes) > 0 {
|
||||
metrics.Uptime = getNodeAge(&nodes[0])
|
||||
}
|
||||
|
||||
// 格式化总资源
|
||||
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
|
||||
metrics.TotalMemory = formatBytes(totalMem)
|
||||
metrics.TotalGPU = totalGPU
|
||||
|
||||
// 格式化单机最大值
|
||||
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
|
||||
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
|
||||
metrics.MaxNodeGPU = maxNodeGPU
|
||||
metrics.MaxNodeCPUUsage = maxNodeCPUUsage
|
||||
metrics.MaxNodeMemUsage = maxNodeMemUsage
|
||||
metrics.MaxNodeGPUUsage = maxNodeGPUUsage
|
||||
|
||||
// 使用情况(简化处理)
|
||||
if len(nodeMetrics) > 0 {
|
||||
for _, nm := range nodeMetrics {
|
||||
// 解析使用的 CPU 和内存
|
||||
// 这里简化处理,实际应该解析字符串
|
||||
usedCPU += int64(nm.CPUPercent * float64(totalCPU) / 100.0)
|
||||
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
|
||||
usedGPU += nm.GPUUsage
|
||||
}
|
||||
|
||||
if totalCPU > 0 {
|
||||
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
|
||||
}
|
||||
if totalMem > 0 {
|
||||
metrics.MemoryUsage = float64(usedMem) / float64(totalMem) * 100
|
||||
}
|
||||
if totalGPU > 0 {
|
||||
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
|
||||
}
|
||||
|
||||
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
|
||||
metrics.UsedMemory = formatBytes(usedMem)
|
||||
metrics.UsedGPU = usedGPU
|
||||
}
|
||||
|
||||
// 确定集群状态
|
||||
if healthyNodes == len(nodes) {
|
||||
metrics.Status = "healthy"
|
||||
} else if healthyNodes > 0 {
|
||||
metrics.Status = "warning"
|
||||
} else {
|
||||
metrics.Status = "error"
|
||||
}
|
||||
|
||||
return metrics
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
func getNodeStatus(node *corev1.Node) string {
|
||||
for _, condition := range node.Status.Conditions {
|
||||
if condition.Type == corev1.NodeReady {
|
||||
if condition.Status == corev1.ConditionTrue {
|
||||
return "Ready"
|
||||
}
|
||||
return "NotReady"
|
||||
}
|
||||
}
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
func getNodeRole(node *corev1.Node) string {
|
||||
if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
|
||||
return "control-plane"
|
||||
}
|
||||
if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
|
||||
return "control-plane"
|
||||
}
|
||||
return "worker"
|
||||
}
|
||||
|
||||
func getNodeAge(node *corev1.Node) string {
|
||||
age := time.Since(node.CreationTimestamp.Time)
|
||||
days := int(age.Hours() / 24)
|
||||
hours := int(age.Hours()) % 24
|
||||
|
||||
if days > 0 {
|
||||
return fmt.Sprintf("%dd %dh", days, hours)
|
||||
}
|
||||
return fmt.Sprintf("%dh", hours)
|
||||
}
|
||||
|
||||
func formatBytes(bytes int64) string {
|
||||
const unit = 1024
|
||||
if bytes < unit {
|
||||
return fmt.Sprintf("%d B", bytes)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := bytes / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user