Files
ocdp-go/backend/internal/adapter/output/k8s/metrics_client.go
Ivan087 33ddaf97db fix: scale replicas in response, K8s metrics client, quota precheck, auth tests
- Add GetMetrics method to MetricsClient interface and implement cluster metrics API
- Add QuotaPrecheck service for validating resource quotas before deployment
- Add auth DTO with role/permission models and auth handler tests
- Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics
- Update workspace handler with GetWorkspace endpoint and shared-user list
- Fix monitoring handler to use correct service method name
- Add tail_lines fallback in instance handler for snake_case query params
- Update nginx config for SSE log streaming support (no buffering)
- Add comprehensive test coverage: auth_service_test, auth_handler_test,
  auth_dto_test, metrics_client_test, quota_precheck_test
- Update error messages for quota validation and instance operations
- ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit
- InstanceCard: correctly disable scale-minus when replicas <= 0
- SidebarLayout: add hover transition for sidebar items
- Update todo.md and lessons.md with latest fixes
2026-05-20 16:56:29 +08:00

509 lines
16 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package k8s
import (
"context"
"fmt"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
metricsv "k8s.io/metrics/pkg/client/clientset/versioned"
"github.com/ocdp/cluster-service/internal/domain/entity"
"github.com/ocdp/cluster-service/internal/domain/repository"
)
// MetricsClient 实现从 Kubernetes 集群获取监控指标
type MetricsClient struct {
clusterRepo repository.ClusterRepository
}
// NewMetricsClient 创建 MetricsClient
func NewMetricsClient(clusterRepo repository.ClusterRepository) *MetricsClient {
return &MetricsClient{
clusterRepo: clusterRepo,
}
}
// GetClusterMetrics 获取集群监控指标
func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string) (*entity.ClusterMetrics, error) {
// 获取集群信息
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
// 创建 Kubernetes 客户端
clientset, metricsClient, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
// 获取节点列表
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list nodes: %w", err)
}
// 获取所有 Pods
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list pods: %w", err)
}
// 获取节点指标CPU/内存使用情况)
nodeMetrics, err := c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
if err != nil {
// 如果无法获取 metrics记录错误但继续
fmt.Printf("Warning: failed to get node metrics: %v\n", err)
}
// 计算集群级别汇总
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
return metrics, nil
}
// GetNodeMetrics 获取集群节点指标
func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([]*entity.NodeMetrics, error) {
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
clientset, metricsClient, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list nodes: %w", err)
}
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
}
// GetPodResourceAllocations returns Kubernetes Pod requests/limits without
// inventing utilization values. GPU memory is treated as vendor integer MB.
func (c *MetricsClient) GetPodResourceAllocations(ctx context.Context, clusterID string) ([]*entity.PodResourceAllocation, error) {
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
clientset, _, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list pods: %w", err)
}
result := make([]*entity.PodResourceAllocation, 0, len(pods.Items))
for _, pod := range pods.Items {
result = append(result, &entity.PodResourceAllocation{
ClusterID: clusterID,
Namespace: pod.Namespace,
PodName: pod.Name,
InstanceName: inferHelmReleaseName(pod.Labels),
Allocation: podResourceAllocation(&pod),
})
}
return result, nil
}
// createK8sClients 创建 Kubernetes 客户端
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
if err != nil {
// 如果无法从 kubeconfig 创建,尝试使用集群配置
config = &rest.Config{
Host: cluster.Host,
TLSClientConfig: rest.TLSClientConfig{
CAData: []byte(cluster.CAData),
CertData: []byte(cluster.CertData),
KeyData: []byte(cluster.KeyData),
},
}
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return nil, nil, fmt.Errorf("failed to create clientset: %w", err)
}
metricsClient, err := metricsv.NewForConfig(config)
if err != nil {
// Metrics API 可能不可用,返回 nil 但不报错
return clientset, nil, nil
}
return clientset, metricsClient, nil
}
// getNodeMetricsData 获取节点详细指标
func (c *MetricsClient) getNodeMetricsData(
ctx context.Context,
clientset *kubernetes.Clientset,
metricsClient *metricsv.Clientset,
nodes []corev1.Node,
) ([]*entity.NodeMetrics, error) {
result := make([]*entity.NodeMetrics, 0, len(nodes))
for _, node := range nodes {
nodeMetric := &entity.NodeMetrics{
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
}
// CPU
cpuCapacity := node.Status.Capacity.Cpu()
cpuAllocatable := node.Status.Allocatable.Cpu()
nodeMetric.CPUCapacity = fmt.Sprintf("%.2f cores", float64(cpuCapacity.MilliValue())/1000.0)
nodeMetric.CPUAllocatable = fmt.Sprintf("%.2f cores", float64(cpuAllocatable.MilliValue())/1000.0)
// Memory
memCapacity := node.Status.Capacity.Memory()
memAllocatable := node.Status.Allocatable.Memory()
nodeMetric.MemoryCapacity = formatBytes(memCapacity.Value())
nodeMetric.MemoryAllocatable = formatBytes(memAllocatable.Value())
// GPU (从 node allocatable 中查找)
if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
nodeMetric.GPUCapacity = int(gpu.Value())
// 尝试获取 GPU 类型
if gpuType, ok := node.Labels["nvidia.com/gpu.product"]; ok {
nodeMetric.GPUType = gpuType
}
}
// 获取 Pod 数量
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.nodeName=%s", node.Name),
})
if err == nil {
nodeMetric.PodCount = len(pods.Items)
}
// 如果有 metrics client获取实时使用情况
if metricsClient != nil {
nodeMetricData, err := metricsClient.MetricsV1beta1().NodeMetricses().Get(ctx, node.Name, metav1.GetOptions{})
if err == nil {
// CPU 使用
cpuUsage := nodeMetricData.Usage.Cpu()
nodeMetric.CPUUsage = fmt.Sprintf("%.2f cores", float64(cpuUsage.MilliValue())/1000.0)
if cpuAllocatable.MilliValue() > 0 {
nodeMetric.CPUPercent = float64(cpuUsage.MilliValue()) / float64(cpuAllocatable.MilliValue()) * 100
}
// Memory 使用
memUsage := nodeMetricData.Usage.Memory()
nodeMetric.MemoryUsage = formatBytes(memUsage.Value())
if memAllocatable.Value() > 0 {
nodeMetric.MemoryPercent = float64(memUsage.Value()) / float64(memAllocatable.Value()) * 100
}
}
}
result = append(result, nodeMetric)
}
return result, nil
}
// aggregateClusterMetrics 聚合集群级别指标
func (c *MetricsClient) aggregateClusterMetrics(
cluster *entity.Cluster,
nodes []corev1.Node,
pods []corev1.Pod,
nodeMetrics []*entity.NodeMetrics,
) *entity.ClusterMetrics {
metrics := &entity.ClusterMetrics{
ClusterID: cluster.ID,
ClusterName: cluster.Name,
Status: "healthy",
NodeCount: len(nodes),
PodCount: len(pods),
LastCheck: time.Now(),
Nodes: make([]entity.NodeMetrics, 0),
}
// 汇总资源
var totalCPU, totalMem, usedCPU, usedMem int64
var totalGPU, usedGPU int
healthyNodes := 0
// 单机最大值
var maxNodeCPU, maxNodeMem int64
var maxNodeGPU int
var maxNodeCPUUsage, maxNodeMemUsage, maxNodeGPUUsage float64
for i, node := range nodes {
// CPU
cpuCap := node.Status.Capacity.Cpu()
totalCPU += cpuCap.MilliValue()
if cpuCap.MilliValue() > maxNodeCPU {
maxNodeCPU = cpuCap.MilliValue()
}
// Memory
memCap := node.Status.Capacity.Memory()
totalMem += memCap.Value()
if memCap.Value() > maxNodeMem {
maxNodeMem = memCap.Value()
}
// GPU
if gpu, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
gpuCount := int(gpu.Value())
totalGPU += gpuCount
if gpuCount > maxNodeGPU {
maxNodeGPU = gpuCount
}
}
// Node status
if getNodeStatus(&node) == "Ready" {
healthyNodes++
}
// 从 nodeMetrics 获取使用情况
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
// 更新单机最大使用率
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
}
if nodeMetrics[i].MemoryPercent > maxNodeMemUsage {
maxNodeMemUsage = nodeMetrics[i].MemoryPercent
}
if nodeMetrics[i].GPUPercent > maxNodeGPUUsage {
maxNodeGPUUsage = nodeMetrics[i].GPUPercent
}
}
}
// 计算集群 uptime简化使用最老节点的年龄
if len(nodes) > 0 {
metrics.Uptime = getNodeAge(&nodes[0])
}
// 格式化总资源
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
metrics.TotalMemory = formatBytes(totalMem)
metrics.TotalGPU = totalGPU
// 格式化单机最大值
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
metrics.MaxNodeGPU = maxNodeGPU
metrics.MaxNodeCPUUsage = maxNodeCPUUsage
metrics.MaxNodeMemUsage = maxNodeMemUsage
metrics.MaxNodeGPUUsage = maxNodeGPUUsage
// 使用情况(简化处理)
if len(nodeMetrics) > 0 {
for _, nm := range nodeMetrics {
// 解析使用的 CPU 和内存
// 这里简化处理,实际应该解析字符串
usedCPU += int64(nm.CPUPercent * float64(totalCPU) / 100.0)
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
usedGPU += nm.GPUUsage
}
if totalCPU > 0 {
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
}
if totalMem > 0 {
metrics.MemoryUsage = float64(usedMem) / float64(totalMem) * 100
}
if totalGPU > 0 {
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
}
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
metrics.UsedMemory = formatBytes(usedMem)
metrics.UsedGPU = usedGPU
}
// 确定集群状态
if healthyNodes == len(nodes) {
metrics.Status = "healthy"
} else if healthyNodes > 0 {
metrics.Status = "warning"
} else {
metrics.Status = "error"
}
return metrics
}
// Helper functions
func getNodeStatus(node *corev1.Node) string {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady {
if condition.Status == corev1.ConditionTrue {
return "Ready"
}
return "NotReady"
}
}
return "Unknown"
}
func getNodeRole(node *corev1.Node) string {
if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
return "control-plane"
}
if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
return "control-plane"
}
return "worker"
}
func getNodeAge(node *corev1.Node) string {
age := time.Since(node.CreationTimestamp.Time)
days := int(age.Hours() / 24)
hours := int(age.Hours()) % 24
if days > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
return fmt.Sprintf("%dh", hours)
}
func formatBytes(bytes int64) string {
const unit = 1024
if bytes < unit {
return fmt.Sprintf("%d B", bytes)
}
div, exp := int64(unit), 0
for n := bytes / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
}
func inferHelmReleaseName(labels map[string]string) string {
if labels == nil {
return ""
}
for _, key := range []string{
"app.kubernetes.io/instance",
"release",
"helm.sh/release",
"meta.helm.sh/release-name",
"app",
} {
if value := labels[key]; value != "" {
return value
}
}
return ""
}
func podResourceAllocation(pod *corev1.Pod) entity.ResourceAllocation {
if pod == nil {
return entity.ResourceAllocation{}
}
sum := entity.ResourceAllocation{}
for _, container := range pod.Spec.Containers {
sum = addContainerAllocation(sum, container)
}
initMax := entity.ResourceAllocation{}
for _, container := range pod.Spec.InitContainers {
initMax = maxAllocation(initMax, containerAllocation(container))
}
return maxAllocation(sum, initMax)
}
func addContainerAllocation(base entity.ResourceAllocation, container corev1.Container) entity.ResourceAllocation {
return addAllocation(base, containerAllocation(container))
}
func containerAllocation(container corev1.Container) entity.ResourceAllocation {
requests := container.Resources.Requests
limits := container.Resources.Limits
return entity.ResourceAllocation{
CPURequestsMilli: quantityMilliValue(requests, corev1.ResourceCPU),
CPULimitsMilli: quantityMilliValue(limits, corev1.ResourceCPU),
MemoryRequestsBytes: quantityValue(requests, corev1.ResourceMemory),
MemoryLimitsBytes: quantityValue(limits, corev1.ResourceMemory),
GPURequests: quantityValue(requests, corev1.ResourceName("nvidia.com/gpu")),
GPULimits: quantityValue(limits, corev1.ResourceName("nvidia.com/gpu")),
GPUMemoryRequestsMB: quantityValueAny(requests, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
GPUMemoryLimitsMB: quantityValueAny(limits, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
}
}
func addAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
GPURequests: left.GPURequests + right.GPURequests,
GPULimits: left.GPULimits + right.GPULimits,
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
}
}
func maxAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: maxInt64(left.CPURequestsMilli, right.CPURequestsMilli),
CPULimitsMilli: maxInt64(left.CPULimitsMilli, right.CPULimitsMilli),
MemoryRequestsBytes: maxInt64(left.MemoryRequestsBytes, right.MemoryRequestsBytes),
MemoryLimitsBytes: maxInt64(left.MemoryLimitsBytes, right.MemoryLimitsBytes),
GPURequests: maxInt64(left.GPURequests, right.GPURequests),
GPULimits: maxInt64(left.GPULimits, right.GPULimits),
GPUMemoryRequestsMB: maxInt64(left.GPUMemoryRequestsMB, right.GPUMemoryRequestsMB),
GPUMemoryLimitsMB: maxInt64(left.GPUMemoryLimitsMB, right.GPUMemoryLimitsMB),
}
}
func quantityMilliValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
if quantity, ok := resources[name]; ok {
return quantity.MilliValue()
}
return 0
}
func quantityValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
if quantity, ok := resources[name]; ok {
return quantity.Value()
}
return 0
}
func quantityValueAny(resources corev1.ResourceList, names ...corev1.ResourceName) int64 {
for _, name := range names {
if quantity, ok := resources[name]; ok {
return quantity.Value()
}
}
return 0
}
func maxInt64(left, right int64) int64 {
if left > right {
return left
}
return right
}