Files
ocdp-go/backend/internal/adapter/output/k8s/diagnostics_client.go
Ivan087 7d9545f827 feat: fix YAML field conversion, admin namespace, streaming logs, and vllm-serve deploy
- Fix Axios keysToSnake converting user values map keys (gpuMem -> gpu_mem)
  - Add skipRecurseKeys to keysToSnake for values/valuesYaml fields
  - Add values_yaml alt json tag and Normalize() in DTOs
  - Check both camelCase/snake_case in enforceNamespaceValues
  - Read both tailLines/tail_lines query param for diagnostics
- Admin users can freely choose namespace in LaunchModal (free-text input)
  - Block only kube-system/kube-public/kube-node-lease for admin
  - Regular users keep existing namespace restrictions
- Add SSE streaming pod logs endpoint (backend + frontend)
  - New PodLogStreamer interface and K8s Follow:true implementation
  - SSE handler with text/event-stream output
  - Frontend DiagnosticsModal: Stream button, auto-scroll, live indicator
- Remove per-card Refresh button from InstanceCard (redundant with page refresh)
- Deploy bge-m3 on vllm-serve 0.6.0 (gpuMem=10000, status=deployed)
2026-05-12 16:50:25 +08:00

375 lines
11 KiB
Go

package k8s
import (
"bufio"
"context"
"fmt"
"io"
"sort"
"strings"
"time"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"github.com/ocdp/cluster-service/internal/domain/entity"
"github.com/ocdp/cluster-service/internal/domain/repository"
)
type DiagnosticsClient struct{}
func NewDiagnosticsClient() repository.InstanceDiagnosticsClient {
return &DiagnosticsClient{}
}
type MockDiagnosticsClient struct{}
func NewMockDiagnosticsClient() repository.InstanceDiagnosticsClient {
return &MockDiagnosticsClient{}
}
func (*MockDiagnosticsClient) GetDiagnostics(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance, tailLines int64) (*entity.InstanceDiagnostics, error) {
return &entity.InstanceDiagnostics{
InstanceName: instance.Name,
Namespace: instance.Namespace,
CollectedAt: time.Now(),
}, nil
}
func (*MockDiagnosticsClient) StreamPodLogs(ctx context.Context, cluster *entity.Cluster, namespace, podName, containerName string, tailLines int64) (<-chan string, <-chan error, error) {
lines := make(chan string, 10)
errs := make(chan error, 1)
go func() {
defer close(lines)
defer close(errs)
select {
case <-ctx.Done():
return
case lines <- "[mock] Streaming pod logs...":
case lines <- "[mock] Container started successfully":
case lines <- "[mock] Listening on :8080":
}
}()
return lines, errs, nil
}
func (c *DiagnosticsClient) GetDiagnostics(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance, tailLines int64) (*entity.InstanceDiagnostics, error) {
clientset, err := diagnosticsClientset(cluster)
if err != nil {
return nil, err
}
if tailLines <= 0 {
tailLines = 200
}
if tailLines > 2000 {
tailLines = 2000
}
pods, err := listInstancePods(ctx, clientset, instance)
if err != nil {
return nil, err
}
services, err := listInstanceServices(ctx, clientset, instance)
if err != nil {
return nil, err
}
events, err := listInstanceEvents(ctx, clientset, instance, pods, services)
if err != nil {
return nil, err
}
logs := collectPodLogs(ctx, clientset, pods, tailLines)
return &entity.InstanceDiagnostics{
InstanceName: instance.Name,
Namespace: instance.Namespace,
Pods: convertPodsToDiagnostics(pods),
Services: convertServicesToDiagnostics(services),
Events: convertEventsToDiagnostics(events),
Logs: logs,
CollectedAt: time.Now(),
}, nil
}
func (c *DiagnosticsClient) StreamPodLogs(ctx context.Context, cluster *entity.Cluster, namespace, podName, containerName string, tailLines int64) (<-chan string, <-chan error, error) {
clientset, err := diagnosticsClientset(cluster)
if err != nil {
return nil, nil, err
}
if tailLines <= 0 {
tailLines = 200
}
if tailLines > 2000 {
tailLines = 2000
}
req := clientset.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{
Container: containerName,
Follow: true,
TailLines: &tailLines,
})
stream, err := req.Stream(ctx)
if err != nil {
return nil, nil, fmt.Errorf("failed to open log stream for %s/%s: %w", podName, containerName, err)
}
lines := make(chan string, 64)
errs := make(chan error, 1)
go func() {
defer close(lines)
defer close(errs)
defer func() { _ = stream.Close() }()
scanner := bufio.NewScanner(stream)
// Allow long lines; Kubernetes log entries can exceed the default 64 KiB
scanner.Buffer(make([]byte, 0, 64*1024), 2*1024*1024)
for scanner.Scan() {
select {
case <-ctx.Done():
return
default:
}
line := scanner.Text()
if line == "" {
continue
}
select {
case lines <- line:
case <-ctx.Done():
return
}
}
if err := scanner.Err(); err != nil {
select {
case errs <- err:
case <-ctx.Done():
}
}
}()
return lines, errs, nil
}
func diagnosticsClientset(cluster *entity.Cluster) (kubernetes.Interface, error) {
config, err := restConfigFromCluster(cluster)
if err != nil {
return nil, err
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
return nil, fmt.Errorf("failed to create diagnostics kubernetes client: %w", err)
}
return clientset, nil
}
func listInstancePods(ctx context.Context, clientset kubernetes.Interface, instance *entity.Instance) ([]corev1.Pod, error) {
selector := fmt.Sprintf("app.kubernetes.io/instance=%s", instance.Name)
pods, err := clientset.CoreV1().Pods(instance.Namespace).List(ctx, metav1.ListOptions{LabelSelector: selector})
if err != nil {
return nil, fmt.Errorf("failed to list instance pods: %w", err)
}
if len(pods.Items) > 0 {
return pods.Items, nil
}
all, err := clientset.CoreV1().Pods(instance.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list namespace pods: %w", err)
}
filtered := make([]corev1.Pod, 0)
for _, pod := range all.Items {
if resourceMatchesInstance(pod.ObjectMeta, instance) {
filtered = append(filtered, pod)
}
}
return filtered, nil
}
func listInstanceServices(ctx context.Context, clientset kubernetes.Interface, instance *entity.Instance) ([]corev1.Service, error) {
selector := fmt.Sprintf("app.kubernetes.io/instance=%s", instance.Name)
services, err := clientset.CoreV1().Services(instance.Namespace).List(ctx, metav1.ListOptions{LabelSelector: selector})
if err != nil {
return nil, fmt.Errorf("failed to list instance services: %w", err)
}
if len(services.Items) > 0 {
return services.Items, nil
}
all, err := clientset.CoreV1().Services(instance.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list namespace services: %w", err)
}
filtered := make([]corev1.Service, 0)
for _, svc := range all.Items {
if resourceMatchesInstance(svc.ObjectMeta, instance) {
filtered = append(filtered, svc)
}
}
return filtered, nil
}
func listInstanceEvents(ctx context.Context, clientset kubernetes.Interface, instance *entity.Instance, pods []corev1.Pod, services []corev1.Service) ([]corev1.Event, error) {
events, err := clientset.CoreV1().Events(instance.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list instance events: %w", err)
}
names := map[string]bool{instance.Name: true}
for _, pod := range pods {
names[pod.Name] = true
}
for _, svc := range services {
names[svc.Name] = true
}
filtered := make([]corev1.Event, 0)
for _, event := range events.Items {
if names[event.InvolvedObject.Name] || strings.Contains(event.Message, instance.Name) {
filtered = append(filtered, event)
}
}
sort.SliceStable(filtered, func(i, j int) bool {
return filtered[i].LastTimestamp.Time.After(filtered[j].LastTimestamp.Time)
})
if len(filtered) > 100 {
filtered = filtered[:100]
}
return filtered, nil
}
func collectPodLogs(ctx context.Context, clientset kubernetes.Interface, pods []corev1.Pod, tailLines int64) []entity.InstancePodLog {
logs := make([]entity.InstancePodLog, 0)
for _, pod := range pods {
for _, container := range pod.Spec.Containers {
item := entity.InstancePodLog{Pod: pod.Name, Container: container.Name, TailLines: tailLines}
req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &corev1.PodLogOptions{
Container: container.Name,
TailLines: &tailLines,
})
stream, err := req.Stream(ctx)
if err != nil {
item.Error = err.Error()
logs = append(logs, item)
continue
}
data, err := io.ReadAll(io.LimitReader(stream, 1<<20))
_ = stream.Close()
if err != nil {
item.Error = err.Error()
} else {
item.Log = string(data)
}
logs = append(logs, item)
}
}
return logs
}
func convertPodsToDiagnostics(pods []corev1.Pod) []entity.InstancePodDiagnostics {
out := make([]entity.InstancePodDiagnostics, 0, len(pods))
for _, pod := range pods {
containers := make([]entity.InstanceContainerDiagnostics, 0, len(pod.Status.ContainerStatuses))
var restarts int32
for _, status := range pod.Status.ContainerStatuses {
restarts += status.RestartCount
containers = append(containers, entity.InstanceContainerDiagnostics{
Name: status.Name,
Image: status.Image,
Ready: status.Ready,
RestartCount: status.RestartCount,
State: containerStateName(status.State),
Reason: containerStateReason(status.State),
Message: containerStateMessage(status.State),
})
}
conditions := make([]entity.InstanceConditionDiagnostics, 0, len(pod.Status.Conditions))
for _, condition := range pod.Status.Conditions {
conditions = append(conditions, entity.InstanceConditionDiagnostics{
Type: string(condition.Type),
Status: string(condition.Status),
Reason: condition.Reason,
Message: condition.Message,
})
}
out = append(out, entity.InstancePodDiagnostics{
Name: pod.Name,
Namespace: pod.Namespace,
Phase: string(pod.Status.Phase),
NodeName: pod.Spec.NodeName,
PodIP: pod.Status.PodIP,
HostIP: pod.Status.HostIP,
RestartCount: restarts,
Containers: containers,
Conditions: conditions,
CreationTimestamp: pod.CreationTimestamp.Time,
})
}
return out
}
func convertServicesToDiagnostics(services []corev1.Service) []entity.InstanceServiceDiagnostics {
out := make([]entity.InstanceServiceDiagnostics, 0, len(services))
for _, svc := range services {
entry := convertServiceToEntry(&svc)
out = append(out, entity.InstanceServiceDiagnostics{
Name: svc.Name,
Namespace: svc.Namespace,
Type: string(svc.Spec.Type),
ClusterIP: svc.Spec.ClusterIP,
Ports: entry.Ports,
})
}
return out
}
func convertEventsToDiagnostics(events []corev1.Event) []entity.InstanceEventDiagnostics {
out := make([]entity.InstanceEventDiagnostics, 0, len(events))
for _, event := range events {
out = append(out, entity.InstanceEventDiagnostics{
Type: event.Type,
Reason: event.Reason,
Message: event.Message,
InvolvedKind: event.InvolvedObject.Kind,
InvolvedName: event.InvolvedObject.Name,
Count: event.Count,
FirstTimestamp: event.FirstTimestamp.Time,
LastTimestamp: event.LastTimestamp.Time,
})
}
return out
}
func containerStateName(state corev1.ContainerState) string {
switch {
case state.Running != nil:
return "running"
case state.Waiting != nil:
return "waiting"
case state.Terminated != nil:
return "terminated"
default:
return "unknown"
}
}
func containerStateReason(state corev1.ContainerState) string {
switch {
case state.Waiting != nil:
return state.Waiting.Reason
case state.Terminated != nil:
return state.Terminated.Reason
default:
return ""
}
}
func containerStateMessage(state corev1.ContainerState) string {
switch {
case state.Waiting != nil:
return state.Waiting.Message
case state.Terminated != nil:
return state.Terminated.Message
default:
return ""
}
}