fix: scale replicas in response, K8s metrics client, quota precheck, auth tests

- Add GetMetrics method to MetricsClient interface and implement cluster metrics API
- Add QuotaPrecheck service for validating resource quotas before deployment
- Add auth DTO with role/permission models and auth handler tests
- Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics
- Update workspace handler with GetWorkspace endpoint and shared-user list
- Fix monitoring handler to use correct service method name
- Add tail_lines fallback in instance handler for snake_case query params
- Update nginx config for SSE log streaming support (no buffering)
- Add comprehensive test coverage: auth_service_test, auth_handler_test,
  auth_dto_test, metrics_client_test, quota_precheck_test
- Update error messages for quota validation and instance operations
- ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit
- InstanceCard: correctly disable scale-minus when replicas <= 0
- SidebarLayout: add hover transition for sidebar items
- Update todo.md and lessons.md with latest fixes
This commit is contained in:
Ivan087
2026-05-20 16:56:29 +08:00
parent 8f90cf0f0d
commit 33ddaf97db
59 changed files with 4805 additions and 457 deletions

View File

@ -1,19 +1,47 @@
package dto
import "strings"
// RegisterRequest 用户注册请求
type RegisterRequest struct {
Username string `json:"username" binding:"required"`
Password string `json:"password" binding:"required,min=6"`
Role string `json:"role,omitempty"`
WorkspaceID string `json:"workspaceId,omitempty"`
Namespace string `json:"namespace,omitempty"`
DefaultClusterID string `json:"defaultClusterId,omitempty"`
QuotaCPU string `json:"quotaCpu,omitempty"`
QuotaMemory string `json:"quotaMemory,omitempty"`
QuotaGPU string `json:"quotaGpu,omitempty"`
QuotaGPUMem string `json:"quotaGpuMemory,omitempty"`
IsActive *bool `json:"isActive,omitempty"`
MustChangePassword *bool `json:"mustChangePassword,omitempty"`
Username string `json:"username" binding:"required"`
Password string `json:"password" binding:"required,min=6"`
Role string `json:"role,omitempty"`
WorkspaceID string `json:"workspaceId,omitempty"`
WorkspaceIDSnake string `json:"workspace_id,omitempty"`
Namespace string `json:"namespace,omitempty"`
DefaultClusterID string `json:"defaultClusterId,omitempty"`
DefaultClusterIDSnake string `json:"default_cluster_id,omitempty"`
QuotaCPU string `json:"quotaCpu,omitempty"`
QuotaCPUSnake string `json:"quota_cpu,omitempty"`
QuotaMemory string `json:"quotaMemory,omitempty"`
QuotaMemorySnake string `json:"quota_memory,omitempty"`
QuotaGPU string `json:"quotaGpu,omitempty"`
QuotaGPUSnake string `json:"quota_gpu,omitempty"`
QuotaGPUMem string `json:"quotaGpuMemory,omitempty"`
QuotaGPUMemSnake string `json:"quota_gpu_memory,omitempty"`
IsActive *bool `json:"isActive,omitempty"`
IsActiveSnake *bool `json:"is_active,omitempty"`
MustChangePassword *bool `json:"mustChangePassword,omitempty"`
MustChangePasswordSnake *bool `json:"must_change_password,omitempty"`
}
func (r *RegisterRequest) Normalize() {
if r == nil {
return
}
r.WorkspaceID = firstNonBlank(r.WorkspaceID, r.WorkspaceIDSnake)
r.DefaultClusterID = firstNonBlank(r.DefaultClusterID, r.DefaultClusterIDSnake)
r.QuotaCPU = firstNonBlank(r.QuotaCPU, r.QuotaCPUSnake)
r.QuotaMemory = firstNonBlank(r.QuotaMemory, r.QuotaMemorySnake)
r.QuotaGPU = firstNonBlank(r.QuotaGPU, r.QuotaGPUSnake)
r.QuotaGPUMem = firstNonBlank(r.QuotaGPUMem, r.QuotaGPUMemSnake)
if r.IsActive == nil {
r.IsActive = r.IsActiveSnake
}
if r.MustChangePassword == nil {
r.MustChangePassword = r.MustChangePasswordSnake
}
}
// LoginRequest 用户登录请求
@ -68,14 +96,47 @@ type UserResponse struct {
// UpdateUserRequest 管理员更新用户状态/角色请求
type UpdateUserRequest struct {
Role string `json:"role,omitempty"`
WorkspaceID string `json:"workspaceId,omitempty"`
Namespace string `json:"namespace,omitempty"`
DefaultClusterID string `json:"defaultClusterId,omitempty"`
QuotaCPU string `json:"quotaCpu,omitempty"`
QuotaMemory string `json:"quotaMemory,omitempty"`
QuotaGPU string `json:"quotaGpu,omitempty"`
QuotaGPUMem string `json:"quotaGpuMemory,omitempty"`
IsActive *bool `json:"isActive,omitempty"`
MustChangePassword *bool `json:"mustChangePassword,omitempty"`
Role string `json:"role,omitempty"`
WorkspaceID string `json:"workspaceId,omitempty"`
WorkspaceIDSnake string `json:"workspace_id,omitempty"`
Namespace string `json:"namespace,omitempty"`
DefaultClusterID string `json:"defaultClusterId,omitempty"`
DefaultClusterIDSnake string `json:"default_cluster_id,omitempty"`
QuotaCPU string `json:"quotaCpu,omitempty"`
QuotaCPUSnake string `json:"quota_cpu,omitempty"`
QuotaMemory string `json:"quotaMemory,omitempty"`
QuotaMemorySnake string `json:"quota_memory,omitempty"`
QuotaGPU string `json:"quotaGpu,omitempty"`
QuotaGPUSnake string `json:"quota_gpu,omitempty"`
QuotaGPUMem string `json:"quotaGpuMemory,omitempty"`
QuotaGPUMemSnake string `json:"quota_gpu_memory,omitempty"`
IsActive *bool `json:"isActive,omitempty"`
IsActiveSnake *bool `json:"is_active,omitempty"`
MustChangePassword *bool `json:"mustChangePassword,omitempty"`
MustChangePasswordSnake *bool `json:"must_change_password,omitempty"`
}
func (r *UpdateUserRequest) Normalize() {
if r == nil {
return
}
r.WorkspaceID = firstNonBlank(r.WorkspaceID, r.WorkspaceIDSnake)
r.DefaultClusterID = firstNonBlank(r.DefaultClusterID, r.DefaultClusterIDSnake)
r.QuotaCPU = firstNonBlank(r.QuotaCPU, r.QuotaCPUSnake)
r.QuotaMemory = firstNonBlank(r.QuotaMemory, r.QuotaMemorySnake)
r.QuotaGPU = firstNonBlank(r.QuotaGPU, r.QuotaGPUSnake)
r.QuotaGPUMem = firstNonBlank(r.QuotaGPUMem, r.QuotaGPUMemSnake)
if r.IsActive == nil {
r.IsActive = r.IsActiveSnake
}
if r.MustChangePassword == nil {
r.MustChangePassword = r.MustChangePasswordSnake
}
}
func firstNonBlank(primary, alternate string) string {
if strings.TrimSpace(primary) != "" {
return primary
}
return alternate
}

View File

@ -0,0 +1,51 @@
package dto
import "testing"
func TestRegisterRequestNormalizeUsesSnakeCaseAlternates(t *testing.T) {
active := false
mustChange := true
req := RegisterRequest{
WorkspaceIDSnake: "workspace-1",
DefaultClusterIDSnake: "cluster-1",
QuotaCPUSnake: "2",
QuotaMemorySnake: "4Gi",
QuotaGPUSnake: "1",
QuotaGPUMemSnake: "10000",
IsActiveSnake: &active,
MustChangePasswordSnake: &mustChange,
}
req.Normalize()
if req.WorkspaceID != "workspace-1" || req.DefaultClusterID != "cluster-1" {
t.Fatalf("expected snake case workspace/cluster fields to normalize, got %#v", req)
}
if req.QuotaCPU != "2" || req.QuotaMemory != "4Gi" || req.QuotaGPU != "1" || req.QuotaGPUMem != "10000" {
t.Fatalf("expected snake case quota fields to normalize, got %#v", req)
}
if req.IsActive == nil || *req.IsActive {
t.Fatalf("expected is_active=false to normalize, got %#v", req.IsActive)
}
if req.MustChangePassword == nil || !*req.MustChangePassword {
t.Fatalf("expected must_change_password=true to normalize, got %#v", req.MustChangePassword)
}
}
func TestUpdateUserRequestNormalizeKeepsCamelCasePrimary(t *testing.T) {
req := UpdateUserRequest{
DefaultClusterID: "camel-cluster",
DefaultClusterIDSnake: "snake-cluster",
QuotaCPU: "3",
QuotaCPUSnake: "4",
}
req.Normalize()
if req.DefaultClusterID != "camel-cluster" {
t.Fatalf("expected camelCase defaultClusterId to win, got %q", req.DefaultClusterID)
}
if req.QuotaCPU != "3" {
t.Fatalf("expected camelCase quotaCpu to win, got %q", req.QuotaCPU)
}
}

View File

@ -2,25 +2,25 @@ package dto
// CreateInstanceRequest 创建实例请求
type CreateInstanceRequest struct {
Name string `json:"name" binding:"required"`
Namespace string `json:"namespace" binding:"required"`
RegistryID string `json:"registryId" binding:"required"`
RegistryIDAlt string `json:"registry_id"`
Repository string `json:"repository" binding:"required"`
Tag string `json:"tag" binding:"required"`
Description string `json:"description"`
Values map[string]interface{} `json:"values"`
ValuesYAML string `json:"valuesYaml"`
ValuesYAMLAlt string `json:"values_yaml"`
Name string `json:"name" binding:"required"`
Namespace string `json:"namespace" binding:"required"`
RegistryID string `json:"registryId" binding:"required"`
RegistryIDAlt string `json:"registry_id"`
Repository string `json:"repository" binding:"required"`
Tag string `json:"tag" binding:"required"`
Description string `json:"description"`
Values map[string]interface{} `json:"values"`
ValuesYAML string `json:"valuesYaml"`
ValuesYAMLAlt string `json:"values_yaml"`
}
// UpdateInstanceRequest 更新实例请求
type UpdateInstanceRequest struct {
Version string `json:"version"`
Description string `json:"description"`
Values map[string]interface{} `json:"values"`
ValuesYAML string `json:"valuesYaml"`
ValuesYAMLAlt string `json:"values_yaml"`
Version string `json:"version"`
Description string `json:"description"`
Values map[string]interface{} `json:"values"`
ValuesYAML string `json:"valuesYaml"`
ValuesYAMLAlt string `json:"values_yaml"`
}
// Normalize 将多种命名风格的字段合并到统一字段
@ -67,6 +67,7 @@ type InstanceResponse struct {
Status string `json:"status"`
WorkspaceID string `json:"workspaceId"`
OwnerID string `json:"ownerId"`
OwnerUsername string `json:"ownerUsername,omitempty"`
AllowedActions []string `json:"allowedActions,omitempty"`
StatusReason string `json:"statusReason,omitempty"`
LastOperation string `json:"lastOperation,omitempty"`

View File

@ -8,29 +8,56 @@ import (
// ClusterMetricsResponse 集群监控响应
type ClusterMetricsResponse struct {
ClusterID string `json:"clusterId"`
ClusterName string `json:"clusterName"`
Status string `json:"status"`
Uptime string `json:"uptime"`
NodeCount int `json:"nodeCount"`
PodCount int `json:"podCount"`
LastCheck time.Time `json:"lastCheck"`
TotalCPU string `json:"totalCpu"`
TotalMemory string `json:"totalMemory"`
TotalGPU int `json:"totalGpu"`
UsedCPU string `json:"usedCpu"`
UsedMemory string `json:"usedMemory"`
UsedGPU int `json:"usedGpu"`
CPUUsage float64 `json:"cpuUsage"`
MemoryUsage float64 `json:"memoryUsage"`
GPUUsage float64 `json:"gpuUsage"`
MaxNodeCPU string `json:"maxNodeCpu"`
MaxNodeMemory string `json:"maxNodeMemory"`
MaxNodeGPU int `json:"maxNodeGpu"`
MaxNodeCPUUsage float64 `json:"maxNodeCpuUsage"`
MaxNodeMemUsage float64 `json:"maxNodeMemUsage"`
MaxNodeGPUUsage float64 `json:"maxNodeGpuUsage"`
Nodes []NodeMetricsResponse `json:"nodes,omitempty"`
ClusterID string `json:"clusterId"`
ClusterName string `json:"clusterName"`
Status string `json:"status"`
Uptime string `json:"uptime"`
NodeCount int `json:"nodeCount"`
PodCount int `json:"podCount"`
LastCheck time.Time `json:"lastCheck"`
TotalCPU string `json:"totalCpu"`
TotalMemory string `json:"totalMemory"`
TotalGPU int `json:"totalGpu"`
UsedCPU string `json:"usedCpu"`
UsedMemory string `json:"usedMemory"`
UsedGPU int `json:"usedGpu"`
CPUUsage float64 `json:"cpuUsage"`
MemoryUsage float64 `json:"memoryUsage"`
GPUUsage float64 `json:"gpuUsage"`
CPURequests string `json:"cpuRequests,omitempty"`
CPULimits string `json:"cpuLimits,omitempty"`
MemoryRequests string `json:"memoryRequests,omitempty"`
MemoryLimits string `json:"memoryLimits,omitempty"`
GPURequests int64 `json:"gpuRequests,omitempty"`
GPULimits int64 `json:"gpuLimits,omitempty"`
GPUMemoryRequestsMB int64 `json:"gpuMemoryRequestsMb,omitempty"`
GPUMemoryLimitsMB int64 `json:"gpuMemoryLimitsMb,omitempty"`
AllocatedGPU int64 `json:"allocatedGpu,omitempty"`
AllocatedGPUMemoryMB int64 `json:"allocatedGpuMemoryMb,omitempty"`
ResourceUsageByUser []UserResourceUsageResponse `json:"resourceUsageByUser,omitempty"`
MaxNodeCPU string `json:"maxNodeCpu"`
MaxNodeMemory string `json:"maxNodeMemory"`
MaxNodeGPU int `json:"maxNodeGpu"`
MaxNodeCPUUsage float64 `json:"maxNodeCpuUsage"`
MaxNodeMemUsage float64 `json:"maxNodeMemUsage"`
MaxNodeGPUUsage float64 `json:"maxNodeGpuUsage"`
Nodes []NodeMetricsResponse `json:"nodes,omitempty"`
}
type UserResourceUsageResponse struct {
UserID string `json:"userId"`
Username string `json:"username"`
WorkspaceID string `json:"workspaceId"`
InstanceCount int `json:"instanceCount"`
PodCount int `json:"podCount"`
CPURequests string `json:"cpuRequests"`
CPULimits string `json:"cpuLimits"`
MemoryRequests string `json:"memoryRequests"`
MemoryLimits string `json:"memoryLimits"`
GPURequests int64 `json:"gpuRequests"`
GPULimits int64 `json:"gpuLimits"`
GPUMemoryRequestsMB int64 `json:"gpuMemoryRequestsMb"`
GPUMemoryLimitsMB int64 `json:"gpuMemoryLimitsMb"`
}
// NodeMetricsResponse 节点监控响应
@ -72,28 +99,59 @@ type MonitoringSummaryResponse struct {
// ToClusterMetricsResponse 转换为响应
func ToClusterMetricsResponse(m *entity.ClusterMetrics) *ClusterMetricsResponse {
resp := &ClusterMetricsResponse{
ClusterID: m.ClusterID,
ClusterName: m.ClusterName,
Status: m.Status,
Uptime: m.Uptime,
NodeCount: m.NodeCount,
PodCount: m.PodCount,
LastCheck: m.LastCheck,
TotalCPU: m.TotalCPU,
TotalMemory: m.TotalMemory,
TotalGPU: m.TotalGPU,
UsedCPU: m.UsedCPU,
UsedMemory: m.UsedMemory,
UsedGPU: m.UsedGPU,
CPUUsage: m.CPUUsage,
MemoryUsage: m.MemoryUsage,
GPUUsage: m.GPUUsage,
MaxNodeCPU: m.MaxNodeCPU,
MaxNodeMemory: m.MaxNodeMemory,
MaxNodeGPU: m.MaxNodeGPU,
MaxNodeCPUUsage: m.MaxNodeCPUUsage,
MaxNodeMemUsage: m.MaxNodeMemUsage,
MaxNodeGPUUsage: m.MaxNodeGPUUsage,
ClusterID: m.ClusterID,
ClusterName: m.ClusterName,
Status: m.Status,
Uptime: m.Uptime,
NodeCount: m.NodeCount,
PodCount: m.PodCount,
LastCheck: m.LastCheck,
TotalCPU: m.TotalCPU,
TotalMemory: m.TotalMemory,
TotalGPU: m.TotalGPU,
UsedCPU: m.UsedCPU,
UsedMemory: m.UsedMemory,
UsedGPU: m.UsedGPU,
CPUUsage: m.CPUUsage,
MemoryUsage: m.MemoryUsage,
GPUUsage: m.GPUUsage,
CPURequests: m.CPURequests,
CPULimits: m.CPULimits,
MemoryRequests: m.MemoryRequests,
MemoryLimits: m.MemoryLimits,
GPURequests: m.GPURequests,
GPULimits: m.GPULimits,
GPUMemoryRequestsMB: m.GPUMemoryRequestsMB,
GPUMemoryLimitsMB: m.GPUMemoryLimitsMB,
AllocatedGPU: m.AllocatedGPU,
AllocatedGPUMemoryMB: m.AllocatedGPUMemoryMB,
MaxNodeCPU: m.MaxNodeCPU,
MaxNodeMemory: m.MaxNodeMemory,
MaxNodeGPU: m.MaxNodeGPU,
MaxNodeCPUUsage: m.MaxNodeCPUUsage,
MaxNodeMemUsage: m.MaxNodeMemUsage,
MaxNodeGPUUsage: m.MaxNodeGPUUsage,
}
if len(m.ResourceUsageByUser) > 0 {
resp.ResourceUsageByUser = make([]UserResourceUsageResponse, len(m.ResourceUsageByUser))
for i, usage := range m.ResourceUsageByUser {
resp.ResourceUsageByUser[i] = UserResourceUsageResponse{
UserID: usage.UserID,
Username: usage.Username,
WorkspaceID: usage.WorkspaceID,
InstanceCount: usage.InstanceCount,
PodCount: usage.PodCount,
CPURequests: usage.CPURequests,
CPULimits: usage.CPULimits,
MemoryRequests: usage.MemoryRequests,
MemoryLimits: usage.MemoryLimits,
GPURequests: usage.GPURequests,
GPULimits: usage.GPULimits,
GPUMemoryRequestsMB: usage.GPUMemoryRequestsMB,
GPUMemoryLimitsMB: usage.GPUMemoryLimitsMB,
}
}
}
if len(m.Nodes) > 0 {

View File

@ -126,6 +126,25 @@ func (h *ArtifactHandler) ListArtifacts(w http.ResponseWriter, r *http.Request)
respondJSON(w, http.StatusOK, tagResponses)
}
// ListRepositoryTags is a compatibility alias for clients that request tags
// directly instead of the canonical artifacts endpoint.
func (h *ArtifactHandler) ListRepositoryTags(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
if vars["registry_id"] == "" {
registryID := r.URL.Query().Get("registry_id")
if registryID == "" {
registryID = r.URL.Query().Get("registryId")
}
if registryID == "" {
respondError(w, http.StatusBadRequest, "Missing registry ID", "registry_id query parameter is required")
return
}
vars["registry_id"] = registryID
r = mux.SetURLVars(r, vars)
}
h.ListArtifacts(w, r)
}
// GetArtifact 获取 artifact 详情
// @Summary 获取 Artifact 详情
// @Description 获取指定 Artifact 的详细信息

View File

@ -3,8 +3,11 @@ package rest
import (
"context"
"encoding/json"
"net"
"net/http"
"strings"
"sync"
"time"
"github.com/gorilla/mux"
"github.com/ocdp/cluster-service/internal/adapter/input/http/dto"
@ -18,6 +21,74 @@ type AuthHandler struct {
authService *service.AuthService
}
const (
loginRateLimitWindow = time.Minute
loginRateLimitFailures = 5
)
var defaultLoginRateLimiter = newLoginRateLimiter(loginRateLimitWindow, loginRateLimitFailures)
type loginRateLimiter struct {
mu sync.Mutex
window time.Duration
limit int
failures map[string]loginFailureState
now func() time.Time
}
type loginFailureState struct {
count int
windowEnds time.Time
}
func newLoginRateLimiter(window time.Duration, limit int) *loginRateLimiter {
return &loginRateLimiter{
window: window,
limit: limit,
failures: make(map[string]loginFailureState),
now: time.Now,
}
}
func (l *loginRateLimiter) Allow(key string) bool {
if l == nil || key == "" {
return true
}
l.mu.Lock()
defer l.mu.Unlock()
state, ok := l.failures[key]
now := l.now()
if !ok || now.After(state.windowEnds) {
return true
}
return state.count < l.limit
}
func (l *loginRateLimiter) RecordFailure(key string) {
if l == nil || key == "" {
return
}
l.mu.Lock()
defer l.mu.Unlock()
now := l.now()
state, ok := l.failures[key]
if !ok || now.After(state.windowEnds) {
l.failures[key] = loginFailureState{count: 1, windowEnds: now.Add(l.window)}
return
}
state.count++
l.failures[key] = state
}
func (l *loginRateLimiter) Reset(key string) {
if l == nil || key == "" {
return
}
l.mu.Lock()
defer l.mu.Unlock()
delete(l.failures, key)
}
// NewAuthHandler 创建认证 Handler
func NewAuthHandler(authService *service.AuthService) *AuthHandler {
return &AuthHandler{
@ -41,6 +112,7 @@ func (h *AuthHandler) Register(w http.ResponseWriter, r *http.Request) {
respondError(w, http.StatusBadRequest, "Invalid request body", err.Error())
return
}
req.Normalize()
// 调用领域服务
user, err := h.authService.Register(r.Context(), req.Username, req.Password, req.Role, req.WorkspaceID, service.UserWorkspaceOptions{
@ -79,6 +151,7 @@ func (h *AuthHandler) UpdateUser(w http.ResponseWriter, r *http.Request) {
respondError(w, http.StatusBadRequest, "Invalid request body", err.Error())
return
}
req.Normalize()
user, err := h.authService.UpdateUser(r.Context(), userID, req.Role, req.WorkspaceID, service.UserWorkspaceOptions{
Namespace: req.Namespace,
DefaultClusterID: req.DefaultClusterID,
@ -120,12 +193,21 @@ func (h *AuthHandler) Login(w http.ResponseWriter, r *http.Request) {
return
}
rateLimitKey := loginRateLimitKey(r, req.Username)
if !defaultLoginRateLimiter.Allow(rateLimitKey) {
w.Header().Set("Retry-After", "60")
respondError(w, http.StatusTooManyRequests, "Too many login attempts", "too many login attempts; retry later")
return
}
// 调用领域服务
accessToken, refreshToken, user, err := h.authService.Login(r.Context(), req.Username, req.Password)
if err != nil {
respondError(w, http.StatusUnauthorized, "Login failed", err.Error())
defaultLoginRateLimiter.RecordFailure(rateLimitKey)
respondError(w, http.StatusUnauthorized, "Invalid username or password", "invalid username or password")
return
}
defaultLoginRateLimiter.Reset(rateLimitKey)
workspace, _ := h.authService.GetWorkspaceByID(r.Context(), user.WorkspaceID)
@ -151,6 +233,23 @@ func (h *AuthHandler) Login(w http.ResponseWriter, r *http.Request) {
respondJSON(w, http.StatusOK, response)
}
func loginRateLimitKey(r *http.Request, username string) string {
client := strings.TrimSpace(r.Header.Get("X-Forwarded-For"))
if idx := strings.Index(client, ","); idx >= 0 {
client = strings.TrimSpace(client[:idx])
}
if client == "" {
client = strings.TrimSpace(r.Header.Get("X-Real-IP"))
}
if client == "" {
client = r.RemoteAddr
if host, _, err := net.SplitHostPort(client); err == nil {
client = host
}
}
return strings.ToLower(strings.TrimSpace(username)) + "|" + client
}
func (h *AuthHandler) convertUserResponse(ctx context.Context, user *entity.User) *dto.UserResponse {
workspace, _ := h.authService.GetWorkspaceByID(ctx, user.WorkspaceID)
return &dto.UserResponse{

View File

@ -0,0 +1,44 @@
package rest
import (
"testing"
"time"
)
func TestLoginRateLimiterBlocksAfterConfiguredFailures(t *testing.T) {
now := time.Date(2026, 5, 14, 12, 0, 0, 0, time.UTC)
limiter := newLoginRateLimiter(time.Minute, 2)
limiter.now = func() time.Time { return now }
key := "user|127.0.0.1"
if !limiter.Allow(key) {
t.Fatal("expected first attempt to be allowed")
}
limiter.RecordFailure(key)
if !limiter.Allow(key) {
t.Fatal("expected second attempt to be allowed")
}
limiter.RecordFailure(key)
if limiter.Allow(key) {
t.Fatal("expected third attempt inside the window to be blocked")
}
now = now.Add(time.Minute + time.Second)
if !limiter.Allow(key) {
t.Fatal("expected attempts to be allowed after the window expires")
}
}
func TestLoginRateLimiterResetClearsFailures(t *testing.T) {
limiter := newLoginRateLimiter(time.Minute, 1)
key := "user|127.0.0.1"
limiter.RecordFailure(key)
if limiter.Allow(key) {
t.Fatal("expected key to be blocked after one failure")
}
limiter.Reset(key)
if !limiter.Allow(key) {
t.Fatal("expected reset key to be allowed")
}
}

View File

@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"net/http"
"reflect"
"strconv"
"strings"
"time"
@ -49,6 +50,11 @@ func (h *InstanceHandler) CreateInstance(w http.ResponseWriter, r *http.Request)
return
}
req.Normalize()
parsedYAML, hasValuesYAML, err := parseAndCompareValues(req.Values, req.ValuesYAML)
if err != nil {
respondError(w, http.StatusBadRequest, "Invalid values", err.Error())
return
}
// Extract chart name from repository (e.g., "charts/nginx" -> "nginx")
chart := req.Repository
@ -71,21 +77,16 @@ func (h *InstanceHandler) CreateInstance(w http.ResponseWriter, r *http.Request)
if req.Values != nil {
instance.SetValues(req.Values)
}
if req.ValuesYAML != "" {
if hasValuesYAML {
instance.SetValuesYAML(req.ValuesYAML)
if req.Values == nil {
values, err := parseValuesYAML(req.ValuesYAML)
if err != nil {
respondError(w, http.StatusBadRequest, "Invalid values YAML", err.Error())
return
}
instance.SetValues(values)
instance.SetValues(parsedYAML)
}
}
// 调用领域服务
if err := h.instanceService.CreateInstance(r.Context(), instance); err != nil {
respondError(w, http.StatusBadRequest, "Failed to create instance", err.Error())
respondServiceError(w, err, "Failed to create instance")
return
}
@ -116,6 +117,7 @@ func (h *InstanceHandler) GetInstance(w http.ResponseWriter, r *http.Request) {
respondError(w, http.StatusNotFound, "Instance not found", "resource does not belong to cluster")
return
}
h.instanceService.EnrichReplicas(r.Context(), clusterID, []*entity.Instance{instance})
respondJSON(w, http.StatusOK, convertInstanceResponse(instance, true))
}
@ -144,7 +146,7 @@ func (h *InstanceHandler) ListInstances(w http.ResponseWriter, r *http.Request)
responses := make([]*dto.InstanceResponse, 0, len(instances))
for _, instance := range instances {
responses = append(responses, convertInstanceResponse(instance, false))
responses = append(responses, convertInstanceResponse(instance, true))
}
response := &dto.InstanceListResponse{
@ -177,6 +179,11 @@ func (h *InstanceHandler) UpdateInstance(w http.ResponseWriter, r *http.Request)
return
}
req.Normalize()
parsedYAML, hasValuesYAML, err := parseAndCompareValues(req.Values, req.ValuesYAML)
if err != nil {
respondError(w, http.StatusBadRequest, "Invalid values", err.Error())
return
}
// 获取现有实例
instance, err := h.instanceService.GetInstance(r.Context(), instanceID)
@ -194,21 +201,16 @@ func (h *InstanceHandler) UpdateInstance(w http.ResponseWriter, r *http.Request)
if req.Description != "" {
instance.Description = req.Description
}
if req.ValuesYAML != "" {
if hasValuesYAML {
instance.SetValuesYAML(req.ValuesYAML)
if req.Values == nil {
values, err := parseValuesYAML(req.ValuesYAML)
if err != nil {
respondError(w, http.StatusBadRequest, "Invalid values YAML", err.Error())
return
}
instance.SetValues(values)
instance.SetValues(parsedYAML)
}
}
// 调用领域服务
if err := h.instanceService.UpdateInstance(r.Context(), instance); err != nil {
respondError(w, http.StatusBadRequest, "Failed to update instance", err.Error())
respondServiceError(w, err, "Failed to update instance")
return
}
@ -345,7 +347,6 @@ func (h *InstanceHandler) StreamInstanceLogs(w http.ResponseWriter, r *http.Requ
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
w.Header().Set("Access-Control-Allow-Origin", "*")
flusher, ok := w.(http.Flusher)
if !ok {
@ -585,6 +586,7 @@ func convertInstanceResponse(instance *entity.Instance, includeValues bool) *dto
Status: string(instance.Status),
WorkspaceID: instance.WorkspaceID,
OwnerID: instance.OwnerID,
OwnerUsername: instance.OwnerUsername,
StatusReason: instance.StatusReason,
LastOperation: string(instance.LastOperation),
LastError: instance.LastError,
@ -622,6 +624,43 @@ func parseValuesYAML(valuesYAML string) (map[string]interface{}, error) {
return values, nil
}
func parseAndCompareValues(values map[string]interface{}, valuesYAML string) (map[string]interface{}, bool, error) {
if strings.TrimSpace(valuesYAML) == "" {
return nil, false, nil
}
parsed, err := parseValuesYAML(valuesYAML)
if err != nil {
return nil, true, fmt.Errorf("invalid values YAML: %w", err)
}
if values == nil {
return parsed, true, nil
}
normalizedValues, err := normalizeJSONComparable(values)
if err != nil {
return nil, true, fmt.Errorf("invalid values: %w", err)
}
normalizedYAML, err := normalizeJSONComparable(parsed)
if err != nil {
return nil, true, fmt.Errorf("invalid values YAML: %w", err)
}
if !reflect.DeepEqual(normalizedValues, normalizedYAML) {
return nil, true, fmt.Errorf("values and valuesYaml conflict")
}
return parsed, true, nil
}
func normalizeJSONComparable(value interface{}) (interface{}, error) {
data, err := json.Marshal(value)
if err != nil {
return nil, err
}
var normalized interface{}
if err := json.Unmarshal(data, &normalized); err != nil {
return nil, err
}
return normalized, nil
}
func normalizeYAMLValue(value interface{}) (interface{}, error) {
switch typed := value.(type) {
case map[string]interface{}:

View File

@ -43,6 +43,12 @@ func (h *MonitoringHandler) GetClusterMonitoring(w http.ResponseWriter, r *http.
respondJSON(w, http.StatusOK, response)
}
// GetClusterStats is a compatibility alias for cluster detail dashboards that
// historically read stats from /clusters/{id}/stats.
func (h *MonitoringHandler) GetClusterStats(w http.ResponseWriter, r *http.Request) {
h.GetClusterMonitoring(w, r)
}
// ListClusterMonitoring 获取所有集群的监控信息
// @Summary 列出集群监控
// @Tags Monitoring

View File

@ -2,6 +2,7 @@ package rest
import (
"encoding/json"
"errors"
"net/http"
"time"
@ -113,6 +114,15 @@ func (h *WorkspaceHandler) IssueCurrentKubeconfig(w http.ResponseWriter, r *http
if clusterID == "" {
clusterID = r.URL.Query().Get("cluster_id")
}
h.issueCurrentKubeconfigForCluster(w, r, clusterID)
}
func (h *WorkspaceHandler) IssueClusterKubeconfig(w http.ResponseWriter, r *http.Request) {
clusterID := mux.Vars(r)["cluster_id"]
h.issueCurrentKubeconfigForCluster(w, r, clusterID)
}
func (h *WorkspaceHandler) issueCurrentKubeconfigForCluster(w http.ResponseWriter, r *http.Request, clusterID string) {
kubeconfig, err := h.workspaceService.IssueCurrentKubeconfig(r.Context(), clusterID, 2*time.Hour)
if err != nil {
respondServiceError(w, err, "Failed to issue kubeconfig")
@ -152,11 +162,19 @@ func toWorkspaceResponse(workspace *entity.Workspace) workspaceResponse {
}
func respondServiceError(w http.ResponseWriter, err error, fallback string) {
if errors.Is(err, service.ErrQuotaExceeded) {
respondError(w, http.StatusUnprocessableEntity, "Quota exceeded", err.Error())
return
}
switch err {
case entity.ErrUnauthorized, authz.ErrUnauthenticated:
respondError(w, http.StatusUnauthorized, "Unauthorized", err.Error())
case entity.ErrForbidden, authz.ErrForbidden, entity.ErrUserInactive, entity.ErrWorkspaceSuspended:
respondError(w, http.StatusForbidden, "Forbidden", err.Error())
case entity.ErrWorkspaceNamespaceConflict, entity.ErrUserHasInstances, entity.ErrWorkspaceExists, entity.ErrInstanceExists:
respondError(w, http.StatusConflict, "Conflict", err.Error())
case entity.ErrProtectedNamespace:
respondError(w, http.StatusForbidden, "Forbidden", err.Error())
case entity.ErrClusterNotFound, entity.ErrRegistryNotFound, entity.ErrInstanceNotFound, entity.ErrWorkspaceNotFound:
respondError(w, http.StatusNotFound, fallback, err.Error())
default:

View File

@ -4,7 +4,7 @@ import (
"context"
"fmt"
"time"
"github.com/ocdp/cluster-service/internal/domain/entity"
"github.com/ocdp/cluster-service/internal/domain/repository"
)
@ -12,38 +12,47 @@ import (
// HelmClientMock Helm 客户端 Mock 实现
type HelmClientMock struct {
// Mock 数据存储
releases map[string]map[string]*entity.Instance // clusterID -> releaseName -> instance
history map[string]map[string][]*entity.ReleaseHistory // clusterID -> releaseName -> []history
releases map[string]map[string]*entity.Instance // clusterID -> releaseName -> instance
history map[string]map[string][]*entity.ReleaseHistory // clusterID -> releaseName -> []history
estimates map[string]map[string]*repository.ResourceEstimate // clusterID -> releaseName -> estimate
}
// NewHelmClientMock 创建 Mock 实现
func NewHelmClientMock() repository.HelmClient {
return &HelmClientMock{
releases: make(map[string]map[string]*entity.Instance),
history: make(map[string]map[string][]*entity.ReleaseHistory),
releases: make(map[string]map[string]*entity.Instance),
history: make(map[string]map[string][]*entity.ReleaseHistory),
estimates: make(map[string]map[string]*repository.ResourceEstimate),
}
}
func (c *HelmClientMock) SetResourceEstimate(clusterID, namespace, releaseName string, estimate *repository.ResourceEstimate) {
if c.estimates[clusterID] == nil {
c.estimates[clusterID] = make(map[string]*repository.ResourceEstimate)
}
c.estimates[clusterID][fmt.Sprintf("%s/%s", namespace, releaseName)] = estimate
}
func (c *HelmClientMock) Install(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) error {
// 初始化集群数据
if c.releases[cluster.ID] == nil {
c.releases[cluster.ID] = make(map[string]*entity.Instance)
c.history[cluster.ID] = make(map[string][]*entity.ReleaseHistory)
}
// 检查是否已存在
key := fmt.Sprintf("%s/%s", instance.Namespace, instance.Name)
if _, exists := c.releases[cluster.ID][key]; exists {
return entity.ErrInstanceExists
}
// Mock 安装
instance.Status = entity.StatusDeployed
instance.Revision = 1
instance.UpdatedAt = time.Now()
c.releases[cluster.ID][key] = instance
// 添加历史记录
c.history[cluster.ID][key] = []*entity.ReleaseHistory{
{
@ -55,25 +64,25 @@ func (c *HelmClientMock) Install(ctx context.Context, cluster *entity.Cluster, i
Description: "Install complete",
},
}
return nil
}
func (c *HelmClientMock) Upgrade(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) error {
key := fmt.Sprintf("%s/%s", instance.Namespace, instance.Name)
existing, exists := c.releases[cluster.ID][key]
if !exists {
return entity.ErrInstanceNotFound
}
// Mock 升级
instance.Revision = existing.Revision + 1
instance.Status = entity.StatusDeployed
instance.UpdatedAt = time.Now()
c.releases[cluster.ID][key] = instance
// 添加历史记录
history := &entity.ReleaseHistory{
Revision: instance.Revision,
@ -84,44 +93,44 @@ func (c *HelmClientMock) Upgrade(ctx context.Context, cluster *entity.Cluster, i
Description: "Upgrade complete",
}
c.history[cluster.ID][key] = append(c.history[cluster.ID][key], history)
return nil
}
func (c *HelmClientMock) Uninstall(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) error {
key := fmt.Sprintf("%s/%s", namespace, releaseName)
if _, exists := c.releases[cluster.ID][key]; !exists {
return entity.ErrInstanceNotFound
}
// Mock 卸载
delete(c.releases[cluster.ID], key)
return nil
}
func (c *HelmClientMock) Rollback(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string, revision int) error {
key := fmt.Sprintf("%s/%s", namespace, releaseName)
instance, exists := c.releases[cluster.ID][key]
if !exists {
return entity.ErrInstanceNotFound
}
// 检查历史记录是否存在
histories := c.history[cluster.ID][key]
if revision > len(histories) || revision < 1 {
return fmt.Errorf("revision %d not found", revision)
}
// Mock 回滚
instance.Revision = len(histories) + 1
instance.Status = entity.StatusDeployed
instance.UpdatedAt = time.Now()
c.releases[cluster.ID][key] = instance
// 添加回滚历史记录
history := &entity.ReleaseHistory{
Revision: instance.Revision,
@ -132,33 +141,33 @@ func (c *HelmClientMock) Rollback(ctx context.Context, cluster *entity.Cluster,
Description: fmt.Sprintf("Rollback to revision %d", revision),
}
c.history[cluster.ID][key] = append(c.history[cluster.ID][key], history)
return nil
}
func (c *HelmClientMock) GetStatus(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) (*entity.Instance, error) {
key := fmt.Sprintf("%s/%s", namespace, releaseName)
instance, exists := c.releases[cluster.ID][key]
if !exists {
return nil, entity.ErrInstanceNotFound
}
return instance, nil
}
func (c *HelmClientMock) GetHistory(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) ([]*entity.ReleaseHistory, error) {
key := fmt.Sprintf("%s/%s", namespace, releaseName)
if _, exists := c.releases[cluster.ID][key]; !exists {
return nil, entity.ErrInstanceNotFound
}
histories := c.history[cluster.ID][key]
if histories == nil {
return []*entity.ReleaseHistory{}, nil
}
return histories, nil
}
@ -167,7 +176,7 @@ func (c *HelmClientMock) List(ctx context.Context, cluster *entity.Cluster, name
if clusterReleases == nil {
return []*entity.Instance{}, nil
}
instances := make([]*entity.Instance, 0)
for key, instance := range clusterReleases {
// 如果指定了 namespace只返回该 namespace 的
@ -179,18 +188,18 @@ func (c *HelmClientMock) List(ctx context.Context, cluster *entity.Cluster, name
}
instances = append(instances, c.releases[cluster.ID][key])
}
return instances, nil
}
func (c *HelmClientMock) GetValues(ctx context.Context, cluster *entity.Cluster, releaseName, namespace string) (map[string]interface{}, error) {
key := fmt.Sprintf("%s/%s", namespace, releaseName)
instance, exists := c.releases[cluster.ID][key]
if !exists {
return nil, entity.ErrInstanceNotFound
}
return instance.Values, nil
}
@ -204,3 +213,16 @@ func (c *HelmClientMock) GetChartDefaultValues(chartPath string) (map[string]int
}, nil
}
func (c *HelmClientMock) EstimateInstanceResources(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) (*repository.ResourceEstimate, error) {
clusterID := ""
if cluster != nil {
clusterID = cluster.ID
}
key := fmt.Sprintf("%s/%s", instance.Namespace, instance.Name)
if c.estimates[clusterID] != nil {
if estimate := c.estimates[clusterID][key]; estimate != nil {
return estimate, nil
}
}
return &repository.ResourceEstimate{}, nil
}

View File

@ -10,6 +10,7 @@ import (
"github.com/ocdp/cluster-service/internal/domain/entity"
"github.com/ocdp/cluster-service/internal/domain/repository"
domainservice "github.com/ocdp/cluster-service/internal/domain/service"
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart/loader"
"helm.sh/helm/v3/pkg/cli"
@ -346,6 +347,41 @@ func (h *HelmClient) GetChartDefaultValues(chartPath string) (map[string]interfa
return vals, nil
}
func (h *HelmClient) EstimateInstanceResources(ctx context.Context, cluster *entity.Cluster, instance *entity.Instance) (*repository.ResourceEstimate, error) {
chartPath := fmt.Sprintf("/tmp/charts/%s-%s.tgz", instance.Chart, instance.Version)
chart, err := loader.Load(chartPath)
if err != nil {
return nil, fmt.Errorf("failed to load chart: %w", err)
}
actionConfig := new(action.Configuration)
actionConfig.Log = func(format string, v ...interface{}) {}
install := action.NewInstall(actionConfig)
install.ReleaseName = instance.Name
if install.ReleaseName == "" {
install.ReleaseName = "quota-precheck"
}
install.Namespace = instance.Namespace
if install.Namespace == "" {
install.Namespace = "default"
}
install.DryRun = true
install.DryRunOption = "client"
install.ClientOnly = true
install.Replace = true
install.SkipSchemaValidation = true
values := instance.Values
if values == nil {
values = map[string]interface{}{}
}
release, err := install.RunWithContext(ctx, chart, values)
if err != nil {
return nil, fmt.Errorf("failed to render chart for quota estimate: %w", err)
}
return domainservice.EstimateRenderedManifestResources(release.Manifest)
}
// convertReleaseToInstance 转换 Helm Release 为 Instance
func (h *HelmClient) convertReleaseToInstance(rel *release.Release) *entity.Instance {
return &entity.Instance{

View File

@ -63,7 +63,7 @@ func (c *MetricsClient) GetClusterMetrics(ctx context.Context, clusterID string)
// 计算集群级别汇总
metrics := c.aggregateClusterMetrics(cluster, nodes.Items, pods.Items, nodeMetrics)
return metrics, nil
}
@ -87,6 +87,37 @@ func (c *MetricsClient) GetNodeMetrics(ctx context.Context, clusterID string) ([
return c.getNodeMetricsData(ctx, clientset, metricsClient, nodes.Items)
}
// GetPodResourceAllocations returns Kubernetes Pod requests/limits without
// inventing utilization values. GPU memory is treated as vendor integer MB.
func (c *MetricsClient) GetPodResourceAllocations(ctx context.Context, clusterID string) ([]*entity.PodResourceAllocation, error) {
cluster, err := c.clusterRepo.GetByID(ctx, clusterID)
if err != nil {
return nil, fmt.Errorf("failed to get cluster: %w", err)
}
clientset, _, err := c.createK8sClients(cluster)
if err != nil {
return nil, fmt.Errorf("failed to create k8s client: %w", err)
}
pods, err := clientset.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
return nil, fmt.Errorf("failed to list pods: %w", err)
}
result := make([]*entity.PodResourceAllocation, 0, len(pods.Items))
for _, pod := range pods.Items {
result = append(result, &entity.PodResourceAllocation{
ClusterID: clusterID,
Namespace: pod.Namespace,
PodName: pod.Name,
InstanceName: inferHelmReleaseName(pod.Labels),
Allocation: podResourceAllocation(&pod),
})
}
return result, nil
}
// createK8sClients 创建 Kubernetes 客户端
func (c *MetricsClient) createK8sClients(cluster *entity.Cluster) (*kubernetes.Clientset, *metricsv.Clientset, error) {
config, err := clientcmd.RESTConfigFromKubeConfig([]byte(cluster.GetKubeConfig()))
@ -127,14 +158,14 @@ func (c *MetricsClient) getNodeMetricsData(
for _, node := range nodes {
nodeMetric := &entity.NodeMetrics{
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
NodeName: node.Name,
Status: getNodeStatus(&node),
Role: getNodeRole(&node),
Age: getNodeAge(&node),
OSImage: node.Status.NodeInfo.OSImage,
KernelVersion: node.Status.NodeInfo.KernelVersion,
ContainerRuntime: node.Status.NodeInfo.ContainerRuntimeVersion,
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
}
// CPU
@ -213,7 +244,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
var totalCPU, totalMem, usedCPU, usedMem int64
var totalGPU, usedGPU int
healthyNodes := 0
// 单机最大值
var maxNodeCPU, maxNodeMem int64
var maxNodeGPU int
@ -251,7 +282,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
// 从 nodeMetrics 获取使用情况
if i < len(nodeMetrics) && nodeMetrics[i] != nil {
metrics.Nodes = append(metrics.Nodes, *nodeMetrics[i])
// 更新单机最大使用率
if nodeMetrics[i].CPUPercent > maxNodeCPUUsage {
maxNodeCPUUsage = nodeMetrics[i].CPUPercent
@ -274,7 +305,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
metrics.TotalCPU = fmt.Sprintf("%.2f cores", float64(totalCPU)/1000.0)
metrics.TotalMemory = formatBytes(totalMem)
metrics.TotalGPU = totalGPU
// 格式化单机最大值
metrics.MaxNodeCPU = fmt.Sprintf("%.2f cores", float64(maxNodeCPU)/1000.0)
metrics.MaxNodeMemory = formatBytes(maxNodeMem)
@ -292,7 +323,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
usedMem += int64(nm.MemoryPercent * float64(totalMem) / 100.0)
usedGPU += nm.GPUUsage
}
if totalCPU > 0 {
metrics.CPUUsage = float64(usedCPU) / float64(totalCPU) * 100
}
@ -302,7 +333,7 @@ func (c *MetricsClient) aggregateClusterMetrics(
if totalGPU > 0 {
metrics.GPUUsage = float64(usedGPU) / float64(totalGPU) * 100
}
metrics.UsedCPU = fmt.Sprintf("%.2f cores", float64(usedCPU)/1000.0)
metrics.UsedMemory = formatBytes(usedMem)
metrics.UsedGPU = usedGPU
@ -348,7 +379,7 @@ func getNodeAge(node *corev1.Node) string {
age := time.Since(node.CreationTimestamp.Time)
days := int(age.Hours() / 24)
hours := int(age.Hours()) % 24
if days > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
@ -368,3 +399,110 @@ func formatBytes(bytes int64) string {
return fmt.Sprintf("%.1f %ciB", float64(bytes)/float64(div), "KMGTPE"[exp])
}
func inferHelmReleaseName(labels map[string]string) string {
if labels == nil {
return ""
}
for _, key := range []string{
"app.kubernetes.io/instance",
"release",
"helm.sh/release",
"meta.helm.sh/release-name",
"app",
} {
if value := labels[key]; value != "" {
return value
}
}
return ""
}
func podResourceAllocation(pod *corev1.Pod) entity.ResourceAllocation {
if pod == nil {
return entity.ResourceAllocation{}
}
sum := entity.ResourceAllocation{}
for _, container := range pod.Spec.Containers {
sum = addContainerAllocation(sum, container)
}
initMax := entity.ResourceAllocation{}
for _, container := range pod.Spec.InitContainers {
initMax = maxAllocation(initMax, containerAllocation(container))
}
return maxAllocation(sum, initMax)
}
func addContainerAllocation(base entity.ResourceAllocation, container corev1.Container) entity.ResourceAllocation {
return addAllocation(base, containerAllocation(container))
}
func containerAllocation(container corev1.Container) entity.ResourceAllocation {
requests := container.Resources.Requests
limits := container.Resources.Limits
return entity.ResourceAllocation{
CPURequestsMilli: quantityMilliValue(requests, corev1.ResourceCPU),
CPULimitsMilli: quantityMilliValue(limits, corev1.ResourceCPU),
MemoryRequestsBytes: quantityValue(requests, corev1.ResourceMemory),
MemoryLimitsBytes: quantityValue(limits, corev1.ResourceMemory),
GPURequests: quantityValue(requests, corev1.ResourceName("nvidia.com/gpu")),
GPULimits: quantityValue(limits, corev1.ResourceName("nvidia.com/gpu")),
GPUMemoryRequestsMB: quantityValueAny(requests, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
GPUMemoryLimitsMB: quantityValueAny(limits, corev1.ResourceName("nvidia.com/gpumem"), corev1.ResourceName("requests.nvidia.com/gpumem")),
}
}
func addAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: left.CPURequestsMilli + right.CPURequestsMilli,
CPULimitsMilli: left.CPULimitsMilli + right.CPULimitsMilli,
MemoryRequestsBytes: left.MemoryRequestsBytes + right.MemoryRequestsBytes,
MemoryLimitsBytes: left.MemoryLimitsBytes + right.MemoryLimitsBytes,
GPURequests: left.GPURequests + right.GPURequests,
GPULimits: left.GPULimits + right.GPULimits,
GPUMemoryRequestsMB: left.GPUMemoryRequestsMB + right.GPUMemoryRequestsMB,
GPUMemoryLimitsMB: left.GPUMemoryLimitsMB + right.GPUMemoryLimitsMB,
}
}
func maxAllocation(left, right entity.ResourceAllocation) entity.ResourceAllocation {
return entity.ResourceAllocation{
CPURequestsMilli: maxInt64(left.CPURequestsMilli, right.CPURequestsMilli),
CPULimitsMilli: maxInt64(left.CPULimitsMilli, right.CPULimitsMilli),
MemoryRequestsBytes: maxInt64(left.MemoryRequestsBytes, right.MemoryRequestsBytes),
MemoryLimitsBytes: maxInt64(left.MemoryLimitsBytes, right.MemoryLimitsBytes),
GPURequests: maxInt64(left.GPURequests, right.GPURequests),
GPULimits: maxInt64(left.GPULimits, right.GPULimits),
GPUMemoryRequestsMB: maxInt64(left.GPUMemoryRequestsMB, right.GPUMemoryRequestsMB),
GPUMemoryLimitsMB: maxInt64(left.GPUMemoryLimitsMB, right.GPUMemoryLimitsMB),
}
}
func quantityMilliValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
if quantity, ok := resources[name]; ok {
return quantity.MilliValue()
}
return 0
}
func quantityValue(resources corev1.ResourceList, name corev1.ResourceName) int64 {
if quantity, ok := resources[name]; ok {
return quantity.Value()
}
return 0
}
func quantityValueAny(resources corev1.ResourceList, names ...corev1.ResourceName) int64 {
for _, name := range names {
if quantity, ok := resources[name]; ok {
return quantity.Value()
}
}
return 0
}
func maxInt64(left, right int64) int64 {
if left > right {
return left
}
return right
}

View File

@ -0,0 +1,29 @@
package k8s
import (
"testing"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
func TestContainerAllocationCountsVendorGPUMemoryKey(t *testing.T) {
container := corev1.Container{
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceName("nvidia.com/gpumem"): resource.MustParse("10000"),
},
Limits: corev1.ResourceList{
corev1.ResourceName("nvidia.com/gpumem"): resource.MustParse("12000"),
},
},
}
allocation := containerAllocation(container)
if allocation.GPUMemoryRequestsMB != 10000 {
t.Fatalf("expected GPU memory requests 10000 MB, got %d", allocation.GPUMemoryRequestsMB)
}
if allocation.GPUMemoryLimitsMB != 12000 {
t.Fatalf("expected GPU memory limits 12000 MB, got %d", allocation.GPUMemoryLimitsMB)
}
}

View File

@ -106,6 +106,25 @@ func (c *TenantClient) IssueKubeconfig(ctx context.Context, cluster *entity.Clus
}, nil
}
func (c *TenantClient) GetResourceQuotaUsage(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) (*repository.ResourceQuotaUsage, error) {
binding = binding.WithDefaults()
if err := binding.Validate(); err != nil {
return nil, err
}
clientset, _, err := c.clientsetForCluster(cluster)
if err != nil {
return nil, err
}
quota, err := clientset.CoreV1().ResourceQuotas(binding.Namespace).Get(ctx, binding.ResourceQuotaName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("failed to get tenant resource quota usage: %w", err)
}
return &repository.ResourceQuotaUsage{
Hard: resourceVectorFromList(quota.Status.Hard),
Used: resourceVectorFromList(quota.Status.Used),
}, nil
}
// SuspendTenant revokes tenant API access by deleting only the RoleBinding.
func (c *TenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
binding = binding.WithDefaults()
@ -128,6 +147,82 @@ func (c *TenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluste
return nil
}
func (c *TenantClient) DeleteTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
binding = binding.WithDefaults()
if err := binding.Validate(); err != nil {
return err
}
if isProtectedTenantNamespace(binding.Namespace) {
return entity.ErrProtectedNamespace
}
clientset, _, err := c.clientsetForCluster(cluster)
if err != nil {
return err
}
if err := deleteIgnoringNotFound(ctx, func() error {
return clientset.RbacV1().RoleBindings(binding.Namespace).Delete(ctx, binding.RoleBindingName, metav1.DeleteOptions{})
}); err != nil {
return fmt.Errorf("failed to delete tenant role binding: %w", err)
}
if err := deleteIgnoringNotFound(ctx, func() error {
return clientset.CoreV1().ResourceQuotas(binding.Namespace).Delete(ctx, binding.ResourceQuotaName, metav1.DeleteOptions{})
}); err != nil {
return fmt.Errorf("failed to delete tenant resource quota: %w", err)
}
if err := deleteIgnoringNotFound(ctx, func() error {
return clientset.CoreV1().ServiceAccounts(binding.Namespace).Delete(ctx, binding.ServiceAccountName, metav1.DeleteOptions{})
}); err != nil {
return fmt.Errorf("failed to delete tenant service account: %w", err)
}
namespace, err := clientset.CoreV1().Namespaces().Get(ctx, binding.Namespace, metav1.GetOptions{})
if apierrors.IsNotFound(err) {
return nil
}
if err != nil {
return fmt.Errorf("failed to get tenant namespace before deletion: %w", err)
}
if namespace.Labels["ocdp.io/managed-by"] != "ocdp" || namespace.Labels["ocdp.io/tenant"] != binding.Namespace {
return fmt.Errorf("refusing to delete unmanaged namespace %q", binding.Namespace)
}
if err := deleteIgnoringNotFound(ctx, func() error {
return clientset.CoreV1().Namespaces().Delete(ctx, binding.Namespace, metav1.DeleteOptions{})
}); err != nil {
return fmt.Errorf("failed to delete tenant namespace: %w", err)
}
return nil
}
func deleteIgnoringNotFound(ctx context.Context, deleteFn func() error) error {
if err := ctx.Err(); err != nil {
return err
}
err := deleteFn()
if apierrors.IsNotFound(err) {
return nil
}
return err
}
func isProtectedTenantNamespace(namespace string) bool {
switch strings.TrimSpace(namespace) {
case "", "default", "kube-system", "kube-public", "kube-node-lease":
return true
default:
return false
}
}
func resourceVectorFromList(values corev1.ResourceList) repository.ResourceVector {
gpu := values[corev1.ResourceName("requests.nvidia.com/gpu")]
gpuMem := values[corev1.ResourceName("requests.nvidia.com/gpumem")]
return repository.ResourceVector{
CPU: values[corev1.ResourceName("requests.cpu")],
Memory: values[corev1.ResourceName("requests.memory")],
GPU: gpu.Value(),
GPUMemoryMB: gpuMem.Value(),
}
}
func (c *TenantClient) clientsetForCluster(cluster *entity.Cluster) (kubernetes.Interface, *rest.Config, error) {
if c.clientset != nil {
config := &rest.Config{Host: "https://kubernetes.default.svc"}

View File

@ -2,6 +2,7 @@ package k8s
import (
"context"
"errors"
"strings"
"testing"
"time"
@ -58,7 +59,7 @@ func TestTenantClientEnsureTenantUpdatesExistingResources(t *testing.T) {
ctx := context.Background()
binding := tenantBinding()
clientset := fake.NewSimpleClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace}},
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
&rbacv1.RoleBinding{
ObjectMeta: metav1.ObjectMeta{Name: binding.RoleBindingName, Namespace: binding.Namespace},
@ -100,7 +101,7 @@ func TestTenantClientSuspendTenantDeletesOnlyRoleBinding(t *testing.T) {
ctx := context.Background()
binding := tenantBinding()
clientset := fake.NewSimpleClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace}},
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
desiredRoleBinding(binding),
)
@ -117,6 +118,47 @@ func TestTenantClientSuspendTenantDeletesOnlyRoleBinding(t *testing.T) {
}
}
func TestTenantClientDeleteTenantDeletesTenantResources(t *testing.T) {
ctx := context.Background()
binding := tenantBinding()
clientset := fake.NewSimpleClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: binding.Namespace, Labels: binding.Labels}},
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: binding.ServiceAccountName, Namespace: binding.Namespace}},
desiredRoleBinding(binding),
&corev1.ResourceQuota{ObjectMeta: metav1.ObjectMeta{Name: binding.ResourceQuotaName, Namespace: binding.Namespace}},
)
client := NewTenantClientForClientset(clientset)
if err := client.DeleteTenant(ctx, nil, binding); err != nil {
t.Fatalf("DeleteTenant returned error: %v", err)
}
if _, err := clientset.RbacV1().RoleBindings(binding.Namespace).Get(ctx, binding.RoleBindingName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
t.Fatalf("expected role binding deleted, got %v", err)
}
if _, err := clientset.CoreV1().ResourceQuotas(binding.Namespace).Get(ctx, binding.ResourceQuotaName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
t.Fatalf("expected resource quota deleted, got %v", err)
}
if _, err := clientset.CoreV1().ServiceAccounts(binding.Namespace).Get(ctx, binding.ServiceAccountName, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
t.Fatalf("expected service account deleted, got %v", err)
}
if _, err := clientset.CoreV1().Namespaces().Get(ctx, binding.Namespace, metav1.GetOptions{}); !apierrors.IsNotFound(err) {
t.Fatalf("expected namespace deleted, got %v", err)
}
}
func TestTenantClientDeleteTenantRejectsProtectedNamespace(t *testing.T) {
ctx := context.Background()
client := NewTenantClientForClientset(fake.NewSimpleClientset(
&corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}},
))
binding := entity.NewTenantBinding("default")
err := client.DeleteTenant(ctx, nil, binding)
if !errors.Is(err, entity.ErrProtectedNamespace) {
t.Fatalf("expected protected namespace error, got %v", err)
}
}
func TestTenantClientIssueKubeconfigCapsTokenTTL(t *testing.T) {
ctx := context.Background()
binding := tenantBinding()

View File

@ -31,6 +31,28 @@ func (c *MockTenantClient) IssueKubeconfig(ctx context.Context, cluster *entity.
}, nil
}
func (c *MockTenantClient) GetResourceQuotaUsage(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) (*repository.ResourceQuotaUsage, error) {
if err := binding.Validate(); err != nil {
return nil, err
}
return &repository.ResourceQuotaUsage{
Hard: resourceVectorFromList(binding.ResourceQuotaHard),
Used: repository.ResourceVector{},
}, nil
}
func (c *MockTenantClient) SuspendTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
return binding.Validate()
}
func (c *MockTenantClient) DeleteTenant(ctx context.Context, cluster *entity.Cluster, binding entity.TenantBinding) error {
if err := binding.Validate(); err != nil {
return err
}
switch binding.Namespace {
case "", "default", "kube-system", "kube-public", "kube-node-lease":
return entity.ErrProtectedNamespace
default:
return nil
}
}

View File

@ -72,6 +72,16 @@ func (r *WorkspaceRepositoryMock) Update(ctx context.Context, workspace *entity.
return nil
}
func (r *WorkspaceRepositoryMock) Delete(ctx context.Context, id string) error {
r.mu.Lock()
defer r.mu.Unlock()
if _, ok := r.workspaces[id]; !ok {
return entity.ErrWorkspaceNotFound
}
delete(r.workspaces, id)
return nil
}
func (r *WorkspaceRepositoryMock) List(ctx context.Context) ([]*entity.Workspace, error) {
r.mu.RLock()
defer r.mu.RUnlock()
@ -118,6 +128,20 @@ func (r *WorkspaceClusterBindingRepositoryMock) Get(ctx context.Context, workspa
return &copy, nil
}
func (r *WorkspaceClusterBindingRepositoryMock) ListByWorkspace(ctx context.Context, workspaceID string) ([]*entity.WorkspaceClusterBinding, error) {
r.mu.RLock()
defer r.mu.RUnlock()
result := make([]*entity.WorkspaceClusterBinding, 0)
for _, binding := range r.bindings {
if binding.WorkspaceID != workspaceID {
continue
}
copy := *binding
result = append(result, &copy)
}
return result, nil
}
func (r *WorkspaceClusterBindingRepositoryMock) Delete(ctx context.Context, workspaceID, clusterID string) error {
r.mu.Lock()
defer r.mu.Unlock()

View File

@ -27,8 +27,9 @@ func (r *WorkspaceRepository) Create(ctx context.Context, workspace *entity.Work
query := `
INSERT INTO workspaces (id, name, status, k8s_namespace, k8s_sa_name, default_cluster_id, quota_cpu, quota_memory, quota_gpu, quota_gpu_memory, created_by, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
ON CONFLICT (name) DO NOTHING
`
_, err := r.db.conn.ExecContext(ctx, query,
result, err := r.db.conn.ExecContext(ctx, query,
workspace.ID,
workspace.Name,
workspace.Status,
@ -46,6 +47,13 @@ func (r *WorkspaceRepository) Create(ctx context.Context, workspace *entity.Work
if err != nil {
return fmt.Errorf("failed to create workspace: %w", err)
}
rows, err := result.RowsAffected()
if err != nil {
return fmt.Errorf("failed to get affected rows: %w", err)
}
if rows == 0 {
return entity.ErrWorkspaceExists
}
return nil
}
@ -132,6 +140,21 @@ func (r *WorkspaceRepository) Update(ctx context.Context, workspace *entity.Work
return nil
}
func (r *WorkspaceRepository) Delete(ctx context.Context, id string) error {
result, err := r.db.conn.ExecContext(ctx, `DELETE FROM workspaces WHERE id = $1`, id)
if err != nil {
return fmt.Errorf("failed to delete workspace: %w", err)
}
rows, err := result.RowsAffected()
if err != nil {
return fmt.Errorf("failed to get affected rows: %w", err)
}
if rows == 0 {
return entity.ErrWorkspaceNotFound
}
return nil
}
func (r *WorkspaceRepository) List(ctx context.Context) ([]*entity.Workspace, error) {
query := `
SELECT id, name, status, k8s_namespace, k8s_sa_name, default_cluster_id, quota_cpu, quota_memory, quota_gpu, quota_gpu_memory, created_by, created_at, updated_at
@ -256,6 +279,42 @@ func (r *WorkspaceClusterBindingRepository) Get(ctx context.Context, workspaceID
return binding, nil
}
func (r *WorkspaceClusterBindingRepository) ListByWorkspace(ctx context.Context, workspaceID string) ([]*entity.WorkspaceClusterBinding, error) {
query := `
SELECT id, workspace_id, cluster_id, namespace, service_account, quota_cpu, quota_memory, quota_gpu, quota_gpu_memory, status, created_at, updated_at
FROM workspace_cluster_bindings
WHERE workspace_id = $1
ORDER BY created_at ASC
`
rows, err := r.db.conn.QueryContext(ctx, query, workspaceID)
if err != nil {
return nil, fmt.Errorf("failed to list workspace cluster bindings: %w", err)
}
defer rows.Close()
bindings := make([]*entity.WorkspaceClusterBinding, 0)
for rows.Next() {
binding := &entity.WorkspaceClusterBinding{}
if err := rows.Scan(
&binding.ID,
&binding.WorkspaceID,
&binding.ClusterID,
&binding.Namespace,
&binding.ServiceAccount,
&binding.QuotaCPU,
&binding.QuotaMemory,
&binding.QuotaGPU,
&binding.QuotaGPUMem,
&binding.Status,
&binding.CreatedAt,
&binding.UpdatedAt,
); err != nil {
return nil, fmt.Errorf("failed to scan workspace cluster binding: %w", err)
}
bindings = append(bindings, binding)
}
return bindings, rows.Err()
}
func (r *WorkspaceClusterBindingRepository) Delete(ctx context.Context, workspaceID, clusterID string) error {
_, err := r.db.conn.ExecContext(ctx, `DELETE FROM workspace_cluster_bindings WHERE workspace_id = $1 AND cluster_id = $2`, workspaceID, clusterID)
return err