- Add GetMetrics method to MetricsClient interface and implement cluster metrics API - Add QuotaPrecheck service for validating resource quotas before deployment - Add auth DTO with role/permission models and auth handler tests - Add instance diagnostics: mounted NFS volumes, labels, annotations in pod diagnostics - Update workspace handler with GetWorkspace endpoint and shared-user list - Fix monitoring handler to use correct service method name - Add tail_lines fallback in instance handler for snake_case query params - Update nginx config for SSE log streaming support (no buffering) - Add comprehensive test coverage: auth_service_test, auth_handler_test, auth_dto_test, metrics_client_test, quota_precheck_test - Update error messages for quota validation and instance operations - ModifyModal: fix YAML lineWidth:0, modified keys summary, delta-only submit - InstanceCard: correctly disable scale-minus when replicas <= 0 - SidebarLayout: add hover transition for sidebar items - Update todo.md and lessons.md with latest fixes
401 lines
13 KiB
Go
401 lines
13 KiB
Go
package service
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/ocdp/cluster-service/internal/domain/entity"
|
|
"github.com/ocdp/cluster-service/internal/domain/repository"
|
|
corev1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
|
"k8s.io/apimachinery/pkg/util/yaml"
|
|
)
|
|
|
|
var ErrQuotaExceeded = errors.New("quota exceeded")
|
|
|
|
type QuotaExceededResource struct {
|
|
Name string
|
|
Required string
|
|
Hard string
|
|
}
|
|
|
|
type QuotaPrecheckResult struct {
|
|
Allowed bool
|
|
Required repository.ResourceEstimate
|
|
Hard repository.ResourceVector
|
|
Exceeded []QuotaExceededResource
|
|
}
|
|
|
|
type QuotaPrecheckService struct {
|
|
helmClient repository.HelmClient
|
|
}
|
|
|
|
func NewQuotaPrecheckService(helmClient repository.HelmClient) *QuotaPrecheckService {
|
|
return &QuotaPrecheckService{helmClient: helmClient}
|
|
}
|
|
|
|
func (s *QuotaPrecheckService) EstimateAndCompare(ctx context.Context, cluster *entity.Cluster, workspace *entity.Workspace, instance *entity.Instance) (*QuotaPrecheckResult, error) {
|
|
if s == nil || s.helmClient == nil {
|
|
return nil, errors.New("quota precheck requires helm client")
|
|
}
|
|
estimate, err := s.helmClient.EstimateInstanceResources(ctx, cluster, instance)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
result, err := CompareWorkspaceQuota(workspace, estimate)
|
|
if err != nil {
|
|
return result, err
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (s *QuotaPrecheckService) EstimateAndCompareBinding(ctx context.Context, cluster *entity.Cluster, binding *entity.WorkspaceClusterBinding, usage *repository.ResourceQuotaUsage, target *entity.Instance, current *entity.Instance) (*QuotaPrecheckResult, error) {
|
|
if s == nil || s.helmClient == nil {
|
|
return nil, errors.New("quota precheck requires helm client")
|
|
}
|
|
targetEstimate, err := s.helmClient.EstimateInstanceResources(ctx, cluster, target)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var currentEstimate *repository.ResourceEstimate
|
|
if current != nil {
|
|
currentEstimate, err = s.helmClient.EstimateInstanceResources(ctx, cluster, current)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
result, err := CompareBindingQuota(binding, usage, targetEstimate, currentEstimate)
|
|
if err != nil {
|
|
return result, err
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func CompareWorkspaceQuota(workspace *entity.Workspace, estimate *repository.ResourceEstimate) (*QuotaPrecheckResult, error) {
|
|
return compareQuotaList(resourceQuotaHard(workspace), nil, estimate, nil)
|
|
}
|
|
|
|
func CompareBindingQuota(binding *entity.WorkspaceClusterBinding, usage *repository.ResourceQuotaUsage, targetEstimate, currentEstimate *repository.ResourceEstimate) (*QuotaPrecheckResult, error) {
|
|
return compareQuotaList(bindingQuotaHard(binding), usage, targetEstimate, currentEstimate)
|
|
}
|
|
|
|
func compareQuotaList(hardList corev1.ResourceList, usage *repository.ResourceQuotaUsage, targetEstimate, currentEstimate *repository.ResourceEstimate) (*QuotaPrecheckResult, error) {
|
|
if targetEstimate == nil {
|
|
targetEstimate = &repository.ResourceEstimate{}
|
|
}
|
|
current := effectiveQuotaRequests(currentEstimate)
|
|
target := effectiveQuotaRequests(targetEstimate)
|
|
used := repository.ResourceVector{}
|
|
if usage != nil {
|
|
used = usage.Used
|
|
}
|
|
required := addResourceVector(subtractResourceVectorFloorZero(used, current), target)
|
|
hard := resourceVectorFromQuotaHard(hardList)
|
|
result := &QuotaPrecheckResult{
|
|
Allowed: true,
|
|
Required: repository.ResourceEstimate{
|
|
Requests: required,
|
|
},
|
|
Hard: hard,
|
|
}
|
|
addExceeded := func(name, required, limit string) {
|
|
result.Allowed = false
|
|
result.Exceeded = append(result.Exceeded, QuotaExceededResource{
|
|
Name: name,
|
|
Required: required,
|
|
Hard: limit,
|
|
})
|
|
}
|
|
if quantity, ok := hardList[corev1.ResourceName("requests.cpu")]; ok && required.CPU.Cmp(quantity) > 0 {
|
|
addExceeded("requests.cpu", required.CPU.String(), quantity.String())
|
|
}
|
|
if quantity, ok := hardList[corev1.ResourceName("requests.memory")]; ok && required.Memory.Cmp(quantity) > 0 {
|
|
addExceeded("requests.memory", required.Memory.String(), quantity.String())
|
|
}
|
|
if quantity, ok := hardList[corev1.ResourceName("requests.nvidia.com/gpu")]; ok && required.GPU > quantity.Value() {
|
|
addExceeded("requests.nvidia.com/gpu", strconv.FormatInt(required.GPU, 10), quantity.String())
|
|
}
|
|
if quantity, ok := hardList[corev1.ResourceName("requests.nvidia.com/gpumem")]; ok && required.GPUMemoryMB > quantity.Value() {
|
|
addExceeded("requests.nvidia.com/gpumem", strconv.FormatInt(required.GPUMemoryMB, 10), quantity.String())
|
|
}
|
|
sort.Slice(result.Exceeded, func(i, j int) bool {
|
|
return result.Exceeded[i].Name < result.Exceeded[j].Name
|
|
})
|
|
if !result.Allowed {
|
|
return result, ErrQuotaExceeded
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func legacyCompareWorkspaceQuota(workspace *entity.Workspace, estimate *repository.ResourceEstimate) (*QuotaPrecheckResult, error) {
|
|
if estimate == nil {
|
|
estimate = &repository.ResourceEstimate{}
|
|
}
|
|
hardList := resourceQuotaHard(workspace)
|
|
hard := resourceVectorFromQuotaHard(hardList)
|
|
result := &QuotaPrecheckResult{
|
|
Allowed: true,
|
|
Required: *estimate,
|
|
Hard: hard,
|
|
}
|
|
effectiveRequests := effectiveQuotaRequests(estimate)
|
|
addExceeded := func(name, required, limit string) {
|
|
result.Allowed = false
|
|
result.Exceeded = append(result.Exceeded, QuotaExceededResource{
|
|
Name: name,
|
|
Required: required,
|
|
Hard: limit,
|
|
})
|
|
}
|
|
if quantity, ok := hardList[corev1.ResourceName("requests.cpu")]; ok && effectiveRequests.CPU.Cmp(quantity) > 0 {
|
|
addExceeded("requests.cpu", effectiveRequests.CPU.String(), quantity.String())
|
|
}
|
|
if quantity, ok := hardList[corev1.ResourceName("requests.memory")]; ok && effectiveRequests.Memory.Cmp(quantity) > 0 {
|
|
addExceeded("requests.memory", effectiveRequests.Memory.String(), quantity.String())
|
|
}
|
|
if quantity, ok := hardList[corev1.ResourceName("requests.nvidia.com/gpu")]; ok && effectiveRequests.GPU > quantity.Value() {
|
|
addExceeded("requests.nvidia.com/gpu", strconv.FormatInt(effectiveRequests.GPU, 10), quantity.String())
|
|
}
|
|
if quantity, ok := hardList[corev1.ResourceName("requests.nvidia.com/gpumem")]; ok && effectiveRequests.GPUMemoryMB > quantity.Value() {
|
|
addExceeded("requests.nvidia.com/gpumem", strconv.FormatInt(effectiveRequests.GPUMemoryMB, 10), quantity.String())
|
|
}
|
|
sort.Slice(result.Exceeded, func(i, j int) bool {
|
|
return result.Exceeded[i].Name < result.Exceeded[j].Name
|
|
})
|
|
if !result.Allowed {
|
|
return result, ErrQuotaExceeded
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func effectiveQuotaRequests(estimate *repository.ResourceEstimate) repository.ResourceVector {
|
|
if estimate == nil {
|
|
return repository.ResourceVector{}
|
|
}
|
|
return repository.ResourceVector{
|
|
CPU: maxQuantity(estimate.Requests.CPU, estimate.Limits.CPU),
|
|
Memory: maxQuantity(estimate.Requests.Memory, estimate.Limits.Memory),
|
|
GPU: maxInt64(estimate.Requests.GPU, estimate.Limits.GPU),
|
|
GPUMemoryMB: maxInt64(estimate.Requests.GPUMemoryMB, estimate.Limits.GPUMemoryMB),
|
|
}
|
|
}
|
|
|
|
func addResourceVector(left, right repository.ResourceVector) repository.ResourceVector {
|
|
out := left
|
|
out.CPU.Add(right.CPU)
|
|
out.Memory.Add(right.Memory)
|
|
out.GPU += right.GPU
|
|
out.GPUMemoryMB += right.GPUMemoryMB
|
|
return out
|
|
}
|
|
|
|
func subtractResourceVectorFloorZero(left, right repository.ResourceVector) repository.ResourceVector {
|
|
out := left
|
|
out.CPU.Sub(right.CPU)
|
|
if out.CPU.Sign() < 0 {
|
|
out.CPU = resource.Quantity{}
|
|
}
|
|
out.Memory.Sub(right.Memory)
|
|
if out.Memory.Sign() < 0 {
|
|
out.Memory = resource.Quantity{}
|
|
}
|
|
out.GPU -= right.GPU
|
|
if out.GPU < 0 {
|
|
out.GPU = 0
|
|
}
|
|
out.GPUMemoryMB -= right.GPUMemoryMB
|
|
if out.GPUMemoryMB < 0 {
|
|
out.GPUMemoryMB = 0
|
|
}
|
|
return out
|
|
}
|
|
|
|
func maxQuantity(left, right resource.Quantity) resource.Quantity {
|
|
if left.Cmp(right) >= 0 {
|
|
return left
|
|
}
|
|
return right
|
|
}
|
|
|
|
func maxInt64(left, right int64) int64 {
|
|
if left >= right {
|
|
return left
|
|
}
|
|
return right
|
|
}
|
|
|
|
func EstimateRenderedManifestResources(manifest string) (*repository.ResourceEstimate, error) {
|
|
decoder := yaml.NewYAMLOrJSONDecoder(strings.NewReader(manifest), 4096)
|
|
estimate := &repository.ResourceEstimate{}
|
|
for {
|
|
var obj unstructured.Unstructured
|
|
if err := decoder.Decode(&obj); err != nil {
|
|
if errors.Is(err, io.EOF) {
|
|
break
|
|
}
|
|
return nil, fmt.Errorf("failed to decode rendered manifest: %w", err)
|
|
}
|
|
if obj.GetKind() == "" {
|
|
continue
|
|
}
|
|
podSpec, replicas, ok := podTemplateSpec(obj.Object)
|
|
if !ok {
|
|
continue
|
|
}
|
|
addPodSpecResources(estimate, podSpec, replicas)
|
|
}
|
|
return estimate, nil
|
|
}
|
|
|
|
func resourceVectorFromQuotaHard(hard corev1.ResourceList) repository.ResourceVector {
|
|
gpu := hard[corev1.ResourceName("requests.nvidia.com/gpu")]
|
|
gpuMemory := hard[corev1.ResourceName("requests.nvidia.com/gpumem")]
|
|
return repository.ResourceVector{
|
|
CPU: hard[corev1.ResourceName("requests.cpu")],
|
|
Memory: hard[corev1.ResourceName("requests.memory")],
|
|
GPU: gpu.Value(),
|
|
GPUMemoryMB: gpuMemory.Value(),
|
|
}
|
|
}
|
|
|
|
func bindingQuotaHard(binding *entity.WorkspaceClusterBinding) corev1.ResourceList {
|
|
hard := corev1.ResourceList{}
|
|
if binding == nil {
|
|
return hard
|
|
}
|
|
addQuantity := func(name corev1.ResourceName, value string) {
|
|
value = normalizeStandardQuotaQuantity(value)
|
|
if value == "" {
|
|
return
|
|
}
|
|
if quantity, err := resource.ParseQuantity(value); err == nil {
|
|
hard[name] = quantity
|
|
}
|
|
}
|
|
addGPUMemoryQuantity := func(value string) {
|
|
value, err := normalizeGPUMemoryQuota(value)
|
|
if err != nil || value == "" {
|
|
return
|
|
}
|
|
if quantity, err := resource.ParseQuantity(value); err == nil {
|
|
hard[corev1.ResourceName("requests.nvidia.com/gpumem")] = quantity
|
|
}
|
|
}
|
|
addQuantity(corev1.ResourceName("requests.cpu"), binding.QuotaCPU)
|
|
addQuantity(corev1.ResourceName("requests.memory"), binding.QuotaMemory)
|
|
addQuantity(corev1.ResourceName("requests.nvidia.com/gpu"), binding.QuotaGPU)
|
|
addGPUMemoryQuantity(binding.QuotaGPUMem)
|
|
return hard
|
|
}
|
|
|
|
func podTemplateSpec(obj map[string]interface{}) (map[string]interface{}, int64, bool) {
|
|
kind, _, _ := unstructured.NestedString(obj, "kind")
|
|
switch kind {
|
|
case "Pod":
|
|
spec, ok := nestedMap(obj, "spec")
|
|
return spec, 1, ok
|
|
case "Deployment", "ReplicaSet", "StatefulSet", "ReplicationController":
|
|
spec, replicas, ok := workloadTemplateSpec(obj)
|
|
return spec, replicas, ok
|
|
case "DaemonSet", "Job":
|
|
spec, ok := nestedMap(obj, "spec", "template", "spec")
|
|
return spec, 1, ok
|
|
case "CronJob":
|
|
spec, ok := nestedMap(obj, "spec", "jobTemplate", "spec", "template", "spec")
|
|
return spec, 1, ok
|
|
default:
|
|
return nil, 0, false
|
|
}
|
|
}
|
|
|
|
func workloadTemplateSpec(obj map[string]interface{}) (map[string]interface{}, int64, bool) {
|
|
spec, ok := nestedMap(obj, "spec", "template", "spec")
|
|
if !ok {
|
|
return nil, 0, false
|
|
}
|
|
replicas, _, err := unstructured.NestedInt64(obj, "spec", "replicas")
|
|
if err != nil || replicas < 1 {
|
|
replicas = 1
|
|
}
|
|
return spec, replicas, true
|
|
}
|
|
|
|
func nestedMap(obj map[string]interface{}, fields ...string) (map[string]interface{}, bool) {
|
|
value, ok, err := unstructured.NestedMap(obj, fields...)
|
|
return value, ok && err == nil
|
|
}
|
|
|
|
func addPodSpecResources(estimate *repository.ResourceEstimate, podSpec map[string]interface{}, replicas int64) {
|
|
if replicas < 1 {
|
|
replicas = 1
|
|
}
|
|
for _, field := range []string{"initContainers", "containers"} {
|
|
containers, ok, err := unstructured.NestedSlice(podSpec, field)
|
|
if err != nil || !ok {
|
|
continue
|
|
}
|
|
for _, item := range containers {
|
|
container, ok := item.(map[string]interface{})
|
|
if !ok {
|
|
continue
|
|
}
|
|
addContainerResourceList(&estimate.Requests, replicas, container, "resources", "requests")
|
|
addContainerResourceList(&estimate.Limits, replicas, container, "resources", "limits")
|
|
}
|
|
}
|
|
}
|
|
|
|
func addContainerResourceList(target *repository.ResourceVector, replicas int64, container map[string]interface{}, fields ...string) {
|
|
resources, ok := nestedMap(container, fields...)
|
|
if !ok {
|
|
return
|
|
}
|
|
for name, value := range resources {
|
|
switch name {
|
|
case "cpu":
|
|
addQuantity(&target.CPU, value, replicas)
|
|
case "memory":
|
|
addQuantity(&target.Memory, value, replicas)
|
|
case "nvidia.com/gpu", "requests.nvidia.com/gpu", "limits.nvidia.com/gpu":
|
|
target.GPU += parseIntegerResource(value) * replicas
|
|
case "nvidia.com/gpumem", "requests.nvidia.com/gpumem", "limits.nvidia.com/gpumem":
|
|
target.GPUMemoryMB += parseGPUMemoryResource(value) * replicas
|
|
}
|
|
}
|
|
}
|
|
|
|
func addQuantity(target *resource.Quantity, value interface{}, replicas int64) {
|
|
quantity, err := resource.ParseQuantity(fmt.Sprint(value))
|
|
if err != nil {
|
|
return
|
|
}
|
|
quantity.Mul(replicas)
|
|
target.Add(quantity)
|
|
}
|
|
|
|
func parseIntegerResource(value interface{}) int64 {
|
|
quantity, err := resource.ParseQuantity(fmt.Sprint(value))
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
return quantity.Value()
|
|
}
|
|
|
|
func parseGPUMemoryResource(value interface{}) int64 {
|
|
normalized, err := normalizeGPUMemoryQuota(fmt.Sprint(value))
|
|
if err != nil || normalized == "" {
|
|
return 0
|
|
}
|
|
parsed, err := strconv.ParseInt(normalized, 10, 64)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
return parsed
|
|
}
|