feat: Add orchestration models and services for Kubernetes cluster management

- Implemented Pydantic models for Kubernetes cluster state representation in `cluster.py`. - Created a `Resource` class for converting JSON/dict to Python objects in `resource.py`. - Established user models and services for user management, including password hashing and JWT token generation. - Developed application orchestration services for managing Kubernetes applications, including installation and uninstallation. - Added cluster service for retrieving cluster status and health reports. - Introduced node service for fetching node resource details and health status. - Implemented user service for handling user authentication and management.
2025-09-02 02:50:42 +00:00
parent 00039b2fe1
commit c7f8e69d61
65 changed files with 3649 additions and 0 deletions
--- a/.env.template
+++ b/.env.template
@ -0,0 +1,13 @@
 # config
 OCDP_CONFIG_FILE="~/.ocdp/config.yaml"
 # password
 DATABASE_MYSQL_PASSWORD="****"
 # token
 TOKEN_JWT_SECRET_KEY="****"
 # admin
 ADMIN_USERNAME="admin"
 ADMIN_PASSWORD="****"
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,17 @@
 # python
 __pycache__/
 *.py[cod]
 # data
 *.csv
 *.json
 *.xlsx
 *.yaml
 *.yml
 # env variable
 .env
 *.ini
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
--- a/config.yaml.template
+++ b/config.yaml.template
@ -0,0 +1,28 @@
 orchestration:
  kube:
    kubectl_file: "~/.ocdp/kube/config"
    applications_dir: "~/.ocdp/kube/applications"
    logs_dir: "~/.ocdp/kube/logs"
 logger:
  loki:
    url: "https://loki.bwgdi.com/loki/api/v1/push"
    labels: "application=myapp,environment=develop"
    label_keys: ""
 database:
  mysql:
    host: "localhost"
    port: 3306
    db_name: "ocdp"
    username: "root"
    # ❗️ Password should be read from environment variables, not provided here
 password:
  hash:
    algorithm: "ARGON2"
 token:
  jwt:
    signing_algorithm: "HS256"
    # ❗️ Secret should be read from environment variables, not provided here
--- a/frontend/frontend.html
+++ b/frontend/frontend.html
@ -0,0 +1,515 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Cluster Status Dashboard - Dynamic</title>
    <!-- Tailwind CSS for styling -->
    <script src="https://cdn.tailwindcss.com"></script>
    <!-- Chart.js for beautiful charts -->
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <!-- Lucide Icons for a modern look -->
    <script src="https://unpkg.com/lucide@latest"></script>
    <style>
        /* Custom styles for a better dark mode and overall look */
        body {
            font-family: 'Inter', sans-serif;
            background: linear-gradient(135deg, #1a202c 0%, #2d3748 50%, #4a5568 100%);
            background-size: 400% 400%;
            animation: gradientBG 15s ease infinite;
        }
        @keyframes gradientBG {
            0% { background-position: 0% 50%; }
            50% { background-position: 100% 50%; }
            100% { background-position: 0% 50%; }
        }
        .chart-container {
            position: relative;
            height: 120px;
            width: 120px;
        }
        .chart-label {
            position: absolute;
            top: 50%;
            left: 50%;
            transform: translate(-50%, -50%);
            font-size: 1.1rem;
            font-weight: 600;
        }
        /* Glassmorphism Card Style */
        .card {
            background: rgba(31, 41, 55, 0.5); /* gray-800 with transparency */
            backdrop-filter: blur(12px);
            -webkit-backdrop-filter: blur(12px);
            border-radius: 1rem; /* 16px */
            border: 1px solid rgba(255, 255, 255, 0.1);
            transition: all 0.3s ease;
        }
        .card:hover {
            transform: translateY(-5px) scale(1.01);
            box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.2), 0 10px 10px -5px rgba(0, 0, 0, 0.1);
        }
        /* Custom scrollbar for pod lists */
        .pod-list::-webkit-scrollbar {
            width: 6px;
        }
        .pod-list::-webkit-scrollbar-track {
            background: rgba(45, 55, 72, 0.5); /* gray-700 with transparency */
        }
        .pod-list::-webkit-scrollbar-thumb {
            background: #90cdf4; /* blue-300 */
            border-radius: 3px;
        }
        /* Tooltip for pressure status */
        .tooltip {
            position: relative;
            display: inline-block;
        }
        .tooltip .tooltiptext {
            visibility: hidden;
            width: 140px;
            background-color: #111827;
            color: #fff;
            text-align: center;
            border-radius: 6px;
            padding: 5px 0;
            position: absolute;
            z-index: 1;
            bottom: 125%;
            left: 50%;
            margin-left: -70px;
            opacity: 0;
            transition: opacity 0.3s;
        }
        .tooltip:hover .tooltiptext {
            visibility: visible;
            opacity: 1;
        }
    </style>
     <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
 </head>
 <body class="bg-gray-900 text-gray-200">
    <div class="container mx-auto p-4 md:p-8">
        <!-- API URL Input Section -->
        <div class="card p-4 mb-8">
            <div class="flex flex-col sm:flex-row items-center gap-4">
                <label for="apiUrl" class="font-semibold text-white flex-shrink-0">API URL:</label>
                <input type="text" id="apiUrl" placeholder="http://127.0.0.1:8000/api/v1/orchestration/cluster/cluster-status" class="w-full bg-gray-900/50 text-white border border-gray-600 rounded-lg px-4 py-2 focus:ring-2 focus:ring-blue-500 focus:border-blue-500 outline-none transition">
                <button id="fetchDataBtn" class="bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-6 rounded-lg flex items-center gap-2 transition w-full sm:w-auto">
                    <i data-lucide="refresh-cw" class="w-4 h-4"></i>
                    <span>Get Data</span>
                </button>
            </div>
            <div id="status-message" class="mt-3 text-center min-h-[20px]"></div>
        </div>
        <!-- Header -->
        <header class="mb-8 flex items-center gap-4">
            <i data-lucide="layout-dashboard" class="w-10 h-10 text-blue-400"></i>
            <div>
                <h1 class="text-3xl font-bold text-white">Orchestration Cluster Status</h1>
                <p class="text-gray-400">Real-time overview of cluster health and resource allocation.</p>
            </div>
        </header>
        <main id="dashboard-content" class="hidden">
            <!-- Cluster Summary Section -->
            <div id="summary-section" class="mb-8">
                <h2 class="text-2xl font-semibold text-white mb-4 flex items-center gap-2"><i data-lucide="server" class="w-6 h-6"></i>Cluster Summary</h2>
                <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-6">
                    <div id="health-card" class="card p-6 flex flex-col justify-center items-center"></div>
                    <div class="card p-6 col-span-1 md:col-span-2 lg:col-span-3">
                        <h3 class="text-lg font-semibold mb-4 text-white flex items-center gap-2"><i data-lucide="pie-chart" class="w-5 h-5"></i>Core Resource Usage</h3>
                        <div class="grid grid-cols-2 md:grid-cols-4 gap-6 text-center">
                            <div class="flex flex-col items-center">
                                <div class="chart-container"><canvas id="cpuChart"></canvas><div id="cpuChartLabel" class="chart-label text-white"></div></div>
                                <p class="mt-2 text-gray-300 font-medium">CPU Usage</p><p id="cpu-usage-text" class="text-sm text-gray-400"></p>
                            </div>
                            <div class="flex flex-col items-center">
                                 <div class="chart-container"><canvas id="memoryChart"></canvas><div id="memoryChartLabel" class="chart-label text-white"></div></div>
                                <p class="mt-2 text-gray-300 font-medium">Memory Usage</p><p id="memory-usage-text" class="text-sm text-gray-400"></p>
                            </div>
                            <div class="flex flex-col items-center">
                                <div class="chart-container"><canvas id="storageChart"></canvas><div id="storageChartLabel" class="chart-label text-white"></div></div>
                                <p class="mt-2 text-gray-300 font-medium">Ephemeral Storage</p><p id="storage-usage-text" class="text-sm text-gray-400"></p>
                            </div>
                            <div class="flex flex-col items-center">
                                 <div class="chart-container"><canvas id="podsChart"></canvas><div id="podsChartLabel" class="chart-label text-white"></div></div>
                                <p class="mt-2 text-gray-300 font-medium">Pod Allocation</p><p id="pods-usage-text" class="text-sm text-gray-400"></p>
                            </div>
                        </div>
                    </div>
                </div>
                 <div class="grid grid-cols-1 md:grid-cols-3 gap-6 mt-6">
                    <div class="card p-6">
                        <h3 class="text-lg font-semibold text-white mb-3 flex items-center gap-2"><i data-lucide="lightbulb" class="w-5 h-5 text-yellow-300"></i>Scheduling Hints</h3>
                        <div id="scheduling-hints" class="space-y-3 text-gray-300"></div>
                    </div>
                    <div class="card p-6 md:col-span-2">
                        <h3 class="text-lg font-semibold text-white mb-3 flex items-center gap-2"><i data-lucide="gpu-chip" class="w-5 h-5 text-green-400"></i>GPU Availability</h3>
                        <div id="gpu-availability" class="space-y-2 text-gray-300"></div>
                    </div>
                </div>
            </div>
            <!-- Nodes Section -->
            <div>
                <h2 class="text-2xl font-semibold text-white mb-4 flex items-center gap-2"><i data-lucide="hard-drive" class="w-6 h-6"></i>Node Details</h2>
                <div id="nodes-grid" class="space-y-6"></div>
            </div>
        </main>
    </div>
    <script>
        const apiUrlInput = document.getElementById('apiUrl');
        const fetchDataBtn = document.getElementById('fetchDataBtn');
        const statusMessage = document.getElementById('status-message');
        const dashboardContent = document.getElementById('dashboard-content');
        let refreshInterval = null;
        const REFRESH_INTERVAL_MS = 30000; // 30 seconds
        // --- CORE LOGIC ---
        async function fetchAndRenderDashboard() {
            const url = apiUrlInput.value.trim();
            if (!url) {
                showStatus('Please enter a valid API URL.', 'error');
                return;
            }
            if (refreshInterval) clearInterval(refreshInterval);
            showStatus('Fetching data...', 'loading');
            fetchDataBtn.disabled = true;
            try {
                const response = await fetch(url);
                if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
                const data = await response.json();
                renderDashboard(data);
                dashboardContent.classList.remove('hidden');
                showStatus(`Data updated successfully. Next refresh in ${REFRESH_INTERVAL_MS / 1000}s.`, 'success');
                localStorage.setItem('clusterApiUrl', url);
                refreshInterval = setInterval(fetchAndRenderDashboard, REFRESH_INTERVAL_MS);
            } catch (error) {
                console.error('Failed to fetch or render dashboard:', error);
                let errorMessage = `Failed to load data: ${error.message}`;
                if (error instanceof TypeError && error.message === 'Failed to fetch') {
                    errorMessage = 'Network Error: Failed to fetch. This is likely a CORS issue. Please ensure your API server at the specified URL is running and has CORS enabled (e.g., with the "Access-Control-Allow-Origin: *" header).';
                }
                showStatus(errorMessage, 'error');
                dashboardContent.classList.add('hidden');
            } finally {
                fetchDataBtn.disabled = false;
            }
        }
        // --- UTILITY FUNCTIONS ---
        function parseResourceValue(valueStr) {
            if (typeof valueStr !== 'string') return parseFloat(valueStr) || 0;
            const value = parseFloat(valueStr);
            if (valueStr.toLowerCase().includes('gi')) return value;
            if (valueStr.toLowerCase().includes('mi')) return value / 1024;
            if (valueStr.toLowerCase().includes('ki')) return value / 1024 / 1024;
            if (valueStr.toLowerCase().includes('m')) return value / 1000;
            return value;
        }
        function showStatus(message, type) {
            statusMessage.textContent = message;
            statusMessage.className = 'mt-3 text-center min-h-[20px] ';
            switch (type) {
                case 'success': statusMessage.classList.add('text-green-400'); break;
                case 'error': statusMessage.classList.add('text-red-400'); break;
                case 'loading': statusMessage.classList.add('text-blue-400'); break;
                default: statusMessage.classList.add('text-gray-400');
            }
        }
        function createDonutChart(canvasId, labelId, used, total, color) {
            const free = total - used;
            const percentage = total > 0 ? ((used / total) * 100).toFixed(1) : 0;
            const ctx = document.getElementById(canvasId).getContext('2d');
            if (window.chartInstances && window.chartInstances[canvasId]) {
                window.chartInstances[canvasId].destroy();
            }
            const chart = new Chart(ctx, {
                type: 'doughnut',
                data: {
                    datasets: [{
                        data: [used, free > 0 ? free : 0],
                        backgroundColor: [color, 'rgba(74, 85, 104, 0.5)'],
                        borderWidth: 0,
                        hoverBackgroundColor: [color, 'rgba(74, 85, 104, 0.7)']
                    }]
                },
                options: {
                    responsive: true, maintainAspectRatio: false, cutout: '75%',
                    plugins: { legend: { display: false }, tooltip: { enabled: false } },
                    animation: { duration: 500 }
                }
            });
            if (!window.chartInstances) window.chartInstances = {};
            window.chartInstances[canvasId] = chart;
            document.getElementById(labelId).innerText = `${percentage}%`;
        }
        function getNodePressureStatus(conditions) {
            if (!Array.isArray(conditions)) {
                return { hasPressure: false, reason: 'Unknown' };
            }
            const pressureCondition = conditions.find(c =>
                (c.type === 'MemoryPressure' || c.type === 'DiskPressure' || c.type === 'PIDPressure') && c.status === 'True'
            );
            return {
                hasPressure: !!pressureCondition,
                reason: pressureCondition ? pressureCondition.type : 'No Pressure'
            };
        }
        // --- RENDER FUNCTIONS ---
        function renderDashboard(data) {
            renderSummary(data); // Pass full data object
            renderNodes(data.nodes);
            lucide.createIcons();
        }
        function renderSummary(data) {
            const summary = data.summary;
            const healthCard = document.getElementById('health-card');
            const isHealthy = summary.health.unhealthy_nodes === 0;
            healthCard.innerHTML = `
                <div class="text-center">
                    <p class="text-lg font-semibold ${isHealthy ? 'text-green-300' : 'text-red-400'} mb-2 flex items-center gap-2">
                        <i data-lucide="${isHealthy ? 'shield-check' : 'shield-alert'}" class="w-6 h-6"></i>
                        ${isHealthy ? 'Cluster Healthy' : 'Cluster Unhealthy'}
                    </p>
                    <p class="text-4xl font-bold text-white">${summary.health.ready_nodes} <span class="text-2xl font-normal">/ ${summary.health.total_nodes}</span></p>
                    <p class="text-gray-400">Nodes Ready</p>
                </div>
            `;
            const { cluster_total_cpu, cluster_total_memory, cluster_total_pods, cluster_total_ephemeral_storage, best_node_for_gpu_app } = summary.resources;
            const cpuUsed = parseFloat(cluster_total_cpu.used);
            const cpuTotal = parseFloat(cluster_total_cpu.total);
            createDonutChart('cpuChart', 'cpuChartLabel', cpuUsed, cpuTotal, '#6ee7b7');
            document.getElementById('cpu-usage-text').innerText = `${cpuUsed.toFixed(2)} / ${cpuTotal.toFixed(2)} Cores`;
            const memUsed = parseResourceValue(cluster_total_memory.used);
            const memTotal = parseResourceValue(cluster_total_memory.total);
            createDonutChart('memoryChart', 'memoryChartLabel', memUsed, memTotal, '#93c5fd');
            document.getElementById('memory-usage-text').innerText = `${memUsed.toFixed(2)} / ${memTotal.toFixed(2)} GiB`;
            createDonutChart('podsChart', 'podsChartLabel', cluster_total_pods.used, cluster_total_pods.total, '#fca5a5');
            document.getElementById('pods-usage-text').innerText = `${cluster_total_pods.used} / ${cluster_total_pods.total} Pods`;
            const storageUsed = parseResourceValue(cluster_total_ephemeral_storage.used);
            const storageTotal = parseResourceValue(cluster_total_ephemeral_storage.total);
            createDonutChart('storageChart', 'storageChartLabel', storageUsed, storageTotal, '#fde047');
            document.getElementById('storage-usage-text').innerText = `${storageUsed.toFixed(2)} / ${storageTotal.toFixed(2)} GiB`;
            const hintsContainer = document.getElementById('scheduling-hints');
            const bestGpuNode = data.nodes.find(n => n.name === best_node_for_gpu_app.node_name);
            const gpuProduct = bestGpuNode?.gpu_info?.types[0]?.product || 'N/A';
            const gpuMemoryGB = (best_node_for_gpu_app.memory_per_gpu_mb / 1024).toFixed(1);
            hintsContainer.innerHTML = `
                <p><strong class="font-semibold text-blue-300">For CPU:</strong> ${summary.resources.best_node_for_cpu.node_name} (${summary.resources.best_node_for_cpu.free_amount} free)</p>
                <p><strong class="font-semibold text-emerald-300">For Memory:</strong> ${summary.resources.best_node_for_memory.node_name} (${summary.resources.best_node_for_memory.free_amount} free)</p>
                <p>
                    <strong class="font-semibold text-purple-300">For GPU App:</strong> ${best_node_for_gpu_app.node_name} 
                    <span class="block text-sm text-gray-400 pl-4">
                        - Product: ${gpuProduct} <br>
                        - Free: ${best_node_for_gpu_app.free_gpu_count} cards (${gpuMemoryGB} GB/card) <br>
                        - Total Available: ${best_node_for_gpu_app.total_potential_memory_gb.toFixed(1)} GB
                    </span>
                </p>
            `;
            const gpuContainer = document.getElementById('gpu-availability');
            gpuContainer.innerHTML = summary.resources.distributed_gpu_availability
                .filter(gpu => gpu.product !== 'Unknown' && gpu.total_free_count > 0)
                .map(gpu => {
                    const totalMemoryGB = (gpu.total_free_count * gpu.memory_per_gpu_mb) / 1024;
                    return `
                    <div class="flex justify-between items-center bg-gray-900/40 p-3 rounded-lg">
                        <div>
                            <p class="font-semibold text-white">${gpu.product}</p>
                            <p class="text-sm text-gray-400">${gpu.total_free_count} cards &times; ${(gpu.memory_per_gpu_mb / 1024).toFixed(1)} GB/card</p>
                        </div>
                        <div class="text-right">
                             <p class="text-2xl font-bold text-green-300">${totalMemoryGB.toFixed(1)} GB</p>
                             <p class="text-sm text-gray-400">Total Available</p>
                        </div>
                    </div>
                `}).join('') || `<p class="text-gray-400">No dedicated GPUs available in the cluster.</p>`;
        }
        function renderNodes(nodes) {
            const nodesGrid = document.getElementById('nodes-grid');
            nodesGrid.innerHTML = nodes.map((node, index) => {
                const cpuUsed = parseResourceValue(node.cpu.used);
                const cpuTotal = parseResourceValue(node.cpu.total);
                const cpuPercentage = cpuTotal > 0 ? (cpuUsed / cpuTotal) * 100 : 0;
                const memUsed = parseResourceValue(node.memory.used);
                const memTotal = parseResourceValue(node.memory.total);
                const memPercentage = memTotal > 0 ? (memUsed / memTotal) * 100 : 0;
                const diskTotalBytes = parseFloat(node.ephemeral_storage.total);
                const diskTotal = diskTotalBytes > 0 ? diskTotalBytes / (1024 * 1024 * 1024) : 0; // Convert bytes to GiB
                const diskUsed = parseResourceValue(node.ephemeral_storage.used);
                const diskPercentage = diskTotal > 0 ? (diskUsed / diskTotal) * 100 : 0;
                const podsPercentage = node.pods.total > 0 ? (node.pods.used / node.pods.total) * 100 : 0;
                const isReady = node.health.overall_status === 'Ready';
                const pressureStatus = getNodePressureStatus(node.health.conditions);
                const gpuSectionHtml = createGpuSection(node.gpu_info);
                return `
                    <div class="card overflow-hidden">
                        <div class="p-4 bg-gray-900/30 flex flex-col sm:flex-row justify-between items-start sm:items-center">
                            <div>
                                <h3 class="text-xl font-bold text-white">${node.name}</h3>
                                <div class="flex flex-wrap gap-2 mt-2">
                                    ${node.roles.map(role => `<span class="bg-blue-500/50 text-blue-200 text-xs font-semibold px-2.5 py-1 rounded-full">${role}</span>`).join('')}
                                </div>
                            </div>
                            <div class="mt-3 sm:mt-0 flex items-center gap-4">
                                <div class="tooltip">
                                    <i data-lucide="${pressureStatus.hasPressure ? 'shield-alert' : 'shield-check'}" class="w-6 h-6 ${pressureStatus.hasPressure ? 'text-orange-400' : 'text-green-400'}"></i>
                                    <span class="tooltiptext">${pressureStatus.reason}</span>
                                </div>
                                <div class="flex items-center gap-2 text-lg font-semibold ${isReady ? 'text-green-300' : 'text-red-400'}">
                                    <span class="w-3 h-3 rounded-full ${isReady ? 'bg-green-400' : 'bg-red-400'}"></span>
                                    ${node.health.overall_status}
                                </div>
                            </div>
                        </div>
                        <div class="p-4">
                            <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-4 mb-4">
                                ${createResourceBar('CPU', cpuUsed.toFixed(2), cpuTotal, 'Cores', cpuPercentage, 'bg-emerald-400')}
                                ${createResourceBar('Memory', memUsed.toFixed(2), memTotal.toFixed(2), 'GiB', memPercentage, 'bg-blue-400')}
                                ${createResourceBar('Ephemeral Storage', diskUsed.toFixed(2), diskTotal.toFixed(2), 'GiB', diskPercentage, 'bg-yellow-400')}
                                ${createResourceBar('Pods', node.pods.used, node.pods.total, '', podsPercentage, 'bg-red-400')}
                            </div>
                            ${gpuSectionHtml}
                            <h4 class="font-semibold text-gray-300 mt-4 mb-2">Running Pods (${node.running_pods.length})</h4>
                            <div class="overflow-x-auto pod-list max-h-60 bg-gray-900/50 rounded-lg">
                                <table class="w-full text-sm text-left text-gray-300">
                                    <thead class="text-xs text-gray-400 uppercase bg-gray-900/70 sticky top-0">
                                        <tr>
                                            <th scope="col" class="px-4 py-2">Namespace</th><th scope="col" class="px-4 py-2">Name</th>
                                            <th scope="col" class="px-4 py-2">CPU Req.</th><th scope="col" class="px-4 py-2">Memory Req.</th>
                                            <th scope="col" class="px-4 py-2">Age</th>
                                        </tr>
                                    </thead>
                                    <tbody>
                                        ${node.running_pods.map(pod => `
                                            <tr class="border-b border-gray-700/50 hover:bg-gray-700/50">
                                                <td class="px-4 py-2">${pod.namespace}</td>
                                                <td class="px-4 py-2 font-medium text-white whitespace-nowrap">${pod.name}</td>
                                                <td class="px-4 py-2">${pod.cpu_requests}</td>
                                                <td class="px-4 py-2">${pod.memory_requests}</td>
                                                <td class="px-4 py-2">${pod.age}</td>
                                            </tr>`).join('')}
                                        ${node.running_pods.length === 0 ? `<tr><td colspan="5" class="text-center py-4 text-gray-500">No running pods</td></tr>` : ''}
                                    </tbody>
                                </table>
                            </div>
                             <div class="text-right mt-2">
                                <button onclick="toggleDetails(${index})" class="text-sm text-blue-300 hover:underline">Show Labels & Conditions</button>
                            </div>
                            <div id="details-${index}" class="hidden mt-4 p-4 bg-black/30 rounded-lg">
                                <h5 class="font-semibold text-white mb-2">Labels</h5>
                                <pre class="text-xs bg-black/50 p-3 rounded-md max-h-48 overflow-auto"><code>${JSON.stringify(node.labels, null, 2)}</code></pre>
                            </div>
                        </div>
                    </div>
                `;
            }).join('');
        }
        function createGpuSection(gpu_info) {
            if (!gpu_info || !gpu_info.usage || gpu_info.usage.total === 0) {
                return '';
            }
            const { used, total } = gpu_info.usage;
            const percentage = total > 0 ? (used / total) * 100 : 0;
            const gpuType = gpu_info.types[0] || {};
            const product = gpuType.product || 'N/A';
            const memoryPerCard = gpuType.memory_mb ? (gpuType.memory_mb / 1024) : 0;
            const totalMemoryOnNode = total * memoryPerCard;
            const detailsHtml = `
                <div class="text-sm mt-2 text-gray-400">
                    <span>${product}</span>
                    <span class="float-right font-medium text-white">${totalMemoryOnNode.toFixed(1)} GB Total
                        <span class="text-gray-500">(${total} &times; ${memoryPerCard.toFixed(1)} GB)</span>
                    </span>
                </div>
            `;
            return `
                <div class="mt-4 pt-4 border-t border-gray-700/50">
                    ${createResourceBar('GPU', used, total, 'Cards', percentage, 'bg-purple-400')}
                    ${detailsHtml}
                </div>
            `;
        }
        function createResourceBar(label, used, total, unit, percentage, colorClass) {
            // Hide bar if total is 0, except for pods
            if (total === 0 && label !== 'Pods') return '';
            return `
                <div>
                    <div class="flex justify-between mb-1">
                        <span class="text-sm font-medium text-gray-300">${label}</span>
                        <span class="text-sm font-medium text-gray-400">${used} / ${total} ${unit}</span>
                    </div>
                    <div class="w-full bg-gray-700/50 rounded-full h-2.5">
                        <div class="${colorClass} h-2.5 rounded-full" style="width: ${percentage}%"></div>
                    </div>
                </div>
            `;
        }
        function toggleDetails(index) {
            const detailsDiv = document.getElementById(`details-${index}`);
            detailsDiv.classList.toggle('hidden');
        }
        // --- INITIALIZATION ---
        window.onload = () => {
            lucide.createIcons();
            fetchDataBtn.addEventListener('click', fetchAndRenderDashboard);
            const savedUrl = localStorage.getItem('clusterApiUrl');
            if (savedUrl) {
                apiUrlInput.value = savedUrl;
                fetchAndRenderDashboard();
            }
        };
    </script>
 </body>
 </ht
--- a/frontend/other.html
+++ b/frontend/other.html
@ -0,0 +1,406 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>OCDP Application Manager</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://unpkg.com/lucide@latest"></script>
    <style>
        /* Custom styles for glassmorphism and animations */
        body {
            font-family: 'Inter', sans-serif;
            background: linear-gradient(135deg, #1a202c 0%, #2d3748 50%, #4a5568 100%);
            background-size: 400% 400%;
            animation: gradientBG 15s ease infinite;
        }
        @keyframes gradientBG {
            0% { background-position: 0% 50%; }
            50% { background-position: 100% 50%; }
            100% { background-position: 0% 50%; }
        }
        .card {
            background: rgba(31, 41, 55, 0.5);
            backdrop-filter: blur(12px);
            -webkit-backdrop-filter: blur(12px);
            border-radius: 1rem;
            border: 1px solid rgba(255, 255, 255, 0.1);
            transition: all 0.3s ease;
        }
        .card:hover {
            transform: translateY(-5px) scale(1.01);
            box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.2), 0 10px 10px -5px rgba(0, 0, 0, 0.1);
        }
        /* Tabs styling */
        .tab-button.active {
            background: rgba(55, 65, 81, 0.5);
            color: #93c5fd;
            border-bottom: 2px solid #3b82f6;
        }
    </style>
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
 </head>
 <body class="bg-gray-900 text-gray-200">
    <div class="container mx-auto p-4 md:p-8">
        <header class="mb-8 flex flex-col sm:flex-row justify-between items-center gap-4">
            <div class="flex items-center gap-4">
                <i data-lucide="layout-dashboard" class="w-10 h-10 text-blue-400"></i>
                <div>
                    <h1 class="text-3xl font-bold text-white">Application Manager</h1>
                    <p class="text-gray-400">Manage and deploy applications on your cluster.</p>
                </div>
            </div>
            <div class="w-full sm:w-auto mt-4 sm:mt-0">
                <input type="text" id="tokenInput" placeholder="Enter JWT Token" class="w-full sm:w-80 bg-gray-900/50 text-white border border-gray-600 rounded-lg px-4 py-2 focus:ring-2 focus:ring-blue-500 focus:border-blue-500 outline-none transition">
            </div>
        </header>
        <main>
            <div class="flex border-b border-gray-700/50 mb-6">
                <button id="tabAvailable" class="tab-button px-6 py-3 font-semibold text-gray-400 hover:text-white transition active">Available Applications</button>
                <button id="tabInstalled" class="tab-button px-6 py-3 font-semibold text-gray-400 hover:text-white transition">Installed Applications</button>
            </div>
            <div id="contentAvailable" class="tab-content">
                <div id="availableAppsList" class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
                    </div>
            </div>
            <div id="contentInstalled" class="tab-content hidden">
                <div id="installedAppsList" class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
                    </div>
            </div>
        </main>
    </div>
    <div id="installModal" class="fixed inset-0 bg-gray-900/50 backdrop-blur-md hidden justify-center items-center z-50">
        <div class="card p-8 w-full max-w-lg">
            <h3 class="text-2xl font-bold text-white mb-4">Install Application</h3>
            <p id="modalAppName" class="text-gray-400 mb-4"></p>
            <form id="installForm" class="space-y-4">
                <div>
                    <label for="installMode" class="block text-sm font-medium text-gray-300">Deployment Mode</label>
                    <select id="installMode" class="mt-1 block w-full bg-gray-900/50 border border-gray-600 rounded-md shadow-sm p-2 text-white"></select>
                </div>
                <div>
                    <label for="userOverrides" class="block text-sm font-medium text-gray-300">User Overrides (JSON)</label>
                    <textarea id="userOverrides" rows="5" class="mt-1 block w-full bg-gray-900/50 border border-gray-600 rounded-md shadow-sm p-2 text-white"></textarea>
                </div>
                <div class="flex justify-end gap-3 pt-4">
                    <button type="button" id="cancelInstallBtn" class="bg-gray-600 hover:bg-gray-700 text-white font-bold py-2 px-4 rounded-lg transition">Cancel</button>
                    <button type="submit" id="submitInstallBtn" class="bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded-lg transition">Install</button>
                </div>
            </form>
        </div>
    </div>
    <div id="statusModal" class="fixed inset-0 bg-gray-900/50 backdrop-blur-md hidden justify-center items-center z-50">
        <div class="card p-8 w-full max-w-2xl">
            <div class="flex justify-between items-center mb-4">
                <h3 class="text-2xl font-bold text-white">Application Status</h3>
                <button id="closeStatusModal" class="text-gray-400 hover:text-white"><i data-lucide="x" class="w-6 h-6"></i></button>
            </div>
            <div id="statusContent" class="space-y-4"></div>
        </div>
    </div>
    <script>
        const API_PREFIX = "http://localhost:8000/api/v1/orchestration";
        let TOKEN = "";
        // --- DOM Elements ---
        const tokenInput = document.getElementById('tokenInput');
        const tabAvailable = document.getElementById('tabAvailable');
        const tabInstalled = document.getElementById('tabInstalled');
        const contentAvailable = document.getElementById('contentAvailable');
        const contentInstalled = document.getElementById('contentInstalled');
        const availableAppsList = document.getElementById('availableAppsList');
        const installedAppsList = document.getElementById('installedAppsList');
        const installModal = document.getElementById('installModal');
        const statusModal = document.getElementById('statusModal');
        const closeStatusModal = document.getElementById('closeStatusModal');
        const installForm = document.getElementById('installForm');
        let activeRefreshInterval = null;
        // --- Core Logic ---
        async function fetchAvailableApps() {
            try {
                const response = await fetch(`${API_PREFIX}/application-templates`);
                if (!response.ok) throw new Error("Failed to fetch available apps.");
                const apps = await response.json();
                renderAvailableApps(apps);
            } catch (error) {
                console.error("Error fetching available apps:", error);
                availableAppsList.innerHTML = `<p class="text-center text-red-400">Failed to load available applications. Please check your API server.</p>`;
            }
        }
        async function fetchInstalledApps() {
            if (!TOKEN) {
                installedAppsList.innerHTML = `<p class="text-center text-yellow-400 col-span-full">Please enter a valid JWT token to view installed applications.</p>`;
                return;
            }
            try {
                const response = await fetch(`${API_PREFIX}/application-instances`, {
                    headers: { "Authorization": `Bearer ${TOKEN}` }
                });
                if (!response.ok) throw new Error("Failed to fetch installed apps.");
                const apps = await response.json();
                renderInstalledApps(apps);
            } catch (error) {
                console.error("Error fetching installed apps:", error);
                installedAppsList.innerHTML = `<p class="text-center text-red-400">Failed to load installed applications. Token might be invalid or expired.</p>`;
            }
        }
        async function fetchAndRenderStatus(namespace, app_template_name, mode) {
            if (!TOKEN) return;
            try {
                const response = await fetch(`${API_PREFIX}/application-instances/${namespace}/${app_template_name}/status?mode=${mode}`, {
                    headers: { "Authorization": `Bearer ${TOKEN}` }
                });
                if (!response.ok) throw new Error("Failed to fetch status.");
                const statusData = await response.json();
                renderStatusDetails(statusData);
            } catch (error) {
                console.error("Error fetching status:", error);
                document.getElementById('statusContent').innerHTML = `<p class="text-red-400">Error: ${error.message}</p>`;
            }
        }
        async function uninstallRelease(namespace, app_template_name, mode) {
            if (!confirm(`Are you sure you want to uninstall the Helm release for ${app_template_name}?`)) return;
            try {
                const response = await fetch(`${API_PREFIX}/application-instances/${namespace}/${app_template_name}?mode=${mode}`, {
                    method: 'DELETE',
                    headers: { "Authorization": `Bearer ${TOKEN}` }
                });
                const result = await response.json();
                alert(`Uninstall Release Result:\n${result.message}`);
                fetchInstalledApps(); // Refresh the list
            } catch (error) {
                console.error("Error uninstalling release:", error);
                alert(`Failed to uninstall release: ${error.message}`);
            }
        }
        async function deleteNamespace(namespace) {
            if (!confirm(`WARNING: This will permanently delete the entire namespace '${namespace}' and all its resources.`)) return;
            try {
                const response = await fetch(`${API_PREFIX}/application-instances/${namespace}`, {
                    method: 'DELETE',
                    headers: { "Authorization": `Bearer ${TOKEN}` }
                });
                const result = await response.json();
                alert(`Delete Namespace Result:\n${result.message}`);
                fetchInstalledApps(); // Refresh the list
            } catch (error) {
                console.error("Error deleting namespace:", error);
                alert(`Failed to delete namespace: ${error.message}`);
            }
        }
        // --- Render Functions ---
        function renderAvailableApps(apps) {
            availableAppsList.innerHTML = apps.map(app => `
                <div class="card p-6 flex flex-col justify-between">
                    <div>
                        <h3 class="text-xl font-bold text-white mb-2">${app.name}</h3>
                        <p class="text-sm text-gray-400">Business Name: ${app.metadata.application_name}</p>
                        <p class="text-sm text-gray-400">Chart: ${app.metadata.distributed.chart}</p>
                    </div>
                    <div class="mt-4">
                        <button onclick="openInstallModal('${app.name}', ${JSON.stringify(app.metadata)})" class="w-full bg-green-600 hover:bg-green-700 text-white font-bold py-2 px-4 rounded-lg flex items-center justify-center gap-2 transition">
                            <i data-lucide="plus-circle" class="w-4 h-4"></i> Install
                        </button>
                    </div>
                </div>
            `).join('');
            lucide.createIcons();
        }
        function renderInstalledApps(apps) {
            installedAppsList.innerHTML = apps.map(app => `
                <div class="card p-6 flex flex-col justify-between">
                    <div>
                        <h3 class="text-xl font-bold text-white mb-2">${app.application_name}</h3>
                        <p class="text-sm text-gray-400">Namespace: <span class="text-blue-300 font-semibold">${app.namespace}</span></p>
                        <p class="text-sm text-gray-400">Release: ${app.release_name}</p>
                        <p class="text-sm text-gray-400">Status: <span class="font-semibold ${app.status === 'deployed' ? 'text-green-300' : 'text-yellow-300'}">${app.status}</span></p>
                    </div>
                    <div class="mt-4 flex gap-2">
                        <button onclick="viewStatusDetails('${app.namespace}', '${app.application_name}', 'distributed')" class="flex-1 bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-2 rounded-lg transition text-sm">
                            View Status
                        </button>
                        <button onclick="uninstallRelease('${app.namespace}', '${app.application_name}', 'distributed')" class="flex-1 bg-red-600 hover:bg-red-700 text-white font-bold py-2 px-2 rounded-lg transition text-sm">
                            Uninstall
                        </button>
                    </div>
                    <div class="mt-2">
                        <button onclick="deleteNamespace('${app.namespace}')" class="w-full bg-red-800 hover:bg-red-900 text-white font-bold py-2 px-2 rounded-lg transition text-sm">
                            Delete Namespace
                        </button>
                    </div>
                </div>
            `).join('');
            lucide.createIcons();
        }
        function renderStatusDetails(statusData) {
            const content = document.getElementById('statusContent');
            content.innerHTML = `
                <p class="text-xl font-bold text-white">Application: ${statusData.application_name}</p>
                <p class="text-gray-400">Namespace: ${statusData.namespace}</p>
                <p class="text-gray-400">Ready: <span class="font-semibold ${statusData.is_ready ? 'text-green-300' : 'text-yellow-300'}">${statusData.is_ready}</span></p>
                ${statusData.base_access_url ? `<p class="text-gray-400">Access URL: <a href="${statusData.base_access_url}" target="_blank" class="text-blue-400 hover:underline">${statusData.base_access_url}</a></p>` : ''}
                <h4 class="font-semibold text-white mt-4 mb-2">Pod Details</h4>
                <ul class="space-y-2">
                    ${statusData.details.map(pod => `
                        <li class="bg-gray-800/50 p-3 rounded-lg flex items-center justify-between">
                            <div>
                                <p class="font-bold text-white">${pod.pod_name}</p>
                                <p class="text-sm text-gray-400">Phase: ${pod.status_phase} | Ready: ${pod.ready_status}</p>
                            </div>
                            <span class="w-3 h-3 rounded-full ${pod.is_ready ? 'bg-green-400' : 'bg-red-400'}"></span>
                        </li>
                    `).join('')}
                </ul>
            `;
            statusModal.classList.remove('hidden');
            statusModal.classList.add('flex');
        }
        // --- Event Handlers & Modal Functions ---
        function openInstallModal(appName, metadata) {
            const modalTitle = document.getElementById('modalAppName');
            modalTitle.innerText = `Installing ${appName}`;
            const modeSelect = document.getElementById('installMode');
            modeSelect.innerHTML = '';
            const modes = [
                { name: 'distributed', data: metadata.distributed },
                { name: 'monolithic', data: metadata.monolithic }
            ];
            modes.forEach(mode => {
                if (mode.data) {
                    const option = document.createElement('option');
                    option.value = mode.name;
                    option.innerText = mode.name;
                    modeSelect.appendChild(option);
                }
            });
            installModal.classList.remove('hidden');
            installModal.classList.add('flex');
            installForm.dataset.appName = appName;
        }
        installForm.addEventListener('submit', async (e) => {
            e.preventDefault();
            const appName = installForm.dataset.appName;
            const mode = document.getElementById('installMode').value;
            const userOverridesText = document.getElementById('userOverrides').value;
            let userOverrides = {};
            if (userOverridesText) {
                try {
                    userOverrides = JSON.parse(userOverridesText);
                } catch (error) {
                    alert("Invalid JSON for user overrides.");
                    return;
                }
            }
            try {
                const response = await fetch(`${API_PREFIX}/application-instances`, {
                    method: 'POST',
                    headers: {
                        "Content-Type": "application/json",
                        "Authorization": `Bearer ${TOKEN}`
                    },
                    body: JSON.stringify({
                        app_template_name: appName,
                        mode: mode,
                        user_overrides: userOverrides
                    })
                });
                const result = await response.json();
                if (!response.ok) throw new Error(result.detail || 'Installation failed.');
                alert(`Installation started successfully:\nNamespace: ${result.namespace}\nMessage: ${result.message}`);
                installModal.classList.remove('flex');
                installModal.classList.add('hidden');
                fetchInstalledApps(); // Refresh installed list
            } catch (error) {
                console.error("Installation error:", error);
                alert(`Installation failed: ${error.message}`);
            }
        });
        document.getElementById('cancelInstallBtn').addEventListener('click', () => {
            installModal.classList.remove('flex');
            installModal.classList.add('hidden');
        });
        closeStatusModal.addEventListener('click', () => {
            statusModal.classList.remove('flex');
            statusModal.classList.add('hidden');
        });
        function viewStatusDetails(namespace, app_template_name, mode) {
            document.getElementById('statusContent').innerHTML = `<p class="text-center text-blue-400">Loading status...</p>`;
            fetchAndRenderStatus(namespace, app_template_name, mode);
        }
        // --- Tab Switching Logic ---
        function switchTab(tabId) {
            document.querySelectorAll('.tab-button').forEach(btn => btn.classList.remove('active'));
            document.querySelectorAll('.tab-content').forEach(content => content.classList.add('hidden'));
            if (tabId === 'tabAvailable') {
                tabAvailable.classList.add('active');
                contentAvailable.classList.remove('hidden');
                fetchAvailableApps();
            } else if (tabId === 'tabInstalled') {
                tabInstalled.classList.add('active');
                contentInstalled.classList.remove('hidden');
                fetchInstalledApps();
            }
            lucide.createIcons();
        }
        tabAvailable.addEventListener('click', () => switchTab('tabAvailable'));
        tabInstalled.addEventListener('click', () => switchTab('tabInstalled'));
        // --- Initialization ---
        window.onload = () => {
            const storedToken = localStorage.getItem('jwtToken');
            if (storedToken) {
                tokenInput.value = storedToken;
                TOKEN = storedToken;
                switchTab('tabInstalled');
            } else {
                switchTab('tabAvailable');
            }
            tokenInput.addEventListener('input', (e) => {
                TOKEN = e.target.value;
                localStorage.setItem('jwtToken', TOKEN);
                if (TOKEN) {
                    // Refresh current tab if token is entered
                    const currentTab = document.querySelector('.tab-button.active');
                    if (currentTab) switchTab(currentTab.id);
                }
            });
            lucide.createIcons();
        };
    </script>
 </body>
 </html>
--- a/migration/alembic.ini.template
+++ b/migration/alembic.ini.template
@ -0,0 +1,147 @@
 # A generic, single database configuration.
 [alembic]
 # path to migration scripts.
 # this is typically a path given in POSIX (e.g. forward slashes)
 # format, relative to the token %(here)s which refers to the location of this
 # ini file
 script_location = %(here)s/alembic
 # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
 # Uncomment the line below if you want the files to be prepended with date and time
 # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
 # for all available tokens
 # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
 # sys.path path, will be prepended to sys.path if present.
 # defaults to the current working directory.  for multiple paths, the path separator
 # is defined by "path_separator" below.
 prepend_sys_path = .
 # timezone to use when rendering the date within the migration file
 # as well as the filename.
 # If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
 # Any required deps can installed by adding `alembic[tz]` to the pip requirements
 # string value is passed to ZoneInfo()
 # leave blank for localtime
 # timezone =
 # max length of characters to apply to the "slug" field
 # truncate_slug_length = 40
 # set to 'true' to run the environment during
 # the 'revision' command, regardless of autogenerate
 # revision_environment = false
 # set to 'true' to allow .pyc and .pyo files without
 # a source .py file to be detected as revisions in the
 # versions/ directory
 # sourceless = false
 # version location specification; This defaults
 # to <script_location>/versions.  When using multiple version
 # directories, initial revisions must be specified with --version-path.
 # The path separator used here should be the separator specified by "path_separator"
 # below.
 # version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions
 # path_separator; This indicates what character is used to split lists of file
 # paths, including version_locations and prepend_sys_path within configparser
 # files such as alembic.ini.
 # The default rendered in new alembic.ini files is "os", which uses os.pathsep
 # to provide os-dependent path splitting.
 #
 # Note that in order to support legacy alembic.ini files, this default does NOT
 # take place if path_separator is not present in alembic.ini.  If this
 # option is omitted entirely, fallback logic is as follows:
 #
 # 1. Parsing of the version_locations option falls back to using the legacy
 #    "version_path_separator" key, which if absent then falls back to the legacy
 #    behavior of splitting on spaces and/or commas.
 # 2. Parsing of the prepend_sys_path option falls back to the legacy
 #    behavior of splitting on spaces, commas, or colons.
 #
 # Valid values for path_separator are:
 #
 # path_separator = :
 # path_separator = ;
 # path_separator = space
 # path_separator = newline
 #
 # Use os.pathsep. Default configuration used for new projects.
 path_separator = os
 # set to 'true' to search source files recursively
 # in each "version_locations" directory
 # new in Alembic version 1.10
 # recursive_version_locations = false
 # the output encoding used when revision files
 # are written from script.py.mako
 # output_encoding = utf-8
 # database URL.  This is consumed by the user-maintained env.py script only.
 # other means of configuring database URLs may be customized within the env.py
 # file.
 sqlalchemy.url = mysql+pymysql://root:GDIP%%40ssw0rd@localhost:3306/ocdp
 [post_write_hooks]
 # post_write_hooks defines scripts or Python functions that are run
 # on newly generated revision scripts.  See the documentation for further
 # detail and examples
 # format using "black" - use the console_scripts runner, against the "black" entrypoint
 # hooks = black
 # black.type = console_scripts
 # black.entrypoint = black
 # black.options = -l 79 REVISION_SCRIPT_FILENAME
 # lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module
 # hooks = ruff
 # ruff.type = module
 # ruff.module = ruff
 # ruff.options = check --fix REVISION_SCRIPT_FILENAME
 # Alternatively, use the exec runner to execute a binary found on your PATH
 # hooks = ruff
 # ruff.type = exec
 # ruff.executable = ruff
 # ruff.options = check --fix REVISION_SCRIPT_FILENAME
 # Logging configuration.  This is also consumed by the user-maintained
 # env.py script only.
 [loggers]
 keys = root,sqlalchemy,alembic
 [handlers]
 keys = console
 [formatters]
 keys = generic
 [logger_root]
 level = WARNING
 handlers = console
 qualname =
 [logger_sqlalchemy]
 level = WARNING
 handlers =
 qualname = sqlalchemy.engine
 [logger_alembic]
 level = INFO
 handlers =
 qualname = alembic
 [handler_console]
 class = StreamHandler
 args = (sys.stderr,)
 level = NOTSET
 formatter = generic
 [formatter_generic]
 format = %(levelname)-5.5s [%(name)s] %(message)s
 datefmt = %H:%M:%S
--- a/migration/alembic/README
+++ b/migration/alembic/README
@ -0,0 +1 @@
 Generic single-database configuration.
--- a/migration/alembic/env.py
+++ b/migration/alembic/env.py
@ -0,0 +1,81 @@
 from logging.config import fileConfig
 from sqlalchemy import engine_from_config
 from sqlalchemy import pool
 from alembic import context
 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.
 config = context.config
 # Interpret the config file for Python logging.
 # This line sets up loggers basically.
 if config.config_file_name is not None:
    fileConfig(config.config_file_name)
 # add your model's MetaData object here
 # for 'autogenerate' support
 # from myapp import mymodel
 # target_metadata = mymodel.Base.metadata
 #target_metadata = None
 from ocdp.database import Base
 from ocdp.models import *
 target_metadata = Base.metadata
 # other values from the config, defined by the needs of env.py,
 # can be acquired:
 # my_important_option = config.get_main_option("my_important_option")
 # ... etc.
 def run_migrations_offline() -> None:
    """Run migrations in 'offline' mode.
    This configures the context with just a URL
    and not an Engine, though an Engine is acceptable
    here as well.  By skipping the Engine creation
    we don't even need a DBAPI to be available.
    Calls to context.execute() here emit the given string to the
    script output.
    """
    url = config.get_main_option("sqlalchemy.url")
    context.configure(
        url=url,
        target_metadata=target_metadata,
        literal_binds=True,
        dialect_opts={"paramstyle": "named"},
    )
    with context.begin_transaction():
        context.run_migrations()
 def run_migrations_online() -> None:
    """Run migrations in 'online' mode.
    In this scenario we need to create an Engine
    and associate a connection with the context.
    """
    connectable = engine_from_config(
        config.get_section(config.config_ini_section, {}),
        prefix="sqlalchemy.",
        poolclass=pool.NullPool,
    )
    with connectable.connect() as connection:
        context.configure(
            connection=connection, target_metadata=target_metadata
        )
        with context.begin_transaction():
            context.run_migrations()
 if context.is_offline_mode():
    run_migrations_offline()
 else:
    run_migrations_online()
--- a/migration/alembic/script.py.mako
+++ b/migration/alembic/script.py.mako
@ -0,0 +1,28 @@
 """${message}
 Revision ID: ${up_revision}
 Revises: ${down_revision | comma,n}
 Create Date: ${create_date}
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 ${imports if imports else ""}
 # revision identifiers, used by Alembic.
 revision: str = ${repr(up_revision)}
 down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)}
 branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
 depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
 def upgrade() -> None:
    """Upgrade schema."""
    ${upgrades if upgrades else "pass"}
 def downgrade() -> None:
    """Downgrade schema."""
    ${downgrades if downgrades else "pass"}
--- a/migration/alembic/versions/796b67d23c1c_create_initial_tables.py
+++ b/migration/alembic/versions/796b67d23c1c_create_initial_tables.py
@ -0,0 +1,49 @@
 """create initial tables
 Revision ID: 796b67d23c1c
 Revises: 
 Create Date: 2025-08-23 16:05:51.420713
 """
 from typing import Sequence, Union
 from alembic import op
 import sqlalchemy as sa
 # revision identifiers, used by Alembic.
 revision: str = '796b67d23c1c'
 down_revision: Union[str, Sequence[str], None] = None
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 def upgrade() -> None:
    """Upgrade schema."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table('users',
    sa.Column('user_id', sa.Integer(), autoincrement=True, nullable=False),
    sa.Column('username', sa.String(length=64), nullable=False),
    sa.Column('email', sa.String(length=128), nullable=False),
    sa.Column('hashed_password', sa.String(length=128), nullable=False),
    sa.Column('is_active', sa.Boolean(), nullable=False),
    sa.Column('is_admin', sa.Boolean(), nullable=False),
    sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=False),
    sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=False),
    sa.Column('last_login_at', sa.TIMESTAMP(timezone=True), nullable=True),
    sa.PrimaryKeyConstraint('user_id')
    )
    op.create_index(op.f('ix_users_email'), 'users', ['email'], unique=True)
    op.create_index(op.f('ix_users_user_id'), 'users', ['user_id'], unique=False)
    op.create_index(op.f('ix_users_username'), 'users', ['username'], unique=True)
    # ### end Alembic commands ###
 def downgrade() -> None:
    """Downgrade schema."""
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_index(op.f('ix_users_username'), table_name='users')
    op.drop_index(op.f('ix_users_user_id'), table_name='users')
    op.drop_index(op.f('ix_users_email'), table_name='users')
    op.drop_table('users')
    # ### end Alembic commands ###
--- a/ocdp/init.py
+++ b/ocdp/init.py
@ -0,0 +1,5 @@
 # env
 from dotenv import load_dotenv
 load_dotenv()
 from . import logger
--- a/ocdp/main.py
+++ b/ocdp/main.py
@ -0,0 +1,30 @@
 from fastapi import FastAPI
 from ocdp.controllers.v1 import router as api_v1_router
 app = FastAPI(title="One Click Deployment API", )
 app.include_router(api_v1_router,  tags=["v1"])
 # app.include_router(api_v2_router,  tags=["v2"])
 # 允许的来源（可以改成你前端的地址，比如 http://localhost:3000）
 from fastapi.middleware.cors import CORSMiddleware
 origins = [
    "*",  # 允许所有来源，生产环境建议改成具体域名
    # "http://localhost:3000",
    # "https://yourdomain.com",
 ]
 # 添加中间件
 app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,           # 允许访问的源
    allow_credentials=True,          # 允许携带 Cookie
    allow_methods=["*"],             # 允许的方法，如 GET、POST 等
    allow_headers=["*"],             # 允许的请求头
 )
 import uvicorn
 uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/ocdp/config.py
+++ b/ocdp/config.py
@ -0,0 +1,137 @@
 import os
 import yaml
 from dataclasses import dataclass
 # ----------------------------
 # 1️⃣ Define configuration data structures
 # ----------------------------
@dataclass
 class HashConfig:
    algorithm: str  # Password hashing algorithm
@dataclass
 class PasswordConfig:
    hash: HashConfig
@dataclass
 class KubeConfig:
    kubectl_file: str    # Path to kubectl config file
    applications_dir: str   # Directory to store Kubernetes applications
    logs_dir: str        # Directory for logs
@dataclass
 class OrchestrationConfig:
    kube: KubeConfig
@dataclass
 class LokiConfig:
    url: str        # Loki push endpoint
    labels: str     # Default labels for logs
    label_keys: str # Optional: comma-separated label keys
@dataclass
 class LoggerConfig:
    loki: LokiConfig
@dataclass
 class MySQLConfig:
    host: str       # MySQL host
    port: int       # MySQL port
    db_name: str       # Database db_name
    username: str       # Database username
    password: str   # Password read from environment variable
@dataclass
 class DatabaseConfig:
    mysql: MySQLConfig
@dataclass
 class JWTConfig:
    signing_algorithm: str  # JWT signing algorithm
    secret_key: str             # Secret key read from environment variable
@dataclass
 class TokenConfig:
    jwt: JWTConfig
@dataclass
 class AdminConfig:
    username: str
    password: str
@dataclass
 class Config:
    orchestration: OrchestrationConfig
    logger: LoggerConfig
    database: DatabaseConfig
    password: PasswordConfig
    token: TokenConfig
    admin: AdminConfig
 # ----------------------------
 # 2️⃣ Load YAML configuration and environment variables
 # ----------------------------
 def load_config(yaml_path: str) -> Config:
    with open(yaml_path, "r") as f:
        raw = yaml.safe_load(f)
    # orchestration.kube
    kube_cfg = KubeConfig(**raw["orchestration"]["kube"])
    orchestration_cfg = OrchestrationConfig(kube=kube_cfg)
    # logger.loki
    loki_cfg = LokiConfig(**raw["logger"]["loki"])
    logger_cfg = LoggerConfig(loki=loki_cfg)
    # database.mysql
    mysql_raw = raw["database"]["mysql"]
    mysql_password = os.environ.get("DATABASE_MYSQL_PASSWORD")
    if not mysql_password:
        raise ValueError("Environment variable DATABASE_MYSQL_PASSWORD not set")
    mysql_cfg = MySQLConfig(**mysql_raw, password=mysql_password)
    database_cfg = DatabaseConfig(mysql=mysql_cfg)
    # password.hash
    hash_cfg = HashConfig(**raw["password"]["hash"])
    password_cfg = PasswordConfig(hash=hash_cfg)
    # token.jwt
    jwt_raw = raw["token"]["jwt"]
    jwt_secret_key = os.environ.get("TOKEN_JWT_SECRET_KEY")
    if not jwt_secret_key:
        raise ValueError("Environment variable TOKEN_JWT_SECRET_KEY not set")
    jwt_cfg = JWTConfig(**jwt_raw, secret_key=jwt_secret_key)
    token_cfg = TokenConfig(jwt=jwt_cfg)
    # admin
    admin_cfg = AdminConfig(
        username=os.environ.get("ADMIN_USERNAME"),
        password=os.environ.get("ADMIN_PASSWORD")
    )
    # Return final Config object
    return Config(
        orchestration=orchestration_cfg,
        logger=logger_cfg,
        database=database_cfg,
        password=password_cfg,
        token=token_cfg,
        admin=admin_cfg
    )
 # ----------------------------
 # 3️⃣ Usage example
 # ----------------------------
 from dotenv import load_dotenv
 load_dotenv()  # Load environment variables from .env file
 CONFIG = load_config(os.path.expanduser(os.environ.get("OCDP_CONFIG_FILE")))
 if CONFIG is None:
    raise ValueError("Failed to load configuration from YAML file")
 print("Kube config path:", CONFIG.orchestration.kube.kubectl_file)
 print("Password hash algorithm:", CONFIG.password.hash.algorithm)
 print("MySQL password:", CONFIG.database.mysql.password)
 print("JWT secret key:", CONFIG.token.jwt.secret_key)
 print("Loki URL:", CONFIG.logger.loki.url)
 print("Admin username:", CONFIG.admin.username)
--- a/ocdp/controllers/init.py
+++ b/ocdp/controllers/init.py
@ -0,0 +1 @@
--- a/ocdp/controllers/v1/init.py
+++ b/ocdp/controllers/v1/init.py
@ -0,0 +1,20 @@
 from fastapi import APIRouter
 from .user import router as user_router
 from .auth import router as auth_router
 from .orchestration import router as orchestration_router
 router = APIRouter(prefix="/api/v1")
 router.include_router(user_router)
 router.include_router(auth_router)
 router.include_router(orchestration_router)
--- a/ocdp/controllers/v1/auth/init.py
+++ b/ocdp/controllers/v1/auth/init.py
@ -0,0 +1,10 @@
 from fastapi import APIRouter
 from . import (
    login
 )
 router = APIRouter(prefix="/auth")
 router.include_router(login.router)
--- a/ocdp/controllers/v1/auth/login.py
+++ b/ocdp/controllers/v1/auth/login.py
@ -0,0 +1,60 @@
 from fastapi import APIRouter, Depends, HTTPException, status
 from pydantic import BaseModel
 from sqlalchemy.orm import Session
 from ocdp.database import get_db
 from ocdp.services.user import user_service
 # 创建一个 API 路由器
 router = APIRouter()
 # 登录请求的数据模型
 class LoginRequest(BaseModel):
    """
    用户登录的请求体 (Request Body)
    """
    username: str
    password: str
 # 登录成功后返回的数据模型
 class LoginResponse(BaseModel):
    """
    成功登录后返回的响应体
    """
    access_token: str
    token_type: str = "bearer"
 # 假设你已经创建了路由器实例
 # router = APIRouter(prefix="/users", tags=["Users"])
 # 添加登录路由
@router.post("/login", response_model=LoginResponse)
 def login(
    login_in: LoginRequest,
    db: Session = Depends(get_db)
 ):
    """
    通过用户名和密码获取访问令牌 (Access Token)
    """
    # 调用服务层函数来处理登录逻辑
    token = user_service.login_for_access_token(
        username=login_in.username,
        password=login_in.password,
        db=db
    )
    # 检查服务层返回的 token 是否为空，如果为空，表示认证失败
    if not token:
        # 抛出 401 Unauthorized 异常
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Invalid username or password",
            headers={"WWW-Authenticate": "Bearer"},
        )
    # 认证成功，返回包含 token 的响应
    return {"access_token": token}
--- a/ocdp/controllers/v1/orchestration/init.py
+++ b/ocdp/controllers/v1/orchestration/init.py
@ -0,0 +1,11 @@
 from fastapi import APIRouter
 from .cluster import router as cluster_router
 from .application_controller import router as application_router
 router = APIRouter(prefix="/orchestration")
 router.include_router(cluster_router)
 router.include_router(application_router)
--- a/ocdp/controllers/v1/orchestration/application/init.py
+++ b/ocdp/controllers/v1/orchestration/application/init.py
@ -0,0 +1,7 @@
 from fastapi import APIRouter
 router = APIRouter(prefix="/application")
--- a/ocdp/controllers/v1/orchestration/application/get_application_instance_status.py
+++ b/ocdp/controllers/v1/orchestration/application/get_application_instance_status.py
--- a/ocdp/controllers/v1/orchestration/application/install_application_instance.py
+++ b/ocdp/controllers/v1/orchestration/application/install_application_instance.py
--- a/ocdp/controllers/v1/orchestration/application/list_application_instances.py
+++ b/ocdp/controllers/v1/orchestration/application/list_application_instances.py
--- a/ocdp/controllers/v1/orchestration/application/list_application_templates.py
+++ b/ocdp/controllers/v1/orchestration/application/list_application_templates.py
@ -0,0 +1,30 @@
 # list_application_template.py
 """
 Controller for Application Templates.
 """
 from pydantic import BaseModel, Field
 from fastapi import APIRouter, Depends
 from cluster_tool import Cluster, get_cluster
 from services import application_service
 from models.application import ApplicationMetadata
 # --- FastAPI Router ---
 router = APIRouter(
    prefix="/application-templates",
 )
@router.get("/", response_model=list[ApplicationTemplate], summary="获取所有可安装的应用模板")
 def list_application_templates(cluster: Cluster = Depends(get_cluster)):
    """列出在 `applications_dir` 中所有可供安装的应用及其元数据。"""
    return application_service.list_available_applications(cluster)
--- a/ocdp/controllers/v1/orchestration/application/uninstall_application_instance.py
+++ b/ocdp/controllers/v1/orchestration/application/uninstall_application_instance.py
--- a/ocdp/controllers/v1/orchestration/application_controller.py
+++ b/ocdp/controllers/v1/orchestration/application_controller.py
@ -0,0 +1,87 @@
 # ocdp/controllers/application_instances.py
 """
 Controller for Application Instances.
 (Authentication updated to use OAuth2 Password Bearer flow)
 """
 from fastapi import APIRouter, Depends, HTTPException, status
 from fastapi.security import OAuth2PasswordBearer
 from sqlalchemy.orm import Session
 from ocdp.orchestration.cluster import Cluster, get_cluster
 from ocdp.services.orchestration import application_service
 from ocdp.models.orchestration.application import (
    InstalledApplicationInstance, InstallReceipt,
    UninstallReceipt, NamespaceDeleteReceipt, ApplicationStatus, ApplicationTemplate
 )
 # 假设的依赖和 Service 函数导入路径
 from ocdp.database import get_db
 from ocdp.services.user import user_service
 # ----------------
 from pydantic import BaseModel
 from typing import Optional, Dict, Any
 class InstallRequest(BaseModel):
    app_template_name: str
    mode: str
    user_overrides: Optional[Dict[str, Any]] = None
 # --- FastAPI Router ---
 router = APIRouter()
 oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
 def get_current_user_id_dependency(token: str = Depends(oauth2_scheme)):
    """
    此依赖函数负责调用 user_service 来验证 token 并获取用户 ID。
    """
    # 将获取到的 token 传递给 user_service 的方法
    return user_service.get_user_id_by_token(token)
@router.get("/application-templates", response_model=list[ApplicationTemplate], summary="列出所有可用的应用模板")
 def list_application_templates(
    cluster: Cluster = Depends(get_cluster)
 ):
    """
    获取系统中所有可用的应用模板列表。
    """
    return application_service.list_available_applications(cluster)
@router.get("/application-instances", response_model=list[InstalledApplicationInstance], summary="列出当前用户已安装的应用实例")
 def list_application_instances(
    # 使用新的依赖函数
    user_id: str = Depends(get_current_user_id_dependency),
    cluster: Cluster = Depends(get_cluster)
 ):
    # 这里直接使用 user_id，业务逻辑保持不变
    return application_service.list_user_applications(cluster, user_id)
@router.post("/application-instances", response_model=InstallReceipt, status_code=status.HTTP_202_ACCEPTED, summary="安装一个新的应用实例")
 def install_application_instance(
    request: InstallRequest, 
    user_id: str = Depends(get_current_user_id_dependency),
    cluster: Cluster = Depends(get_cluster)
 ):
    try:
        return application_service.install_new_application(
            cluster=cluster, user_id=user_id, app_template_name=request.app_template_name,
            mode=request.mode, user_overrides=request.user_overrides
        )
    except (ValueError, FileNotFoundError) as e:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(e))
    except Exception as e:
        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An unexpected error occurred: {e}")
 # ... (get_status 和 uninstall/delete 端点保持不变，因为它们不直接依赖 user_id) ...
@router.get("/application-instances/{namespace}/{app_template_name}/status", response_model=ApplicationStatus, summary="获取指定应用实例的状态")
 def get_application_instance_status(namespace: str, app_template_name: str, mode: str, cluster: Cluster = Depends(get_cluster)):
    return application_service.get_instance_status(cluster, namespace, app_template_name, mode)
@router.delete("/application-instances/{namespace}/{app_template_name}", response_model=UninstallReceipt, summary="步骤1：卸载应用实例的 Release")
 def uninstall_instance_release(namespace: str, app_template_name: str, mode: str, cluster: Cluster = Depends(get_cluster)):
    return application_service.uninstall_application_release(cluster, namespace, app_template_name, mode)
@router.delete("/application-instances/{namespace}", response_model=NamespaceDeleteReceipt, summary="步骤2：删除应用实例的命名空间")
 def delete_instance_namespace(namespace: str, cluster: Cluster = Depends(get_cluster)):
    return application_service.delete_application_namespace(cluster, namespace)
--- a/ocdp/controllers/v1/orchestration/cluster/init.py
+++ b/ocdp/controllers/v1/orchestration/cluster/init.py
@ -0,0 +1,8 @@
 from fastapi import APIRouter
 from .get_cluster_status import router as get_cluster_status_router
 router = APIRouter(prefix="/cluster")
 router.include_router(get_cluster_status_router)
--- a/ocdp/controllers/v1/orchestration/cluster/get_cluster_status.py
+++ b/ocdp/controllers/v1/orchestration/cluster/get_cluster_status.py
@ -0,0 +1,24 @@
 # get_cluster_status.py
 from fastapi import APIRouter, Depends
 from ocdp.orchestration.cluster import Cluster, get_cluster
 from ocdp.services.orchestration import cluster_service
 from ocdp.models.orchestration.cluster import ClusterStatus
 router = APIRouter()
 # 依赖注入函数名和参数名也同步调整，更清晰
 def get_status_from_service(cluster: Cluster = Depends(get_cluster)) -> ClusterStatus:
    """辅助函数，在Depends中调用service函数"""
    return cluster_service.get_cluster_status(cluster)
@router.get(
    "/cluster-status", 
    response_model=ClusterStatus, 
    summary="Get Comprehensive Cluster Status"
 )
 def get_comprehensive_cluster_status(status: ClusterStatus = Depends(get_status_from_service)):
    """
    Provides a complete snapshot of the cluster's health and resources.
    """
    return status
--- a/ocdp/controllers/v1/orchestration/cluster/get_health.py
+++ b/ocdp/controllers/v1/orchestration/cluster/get_health.py
@ -0,0 +1,34 @@
 from pydantic import BaseModel
 from fastapi import APIRouter, Depends, HTTPException
 from ocdp.orchestration import Cluster
 # 假设的依赖和 Service 函数导入路径
 from ocdp.orchestration import get_cluster
 from ocdp.services.orchestration import node_service
 # --- Response Models for this endpoint ---
 class NodeHealthStatus(BaseModel):
    is_ready: bool
    pressures: dict[str, bool]
 HealthReportResponse = dict[str, NodeHealthStatus]
 # --- Router Definition ---
 router = APIRouter()
@router.get(
    "/health",
    response_model=HealthReportResponse,
    # summary="获取集群节点健康状态"
 )
 def get_health(cluster: Cluster = Depends(get_cluster)):
    """
    获取集群所有节点的健康状态报告。
    - **is_ready**: 节点是否就绪。
    - **pressures**: 节点的各项压力状态，`true` 表示存在压力。
    """
    try:
        return node_service.get_cluster_health_report(cluster)
    except RuntimeError as e:
        raise HTTPException(status_code=500, detail=str(e))
--- a/ocdp/controllers/v1/orchestration/cluster/get_summary_resources.py
+++ b/ocdp/controllers/v1/orchestration/cluster/get_summary_resources.py
@ -0,0 +1,39 @@
 from pydantic import BaseModel
 from fastapi import APIRouter, Depends, HTTPException
 from ocdp.orchestration import Cluster, get_cluster
 from ocdp.services.orchestration import node_service
 # --- Response Models for this endpoint ---
 class ResourceDetail(BaseModel):
    total: str
    used: str  # 修正: 'used_by_system' -> 'used'
    free: str  # 修正: 'free_for_pods' -> 'free'
 class GPUSummary(BaseModel):
    total_count: int
    allocatable_count: int
    models_summary: dict[str, int]
 class ClusterSummaryResponse(BaseModel):
    cpu: ResourceDetail
    memory: ResourceDetail
    storage: ResourceDetail
    gpu: GPUSummary
 # --- Router Definition ---
 router = APIRouter()
@router.get(
    "/summary/resources",
    response_model=ClusterSummaryResponse,
    # summary="获取集群资源汇总"
 )
 def get_summary_resources(cluster: Cluster = Depends(get_cluster)):
    """
    获取整个集群的资源聚合汇总报告。
    """
    try:
        return node_service.get_cluster_summary_report(cluster)
    except RuntimeError as e:
        raise HTTPException(status_code=500, detail=str(e))
--- a/ocdp/controllers/v1/orchestration/cluster/list_nodes_resources.py
+++ b/ocdp/controllers/v1/orchestration/cluster/list_nodes_resources.py
@ -0,0 +1,44 @@
 from pydantic import BaseModel
 from fastapi import APIRouter, Depends, HTTPException
 from ocdp.orchestration import Cluster, get_cluster
 from ocdp.services.orchestration import node_service
 # --- Response Models for this endpoint ---
 class ResourceDetail(BaseModel):
    total: str
    used: str  # 修正: 'used_by_system' -> 'used'
    free: str  # 修正: 'free_for_pods' -> 'free'
 class GPUInfo(BaseModel):
    count: int
    allocatable_count: int  # 新增: 匹配 service 返回的可分配 GPU 数量
    model: str
    memory_mb: int
 class NodeResourceDetail(BaseModel):
    cpu: ResourceDetail
    memory: ResourceDetail
    storage: ResourceDetail
    gpu: GPUInfo
 NodeResourcesResponse = dict[str, NodeResourceDetail]
 # --- Router Definition ---
 router = APIRouter()
@router.get(
    "/nodes/resources",
    response_model=NodeResourcesResponse,
    # summary="获取各节点资源详情"
 )
 def list_nodes_resources(cluster: Cluster = Depends(get_cluster)):
    """
    获取集群中每个节点的详细资源使用报告。
    - **used_by_system**: 被系统和 Kubelet 预留的资源。
    - **free_for_pods**: 可供 Pod 调度的资源。
    """
    try:
        return node_service.get_per_node_resource_report(cluster)
    except RuntimeError as e:
        raise HTTPException(status_code=500, detail=str(e))
--- a/ocdp/controllers/v1/user/init.py
+++ b/ocdp/controllers/v1/user/init.py
@ -0,0 +1,15 @@
 from fastapi import APIRouter
 from . import (
    register_user,
    get_current_user
 )
 router = APIRouter(prefix="/users")
 router.include_router(register_user.router)   
 router.include_router(get_current_user.router)
--- a/ocdp/controllers/v1/user/get_current_user.py
+++ b/ocdp/controllers/v1/user/get_current_user.py
@ -0,0 +1,39 @@
 from fastapi import APIRouter, Depends, HTTPException, status
 from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
 from pydantic import BaseModel, constr, EmailStr, validator
 from sqlalchemy.orm import Session
 from ocdp.database import get_db
 from ocdp.services.user import user_service
 from ocdp.services.user import user_exceptions
 router = APIRouter()
 class GetCurrentUserResponse(BaseModel):
    """
    获取当前登录用户的信息 (Response Body)
    """
    user_id: int
    username: str
    email: EmailStr
    class Config:
        # Pydantic V2 推荐的用法
        from_attributes = True
        # Pydantic V1 的旧用法
        # orm_mode = True
 oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
@router.get("/me", response_model=GetCurrentUserResponse)
 def get_current_user(
    db: Session = Depends(get_db),
    token: str = Depends(oauth2_scheme)
 ):
    """
    获取当前登录用户的信息.
    """
    current_user = user_service.get_current_user(token, db)
    if not current_user:
        raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid authentication credentials")
    return current_user
--- a/ocdp/controllers/v1/user/register_user.py
+++ b/ocdp/controllers/v1/user/register_user.py
@ -0,0 +1,79 @@
 from fastapi import APIRouter, Depends, HTTPException, status
 from pydantic import BaseModel, constr, EmailStr, validator
 from sqlalchemy.orm import Session
 from ocdp.database import get_db
 from ocdp.services.user import user_service
 from ocdp.services.user import user_exceptions
 # 创建一个 API 路由器
 router = APIRouter()
 ALLOWED_PASSWORD_CHARS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#$%&*_-")
 class RegisterUserRequest(BaseModel):
    """
    用户注册的请求体 (Request Body)
    """
    username: str
    password: str
    email: EmailStr
    @validator('password')
    def validate_password(cls, v):
        if len(v) < 8 or len(v) > 32:
            raise ValueError('密码长度应在8~32位')
        if not any(c.isalpha() for c in v):
            raise ValueError('密码必须包含字母')
        if not any(c.isdigit() for c in v):
            raise ValueError('密码必须包含数字')
        if any(c.isspace() for c in v):
            raise ValueError('密码不能包含空格')
        if any(c not in ALLOWED_PASSWORD_CHARS for c in v):
            raise ValueError('密码包含非法字符')
        return v
 class RegisterUserResponse(BaseModel):
    """
    成功注册后返回的用户信息 (Response Body)
    不包含密码等敏感数据
    """
    id: int
    username: str
    email: EmailStr
    class Config:
        # Pydantic V2 推荐的用法
        from_attributes = True 
        # Pydantic V1 的旧用法
        # orm_mode = True
@router.post("/", response_model=RegisterUserResponse, status_code=status.HTTP_201_CREATED)
 def register_user(
    user_in: RegisterUserRequest,
    db: Session = Depends(get_db)
 ):
    """
    注册一个新用户.
    - **username**: 用户的唯一名称.
    - **email**: 用户的唯一邮箱.
    - **password**: 用户的密码.
    """
    try:
        # 调用 service 层的函数来创建用户
        # user_in.dict() 将 Pydantic 模型转换为字典
        created_user = user_service.create_user(
            username=user_in.username,
            password=user_in.password,
            email=user_in.email,
            db=db
        )
        return created_user
    except user_exceptions.UserAlreadyExistsError as e:
        # 捕获 service 层抛出的特定异常
        # 返回 400 错误，并附带清晰的错误信息
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=str(e),
        )
--- a/ocdp/controllers/v2/init.py
+++ b/ocdp/controllers/v2/init.py
@ -0,0 +1,15 @@
 from fastapi import APIRouter
 router = APIRouter(prefix="/api/v2")
--- a/ocdp/daos/init.py
+++ b/ocdp/daos/init.py
@ -0,0 +1,2 @@
 from .user import user_dao
--- a/ocdp/daos/orchestration/init.py
+++ b/ocdp/daos/orchestration/init.py
--- a/ocdp/daos/orchestration/application_dao.py
+++ b/ocdp/daos/orchestration/application_dao.py
@ -0,0 +1,304 @@
 # dao.py
 import os
 import yaml
 import json
 import time
 from pydantic import ValidationError 
 from ocdp.orchestration.cluster import Cluster
 from ocdp.models.orchestration.application import (ApplicationTemplate, ApplicationMetadata, InstallReceipt, 
                    ApplicationStatus, UninstallReceipt, PodStatusDetail, 
                    InstalledApplicationInstance, NamespaceDeleteReceipt) # <-- 导入更新后的模型
 # ... (list_application_templates, list_application_instances, _deep_merge 保持不变) ...
 def list_application_templates(cluster: Cluster) -> list[ApplicationTemplate]:
    app_dirs = cluster.list_applications(); template_list = []
    for app_dir in app_dirs:
        try:
            metadata_dict = cluster.get_application_metadata(app_dir)
            validated_metadata = ApplicationMetadata(**metadata_dict)
            template_list.append(ApplicationTemplate(name=app_dir,metadata=validated_metadata))
        except Exception as e: print(f"⚠️ Warning: Could not load or validate metadata for '{app_dir}': {e}")
    return template_list
 def list_application_instances(cluster: Cluster, user_id: str) -> list[InstalledApplicationInstance]:
    prefix = f"{user_id}-";
    try:
        ns_json_str = cluster.get("namespaces"); all_ns_data = json.loads(ns_json_str).get("items", [])
        user_namespaces = [ns['metadata']['name'] for ns in all_ns_data if ns['metadata']['name'].startswith(prefix)]
        if not user_namespaces: return []
        releases_json_str = cluster.list_releases(all_namespaces=True, output="json"); all_releases = json.loads(releases_json_str)
        instances = []
        for rel in all_releases:
            if rel.get("namespace") in user_namespaces:
                ns_parts = rel.get("namespace").split('-'); app_name = ns_parts[1] if len(ns_parts) > 2 else "unknown"
                instances.append(InstalledApplicationInstance(
                    application_name=app_name, release_name=rel.get("name"), namespace=rel.get("namespace"), chart=rel.get("chart"), status=rel.get("status")
                ))
        return instances
    except (RuntimeError, json.JSONDecodeError) as e:
        print(f"❌ Error listing application instances: {e}"); return []
 def _deep_merge(source: dict, destination: dict) -> dict:
    for key, value in source.items():
        if isinstance(value, dict) and key in destination and isinstance(destination[key], dict):
            destination[key] = _deep_merge(value, destination[key])
        else:
            destination[key] = value
    return destination
 def install_application(
    cluster,
    namespace,
    app_template_name,
    mode,
    user_overrides=None
 ) -> InstallReceipt:
    metadata = cluster.get_application_metadata(app_template_name)
    print(f"Metadata for '{app_template_name}': {metadata}")
    app_meta = ApplicationMetadata(**metadata)
    deployment_mode = getattr(app_meta, mode, None)
    if not deployment_mode:
        raise ValueError(f"Mode '{mode}' not found.")
    release_name = deployment_mode.release_name
    chart_source = deployment_mode.chart
    values_to_set = deployment_mode.sets
    if user_overrides:
        values_to_set = _deep_merge(user_overrides, values_to_set)
    temp_values_path = f"/tmp/temp-values-{namespace}.yaml"
    with open(temp_values_path, 'w') as f:
        yaml.dump(values_to_set, f)
    try:
        output = cluster.install_release(
            release_name=release_name,
            chart_source=chart_source,
            namespace=namespace,
            config_file=temp_values_path,
            create_namespace=True
        )
        print(output)
        return InstallReceipt(
            application_name=app_meta.application_name,
            release_name=release_name,
            namespace=namespace,
            message=f"Installation triggered successfully. Raw output: {output.strip()}"
        )
    finally:
        if os.path.exists(temp_values_path):
            os.remove(temp_values_path)
 def uninstall_application_release(cluster: Cluster, namespace: str, app_name: str, mode: str) -> UninstallReceipt:
    try:
        # 1. 获取并验证元数据
        metadata = cluster.get_application_metadata(app_name)
        app_meta = ApplicationMetadata(**metadata)
        deployment_mode = getattr(app_meta, mode, None)
        if not deployment_mode:
            raise ValueError(f"Mode '{mode}' not found in metadata.")
        release_name = deployment_mode.release_name
        # 2. 卸载 Helm Release
        output = cluster.uninstall_release(release_name, namespace=namespace, wait=True)
        uninstalled_successfully = True
        # 3. 验证卸载是否成功
        verification_message = "Verification successful: Release is no longer listed by Helm."
        is_clean = True
        try:
            time.sleep(2)
            releases_json_str = cluster.list_releases(namespace=namespace, output="json")
            releases = json.loads(releases_json_str)
            release_found = any(r['name'] == release_name for r in releases)
            if release_found:
                is_clean = False
                verification_message = "Verification failed: Release is still present in Helm's list."
        except Exception as e:
            verification_message = f"Verification check failed: {e}"
    except (ValidationError, ValueError, RuntimeError) as e:
        # 捕获所有已知的预处理和运行时错误
        return UninstallReceipt(
            application_name=app_name, 
            release_name=release_name if 'release_name' in locals() else 'unknown', 
            namespace=namespace, 
            uninstalled_successfully=False, 
            is_clean=False, 
            message=f"Operation failed due to an error: {e}"
        )
    except Exception as e:
        # 捕获所有其他意外错误
        return UninstallReceipt(
            application_name=app_name, 
            release_name=release_name if 'release_name' in locals() else 'unknown', 
            namespace=namespace, 
            uninstalled_successfully=False, 
            is_clean=False, 
            message=f"An unexpected error occurred: {e}"
        )
    return UninstallReceipt(
        application_name=app_name,
        release_name=release_name,
        namespace=namespace,
        uninstalled_successfully=uninstalled_successfully,
        is_clean=is_clean,
        message=f"{output.strip()}. {verification_message}"
    )
 def delete_namespace(cluster: Cluster, namespace: str) -> NamespaceDeleteReceipt:
    app_name = "unknown"
    try:
        # 尝试从命名空间中提取应用名称
        ns_parts = namespace.split('-')
        if len(ns_parts) > 2:
            app_name = ns_parts[1]
    except Exception:
        pass # 如果解析失败，app_name 保持为 'unknown'
    try:
        # 1. 提交删除命名空间的命令
        output = cluster.delete(resource_type="namespace", name=namespace)
        deleted_successfully = True
        # 2. 验证命名空间是否已被删除
        is_clean = False
        verification_message = "Delete command submitted. Namespace is terminating."
        try:
            # 循环检查命名空间直到它不存在
            timeout = 60
            start_time = time.time()
            while time.time() - start_time < timeout:
                try:
                    cluster.get("namespace", name=namespace)
                    time.sleep(5)
                except RuntimeError as e:
                    if "not found" in str(e).lower():
                        is_clean = True
                        verification_message = "Verification successful: Namespace not found."
                        break
                    else:
                        raise e # 重新抛出其他运行时错误
            if not is_clean:
                verification_message = "Verification failed: Namespace still exists after timeout."
        except Exception as e:
            verification_message = f"Verification check failed: {e}"
    except RuntimeError as e:
        # 如果 delete 命令本身失败
        return NamespaceDeleteReceipt(
            application_name=app_name,
            namespace=namespace,
            deleted_successfully=False,
            is_clean=False,
            message=f"Delete namespace command failed: {e}"
        )
    return NamespaceDeleteReceipt(
        application_name=app_name,
        namespace=namespace,
        deleted_successfully=deleted_successfully,
        is_clean=is_clean,
        message=f"{output.strip()}. {verification_message}"
    )
 def get_application_status(
    cluster,
    namespace: str,
    app_template_name: str,
    mode: str
 ):
    app_name = "Unknown"
    base_access_url = None
    paths = None
    try:
        metadata_dict = cluster.get_application_metadata(app_template_name)
        app_meta = ApplicationMetadata(**metadata_dict)
        deployment_mode = getattr(app_meta, mode, None)
        if not deployment_mode:
            raise ValueError(f"Mode '{mode}' not found.")
        app_name = app_meta.application_name
        if not deployment_mode.pod or not deployment_mode.pod.name:
            raise ValueError("Pod name pattern is not defined.")
        pod_name_pattern = deployment_mode.pod.name
        if deployment_mode.svc:
            base_access_url = deployment_mode.svc.url
            paths = deployment_mode.svc.paths
        pods_json_str = cluster.get("pods", namespace=namespace)
        all_pods = json.loads(pods_json_str).get("items", [])
        target_pods = [p for p in all_pods if p.get('metadata', {}).get('name', '').startswith(pod_name_pattern)]
        if not target_pods:
            return ApplicationStatus(
                application_name=app_name,
                namespace=namespace,
                is_ready=False,
                base_access_url=base_access_url,
                paths=paths,
                details=[]
            )
        all_ready = True
        pod_details = []
        for pod in target_pods:
            pod_name = pod['metadata']['name']
            container_statuses = pod.get('status', {}).get('containerStatuses', [])
            pod_phase = pod.get('status', {}).get('phase', '')
            ready_count = sum(1 for s in container_statuses if s.get('ready'))
            total_count = len(container_statuses)
            pod_is_ready = (pod_phase == 'Running') and (ready_count == total_count)
            if not pod_is_ready:
                all_ready = False
            pod_details.append(
                PodStatusDetail(
                    pod_name=pod_name,
                    is_ready=pod_is_ready,
                    ready_status=f"{ready_count}/{total_count}",
                    status_phase=pod_phase
                )
            )
        return ApplicationStatus(
            application_name=app_name,
            namespace=namespace,
            is_ready=all_ready,
            base_access_url=base_access_url,
            paths=paths,
            details=pod_details
        )
    except (ValidationError, json.JSONDecodeError, KeyError, ValueError, AttributeError) as e:
        return ApplicationStatus(
            application_name=app_name,
            namespace=namespace,
            is_ready=False,
            base_access_url=base_access_url,
            paths=paths,
            details=[PodStatusDetail(pod_name="Error", is_ready=False, ready_status="0/0", status_phase=f"Error: {e}")]
        )
    except Exception as e:
        return ApplicationStatus(
            application_name=app_name,
            namespace=namespace,
            is_ready=False,
            base_access_url=base_access_url,
            paths=paths,
            details=[PodStatusDetail(pod_name="Unexpected Error", is_ready=False, ready_status="0/0", status_phase=f"Error: {e}")]
        )
--- a/ocdp/daos/orchestration/cluster_dao.py
+++ b/ocdp/daos/orchestration/cluster_dao.py
@ -0,0 +1,241 @@
 # cluster_dao.py
 """
 Data Access Object (DAO) 层 - 函数式实现。
 本模块负责执行 kubectl 命令，并解析其输出，将其转换为结构化的 Pydantic 模型。
 所有与数据获取、解析、转换和计算相关的逻辑都集中在此。
 """
 import re
 from ocdp.orchestration.cluster import Cluster
 from ocdp.models.orchestration.cluster import (
    ClusterStatus, ClusterSummary, ClusterHealthSummary, ClusterResourceSummary,
    NodeInfo, NodeHealth, NodeCondition, ResourceUsage, GPUType, GPUUsage, GPUInfo,
    PodsUsage, PodDetail, TotalResourceUsage, PodsTotalUsage, MaxFreeNodeInfo,
    MaxFreeGPUNodeInfo, DistributedGPUAvailability
 )
 # ... (辅助函数 _parse_size_to_kib, _parse_cpu 等保持不变) ...
 def _parse_size_to_kib(size_str: str | None) -> int:
    if not size_str or not size_str[0].isdigit(): return 0
    size_str = size_str.lower()
    val_match = re.search(r'(\d+)', size_str)
    if not val_match: return 0
    val = int(val_match.group(1))
    if 'gi' in size_str: return val * 1024 * 1024
    if 'mi' in size_str: return val * 1024
    if 'ki' in size_str: return val
    return val // 1024
 def _format_size_from_kib(kib: int) -> str:
    if kib < 0: kib = 0
    if kib >= 1024 * 1024: return f"{round(kib / (1024 * 1024), 2)}Gi"
    if kib >= 1024: return f"{round(kib / 1024, 2)}Mi"
    return f"{kib}Ki"
 def _parse_cpu(cpu_str: str | None) -> int:
    if not cpu_str or not cpu_str[0].isdigit(): return 0
    if 'm' in cpu_str: return int(cpu_str.replace('m', ''))
    if '.' in cpu_str: return int(float(cpu_str) * 1000)
    if cpu_str.isdigit(): return int(cpu_str) * 1000
    return 0
 def _format_cpu(millicores: int) -> str:
    if millicores < 0: millicores = 0
    if millicores < 1000: return f"{millicores}m"
    return str(round(millicores / 1000, 3))
 def _find_value(pattern: str, text: str, flags: int = 0) -> str | None:
    match = re.search(pattern, text, flags)
    return match.group(1).strip() if match else None
 def _parse_pods_table(full_node_text: str) -> list[PodDetail]:
    pods_block_match = re.search(r"Non-terminated Pods:(.*?)(?=\nAllocated resources:|\nEvents:)", full_node_text, re.DOTALL)
    if not pods_block_match: return []
    pods_text = pods_block_match.group(1).strip()
    lines = pods_text.split('\n')
    header_index = -1
    for i, line in enumerate(lines):
        if "Namespace" in line and "Name" in line and "CPU Requests" in line:
            header_index = i
            break
    if header_index == -1 or len(lines) <= header_index + 1: return []
    header = lines[header_index]
    col_starts = {"ns": header.find("Namespace"), "name": header.find("Name"), "cpu_req": header.find("CPU Requests"), "cpu_lim": header.find("CPU Limits"), "mem_req": header.find("Memory Requests"), "mem_lim": header.find("Memory Limits"), "age": header.find("Age"),}
    pod_list = []
    for line in lines[header_index + 1:]:
        if not line.strip() or "---" in line: continue
        parts = re.split(r'\s{2,}', line.strip())
        if len(parts) != 7: continue
        pod_list.append(PodDetail(
            namespace=parts[0], name=parts[1], cpu_requests=parts[2], cpu_limits=parts[3],
            memory_requests=parts[4], memory_limits=parts[5], age=parts[6]
        ))
    return pod_list
 def _parse_key_value_block(text: str) -> dict[str, str]:
    data = {}
    for line in text.strip().split('\n'):
        parts = line.split(':', 1)
        if len(parts) == 2:
            key = parts[0].strip()
            value = parts[1].strip()
            data[key] = value
    return data
 def _parse_single_node(text: str) -> NodeInfo | None:
    name = _find_value(r"Name:\s*(\S+)", text)
    if not name: return None
    roles_str = _find_value(r"Roles:\s*([^\n]*)", text) or "<none>"
    roles = [r.strip() for r in roles_str.split(',')] if roles_str != '<none>' else ['worker']
    labels_text = _find_value(r"Labels:(.*?)(?=\nAnnotations:)", text, re.DOTALL) or ""
    labels = {}
    if labels_text:
        for line in labels_text.strip().split('\n'):
            key_value = line.strip().split('=', 1)
            if len(key_value) == 2:
                labels[key_value[0]] = key_value[1]
    conditions_text = _find_value(r"Conditions:(.*?)(?=\nAddresses:)", text, re.DOTALL) or ""
    conditions = []
    for line in conditions_text.strip().split('\n')[2:]:
        parts = re.split(r'\s{2,}', line.strip())
        if len(parts) >= 6:
            conditions.append(NodeCondition(type=parts[0], status=parts[1], last_heartbeat_time=parts[2], last_transition_time=parts[3], reason=parts[4], message=parts[5]))
    health = NodeHealth(conditions=conditions)
    allocatable_text = _find_value(r"Allocatable:(.*?)(?=\nSystem Info:)", text, re.DOTALL) or ""
    allocatable = _parse_key_value_block(allocatable_text)
    allocated_block = _find_value(r"Allocated resources:(.*?)(?=\nEvents:)", text, re.DOTALL) or ""
    allocated_requests = {}
    for line in allocated_block.strip().split('\n'):
        line = line.strip()
        if not line or line.startswith('(') or line.startswith('Resource') or line.startswith('---'):
            continue
        match = re.match(r'^(\S+)\s+(\S+)', line)
        if match:
            resource, request_val = match.groups()
            allocated_requests[resource] = request_val
    pods = PodsUsage(
        total=int(allocatable.get("pods", "0")),
        used=int(_find_value(r"Non-terminated Pods:\s*\((\d+) in total\)", text) or "0"),
        free=0
    )
    cpu = ResourceUsage(total=allocatable.get("cpu", "0"), used=allocated_requests.get("cpu", "0m"), free="0")
    memory = ResourceUsage(total=allocatable.get("memory", "0Ki"), used=allocated_requests.get("memory", "0Ki"), free="0")
    ephemeral_storage, hugepages_1Gi, hugepages_2Mi, rdma_shared_device_a = None, None, None, None
    if "ephemeral-storage" in allocatable:
        ephemeral_storage = ResourceUsage(
            total=allocatable["ephemeral-storage"],
            used=allocated_requests.get("ephemeral-storage", "0Ki"),
            free="0"
        )
    if "hugepages-1Gi" in allocatable:
        hugepages_1Gi = ResourceUsage(
            total=allocatable["hugepages-1Gi"],
            used=allocated_requests.get("hugepages-1Gi", "0"),
            free="0"
        )
    if "hugepages-2Mi" in allocatable:
        hugepages_2Mi = ResourceUsage(
            total=allocatable["hugepages-2Mi"],
            used=allocated_requests.get("hugepages-2Mi", "0"),
            free="0"
        )
    if "rdma/rdma_shared_device_a" in allocatable:
        rdma_shared_device_a = ResourceUsage(
            total=allocatable["rdma/rdma_shared_device_a"],
            used=allocated_requests.get("rdma/rdma_shared_device_a", "0"),
            free="0"
        )
    print(ephemeral_storage, hugepages_1Gi, hugepages_2Mi, rdma_shared_device_a)
    running_pods = _parse_pods_table(text)
    gpu_info = None
    gpu_total = int(allocatable.get("nvidia.com/gpu", "0"))
    if gpu_total > 0:
        gpu_usage = GPUUsage(total=gpu_total, used=int(allocated_requests.get("nvidia.com/gpu", "0")), free=0)
        gpu_type = GPUType(product=labels.get("nvidia.com/gpu.product", "Unknown"), memory_mb=int(labels.get("nvidia.com/gpu.memory", "0")))
        gpu_info = GPUInfo(usage=gpu_usage, types=[gpu_type])
    return NodeInfo(
        name=name, roles=roles, labels=labels, health=health, cpu=cpu, memory=memory, pods=pods, 
        ephemeral_storage=ephemeral_storage, hugepages_1Gi=hugepages_1Gi,
        hugepages_2Mi=hugepages_2Mi, rdma_shared_device_a=rdma_shared_device_a,
        gpu_info=gpu_info, running_pods=running_pods
    )
 def get_cluster_status(cluster: Cluster) -> ClusterStatus:
    raw_output = cluster.describe("nodes")
    node_texts = re.split(r'\n(?=Name:\s+)', raw_output.strip())
    nodes = [_parse_single_node(text) for text in node_texts if text.strip()]
    nodes = [node for node in nodes if node is not None]
    # --- 更新: 计算所有 ResourceUsage 对象的 free 值 ---
    for node in nodes:
        node.cpu.free = _format_cpu(_parse_cpu(node.cpu.total) - _parse_cpu(node.cpu.used))
        node.memory.free = _format_size_from_kib(_parse_size_to_kib(node.memory.total) - _parse_size_to_kib(node.memory.used))
        node.pods.free = node.pods.total - node.pods.used
        if node.gpu_info:
            node.gpu_info.usage.free = node.gpu_info.usage.total - node.gpu_info.usage.used
        if node.ephemeral_storage:
            node.ephemeral_storage.free = _format_size_from_kib(_parse_size_to_kib(node.ephemeral_storage.total) - _parse_size_to_kib(node.ephemeral_storage.used))
        # hugepages 和 rdma 是纯数字，直接计算
        if node.hugepages_1Gi:
            node.hugepages_1Gi.free = str(int(node.hugepages_1Gi.total) - int(node.hugepages_1Gi.used))
        if node.hugepages_2Mi:
            node.hugepages_2Mi.free = str(int(node.hugepages_2Mi.total) - int(node.hugepages_2Mi.used))
        if node.rdma_shared_device_a:
            node.rdma_shared_device_a.free = str(int(node.rdma_shared_device_a.total) - int(node.rdma_shared_device_a.used))
    # (汇总逻辑保持不变)
    health_summary = ClusterHealthSummary(total_nodes=len(nodes), ready_nodes=sum(1 for n in nodes if n.health.overall_status == "Ready"), unhealthy_nodes=sum(1 for n in nodes if n.health.overall_status != "Ready"))
    total_cpu_m = sum(_parse_cpu(n.cpu.total) for n in nodes)
    used_cpu_m = sum(_parse_cpu(n.cpu.used) for n in nodes)
    total_mem_kib = sum(_parse_size_to_kib(n.memory.total) for n in nodes)
    used_mem_kib = sum(_parse_size_to_kib(n.memory.used) for n in nodes)
    total_storage_kib = sum(_parse_size_to_kib(n.ephemeral_storage.total if n.ephemeral_storage else None) for n in nodes)
    used_storage_kib = sum(_parse_size_to_kib(n.ephemeral_storage.used if n.ephemeral_storage else None) for n in nodes)
    cluster_total_cpu = TotalResourceUsage(total=_format_cpu(total_cpu_m), used=_format_cpu(used_cpu_m), free=_format_cpu(total_cpu_m - used_cpu_m))
    cluster_total_memory = TotalResourceUsage(total=_format_size_from_kib(total_mem_kib), used=_format_size_from_kib(used_mem_kib), free=_format_size_from_kib(total_mem_kib - used_mem_kib))
    cluster_total_storage = TotalResourceUsage(total=_format_size_from_kib(total_storage_kib), used=_format_size_from_kib(used_storage_kib), free=_format_size_from_kib(total_storage_kib - used_storage_kib)) if total_storage_kib > 0 else None
    total_pods = sum(n.pods.total for n in nodes)
    used_pods = sum(n.pods.used for n in nodes)
    cluster_total_pods = PodsTotalUsage(total=total_pods, used=used_pods, free=total_pods - used_pods)
    best_cpu_node, best_mem_node, best_gpu_node = None, None, None
    if nodes:
        cpu_leader = max(nodes, key=lambda n: _parse_cpu(n.cpu.free))
        best_cpu_node = MaxFreeNodeInfo(node_name=cpu_leader.name, free_amount=cpu_leader.cpu.free)
        mem_leader = max(nodes, key=lambda n: _parse_size_to_kib(n.memory.free))
        best_mem_node = MaxFreeNodeInfo(node_name=mem_leader.name, free_amount=mem_leader.memory.free)
        gpu_nodes = [n for n in nodes if n.gpu_info and n.gpu_info.types]
        if gpu_nodes:
            gpu_leader = max(gpu_nodes, key=lambda n: n.gpu_info.usage.free * n.gpu_info.types[0].memory_mb)
            best_gpu_node = MaxFreeGPUNodeInfo(
                node_name=gpu_leader.name,
                free_gpu_count=gpu_leader.gpu_info.usage.free,
                memory_per_gpu_mb=gpu_leader.gpu_info.types[0].memory_mb,
                total_potential_memory_gb=round(gpu_leader.gpu_info.usage.free * gpu_leader.gpu_info.types[0].memory_mb / 1024, 2)
            )
    dist_gpu_map = {}
    for node in nodes:
        if node.gpu_info:
            for gpu_type in node.gpu_info.types:
                if gpu_type.product not in dist_gpu_map:
                    dist_gpu_map[gpu_type.product] = {"product": gpu_type.product, "memory_per_gpu_mb": gpu_type.memory_mb, "total_free_count": 0}
                dist_gpu_map[gpu_type.product]["total_free_count"] += node.gpu_info.usage.free
    distributed_gpu_availability = [DistributedGPUAvailability(**data) for data in dist_gpu_map.values()]
    resource_summary = ClusterResourceSummary(
        cluster_total_cpu=cluster_total_cpu,
        cluster_total_memory=cluster_total_memory,
        cluster_total_pods=cluster_total_pods,
        cluster_total_ephemeral_storage=cluster_total_storage,
        best_node_for_cpu=best_cpu_node,
        best_node_for_memory=best_mem_node,
        best_node_for_gpu_app=best_gpu_node,
        distributed_gpu_availability=distributed_gpu_availability
    )
    cluster_summary = ClusterSummary(health=health_summary, resources=resource_summary)
    return ClusterStatus(summary=cluster_summary, nodes=nodes)
--- a/ocdp/daos/user/init.py
+++ b/ocdp/daos/user/init.py
--- a/ocdp/daos/user/user_dao.py
+++ b/ocdp/daos/user/user_dao.py
@ -0,0 +1,36 @@
 from sqlalchemy.orm import Session
 from ocdp.models.user import User    
 def get_user_by_id(user_id: int, db: Session = None):
    return db.query(User).filter(User.user_id == user_id).first()
 def get_user_by_username(username: str, db: Session = None):
    return db.query(User).filter(User.username == username).first()
 def get_user_by_email(email: str, db: Session = None): 
    return db.query(User).filter(User.email == email).first()
 def add_user(user: User, db: Session = None):
    db.add(user)
    db.commit()
    db.refresh(user)
    return user
 def update_user(user: User, db: Session = None):
    db.commit()
    db.refresh(user)
    return user
 def delete_user(user: User, db: Session = None):
    db.delete(user)
    db.commit()
    return True
--- a/ocdp/database/init.py
+++ b/ocdp/database/init.py
@ -0,0 +1,3 @@
 from .database import get_db, Base
--- a/ocdp/database/database.py
+++ b/ocdp/database/database.py
@ -0,0 +1,36 @@
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.ext.declarative import declarative_base
 from urllib.parse import quote_plus
 from ocdp.config import CONFIG
 # 
 mysql_host = CONFIG.database.mysql.host
 mysql_port = CONFIG.database.mysql.port
 mysql_username = CONFIG.database.mysql.username
 mysql_password = quote_plus(CONFIG.database.mysql.password)
 mysql_db_name = CONFIG.database.mysql.db_name
 mysql_url = f"mysql+pymysql://{mysql_username}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_db_name}?charset=utf8mb4"
 engine = create_engine(
    mysql_url
 )
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 Base = declarative_base()
 def get_db():
    db = SessionLocal()
    try:
        yield db
    except Exception as e:
        db.rollback()
        raise e
    finally:
        db.close()
--- a/ocdp/logger.py
+++ b/ocdp/logger.py
@ -0,0 +1,42 @@
 import os
 import sys
 from loguru import logger
 from loguru_loki_handler import loki_handler
 from ocdp.config import CONFIG
 LABELS = CONFIG.logger.loki.labels
 LABELS = dict(item.split("=") for item in LABELS.split(",")) if LABELS else {}
 LABEL_KEYS = CONFIG.logger.loki.label_keys
 LABEL_KEYS = LABEL_KEYS.split(",") if LABEL_KEYS else []
 URL = CONFIG.logger.loki.url
 LOGS_DIR = CONFIG.orchestration.kube.logs_dir
 LOGS_DIR = os.path.expanduser(LOGS_DIR)
 logger.configure(handlers=[
    {
        "sink": sys.stdout,
        "level": "INFO",
    },
    {
        "sink": sys.stderr,
        "level": "ERROR",
    },
    {
        "sink": f"{LOGS_DIR}/app.log",
        "serialize": True,
        "level": "DEBUG",
        "rotation": "1 day",
        "compression": "zip"
    },
    {
        "sink": loki_handler(
            url=URL,
            labels=LABELS,
            labelKeys=LABEL_KEYS
        ),
        "level": "INFO"
    },
 ])
--- a/ocdp/models/init.py
+++ b/ocdp/models/init.py
@ -0,0 +1 @@
--- a/ocdp/models/orchestration/init.py
+++ b/ocdp/models/orchestration/init.py
--- a/ocdp/models/orchestration/application.py
+++ b/ocdp/models/orchestration/application.py
@ -0,0 +1,74 @@
 # models.py
 """
 定义了用于应用编排的所有 Pydantic 数据模型。
 """
 from pydantic import BaseModel, Field
 # ... (InstallationConfig, ApplicationDeploymentMode, ApplicationMetadata 保持不变) ...
 class SvcInfo(BaseModel):
    # 根据你的 YAML，URL 可以是 ~ (None)
    svc_type: str
    protocol: str
    hostname: str
    port: int
    url: str | None = None  # 允许 str 或 None
    paths: dict[str, str] | None = None # 允许 dict 或 None
 class PodInfo(BaseModel):
    name: str
 class ApplicationDeploymentMode(BaseModel):
    method: str
    release_name: str
    chart: str
    sets: dict = Field(default_factory=dict)
    svc: SvcInfo
    pod: PodInfo
 class ApplicationMetadata(BaseModel):
    application_name: str
    distributed: ApplicationDeploymentMode
    monolithic: ApplicationDeploymentMode
 class ApplicationTemplate(BaseModel):
    """代表一个可供安装的应用模板。"""
    name: str = Field(..., description="应用模板的名称 (文件夹名)")
    metadata: ApplicationMetadata = Field(..., description="从 metadata.yaml 解析出的完整配置")
 class InstalledApplicationInstance(BaseModel):
    """代表一个已安装的应用实例。"""
    application_name: str = Field(..., description="应用的业务名称")
    release_name: str = Field(..., description="部署的 Helm Release 名称")
    namespace: str = Field(..., description="应用实例所在的唯一命名空间")
    chart: str = Field(..., description="所使用的 Helm Chart")
    status: str = Field(..., description="Helm Release 的状态 (e.g., 'deployed', 'failed')")
 class InstallReceipt(BaseModel):
    """成功触发安装后，返回给客户端的回执。"""
    application_name: str; release_name: str; namespace: str; message: str
 class UninstallReceipt(BaseModel):
    """卸载 Helm Release 后的回执。"""
    application_name: str; release_name: str; namespace: str;
    uninstalled_successfully: bool; is_clean: bool; message: str
 # --- 关键修改：增加 application_name 字段 ---
 class NamespaceDeleteReceipt(BaseModel):
    """删除 Namespace 后的回执。"""
    application_name: str = Field(..., description="被删除实例的应用业务名称")
    namespace: str = Field(..., description="被删除的命名空间")
    deleted_successfully: bool = Field(..., description="delete 命令是否成功提交")
    is_clean: bool = Field(..., description="验证步骤：Namespace 是否已从集群中清除")
    message: str = Field(..., description="操作结果消息") 
 class PodStatusDetail(BaseModel):
    """描述单个 Pod 的详细状态。"""
    pod_name: str; is_ready: bool; ready_status: str; status_phase: str | None
 class ApplicationStatus(BaseModel):
    application_name: str
    namespace: str
    is_ready: bool
    base_access_url: str | None # 允许 str 或 None
    paths: dict | None # 允许 dict 或 None
    details: list[PodStatusDetail]
--- a/ocdp/models/orchestration/cluster.py
+++ b/ocdp/models/orchestration/cluster.py
@ -0,0 +1,156 @@
 # models.py
 """
 定义了用于解析和展示 Kubernetes 集群状态的所有 Pydantic 数据模型。
 这些模型被用于 API 的请求响应、数据校验以及各层之间的数据传输。
 """
 from pydantic import BaseModel, Field, computed_field
 # ---------------------------------------------------------------------------
 # I. 单个 K8s 对象的详细模型 (Detailed Models for Single K8s Objects)
 # ---------------------------------------------------------------------------
 class PodDetail(BaseModel):
    """代表节点上运行的一个 Pod 的详细资源占用信息。"""
    namespace: str = Field(..., description="Pod 所在的命名空间")
    name: str = Field(..., description="Pod 的名称")
    cpu_requests: str = Field(..., description="CPU 请求量")
    cpu_limits: str = Field(..., description="CPU 限制量")
    memory_requests: str = Field(..., description="内存请求量")
    memory_limits: str = Field(..., description="内存限制量")
    age: str = Field(..., description="Pod 的运行时长")
 class NodeCondition(BaseModel):
    """代表从 `kubectl describe node` 输出中解析出的单个 Condition 行。"""
    type: str
    status: str
    last_heartbeat_time: str
    last_transition_time: str
    reason: str
    message: str
 class NodeHealth(BaseModel):
    """封装节点的健康状况，主要由 Condition 列表构成。"""
    conditions: list[NodeCondition] = Field(..., description="节点的健康状况条件列表")
    @computed_field
    @property
    def overall_status(self) -> str:
        """根据 'Ready' 类型的 Condition 计算出一个简明的总体状态。"""
        for condition in self.conditions:
            if condition.type == "Ready":
                return "Ready" if condition.status == "True" else "NotReady"
        return "Unknown"
 class ResourceUsage(BaseModel):
    """通用资源使用情况模型，用于表示带单位的资源（如CPU, Memory, 存储）。"""
    total: str = Field(..., description="资源总量 (来自 Allocatable)")
    used: str = Field(..., description="已用资源量 (来自 Allocated Requests)")
    free: str = Field(..., description="剩余可用资源量 (计算得出)")
 class PodsUsage(BaseModel):
    """Pod 使用情况模型，表示可调度 Pod 的数量。"""
    total: int = Field(..., description="节点可容纳的 Pod 总数 (Capacity)")
    used: int = Field(..., description="节点上当前运行的 Pod 数量")
    free: int = Field(..., description="剩余可调度的 Pod 数量 (计算得出)")
 class GPUUsage(BaseModel):
    """表示节点上 GPU 设备的调度使用情况 (使用整数)。"""
    total: int = Field(..., description="GPU 设备总数")
    used: int = Field(..., description="已被 Pod 请求的 GPU 数量")
    free: int = Field(..., description="空闲可用的 GPU 数量")
 class GPUType(BaseModel):
    """表示节点上 GPU 的物理规格。"""
    product: str = Field(..., description="GPU 产品型号")
    memory_mb: int = Field(..., description="单块 GPU 的显存大小 (MB)")
 class GPUInfo(BaseModel):
    """统一的、包含嵌套信息的 GPU 汇总对象。"""
    usage: GPUUsage = Field(..., description="GPU 数量统计")
    types: list[GPUType] = Field(..., description="节点上的 GPU 型号列表")
 class NodeInfo(BaseModel):
    """核心模型，完整地描述了一个节点的所有相关信息。"""
    name: str = Field(..., description="节点名称")
    roles: list[str] = Field(..., description="节点角色")
    labels: dict[str, str] = Field(..., description="节点的标签集合")
    health: NodeHealth = Field(..., description="节点健康状况")
    cpu: ResourceUsage = Field(..., description="CPU 资源使用情况")
    memory: ResourceUsage = Field(..., description="内存资源使用情况")
    pods: PodsUsage = Field(..., description="Pod 使用情况")
    # 注意: 以下资源并非在所有节点上都存在，因此使用 `| None` (可选)。
    # 在 API 输出时，如果值为 None，这些字段将不会出现。
    ephemeral_storage: ResourceUsage | None = Field(None, description="临时存储资源使用情况")
    hugepages_1Gi: ResourceUsage | None = Field(None, description="1Gi 大页内存使用情况")
    hugepages_2Mi: ResourceUsage | None = Field(None, description="2Mi 大页内存使用情况")
    rdma_shared_device_a: ResourceUsage | None = Field(None, description="RDMA 共享设备使用情况")
    gpu_info: GPUInfo | None = Field(None, description="节点上所有 GPU 的汇总信息")
    running_pods: list[PodDetail] = Field(..., description="在该节点上运行的 Pod 列表")
 # ---------------------------------------------------------------------------
 # II. 集群级别汇总信息模型 (Cluster-Level Summary Information Models)
 # ---------------------------------------------------------------------------
 # Section 1: 集群资源总览模型
 class TotalResourceUsage(BaseModel):
    """用于集群总览的资源使用情况模型（字符串类型）。"""
    total: str; used: str; free: str
 class PodsTotalUsage(BaseModel):
    """用于集群总览的 Pod 使用情况模型（整数类型）。"""
    total: int; used: int; free: int
 # Section 2: 单机最大承载能力模型
 class MaxFreeNodeInfo(BaseModel):
    """用于在汇总信息中标识拥有最多空闲资源的节点。"""
    node_name: str = Field(..., description="节点名称")
    free_amount: str = Field(..., description="空闲资源量（带单位）")
 class MaxFreeGPUNodeInfo(BaseModel):
    """专用于标识最适合部署大型单机 GPU 应用的节点。"""
    node_name: str = Field(..., description="节点名称")
    free_gpu_count: int = Field(..., description="该节点上的空闲 GPU 数量")
    memory_per_gpu_mb: int = Field(..., description="该型号 GPU 的单卡显存")
    total_potential_memory_gb: float = Field(..., description="空闲 GPU 总显存潜力 (GB), 计算公式: free_gpu_count * memory_per_gpu_mb")
 # Section 3: 分布式应用潜力模型
 class DistributedGPUAvailability(BaseModel):
    """按型号汇总集群中所有可用（空闲）的 GPU，用于评估分布式应用潜力。"""
    product: str = Field(..., description="GPU 产品型号")
    memory_per_gpu_mb: int = Field(..., description="该型号 GPU 的单卡显存")
    total_free_count: int = Field(..., description="该型号 GPU 在整个集群中的空闲总数")
 # --- 主汇总模型 ---
 class ClusterResourceSummary(BaseModel):
    """集群资源的详细汇总，分为总览、单机最大能力和分布式潜力三个部分。"""
    # Part 1: 集群资源总览
    cluster_total_cpu: TotalResourceUsage = Field(..., description="集群 CPU 资源总览")
    cluster_total_memory: TotalResourceUsage = Field(..., description="集群内存资源总览")
    cluster_total_pods: PodsTotalUsage = Field(..., description="集群 Pod 容量总览")
    cluster_total_ephemeral_storage: TotalResourceUsage | None = Field(None, description="集群临时存储资源总览")
    # Part 2: 单机最大承载能力
    best_node_for_cpu: MaxFreeNodeInfo | None = Field(None, description="拥有最多空闲CPU的节点")
    best_node_for_memory: MaxFreeNodeInfo | None = Field(None, description="拥有最多空闲内存的节点")
    best_node_for_gpu_app: MaxFreeGPUNodeInfo | None = Field(None, description="最适合部署大型单机GPU应用的节点")
    # Part 3: 分布式应用潜力
    distributed_gpu_availability: list[DistributedGPUAvailability] = Field(..., description="按型号汇总的、整个集群的空闲GPU数量")
 class ClusterHealthSummary(BaseModel):
    """集群整体健康状况的简要汇总。"""
    total_nodes: int; ready_nodes: int; unhealthy_nodes: int
 class ClusterSummary(BaseModel):
    """将健康状况和资源汇总组合在一起的中间模型。"""
    health: ClusterHealthSummary
    resources: ClusterResourceSummary
 class ClusterStatus(BaseModel):
    """API 的顶级响应模型，包含了集群的完整状态。"""
    summary: ClusterSummary = Field(..., description="集群的整体汇总信息")
    nodes: list[NodeInfo] = Field(..., description="集群中所有节点的详细信息列表")
--- a/ocdp/models/orchestration/resource.py
+++ b/ocdp/models/orchestration/resource.py
@ -0,0 +1,55 @@
 import json
 class Resource:
    """
    将 JSON / dict 转成 Python 对象，嵌套 dict 会自动变成 Resource，
    list 中的 dict 也会自动转换。
    """
    def __init__(self, data):
        if isinstance(data, dict):
            for k, v in data.items():
                if isinstance(v, dict):
                    setattr(self, k, Resource(v))
                elif isinstance(v, list):
                    setattr(self, k, [Resource(i) if isinstance(i, dict) else i for i in v])
                else:
                    setattr(self, k, v)
        else:
            self.value = data  # 基础类型直接存储到 value
    def __repr__(self):
        return f"{self.__dict__}"
    def to_dict(self):
        """可选：将 Resource 再转回 dict"""
        result = {}
        for k, v in self.__dict__.items():
            if isinstance(v, Resource):
                result[k] = v.to_dict()
            elif isinstance(v, list):
                result[k] = [i.to_dict() if isinstance(i, Resource) else i for i in v]
            else:
                result[k] = v
        return result
 # ---------------- 使用示例 ----------------
 if __name__ == "__main__":
    kubectl_json = '''
    {
    "metadata": {"name": "nginx", "namespace": "default"},
    "spec": {"containers": [{"name": "nginx", "image": "nginx:latest"}]},
    "status": {"phase": "Running"}
    }
    '''
    # 转成 Resource 对象
    data_dict = json.loads(kubectl_json)
    pod = Resource(data_dict)
    # 访问字段
    print(pod.metadata.name)                  # nginx
    print(pod.spec.containers[0].image)      # nginx:latest
    print(pod.status.phase)                   # Running
    # 可选转回 dict
    print(pod.to_dict())
--- a/ocdp/models/user/init.py
+++ b/ocdp/models/user/init.py
@ -0,0 +1,2 @@
 from .user import User
--- a/ocdp/models/user/user.py
+++ b/ocdp/models/user/user.py
@ -0,0 +1,21 @@
 from sqlalchemy import Column, Integer, String, Boolean, TIMESTAMP, func   
 import datetime
 from ocdp.database import Base
 class User(Base):
    __tablename__ = "users"
    user_id = Column(Integer, primary_key=True, index=True ,autoincrement=True)
    username = Column(String(64), unique=True, index=True, nullable=False)
    email = Column(String(128), unique=True, index=True, nullable=False)
    hashed_password = Column(String(128), nullable=False)
    is_active = Column(Boolean, nullable=False, default=True)
    is_admin = Column(Boolean, nullable=False, default=False)
    created_at = Column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now())
    updated_at = Column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now(), onupdate=func.now())
    last_login_at = Column(TIMESTAMP(timezone=True), nullable=True)
--- a/ocdp/orchestration/init.py
+++ b/ocdp/orchestration/init.py
@ -0,0 +1,2 @@
 from .cluster import Cluster, get_cluster
--- a/ocdp/orchestration/cluster.py
+++ b/ocdp/orchestration/cluster.py
@ -0,0 +1,265 @@
 import os
 import time
 import yaml
 import subprocess
 from ocdp.config import CONFIG
 class Cluster:
    def __init__(self, kubeconfig: str | None = None):
        self.kubeconfig = kubeconfig or os.path.expanduser(CONFIG.orchestration.kube.kubectl_file)
        self.applications_dir = os.path.expanduser(CONFIG.orchestration.kube.applications_dir)
        self.env = os.environ.copy()
        self.env['KUBECONFIG'] = self.kubeconfig
    # ----------------- 应用元数据接口  -----------------
    def list_applications(self) -> list[str]:
        """列出 applications_dir 中的所有应用（文件夹名）。"""
        if not os.path.exists(self.applications_dir):
            raise FileNotFoundError(f"The applications directory {self.applications_dir} does not exist.")
        return [f for f in os.listdir(self.applications_dir)
                if os.path.isdir(os.path.join(self.applications_dir, f))]
    def get_application_metadata(self, application_dir: str) -> dict:
        """
        获取指定应用的元数据信息。
        Args:
            application_dir (str): 应用在 applications_dir 中的文件夹名。
        Returns:
            dict: 从 metadata.yaml 文件中解析出的内容。
        """
        meta_path = os.path.join(self.applications_dir, application_dir, "metadata.yaml")
        if not os.path.exists(meta_path):
            raise FileNotFoundError(f"metadata.yaml not found in {application_dir}")
        with open(meta_path, 'r') as f:
            try:
                # 使用 FullLoader 以支持 YAML 锚点 (&) 和别名 (*)
                metadata = yaml.full_load(f)
            except yaml.YAMLError as e:
                raise RuntimeError(f"Error parsing metadata.yaml: {e}")
         # --- 动态处理元数据 ---
        # 遍历所有顶层键 (如 'distributed', 'monolithic')
        # --- 动态处理元数据 ---
        for mode, config in metadata.items():
            if isinstance(config, dict) and config.get('method') == 'helm':
                # 1. 拼 chart 绝对路径
                if 'chart' in config:
                    chart_name = config['chart']
                    config['chart'] = os.path.join(self.applications_dir, application_dir, chart_name)
                # 2. 拼服务 URL
                # 直接从 config 中获取 svc_config，而不是从 sets 中
                svc_config = config.get('svc') 
                if isinstance(svc_config, dict):
                    protocol = svc_config.get('protocol')
                    hostname = svc_config.get('hostname')
                    port = svc_config.get('port')
                    if protocol and hostname and port:
                        svc_config['url'] = f"{protocol}://{hostname}:{port}"
        return metadata
    # ----------------- Kubectl 高层接口 -----------------
    def apply(self, file_name: str) -> str:
        """应用 Kubernetes 配置文件"""
        return self._run_kubectl_cmd(["apply", "-f", self._resolve_file(file_name)])
    def delete(
        self,
        file_name: str | None = None,
        resource_type: str | None = None,
        name: str | None = None,
        namespace: str | None = None,
        force: bool = False
    ) -> str:
        """删除 Kubernetes 资源。"""
        if file_name:
            cmd = ["delete", "-f", self._resolve_file(file_name)]
        elif resource_type and name:
            cmd = ["delete", resource_type, name]
            if namespace:
                cmd.extend(["--namespace", namespace])
        else:
            raise ValueError("Invalid arguments: Provide 'file_name' or both 'resource_type' and 'name'.")
        if force:
            cmd.extend(["--force", "--grace-period=0"])
        return self._run_kubectl_cmd(cmd)
    def create(self, namespace_name: str) -> str:
        """创建一个新的 Kubernetes 命名空间。"""
        cmd = ["create", "namespace", namespace_name]
        return self._run_kubectl_cmd(cmd)
    def get(self, resource_type: str, namespace: str = None, name: str = None,
            output: str = "json",
            all_namespaces: bool = False) -> str:
        """通用资源获取方法。默认返回 JSON 格式。"""
        cmd = ["get", resource_type.lower()]
        if name: cmd.append(name)
        if all_namespaces: cmd.append("-A")
        elif namespace: cmd.extend(["-n", namespace])
        if output: cmd.extend(["-o", output])
        return self._run_kubectl_cmd(cmd)
    def describe(self, resource_type: str, name: str = None, namespace: str = None) -> str:
        """描述指定资源。"""
        cmd = ["describe", resource_type.lower()]
        if name: cmd.append(name)
        if namespace: cmd.extend(["-n", namespace])
        return self._run_kubectl_cmd(cmd)
    # ----------------- Helm Repository 管理接口 (新增) -----------------
    def add_repo(
        self, 
        repo_name: str, 
        repo_url: str, 
        username: str | None = None, 
        password: str | None = None
    ) -> str:
        """
        添加一个 Helm Chart 仓库 (helm repo add)。
        Args:
            repo_name (str): 仓库的本地别名。
            repo_url (str): 仓库的 URL。
            username (str, optional): 私有仓库的用户名。
            password (str, optional): 私有仓库的密码。
        Returns:
            str: 命令的输出。
        """
        cmd = ["repo", "add", repo_name, repo_url]
        if username:
            cmd.extend(["--username", username])
        if password:
            cmd.extend(["--password", password, "--pass-credentials"])
        return self._run_helm_cmd(cmd)
    def update_repos(self, repo_names: list[str] | None = None) -> str:
        """
        更新一个或多个 Helm Chart 仓库 (helm repo update)。
        Args:
            repo_names (list[str], optional): 要更新的仓库名称列表。如果为 None，则更新所有仓库。
        Returns:
            str: 命令的输出。
        """
        cmd = ["repo", "update"]
        if repo_names:
            cmd.extend(repo_names)
        return self._run_helm_cmd(cmd)
    # ----------------- Helm Release 管理接口 -----------------
    def install_release(
        self,
        release_name: str,
        chart_source: str,
        namespace: str,
        config_file: str | None = None,
        create_namespace: bool = True
    ) -> str:
        """
        安装一个 Helm Release (应用实例)。
        """
        cmd = ["install", release_name, chart_source, "--namespace", namespace]
        if create_namespace:
            cmd.append("--create-namespace")
        # --- 关键修复：直接使用 config_file 路径 ---
        if config_file:
            # 不再调用 self._resolve_file，因为 config_file 是由上层逻辑（DAO）
            # 提供的完整路径（例如 /tmp/temp-values.yaml）。
            if not os.path.exists(config_file):
                 raise FileNotFoundError(f"Provided config_file does not exist: {config_file}")
            cmd.extend(["-f", config_file])
        return self._run_helm_cmd(cmd)
    def uninstall_release(
        self,
        release_name: str,
        namespace: str | None = None,
        wait: bool = False
    ) -> str:
        """卸载一个 Helm Release (应用实例)。"""
        cmd = ["uninstall", release_name]
        if namespace:
            cmd.extend(["--namespace", namespace])
        if wait:
            cmd.append("--wait")
        return self._run_helm_cmd(cmd)
    def list_releases(
        self,
        namespace: str | None = None,
        all_namespaces: bool = False,
        output: str = None
    ) -> str:
        """列出已安装的 Helm Releases (应用实例)。"""
        cmd = ["list"]
        if all_namespaces:
            cmd.append("--all-namespaces")
        elif namespace:
            cmd.extend(["--namespace", namespace])
        if output:
            cmd.extend(["--output", output])
        return self._run_helm_cmd(cmd)
    # ----------------- 私有方法 -----------------
    def _run_kubectl_cmd(self, cmd_args: list[str]) -> str:
        """执行 kubectl 命令"""
        try:
            command = ["kubectl"] + cmd_args
            print(f"🚀 Executing Kubectl: {' '.join(command)}")
            result = subprocess.run(
                command, check=True, capture_output=True, text=True, env=self.env
            )
            return result.stdout
        except FileNotFoundError:
            raise RuntimeError("`kubectl` command not found. Is it installed and in your PATH?")
        except subprocess.CalledProcessError as e:
            error_message = f"kubectl command failed with exit code {e.returncode}:\n{e.stderr.strip()}"
            raise RuntimeError(error_message) from e
    def _run_helm_cmd(self, cmd_args: list[str]) -> str:
        """[内部实现] 执行 helm 命令"""
        try:
            command = ["helm", "--kubeconfig", self.kubeconfig] + cmd_args
            print(f"🚀 Executing Helm: {' '.join(command)}")
            result = subprocess.run(
                command, check=True, capture_output=True, text=True, env=self.env
            )
            return result.stdout
        except FileNotFoundError:
            raise RuntimeError("`helm` command not found. Is it installed and in your PATH?")
        except subprocess.CalledProcessError as e:
            error_message = f"Helm command failed with exit code {e.returncode}:\n{e.stderr.strip()}"
            raise RuntimeError(error_message) from e
    def _resolve_file(self, file_name: str) -> str:
        """解析资源文件路径 (相对于 applications_dir)"""
        file_path = os.path.join(self.applications_dir, file_name)
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"The file {file_path} does not exist.")
        return file_path
 def get_cluster() -> Cluster:
    return Cluster()
 if __name__ == "__main__":
    cluster = get_cluster()
    print(cluster.get_application_metadata("infer"))
--- a/ocdp/services/init.py
+++ b/ocdp/services/init.py
--- a/ocdp/services/orchestration/init.py
+++ b/ocdp/services/orchestration/init.py
--- a/ocdp/services/orchestration/application_service.py
+++ b/ocdp/services/orchestration/application_service.py
@ -0,0 +1,71 @@
 # service.py
 """
 Service (服务) 层 - 应用编排。
 负责处理核心业务逻辑（如权限、命名），并调用 DAO 层来执行数据操作。
 """
 import ulid
 from ocdp.orchestration.cluster import Cluster
 from ocdp.daos.orchestration import application_dao as dao
 from ocdp.models.orchestration.application import (ApplicationTemplate, InstallReceipt, ApplicationStatus, 
                    InstalledApplicationInstance, UninstallReceipt, NamespaceDeleteReceipt, ApplicationMetadata)
 # ... (list_available_applications, list_user_applications 保持不变) ...
 def list_available_applications(cluster: Cluster) -> list[ApplicationTemplate]:
    """(Service) 获取所有可供安装的应用模板列表。"""
    return dao.list_application_templates(cluster)
 def list_user_applications(cluster: Cluster, user_id: str) -> list[InstalledApplicationInstance]:
    """(Service) 获取指定用户已经安装的应用实例列表。"""
    return dao.list_application_instances(cluster, user_id)
 def install_new_application(
    cluster: Cluster, 
    user_id: str, 
    app_template_name: str, 
    mode: str, 
    user_overrides: dict | None = None
 ) -> InstallReceipt:
    """
    (Service) 触发一个新应用的安装。
    核心职责：根据业务规则生成唯一的命名空间。
    """
    # 1. (Service 职责) 获取应用的业务名称，用于构造命名空间
    #    这里通过调用一次 get_application_metadata 来获取，但只为了 application_name
    #    DAO 层为了执行任务，也会自己获取一次
    metadata = cluster.get_application_metadata(app_template_name)
    application_name = metadata.get("application_name", app_template_name)
    # 2. (Service 职责) 生成唯一的命名空间
    instance_id = str(ulid.new()).lower()
    namespace = f"{user_id}-{application_name}-{instance_id}"
    # 3. (Service 职责) 将所有参数（包括生成的namespace）传递给 DAO 层执行
    return dao.install_application(
        cluster=cluster,
        namespace=namespace,
        app_template_name=app_template_name,
        mode=mode,
        user_overrides=user_overrides
    )
 def get_instance_status(
    cluster: Cluster, 
    namespace: str,
    app_template_name: str, 
    mode: str
 ) -> ApplicationStatus:
    """(Service) 获取指定应用实例的详细状态。"""
    return dao.get_application_status(cluster, namespace, app_template_name, mode)
 def uninstall_application_release(
    cluster: Cluster, 
    namespace: str,
    app_template_name: str,
    mode: str
 ) -> UninstallReceipt:
    """(Service) 卸载应用实例 (Helm Release)。"""
    return dao.uninstall_application_release(cluster, namespace, app_template_name, mode)
 def delete_application_namespace(cluster: Cluster, namespace: str) -> NamespaceDeleteReceipt:
    """(Service) 删除应用实例的命名空间。"""
    return dao.delete_namespace(cluster, namespace)
--- a/ocdp/services/orchestration/cluster_service.py
+++ b/ocdp/services/orchestration/cluster_service.py
@ -0,0 +1,12 @@
 # services.py
 from ocdp.orchestration.cluster import Cluster
 from ocdp.daos.orchestration import cluster_dao
 from ocdp.models.orchestration.cluster import ClusterStatus
 def get_cluster_status(cluster: Cluster) -> ClusterStatus: # 参数名修改
    """
    Service层函数, 作为业务逻辑的入口。
    它将请求直接委托给DAO层来处理。
    """
    return cluster_dao.get_cluster_status(cluster) # 变量名修改
--- a/ocdp/services/orchestration/node_service.py
+++ b/ocdp/services/orchestration/node_service.py
@ -0,0 +1,135 @@
 # node_service.py
 import json
 from collections import defaultdict
 from ocdp.orchestration import Cluster, get_cluster
 # 从我们的 DAO 模块导入函数
 from ocdp.daos.orchestration.node_dao import (
    get_all_nodes_health_status, 
    get_all_nodes_resource_details, 
    refresh_nodes_cache as refresh_dao_cache
 )
 # --- Service 层的格式化工具 ---
 def _format_bytes(byte_count: int) -> str:
    """将字节数格式化为人类可读的字符串"""
    if byte_count < 0: return "N/A"
    power = 1024
    n = 0
    power_labels = {0: 'B', 1: 'KiB', 2: 'MiB', 3: 'GiB', 4: 'TiB'}
    while byte_count >= power and n < len(power_labels) -1 :
        byte_count /= power
        n += 1
    return f"{byte_count:.2f} {power_labels[n]}"
 # --- 对外暴露的 Service 接口函数 ---
 def get_cluster_health_report(cluster: Cluster) -> dict:
    """1. (Service) 获取集群 nodes 健康状态报告"""
    # 直接调用 DAO 函数，它的返回格式已经很好了
    return get_all_nodes_health_status(cluster)
 def get_per_node_resource_report(cluster: Cluster) -> dict:
    """2. (Service) 获取各 node 资源的格式化报告"""
    # 从 DAO 获取数值数据
    raw_resources = get_all_nodes_resource_details(cluster)
    formatted_report = {}
    for name, data in raw_resources.items():
        # 格式化 DAO 传来的数值
        formatted_report[name] = {
            "cpu": {
                "total": f"{data['cpu']['total']:.2f} Cores",
                "used": f"{data['cpu']['total'] - data['cpu']['allocatable']:.2f} Cores",
                "free": f"{data['cpu']['allocatable']:.2f} Cores"
            },
            "memory": {
                "total": _format_bytes(data['memory']['total']),
                "used": _format_bytes(data['memory']['total'] - data['memory']['allocatable']),
                "free": _format_bytes(data['memory']['allocatable'])
            },
            "storage": {
                "total": _format_bytes(data['storage']['total']),
                "used": _format_bytes(data['storage']['total'] - data['storage']['allocatable']),
                "free": _format_bytes(data['storage']['allocatable'])
            },
            # GPU 信息现在直接从 DAO 获取，已经是正确的格式
            "gpu": data['gpu']
        }
    return formatted_report
 def get_cluster_summary_report(cluster: Cluster) -> dict:
    """3. (Service) 获取集群总资源的汇总报告"""
    # 从 DAO 获取数值数据
    raw_resources = get_all_nodes_resource_details(cluster)
    # 初始化聚合器
    total_cpu, alloc_cpu = 0.0, 0.0
    total_mem, alloc_mem = 0, 0
    total_sto, alloc_sto = 0, 0
    total_gpu_count = 0
    alloc_gpu_count = 0
    gpu_models = defaultdict(int)
    # 聚合 DAO 传来的数值
    for data in raw_resources.values():
        total_cpu += data['cpu']['total']
        alloc_cpu += data['cpu']['allocatable']
        total_mem += data['memory']['total']
        alloc_mem += data['memory']['allocatable']
        total_sto += data['storage']['total']
        alloc_sto += data['storage']['allocatable']
        gpu_data = data['gpu']
        if gpu_data['count'] > 0:
            total_gpu_count += gpu_data['count']
            alloc_gpu_count += gpu_data['allocatable_count']
            gpu_models[gpu_data['model']] += gpu_data['count']
    # 格式化最终结果
    return {
        "note": "'used' 代表被系统或 Kubelet 预留的资源, 'free' 代表可供 Pod 调度的资源。",
        "cpu": {
            "total": f"{total_cpu:.2f} Cores",
            "used": f"{total_cpu - alloc_cpu:.2f} Cores",
            "free": f"{alloc_cpu:.2f} Cores"
        },
        "memory": {
            "total": _format_bytes(total_mem),
            "used": _format_bytes(total_mem - alloc_mem),
            "free": _format_bytes(alloc_mem)
        },
        "storage": {
            "total": _format_bytes(total_sto),
            "used": _format_bytes(total_sto - alloc_sto),
            "free": _format_bytes(alloc_sto)
        },
        "gpu": {
            "total_count": total_gpu_count,
            "allocatable_count": alloc_gpu_count,
            "models_summary": dict(gpu_models)
        }
    }
 # --- 使用示例 ---
 if __name__ == "__main__":
    try:
        # 只需要一个 cluster 客户端
        cluster_client = get_cluster()
        # 调用 Service 层的函数
        print("\n" + "="*20 + " 1. 集群健康状态 " + "="*20)
        health_report = get_cluster_health_report(cluster_client)
        print(json.dumps(health_report, indent=2))
        print("\n" + "="*20 + " 2. 各节点资源详情 " + "="*20)
        per_node_report = get_per_node_resource_report(cluster_client)
        print(json.dumps(per_node_report, indent=2))
        print("\n" + "="*20 + " 3. 集群资源汇总 " + "="*20)
        summary_report = get_cluster_summary_report(cluster_client)
        print(json.dumps(summary_report, indent=2))
    except RuntimeError as e:
        print(f"\n发生错误: {e}")
--- a/ocdp/services/user/init.py
+++ b/ocdp/services/user/init.py
--- a/ocdp/services/user/helpers/init.py
+++ b/ocdp/services/user/helpers/init.py
@ -0,0 +1,3 @@
 from .password_handler import hash_password, verify_password
 from .token_handler import generate_token, verify_token
--- a/ocdp/services/user/helpers/password_handler.py
+++ b/ocdp/services/user/helpers/password_handler.py
@ -0,0 +1,11 @@
 from argon2 import PasswordHasher
 PH = PasswordHasher()
 def hash_password(password, ph=PH):
    hashed = ph.hash(password)
    return hashed
 def verify_password(hashed, password, ph=PH):
    return ph.verify(hashed, password)
--- a/ocdp/services/user/helpers/token_handler.py
+++ b/ocdp/services/user/helpers/token_handler.py
@ -0,0 +1,73 @@
 import os
 import jwt
 import datetime
 from ocdp.config import CONFIG
 # Best practice: Load from environment variables.
 # For local development, you can use a .env file and the python-dotenv library.
 SECRET_KEY = CONFIG.token.jwt.secret_key
 ALGORITHM = CONFIG.token.jwt.signing_algorithm
 def generate_token(
    user_id: str,
    expires_delta: datetime.timedelta = datetime.timedelta(minutes=30),
    secret_key: str = SECRET_KEY,
    algorithm: str = ALGORITHM
 ) -> str:
    """
    Generates a JWT token with an expiration time.
    """
    # Ensure a secret key is available before proceeding.
    if not secret_key:
        raise ValueError("SECRET_KEY not found in environment variables.")
    # Get the current time in UTC.
    issue_time = datetime.datetime.now(datetime.timezone.utc)
    # Calculate the expiration time.
    expire_time = issue_time + expires_delta
    # Create the payload with standard claims.
    payload = {
        "sub": user_id,       # 'sub' (Subject): The user's unique identifier.
        "iat": issue_time,    # 'iat' (Issued At): The time the token was created.
        "exp": expire_time    # 'exp' (Expiration Time): When the token becomes invalid.
    }
    # Encode the payload into a JWT token.
    token = jwt.encode(payload, secret_key, algorithm=algorithm)
    return token
 def verify_token(
    token: str,
    secret_key: str = SECRET_KEY,
    algorithm: str = ALGORITHM
 ) -> dict | None:
    """
    Verifies a JWT token.
    If the token is valid, it returns the decoded payload as a dictionary.
    If the token is invalid (e.g., expired or bad signature), it returns None.
    """
    # Ensure a secret key is available for decoding.
    if not secret_key:
        raise ValueError("SECRET_KEY not found in environment variables.")
    try:
        # The core of verification. jwt.decode handles signature, expiration, and algorithm checks.
        payload = jwt.decode(
            token,
            secret_key,
            algorithms=[algorithm] # Specify the algorithm to prevent certain attacks.
        )
        return payload
    except jwt.ExpiredSignatureError:
        # This is one of the most common errors: the token is past its expiration date.
        print("Token verification failed: Token has expired.")
        return None
    except jwt.InvalidTokenError as e:
        # This catches all other JWT errors, such as a bad signature or a malformed token.
        print(f"Token verification failed: Invalid token. Error: {e}")
        return None
--- a/ocdp/services/user/user_exceptions.py
+++ b/ocdp/services/user/user_exceptions.py
@ -0,0 +1,4 @@
 class UserAlreadyExistsError(Exception):
    pass
--- a/ocdp/services/user/user_service.py
+++ b/ocdp/services/user/user_service.py
@ -0,0 +1,119 @@
 # 文件名: user_service.py
 from sqlalchemy.orm import Session
 import datetime
 from .helpers import hash_password, verify_password, generate_token, verify_token
 from .user_exceptions import UserAlreadyExistsError
 # 从 DAO 层导入具体的数据库操作函数
 from ocdp.daos.user import user_dao
 # 从模型和 DTOs/Schemas 中导入
 from ocdp.models.user import User
 # 从辅助模块导入密码和 Token 相关函数
 # 第二层
 # --- 业务逻辑函数 ---
 def login_for_access_token(username: str, password: str, db: Session) -> str | None:
    """处理用户登录的核心业务逻辑。"""
    # 调用 DAO 层函数来获取用户
    user = user_dao.get_user_by_username(username, db)
    # 验证用户是否存在以及密码是否正确
    if not user or not verify_password(user.hashed_password, password):
        return None  # 认证失败
    # 认证成功，生成 Token
    token = generate_token(user_id=str(user.user_id)) # 假设主键是 user_id
    # （可选）更新最后登录时间
    set_last_login(user.user_id, db)
    return token
 # 第一层
 def get_user_by_id(user_id: int, db: Session):
    # 直接调用 DAO 层函数
    return user_dao.get_user_by_id(user_id, db)
 def get_user_by_username(username: str, db: Session):
    # 直接调用 DAO 层函数
    return user_dao.get_user_by_username(username, db)
 def get_user_by_email(email:str, db: Session):
    # 直接调用 DAO 层函数
    return user_dao.get_user_by_email(email, db)
 def get_current_user(token: str, db: Session):
    user_id = verify_token(token).get("sub")
    if not user_id:
        return None
    return user_dao.get_user_by_id(user_id, db)
 def get_user_id_by_token(token: str):
    user_id = verify_token(token).get("sub")
    if not user_id:
        return None
    return user_id
 # 文件名: user_service.py
 def create_user(username: str, password: str, email: str, db: Session):
    # 将验证逻辑移入 Service 层
    existing_user = user_dao.get_user_by_username(username, db)
    if existing_user:
        raise UserAlreadyExistsError(f"User with username '{username}' already exists.")
    # 可以在这里也增加 email 的存在性检查
    existing_email = user_dao.get_user_by_email(email, db)
    if existing_email:
        raise UserAlreadyExistsError(f"User with email '{email}' already exists.")
    hashed = hash_password(password)
    user = User(
        username=username,
        email=email,
        hashed_password=hashed
    )
    return user_dao.add_user(user, db)
 def update_user_password(user_id: int, new_password: str, db: Session):
    user = user_dao.get_user_by_id(user_id, db)
    if not user:
        return None
    # 业务逻辑：哈希新密码
    user.hashed_password = hash_password(new_password)
    # 调用 DAO 层函数来持久化更新
    return user_dao.update_user(user, db)
 def set_last_login(user_id: int, db: Session):
    user = user_dao.get_user_by_id(user_id, db)
    if not user:
        return None
    user.last_login_at = datetime.datetime.now(datetime.timezone.utc)
    # 调用 DAO 层函数来持久化更新
    return user_dao.update_user(user, db)
 def deactivate_user(user_id: int, db: Session):
    user = user_dao.get_user_by_id(user_id, db)
    if not user:
        return None
    user.is_active = False
    # 调用 DAO 层函数来持久化更新
    return user_dao.update_user(user, db)
 def delete_user(user_id: int, db: Session):
    user = user_dao.get_user_by_id(user_id, db)
    if not user:
        return False # 表示用户不存在，删除失败
    # 调用 DAO 层函数来删除
    return user_dao.delete_user(user, db)
--- a/ocdp/utils/init.py
+++ b/ocdp/utils/init.py
		`@ -0,0 +1,7 @@`

							`from fastapi import APIRouter`



							`router = APIRouter(prefix="/application")`
		`@ -0,0 +1,15 @@`

							`from fastapi import APIRouter`



							`router = APIRouter(prefix="/api/v2")`
		`@ -0,0 +1,3 @@`

							`from .password_handler import hash_password, verify_password`
							`from .token_handler import generate_token, verify_token`
		`@ -0,0 +1,4 @@`


							`class UserAlreadyExistsError(Exception):`
							`pass`