feat: Add orchestration models and services for Kubernetes cluster management

- Implemented Pydantic models for Kubernetes cluster state representation in `cluster.py`.
- Created a `Resource` class for converting JSON/dict to Python objects in `resource.py`.
- Established user models and services for user management, including password hashing and JWT token generation.
- Developed application orchestration services for managing Kubernetes applications, including installation and uninstallation.
- Added cluster service for retrieving cluster status and health reports.
- Introduced node service for fetching node resource details and health status.
- Implemented user service for handling user authentication and management.
This commit is contained in:
2025-09-02 02:50:42 +00:00
parent 00039b2fe1
commit c7f8e69d61
65 changed files with 3649 additions and 0 deletions

13
.env.template Normal file
View File

@ -0,0 +1,13 @@
# config
OCDP_CONFIG_FILE="~/.ocdp/config.yaml"
# password
DATABASE_MYSQL_PASSWORD="****"
# token
TOKEN_JWT_SECRET_KEY="****"
# admin
ADMIN_USERNAME="admin"
ADMIN_PASSWORD="****"

17
.gitignore vendored Normal file
View File

@ -0,0 +1,17 @@
# python
__pycache__/
*.py[cod]
# data
*.csv
*.json
*.xlsx
*.yaml
*.yml
# env variable
.env
*.ini

View File

@ -0,0 +1 @@

28
config.yaml.template Normal file
View File

@ -0,0 +1,28 @@
orchestration:
kube:
kubectl_file: "~/.ocdp/kube/config"
applications_dir: "~/.ocdp/kube/applications"
logs_dir: "~/.ocdp/kube/logs"
logger:
loki:
url: "https://loki.bwgdi.com/loki/api/v1/push"
labels: "application=myapp,environment=develop"
label_keys: ""
database:
mysql:
host: "localhost"
port: 3306
db_name: "ocdp"
username: "root"
# ❗️ Password should be read from environment variables, not provided here
password:
hash:
algorithm: "ARGON2"
token:
jwt:
signing_algorithm: "HS256"
# ❗️ Secret should be read from environment variables, not provided here

515
frontend/frontend.html Normal file
View File

@ -0,0 +1,515 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Cluster Status Dashboard - Dynamic</title>
<!-- Tailwind CSS for styling -->
<script src="https://cdn.tailwindcss.com"></script>
<!-- Chart.js for beautiful charts -->
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<!-- Lucide Icons for a modern look -->
<script src="https://unpkg.com/lucide@latest"></script>
<style>
/* Custom styles for a better dark mode and overall look */
body {
font-family: 'Inter', sans-serif;
background: linear-gradient(135deg, #1a202c 0%, #2d3748 50%, #4a5568 100%);
background-size: 400% 400%;
animation: gradientBG 15s ease infinite;
}
@keyframes gradientBG {
0% { background-position: 0% 50%; }
50% { background-position: 100% 50%; }
100% { background-position: 0% 50%; }
}
.chart-container {
position: relative;
height: 120px;
width: 120px;
}
.chart-label {
position: absolute;
top: 50%;
left: 50%;
transform: translate(-50%, -50%);
font-size: 1.1rem;
font-weight: 600;
}
/* Glassmorphism Card Style */
.card {
background: rgba(31, 41, 55, 0.5); /* gray-800 with transparency */
backdrop-filter: blur(12px);
-webkit-backdrop-filter: blur(12px);
border-radius: 1rem; /* 16px */
border: 1px solid rgba(255, 255, 255, 0.1);
transition: all 0.3s ease;
}
.card:hover {
transform: translateY(-5px) scale(1.01);
box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.2), 0 10px 10px -5px rgba(0, 0, 0, 0.1);
}
/* Custom scrollbar for pod lists */
.pod-list::-webkit-scrollbar {
width: 6px;
}
.pod-list::-webkit-scrollbar-track {
background: rgba(45, 55, 72, 0.5); /* gray-700 with transparency */
}
.pod-list::-webkit-scrollbar-thumb {
background: #90cdf4; /* blue-300 */
border-radius: 3px;
}
/* Tooltip for pressure status */
.tooltip {
position: relative;
display: inline-block;
}
.tooltip .tooltiptext {
visibility: hidden;
width: 140px;
background-color: #111827;
color: #fff;
text-align: center;
border-radius: 6px;
padding: 5px 0;
position: absolute;
z-index: 1;
bottom: 125%;
left: 50%;
margin-left: -70px;
opacity: 0;
transition: opacity 0.3s;
}
.tooltip:hover .tooltiptext {
visibility: visible;
opacity: 1;
}
</style>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
</head>
<body class="bg-gray-900 text-gray-200">
<div class="container mx-auto p-4 md:p-8">
<!-- API URL Input Section -->
<div class="card p-4 mb-8">
<div class="flex flex-col sm:flex-row items-center gap-4">
<label for="apiUrl" class="font-semibold text-white flex-shrink-0">API URL:</label>
<input type="text" id="apiUrl" placeholder="http://127.0.0.1:8000/api/v1/orchestration/cluster/cluster-status" class="w-full bg-gray-900/50 text-white border border-gray-600 rounded-lg px-4 py-2 focus:ring-2 focus:ring-blue-500 focus:border-blue-500 outline-none transition">
<button id="fetchDataBtn" class="bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-6 rounded-lg flex items-center gap-2 transition w-full sm:w-auto">
<i data-lucide="refresh-cw" class="w-4 h-4"></i>
<span>Get Data</span>
</button>
</div>
<div id="status-message" class="mt-3 text-center min-h-[20px]"></div>
</div>
<!-- Header -->
<header class="mb-8 flex items-center gap-4">
<i data-lucide="layout-dashboard" class="w-10 h-10 text-blue-400"></i>
<div>
<h1 class="text-3xl font-bold text-white">Orchestration Cluster Status</h1>
<p class="text-gray-400">Real-time overview of cluster health and resource allocation.</p>
</div>
</header>
<main id="dashboard-content" class="hidden">
<!-- Cluster Summary Section -->
<div id="summary-section" class="mb-8">
<h2 class="text-2xl font-semibold text-white mb-4 flex items-center gap-2"><i data-lucide="server" class="w-6 h-6"></i>Cluster Summary</h2>
<div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-6">
<div id="health-card" class="card p-6 flex flex-col justify-center items-center"></div>
<div class="card p-6 col-span-1 md:col-span-2 lg:col-span-3">
<h3 class="text-lg font-semibold mb-4 text-white flex items-center gap-2"><i data-lucide="pie-chart" class="w-5 h-5"></i>Core Resource Usage</h3>
<div class="grid grid-cols-2 md:grid-cols-4 gap-6 text-center">
<div class="flex flex-col items-center">
<div class="chart-container"><canvas id="cpuChart"></canvas><div id="cpuChartLabel" class="chart-label text-white"></div></div>
<p class="mt-2 text-gray-300 font-medium">CPU Usage</p><p id="cpu-usage-text" class="text-sm text-gray-400"></p>
</div>
<div class="flex flex-col items-center">
<div class="chart-container"><canvas id="memoryChart"></canvas><div id="memoryChartLabel" class="chart-label text-white"></div></div>
<p class="mt-2 text-gray-300 font-medium">Memory Usage</p><p id="memory-usage-text" class="text-sm text-gray-400"></p>
</div>
<div class="flex flex-col items-center">
<div class="chart-container"><canvas id="storageChart"></canvas><div id="storageChartLabel" class="chart-label text-white"></div></div>
<p class="mt-2 text-gray-300 font-medium">Ephemeral Storage</p><p id="storage-usage-text" class="text-sm text-gray-400"></p>
</div>
<div class="flex flex-col items-center">
<div class="chart-container"><canvas id="podsChart"></canvas><div id="podsChartLabel" class="chart-label text-white"></div></div>
<p class="mt-2 text-gray-300 font-medium">Pod Allocation</p><p id="pods-usage-text" class="text-sm text-gray-400"></p>
</div>
</div>
</div>
</div>
<div class="grid grid-cols-1 md:grid-cols-3 gap-6 mt-6">
<div class="card p-6">
<h3 class="text-lg font-semibold text-white mb-3 flex items-center gap-2"><i data-lucide="lightbulb" class="w-5 h-5 text-yellow-300"></i>Scheduling Hints</h3>
<div id="scheduling-hints" class="space-y-3 text-gray-300"></div>
</div>
<div class="card p-6 md:col-span-2">
<h3 class="text-lg font-semibold text-white mb-3 flex items-center gap-2"><i data-lucide="gpu-chip" class="w-5 h-5 text-green-400"></i>GPU Availability</h3>
<div id="gpu-availability" class="space-y-2 text-gray-300"></div>
</div>
</div>
</div>
<!-- Nodes Section -->
<div>
<h2 class="text-2xl font-semibold text-white mb-4 flex items-center gap-2"><i data-lucide="hard-drive" class="w-6 h-6"></i>Node Details</h2>
<div id="nodes-grid" class="space-y-6"></div>
</div>
</main>
</div>
<script>
const apiUrlInput = document.getElementById('apiUrl');
const fetchDataBtn = document.getElementById('fetchDataBtn');
const statusMessage = document.getElementById('status-message');
const dashboardContent = document.getElementById('dashboard-content');
let refreshInterval = null;
const REFRESH_INTERVAL_MS = 30000; // 30 seconds
// --- CORE LOGIC ---
async function fetchAndRenderDashboard() {
const url = apiUrlInput.value.trim();
if (!url) {
showStatus('Please enter a valid API URL.', 'error');
return;
}
if (refreshInterval) clearInterval(refreshInterval);
showStatus('Fetching data...', 'loading');
fetchDataBtn.disabled = true;
try {
const response = await fetch(url);
if (!response.ok) throw new Error(`HTTP error! Status: ${response.status}`);
const data = await response.json();
renderDashboard(data);
dashboardContent.classList.remove('hidden');
showStatus(`Data updated successfully. Next refresh in ${REFRESH_INTERVAL_MS / 1000}s.`, 'success');
localStorage.setItem('clusterApiUrl', url);
refreshInterval = setInterval(fetchAndRenderDashboard, REFRESH_INTERVAL_MS);
} catch (error) {
console.error('Failed to fetch or render dashboard:', error);
let errorMessage = `Failed to load data: ${error.message}`;
if (error instanceof TypeError && error.message === 'Failed to fetch') {
errorMessage = 'Network Error: Failed to fetch. This is likely a CORS issue. Please ensure your API server at the specified URL is running and has CORS enabled (e.g., with the "Access-Control-Allow-Origin: *" header).';
}
showStatus(errorMessage, 'error');
dashboardContent.classList.add('hidden');
} finally {
fetchDataBtn.disabled = false;
}
}
// --- UTILITY FUNCTIONS ---
function parseResourceValue(valueStr) {
if (typeof valueStr !== 'string') return parseFloat(valueStr) || 0;
const value = parseFloat(valueStr);
if (valueStr.toLowerCase().includes('gi')) return value;
if (valueStr.toLowerCase().includes('mi')) return value / 1024;
if (valueStr.toLowerCase().includes('ki')) return value / 1024 / 1024;
if (valueStr.toLowerCase().includes('m')) return value / 1000;
return value;
}
function showStatus(message, type) {
statusMessage.textContent = message;
statusMessage.className = 'mt-3 text-center min-h-[20px] ';
switch (type) {
case 'success': statusMessage.classList.add('text-green-400'); break;
case 'error': statusMessage.classList.add('text-red-400'); break;
case 'loading': statusMessage.classList.add('text-blue-400'); break;
default: statusMessage.classList.add('text-gray-400');
}
}
function createDonutChart(canvasId, labelId, used, total, color) {
const free = total - used;
const percentage = total > 0 ? ((used / total) * 100).toFixed(1) : 0;
const ctx = document.getElementById(canvasId).getContext('2d');
if (window.chartInstances && window.chartInstances[canvasId]) {
window.chartInstances[canvasId].destroy();
}
const chart = new Chart(ctx, {
type: 'doughnut',
data: {
datasets: [{
data: [used, free > 0 ? free : 0],
backgroundColor: [color, 'rgba(74, 85, 104, 0.5)'],
borderWidth: 0,
hoverBackgroundColor: [color, 'rgba(74, 85, 104, 0.7)']
}]
},
options: {
responsive: true, maintainAspectRatio: false, cutout: '75%',
plugins: { legend: { display: false }, tooltip: { enabled: false } },
animation: { duration: 500 }
}
});
if (!window.chartInstances) window.chartInstances = {};
window.chartInstances[canvasId] = chart;
document.getElementById(labelId).innerText = `${percentage}%`;
}
function getNodePressureStatus(conditions) {
if (!Array.isArray(conditions)) {
return { hasPressure: false, reason: 'Unknown' };
}
const pressureCondition = conditions.find(c =>
(c.type === 'MemoryPressure' || c.type === 'DiskPressure' || c.type === 'PIDPressure') && c.status === 'True'
);
return {
hasPressure: !!pressureCondition,
reason: pressureCondition ? pressureCondition.type : 'No Pressure'
};
}
// --- RENDER FUNCTIONS ---
function renderDashboard(data) {
renderSummary(data); // Pass full data object
renderNodes(data.nodes);
lucide.createIcons();
}
function renderSummary(data) {
const summary = data.summary;
const healthCard = document.getElementById('health-card');
const isHealthy = summary.health.unhealthy_nodes === 0;
healthCard.innerHTML = `
<div class="text-center">
<p class="text-lg font-semibold ${isHealthy ? 'text-green-300' : 'text-red-400'} mb-2 flex items-center gap-2">
<i data-lucide="${isHealthy ? 'shield-check' : 'shield-alert'}" class="w-6 h-6"></i>
${isHealthy ? 'Cluster Healthy' : 'Cluster Unhealthy'}
</p>
<p class="text-4xl font-bold text-white">${summary.health.ready_nodes} <span class="text-2xl font-normal">/ ${summary.health.total_nodes}</span></p>
<p class="text-gray-400">Nodes Ready</p>
</div>
`;
const { cluster_total_cpu, cluster_total_memory, cluster_total_pods, cluster_total_ephemeral_storage, best_node_for_gpu_app } = summary.resources;
const cpuUsed = parseFloat(cluster_total_cpu.used);
const cpuTotal = parseFloat(cluster_total_cpu.total);
createDonutChart('cpuChart', 'cpuChartLabel', cpuUsed, cpuTotal, '#6ee7b7');
document.getElementById('cpu-usage-text').innerText = `${cpuUsed.toFixed(2)} / ${cpuTotal.toFixed(2)} Cores`;
const memUsed = parseResourceValue(cluster_total_memory.used);
const memTotal = parseResourceValue(cluster_total_memory.total);
createDonutChart('memoryChart', 'memoryChartLabel', memUsed, memTotal, '#93c5fd');
document.getElementById('memory-usage-text').innerText = `${memUsed.toFixed(2)} / ${memTotal.toFixed(2)} GiB`;
createDonutChart('podsChart', 'podsChartLabel', cluster_total_pods.used, cluster_total_pods.total, '#fca5a5');
document.getElementById('pods-usage-text').innerText = `${cluster_total_pods.used} / ${cluster_total_pods.total} Pods`;
const storageUsed = parseResourceValue(cluster_total_ephemeral_storage.used);
const storageTotal = parseResourceValue(cluster_total_ephemeral_storage.total);
createDonutChart('storageChart', 'storageChartLabel', storageUsed, storageTotal, '#fde047');
document.getElementById('storage-usage-text').innerText = `${storageUsed.toFixed(2)} / ${storageTotal.toFixed(2)} GiB`;
const hintsContainer = document.getElementById('scheduling-hints');
const bestGpuNode = data.nodes.find(n => n.name === best_node_for_gpu_app.node_name);
const gpuProduct = bestGpuNode?.gpu_info?.types[0]?.product || 'N/A';
const gpuMemoryGB = (best_node_for_gpu_app.memory_per_gpu_mb / 1024).toFixed(1);
hintsContainer.innerHTML = `
<p><strong class="font-semibold text-blue-300">For CPU:</strong> ${summary.resources.best_node_for_cpu.node_name} (${summary.resources.best_node_for_cpu.free_amount} free)</p>
<p><strong class="font-semibold text-emerald-300">For Memory:</strong> ${summary.resources.best_node_for_memory.node_name} (${summary.resources.best_node_for_memory.free_amount} free)</p>
<p>
<strong class="font-semibold text-purple-300">For GPU App:</strong> ${best_node_for_gpu_app.node_name}
<span class="block text-sm text-gray-400 pl-4">
- Product: ${gpuProduct} <br>
- Free: ${best_node_for_gpu_app.free_gpu_count} cards (${gpuMemoryGB} GB/card) <br>
- Total Available: ${best_node_for_gpu_app.total_potential_memory_gb.toFixed(1)} GB
</span>
</p>
`;
const gpuContainer = document.getElementById('gpu-availability');
gpuContainer.innerHTML = summary.resources.distributed_gpu_availability
.filter(gpu => gpu.product !== 'Unknown' && gpu.total_free_count > 0)
.map(gpu => {
const totalMemoryGB = (gpu.total_free_count * gpu.memory_per_gpu_mb) / 1024;
return `
<div class="flex justify-between items-center bg-gray-900/40 p-3 rounded-lg">
<div>
<p class="font-semibold text-white">${gpu.product}</p>
<p class="text-sm text-gray-400">${gpu.total_free_count} cards &times; ${(gpu.memory_per_gpu_mb / 1024).toFixed(1)} GB/card</p>
</div>
<div class="text-right">
<p class="text-2xl font-bold text-green-300">${totalMemoryGB.toFixed(1)} GB</p>
<p class="text-sm text-gray-400">Total Available</p>
</div>
</div>
`}).join('') || `<p class="text-gray-400">No dedicated GPUs available in the cluster.</p>`;
}
function renderNodes(nodes) {
const nodesGrid = document.getElementById('nodes-grid');
nodesGrid.innerHTML = nodes.map((node, index) => {
const cpuUsed = parseResourceValue(node.cpu.used);
const cpuTotal = parseResourceValue(node.cpu.total);
const cpuPercentage = cpuTotal > 0 ? (cpuUsed / cpuTotal) * 100 : 0;
const memUsed = parseResourceValue(node.memory.used);
const memTotal = parseResourceValue(node.memory.total);
const memPercentage = memTotal > 0 ? (memUsed / memTotal) * 100 : 0;
const diskTotalBytes = parseFloat(node.ephemeral_storage.total);
const diskTotal = diskTotalBytes > 0 ? diskTotalBytes / (1024 * 1024 * 1024) : 0; // Convert bytes to GiB
const diskUsed = parseResourceValue(node.ephemeral_storage.used);
const diskPercentage = diskTotal > 0 ? (diskUsed / diskTotal) * 100 : 0;
const podsPercentage = node.pods.total > 0 ? (node.pods.used / node.pods.total) * 100 : 0;
const isReady = node.health.overall_status === 'Ready';
const pressureStatus = getNodePressureStatus(node.health.conditions);
const gpuSectionHtml = createGpuSection(node.gpu_info);
return `
<div class="card overflow-hidden">
<div class="p-4 bg-gray-900/30 flex flex-col sm:flex-row justify-between items-start sm:items-center">
<div>
<h3 class="text-xl font-bold text-white">${node.name}</h3>
<div class="flex flex-wrap gap-2 mt-2">
${node.roles.map(role => `<span class="bg-blue-500/50 text-blue-200 text-xs font-semibold px-2.5 py-1 rounded-full">${role}</span>`).join('')}
</div>
</div>
<div class="mt-3 sm:mt-0 flex items-center gap-4">
<div class="tooltip">
<i data-lucide="${pressureStatus.hasPressure ? 'shield-alert' : 'shield-check'}" class="w-6 h-6 ${pressureStatus.hasPressure ? 'text-orange-400' : 'text-green-400'}"></i>
<span class="tooltiptext">${pressureStatus.reason}</span>
</div>
<div class="flex items-center gap-2 text-lg font-semibold ${isReady ? 'text-green-300' : 'text-red-400'}">
<span class="w-3 h-3 rounded-full ${isReady ? 'bg-green-400' : 'bg-red-400'}"></span>
${node.health.overall_status}
</div>
</div>
</div>
<div class="p-4">
<div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-4 mb-4">
${createResourceBar('CPU', cpuUsed.toFixed(2), cpuTotal, 'Cores', cpuPercentage, 'bg-emerald-400')}
${createResourceBar('Memory', memUsed.toFixed(2), memTotal.toFixed(2), 'GiB', memPercentage, 'bg-blue-400')}
${createResourceBar('Ephemeral Storage', diskUsed.toFixed(2), diskTotal.toFixed(2), 'GiB', diskPercentage, 'bg-yellow-400')}
${createResourceBar('Pods', node.pods.used, node.pods.total, '', podsPercentage, 'bg-red-400')}
</div>
${gpuSectionHtml}
<h4 class="font-semibold text-gray-300 mt-4 mb-2">Running Pods (${node.running_pods.length})</h4>
<div class="overflow-x-auto pod-list max-h-60 bg-gray-900/50 rounded-lg">
<table class="w-full text-sm text-left text-gray-300">
<thead class="text-xs text-gray-400 uppercase bg-gray-900/70 sticky top-0">
<tr>
<th scope="col" class="px-4 py-2">Namespace</th><th scope="col" class="px-4 py-2">Name</th>
<th scope="col" class="px-4 py-2">CPU Req.</th><th scope="col" class="px-4 py-2">Memory Req.</th>
<th scope="col" class="px-4 py-2">Age</th>
</tr>
</thead>
<tbody>
${node.running_pods.map(pod => `
<tr class="border-b border-gray-700/50 hover:bg-gray-700/50">
<td class="px-4 py-2">${pod.namespace}</td>
<td class="px-4 py-2 font-medium text-white whitespace-nowrap">${pod.name}</td>
<td class="px-4 py-2">${pod.cpu_requests}</td>
<td class="px-4 py-2">${pod.memory_requests}</td>
<td class="px-4 py-2">${pod.age}</td>
</tr>`).join('')}
${node.running_pods.length === 0 ? `<tr><td colspan="5" class="text-center py-4 text-gray-500">No running pods</td></tr>` : ''}
</tbody>
</table>
</div>
<div class="text-right mt-2">
<button onclick="toggleDetails(${index})" class="text-sm text-blue-300 hover:underline">Show Labels & Conditions</button>
</div>
<div id="details-${index}" class="hidden mt-4 p-4 bg-black/30 rounded-lg">
<h5 class="font-semibold text-white mb-2">Labels</h5>
<pre class="text-xs bg-black/50 p-3 rounded-md max-h-48 overflow-auto"><code>${JSON.stringify(node.labels, null, 2)}</code></pre>
</div>
</div>
</div>
`;
}).join('');
}
function createGpuSection(gpu_info) {
if (!gpu_info || !gpu_info.usage || gpu_info.usage.total === 0) {
return '';
}
const { used, total } = gpu_info.usage;
const percentage = total > 0 ? (used / total) * 100 : 0;
const gpuType = gpu_info.types[0] || {};
const product = gpuType.product || 'N/A';
const memoryPerCard = gpuType.memory_mb ? (gpuType.memory_mb / 1024) : 0;
const totalMemoryOnNode = total * memoryPerCard;
const detailsHtml = `
<div class="text-sm mt-2 text-gray-400">
<span>${product}</span>
<span class="float-right font-medium text-white">${totalMemoryOnNode.toFixed(1)} GB Total
<span class="text-gray-500">(${total} &times; ${memoryPerCard.toFixed(1)} GB)</span>
</span>
</div>
`;
return `
<div class="mt-4 pt-4 border-t border-gray-700/50">
${createResourceBar('GPU', used, total, 'Cards', percentage, 'bg-purple-400')}
${detailsHtml}
</div>
`;
}
function createResourceBar(label, used, total, unit, percentage, colorClass) {
// Hide bar if total is 0, except for pods
if (total === 0 && label !== 'Pods') return '';
return `
<div>
<div class="flex justify-between mb-1">
<span class="text-sm font-medium text-gray-300">${label}</span>
<span class="text-sm font-medium text-gray-400">${used} / ${total} ${unit}</span>
</div>
<div class="w-full bg-gray-700/50 rounded-full h-2.5">
<div class="${colorClass} h-2.5 rounded-full" style="width: ${percentage}%"></div>
</div>
</div>
`;
}
function toggleDetails(index) {
const detailsDiv = document.getElementById(`details-${index}`);
detailsDiv.classList.toggle('hidden');
}
// --- INITIALIZATION ---
window.onload = () => {
lucide.createIcons();
fetchDataBtn.addEventListener('click', fetchAndRenderDashboard);
const savedUrl = localStorage.getItem('clusterApiUrl');
if (savedUrl) {
apiUrlInput.value = savedUrl;
fetchAndRenderDashboard();
}
};
</script>
</body>
</ht

406
frontend/other.html Normal file
View File

@ -0,0 +1,406 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OCDP Application Manager</title>
<script src="https://cdn.tailwindcss.com"></script>
<script src="https://unpkg.com/lucide@latest"></script>
<style>
/* Custom styles for glassmorphism and animations */
body {
font-family: 'Inter', sans-serif;
background: linear-gradient(135deg, #1a202c 0%, #2d3748 50%, #4a5568 100%);
background-size: 400% 400%;
animation: gradientBG 15s ease infinite;
}
@keyframes gradientBG {
0% { background-position: 0% 50%; }
50% { background-position: 100% 50%; }
100% { background-position: 0% 50%; }
}
.card {
background: rgba(31, 41, 55, 0.5);
backdrop-filter: blur(12px);
-webkit-backdrop-filter: blur(12px);
border-radius: 1rem;
border: 1px solid rgba(255, 255, 255, 0.1);
transition: all 0.3s ease;
}
.card:hover {
transform: translateY(-5px) scale(1.01);
box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.2), 0 10px 10px -5px rgba(0, 0, 0, 0.1);
}
/* Tabs styling */
.tab-button.active {
background: rgba(55, 65, 81, 0.5);
color: #93c5fd;
border-bottom: 2px solid #3b82f6;
}
</style>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
</head>
<body class="bg-gray-900 text-gray-200">
<div class="container mx-auto p-4 md:p-8">
<header class="mb-8 flex flex-col sm:flex-row justify-between items-center gap-4">
<div class="flex items-center gap-4">
<i data-lucide="layout-dashboard" class="w-10 h-10 text-blue-400"></i>
<div>
<h1 class="text-3xl font-bold text-white">Application Manager</h1>
<p class="text-gray-400">Manage and deploy applications on your cluster.</p>
</div>
</div>
<div class="w-full sm:w-auto mt-4 sm:mt-0">
<input type="text" id="tokenInput" placeholder="Enter JWT Token" class="w-full sm:w-80 bg-gray-900/50 text-white border border-gray-600 rounded-lg px-4 py-2 focus:ring-2 focus:ring-blue-500 focus:border-blue-500 outline-none transition">
</div>
</header>
<main>
<div class="flex border-b border-gray-700/50 mb-6">
<button id="tabAvailable" class="tab-button px-6 py-3 font-semibold text-gray-400 hover:text-white transition active">Available Applications</button>
<button id="tabInstalled" class="tab-button px-6 py-3 font-semibold text-gray-400 hover:text-white transition">Installed Applications</button>
</div>
<div id="contentAvailable" class="tab-content">
<div id="availableAppsList" class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
</div>
</div>
<div id="contentInstalled" class="tab-content hidden">
<div id="installedAppsList" class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
</div>
</div>
</main>
</div>
<div id="installModal" class="fixed inset-0 bg-gray-900/50 backdrop-blur-md hidden justify-center items-center z-50">
<div class="card p-8 w-full max-w-lg">
<h3 class="text-2xl font-bold text-white mb-4">Install Application</h3>
<p id="modalAppName" class="text-gray-400 mb-4"></p>
<form id="installForm" class="space-y-4">
<div>
<label for="installMode" class="block text-sm font-medium text-gray-300">Deployment Mode</label>
<select id="installMode" class="mt-1 block w-full bg-gray-900/50 border border-gray-600 rounded-md shadow-sm p-2 text-white"></select>
</div>
<div>
<label for="userOverrides" class="block text-sm font-medium text-gray-300">User Overrides (JSON)</label>
<textarea id="userOverrides" rows="5" class="mt-1 block w-full bg-gray-900/50 border border-gray-600 rounded-md shadow-sm p-2 text-white"></textarea>
</div>
<div class="flex justify-end gap-3 pt-4">
<button type="button" id="cancelInstallBtn" class="bg-gray-600 hover:bg-gray-700 text-white font-bold py-2 px-4 rounded-lg transition">Cancel</button>
<button type="submit" id="submitInstallBtn" class="bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-4 rounded-lg transition">Install</button>
</div>
</form>
</div>
</div>
<div id="statusModal" class="fixed inset-0 bg-gray-900/50 backdrop-blur-md hidden justify-center items-center z-50">
<div class="card p-8 w-full max-w-2xl">
<div class="flex justify-between items-center mb-4">
<h3 class="text-2xl font-bold text-white">Application Status</h3>
<button id="closeStatusModal" class="text-gray-400 hover:text-white"><i data-lucide="x" class="w-6 h-6"></i></button>
</div>
<div id="statusContent" class="space-y-4"></div>
</div>
</div>
<script>
const API_PREFIX = "http://localhost:8000/api/v1/orchestration";
let TOKEN = "";
// --- DOM Elements ---
const tokenInput = document.getElementById('tokenInput');
const tabAvailable = document.getElementById('tabAvailable');
const tabInstalled = document.getElementById('tabInstalled');
const contentAvailable = document.getElementById('contentAvailable');
const contentInstalled = document.getElementById('contentInstalled');
const availableAppsList = document.getElementById('availableAppsList');
const installedAppsList = document.getElementById('installedAppsList');
const installModal = document.getElementById('installModal');
const statusModal = document.getElementById('statusModal');
const closeStatusModal = document.getElementById('closeStatusModal');
const installForm = document.getElementById('installForm');
let activeRefreshInterval = null;
// --- Core Logic ---
async function fetchAvailableApps() {
try {
const response = await fetch(`${API_PREFIX}/application-templates`);
if (!response.ok) throw new Error("Failed to fetch available apps.");
const apps = await response.json();
renderAvailableApps(apps);
} catch (error) {
console.error("Error fetching available apps:", error);
availableAppsList.innerHTML = `<p class="text-center text-red-400">Failed to load available applications. Please check your API server.</p>`;
}
}
async function fetchInstalledApps() {
if (!TOKEN) {
installedAppsList.innerHTML = `<p class="text-center text-yellow-400 col-span-full">Please enter a valid JWT token to view installed applications.</p>`;
return;
}
try {
const response = await fetch(`${API_PREFIX}/application-instances`, {
headers: { "Authorization": `Bearer ${TOKEN}` }
});
if (!response.ok) throw new Error("Failed to fetch installed apps.");
const apps = await response.json();
renderInstalledApps(apps);
} catch (error) {
console.error("Error fetching installed apps:", error);
installedAppsList.innerHTML = `<p class="text-center text-red-400">Failed to load installed applications. Token might be invalid or expired.</p>`;
}
}
async function fetchAndRenderStatus(namespace, app_template_name, mode) {
if (!TOKEN) return;
try {
const response = await fetch(`${API_PREFIX}/application-instances/${namespace}/${app_template_name}/status?mode=${mode}`, {
headers: { "Authorization": `Bearer ${TOKEN}` }
});
if (!response.ok) throw new Error("Failed to fetch status.");
const statusData = await response.json();
renderStatusDetails(statusData);
} catch (error) {
console.error("Error fetching status:", error);
document.getElementById('statusContent').innerHTML = `<p class="text-red-400">Error: ${error.message}</p>`;
}
}
async function uninstallRelease(namespace, app_template_name, mode) {
if (!confirm(`Are you sure you want to uninstall the Helm release for ${app_template_name}?`)) return;
try {
const response = await fetch(`${API_PREFIX}/application-instances/${namespace}/${app_template_name}?mode=${mode}`, {
method: 'DELETE',
headers: { "Authorization": `Bearer ${TOKEN}` }
});
const result = await response.json();
alert(`Uninstall Release Result:\n${result.message}`);
fetchInstalledApps(); // Refresh the list
} catch (error) {
console.error("Error uninstalling release:", error);
alert(`Failed to uninstall release: ${error.message}`);
}
}
async function deleteNamespace(namespace) {
if (!confirm(`WARNING: This will permanently delete the entire namespace '${namespace}' and all its resources.`)) return;
try {
const response = await fetch(`${API_PREFIX}/application-instances/${namespace}`, {
method: 'DELETE',
headers: { "Authorization": `Bearer ${TOKEN}` }
});
const result = await response.json();
alert(`Delete Namespace Result:\n${result.message}`);
fetchInstalledApps(); // Refresh the list
} catch (error) {
console.error("Error deleting namespace:", error);
alert(`Failed to delete namespace: ${error.message}`);
}
}
// --- Render Functions ---
function renderAvailableApps(apps) {
availableAppsList.innerHTML = apps.map(app => `
<div class="card p-6 flex flex-col justify-between">
<div>
<h3 class="text-xl font-bold text-white mb-2">${app.name}</h3>
<p class="text-sm text-gray-400">Business Name: ${app.metadata.application_name}</p>
<p class="text-sm text-gray-400">Chart: ${app.metadata.distributed.chart}</p>
</div>
<div class="mt-4">
<button onclick="openInstallModal('${app.name}', ${JSON.stringify(app.metadata)})" class="w-full bg-green-600 hover:bg-green-700 text-white font-bold py-2 px-4 rounded-lg flex items-center justify-center gap-2 transition">
<i data-lucide="plus-circle" class="w-4 h-4"></i> Install
</button>
</div>
</div>
`).join('');
lucide.createIcons();
}
function renderInstalledApps(apps) {
installedAppsList.innerHTML = apps.map(app => `
<div class="card p-6 flex flex-col justify-between">
<div>
<h3 class="text-xl font-bold text-white mb-2">${app.application_name}</h3>
<p class="text-sm text-gray-400">Namespace: <span class="text-blue-300 font-semibold">${app.namespace}</span></p>
<p class="text-sm text-gray-400">Release: ${app.release_name}</p>
<p class="text-sm text-gray-400">Status: <span class="font-semibold ${app.status === 'deployed' ? 'text-green-300' : 'text-yellow-300'}">${app.status}</span></p>
</div>
<div class="mt-4 flex gap-2">
<button onclick="viewStatusDetails('${app.namespace}', '${app.application_name}', 'distributed')" class="flex-1 bg-blue-600 hover:bg-blue-700 text-white font-bold py-2 px-2 rounded-lg transition text-sm">
View Status
</button>
<button onclick="uninstallRelease('${app.namespace}', '${app.application_name}', 'distributed')" class="flex-1 bg-red-600 hover:bg-red-700 text-white font-bold py-2 px-2 rounded-lg transition text-sm">
Uninstall
</button>
</div>
<div class="mt-2">
<button onclick="deleteNamespace('${app.namespace}')" class="w-full bg-red-800 hover:bg-red-900 text-white font-bold py-2 px-2 rounded-lg transition text-sm">
Delete Namespace
</button>
</div>
</div>
`).join('');
lucide.createIcons();
}
function renderStatusDetails(statusData) {
const content = document.getElementById('statusContent');
content.innerHTML = `
<p class="text-xl font-bold text-white">Application: ${statusData.application_name}</p>
<p class="text-gray-400">Namespace: ${statusData.namespace}</p>
<p class="text-gray-400">Ready: <span class="font-semibold ${statusData.is_ready ? 'text-green-300' : 'text-yellow-300'}">${statusData.is_ready}</span></p>
${statusData.base_access_url ? `<p class="text-gray-400">Access URL: <a href="${statusData.base_access_url}" target="_blank" class="text-blue-400 hover:underline">${statusData.base_access_url}</a></p>` : ''}
<h4 class="font-semibold text-white mt-4 mb-2">Pod Details</h4>
<ul class="space-y-2">
${statusData.details.map(pod => `
<li class="bg-gray-800/50 p-3 rounded-lg flex items-center justify-between">
<div>
<p class="font-bold text-white">${pod.pod_name}</p>
<p class="text-sm text-gray-400">Phase: ${pod.status_phase} | Ready: ${pod.ready_status}</p>
</div>
<span class="w-3 h-3 rounded-full ${pod.is_ready ? 'bg-green-400' : 'bg-red-400'}"></span>
</li>
`).join('')}
</ul>
`;
statusModal.classList.remove('hidden');
statusModal.classList.add('flex');
}
// --- Event Handlers & Modal Functions ---
function openInstallModal(appName, metadata) {
const modalTitle = document.getElementById('modalAppName');
modalTitle.innerText = `Installing ${appName}`;
const modeSelect = document.getElementById('installMode');
modeSelect.innerHTML = '';
const modes = [
{ name: 'distributed', data: metadata.distributed },
{ name: 'monolithic', data: metadata.monolithic }
];
modes.forEach(mode => {
if (mode.data) {
const option = document.createElement('option');
option.value = mode.name;
option.innerText = mode.name;
modeSelect.appendChild(option);
}
});
installModal.classList.remove('hidden');
installModal.classList.add('flex');
installForm.dataset.appName = appName;
}
installForm.addEventListener('submit', async (e) => {
e.preventDefault();
const appName = installForm.dataset.appName;
const mode = document.getElementById('installMode').value;
const userOverridesText = document.getElementById('userOverrides').value;
let userOverrides = {};
if (userOverridesText) {
try {
userOverrides = JSON.parse(userOverridesText);
} catch (error) {
alert("Invalid JSON for user overrides.");
return;
}
}
try {
const response = await fetch(`${API_PREFIX}/application-instances`, {
method: 'POST',
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${TOKEN}`
},
body: JSON.stringify({
app_template_name: appName,
mode: mode,
user_overrides: userOverrides
})
});
const result = await response.json();
if (!response.ok) throw new Error(result.detail || 'Installation failed.');
alert(`Installation started successfully:\nNamespace: ${result.namespace}\nMessage: ${result.message}`);
installModal.classList.remove('flex');
installModal.classList.add('hidden');
fetchInstalledApps(); // Refresh installed list
} catch (error) {
console.error("Installation error:", error);
alert(`Installation failed: ${error.message}`);
}
});
document.getElementById('cancelInstallBtn').addEventListener('click', () => {
installModal.classList.remove('flex');
installModal.classList.add('hidden');
});
closeStatusModal.addEventListener('click', () => {
statusModal.classList.remove('flex');
statusModal.classList.add('hidden');
});
function viewStatusDetails(namespace, app_template_name, mode) {
document.getElementById('statusContent').innerHTML = `<p class="text-center text-blue-400">Loading status...</p>`;
fetchAndRenderStatus(namespace, app_template_name, mode);
}
// --- Tab Switching Logic ---
function switchTab(tabId) {
document.querySelectorAll('.tab-button').forEach(btn => btn.classList.remove('active'));
document.querySelectorAll('.tab-content').forEach(content => content.classList.add('hidden'));
if (tabId === 'tabAvailable') {
tabAvailable.classList.add('active');
contentAvailable.classList.remove('hidden');
fetchAvailableApps();
} else if (tabId === 'tabInstalled') {
tabInstalled.classList.add('active');
contentInstalled.classList.remove('hidden');
fetchInstalledApps();
}
lucide.createIcons();
}
tabAvailable.addEventListener('click', () => switchTab('tabAvailable'));
tabInstalled.addEventListener('click', () => switchTab('tabInstalled'));
// --- Initialization ---
window.onload = () => {
const storedToken = localStorage.getItem('jwtToken');
if (storedToken) {
tokenInput.value = storedToken;
TOKEN = storedToken;
switchTab('tabInstalled');
} else {
switchTab('tabAvailable');
}
tokenInput.addEventListener('input', (e) => {
TOKEN = e.target.value;
localStorage.setItem('jwtToken', TOKEN);
if (TOKEN) {
// Refresh current tab if token is entered
const currentTab = document.querySelector('.tab-button.active');
if (currentTab) switchTab(currentTab.id);
}
});
lucide.createIcons();
};
</script>
</body>
</html>

View File

@ -0,0 +1,147 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts.
# this is typically a path given in POSIX (e.g. forward slashes)
# format, relative to the token %(here)s which refers to the location of this
# ini file
script_location = %(here)s/alembic
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# Uncomment the line below if you want the files to be prepended with date and time
# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
# for all available tokens
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory. for multiple paths, the path separator
# is defined by "path_separator" below.
prepend_sys_path = .
# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
# Any required deps can installed by adding `alembic[tz]` to the pip requirements
# string value is passed to ZoneInfo()
# leave blank for localtime
# timezone =
# max length of characters to apply to the "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; This defaults
# to <script_location>/versions. When using multiple version
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "path_separator"
# below.
# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions
# path_separator; This indicates what character is used to split lists of file
# paths, including version_locations and prepend_sys_path within configparser
# files such as alembic.ini.
# The default rendered in new alembic.ini files is "os", which uses os.pathsep
# to provide os-dependent path splitting.
#
# Note that in order to support legacy alembic.ini files, this default does NOT
# take place if path_separator is not present in alembic.ini. If this
# option is omitted entirely, fallback logic is as follows:
#
# 1. Parsing of the version_locations option falls back to using the legacy
# "version_path_separator" key, which if absent then falls back to the legacy
# behavior of splitting on spaces and/or commas.
# 2. Parsing of the prepend_sys_path option falls back to the legacy
# behavior of splitting on spaces, commas, or colons.
#
# Valid values for path_separator are:
#
# path_separator = :
# path_separator = ;
# path_separator = space
# path_separator = newline
#
# Use os.pathsep. Default configuration used for new projects.
path_separator = os
# set to 'true' to search source files recursively
# in each "version_locations" directory
# new in Alembic version 1.10
# recursive_version_locations = false
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
# database URL. This is consumed by the user-maintained env.py script only.
# other means of configuring database URLs may be customized within the env.py
# file.
sqlalchemy.url = mysql+pymysql://root:GDIP%%40ssw0rd@localhost:3306/ocdp
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME
# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module
# hooks = ruff
# ruff.type = module
# ruff.module = ruff
# ruff.options = check --fix REVISION_SCRIPT_FILENAME
# Alternatively, use the exec runner to execute a binary found on your PATH
# hooks = ruff
# ruff.type = exec
# ruff.executable = ruff
# ruff.options = check --fix REVISION_SCRIPT_FILENAME
# Logging configuration. This is also consumed by the user-maintained
# env.py script only.
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARNING
handlers = console
qualname =
[logger_sqlalchemy]
level = WARNING
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

1
migration/alembic/README Normal file
View File

@ -0,0 +1 @@
Generic single-database configuration.

81
migration/alembic/env.py Normal file
View File

@ -0,0 +1,81 @@
from logging.config import fileConfig
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from alembic import context
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
#target_metadata = None
from ocdp.database import Base
from ocdp.models import *
target_metadata = Base.metadata
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View File

@ -0,0 +1,28 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision: str = ${repr(up_revision)}
down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
"""Upgrade schema."""
${upgrades if upgrades else "pass"}
def downgrade() -> None:
"""Downgrade schema."""
${downgrades if downgrades else "pass"}

View File

@ -0,0 +1,49 @@
"""create initial tables
Revision ID: 796b67d23c1c
Revises:
Create Date: 2025-08-23 16:05:51.420713
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '796b67d23c1c'
down_revision: Union[str, Sequence[str], None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('users',
sa.Column('user_id', sa.Integer(), autoincrement=True, nullable=False),
sa.Column('username', sa.String(length=64), nullable=False),
sa.Column('email', sa.String(length=128), nullable=False),
sa.Column('hashed_password', sa.String(length=128), nullable=False),
sa.Column('is_active', sa.Boolean(), nullable=False),
sa.Column('is_admin', sa.Boolean(), nullable=False),
sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=False),
sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=False),
sa.Column('last_login_at', sa.TIMESTAMP(timezone=True), nullable=True),
sa.PrimaryKeyConstraint('user_id')
)
op.create_index(op.f('ix_users_email'), 'users', ['email'], unique=True)
op.create_index(op.f('ix_users_user_id'), 'users', ['user_id'], unique=False)
op.create_index(op.f('ix_users_username'), 'users', ['username'], unique=True)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f('ix_users_username'), table_name='users')
op.drop_index(op.f('ix_users_user_id'), table_name='users')
op.drop_index(op.f('ix_users_email'), table_name='users')
op.drop_table('users')
# ### end Alembic commands ###

5
ocdp/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# env
from dotenv import load_dotenv
load_dotenv()
from . import logger

30
ocdp/__main__.py Normal file
View File

@ -0,0 +1,30 @@
from fastapi import FastAPI
from ocdp.controllers.v1 import router as api_v1_router
app = FastAPI(title="One Click Deployment API", )
app.include_router(api_v1_router, tags=["v1"])
# app.include_router(api_v2_router, tags=["v2"])
# 允许的来源(可以改成你前端的地址,比如 http://localhost:3000
from fastapi.middleware.cors import CORSMiddleware
origins = [
"*", # 允许所有来源,生产环境建议改成具体域名
# "http://localhost:3000",
# "https://yourdomain.com",
]
# 添加中间件
app.add_middleware(
CORSMiddleware,
allow_origins=origins, # 允许访问的源
allow_credentials=True, # 允许携带 Cookie
allow_methods=["*"], # 允许的方法,如 GET、POST 等
allow_headers=["*"], # 允许的请求头
)
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

137
ocdp/config.py Normal file
View File

@ -0,0 +1,137 @@
import os
import yaml
from dataclasses import dataclass
# ----------------------------
# 1⃣ Define configuration data structures
# ----------------------------
@dataclass
class HashConfig:
algorithm: str # Password hashing algorithm
@dataclass
class PasswordConfig:
hash: HashConfig
@dataclass
class KubeConfig:
kubectl_file: str # Path to kubectl config file
applications_dir: str # Directory to store Kubernetes applications
logs_dir: str # Directory for logs
@dataclass
class OrchestrationConfig:
kube: KubeConfig
@dataclass
class LokiConfig:
url: str # Loki push endpoint
labels: str # Default labels for logs
label_keys: str # Optional: comma-separated label keys
@dataclass
class LoggerConfig:
loki: LokiConfig
@dataclass
class MySQLConfig:
host: str # MySQL host
port: int # MySQL port
db_name: str # Database db_name
username: str # Database username
password: str # Password read from environment variable
@dataclass
class DatabaseConfig:
mysql: MySQLConfig
@dataclass
class JWTConfig:
signing_algorithm: str # JWT signing algorithm
secret_key: str # Secret key read from environment variable
@dataclass
class TokenConfig:
jwt: JWTConfig
@dataclass
class AdminConfig:
username: str
password: str
@dataclass
class Config:
orchestration: OrchestrationConfig
logger: LoggerConfig
database: DatabaseConfig
password: PasswordConfig
token: TokenConfig
admin: AdminConfig
# ----------------------------
# 2⃣ Load YAML configuration and environment variables
# ----------------------------
def load_config(yaml_path: str) -> Config:
with open(yaml_path, "r") as f:
raw = yaml.safe_load(f)
# orchestration.kube
kube_cfg = KubeConfig(**raw["orchestration"]["kube"])
orchestration_cfg = OrchestrationConfig(kube=kube_cfg)
# logger.loki
loki_cfg = LokiConfig(**raw["logger"]["loki"])
logger_cfg = LoggerConfig(loki=loki_cfg)
# database.mysql
mysql_raw = raw["database"]["mysql"]
mysql_password = os.environ.get("DATABASE_MYSQL_PASSWORD")
if not mysql_password:
raise ValueError("Environment variable DATABASE_MYSQL_PASSWORD not set")
mysql_cfg = MySQLConfig(**mysql_raw, password=mysql_password)
database_cfg = DatabaseConfig(mysql=mysql_cfg)
# password.hash
hash_cfg = HashConfig(**raw["password"]["hash"])
password_cfg = PasswordConfig(hash=hash_cfg)
# token.jwt
jwt_raw = raw["token"]["jwt"]
jwt_secret_key = os.environ.get("TOKEN_JWT_SECRET_KEY")
if not jwt_secret_key:
raise ValueError("Environment variable TOKEN_JWT_SECRET_KEY not set")
jwt_cfg = JWTConfig(**jwt_raw, secret_key=jwt_secret_key)
token_cfg = TokenConfig(jwt=jwt_cfg)
# admin
admin_cfg = AdminConfig(
username=os.environ.get("ADMIN_USERNAME"),
password=os.environ.get("ADMIN_PASSWORD")
)
# Return final Config object
return Config(
orchestration=orchestration_cfg,
logger=logger_cfg,
database=database_cfg,
password=password_cfg,
token=token_cfg,
admin=admin_cfg
)
# ----------------------------
# 3⃣ Usage example
# ----------------------------
from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file
CONFIG = load_config(os.path.expanduser(os.environ.get("OCDP_CONFIG_FILE")))
if CONFIG is None:
raise ValueError("Failed to load configuration from YAML file")
print("Kube config path:", CONFIG.orchestration.kube.kubectl_file)
print("Password hash algorithm:", CONFIG.password.hash.algorithm)
print("MySQL password:", CONFIG.database.mysql.password)
print("JWT secret key:", CONFIG.token.jwt.secret_key)
print("Loki URL:", CONFIG.logger.loki.url)
print("Admin username:", CONFIG.admin.username)

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,20 @@
from fastapi import APIRouter
from .user import router as user_router
from .auth import router as auth_router
from .orchestration import router as orchestration_router
router = APIRouter(prefix="/api/v1")
router.include_router(user_router)
router.include_router(auth_router)
router.include_router(orchestration_router)

View File

@ -0,0 +1,10 @@
from fastapi import APIRouter
from . import (
login
)
router = APIRouter(prefix="/auth")
router.include_router(login.router)

View File

@ -0,0 +1,60 @@
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from sqlalchemy.orm import Session
from ocdp.database import get_db
from ocdp.services.user import user_service
# 创建一个 API 路由器
router = APIRouter()
# 登录请求的数据模型
class LoginRequest(BaseModel):
"""
用户登录的请求体 (Request Body)
"""
username: str
password: str
# 登录成功后返回的数据模型
class LoginResponse(BaseModel):
"""
成功登录后返回的响应体
"""
access_token: str
token_type: str = "bearer"
# 假设你已经创建了路由器实例
# router = APIRouter(prefix="/users", tags=["Users"])
# 添加登录路由
@router.post("/login", response_model=LoginResponse)
def login(
login_in: LoginRequest,
db: Session = Depends(get_db)
):
"""
通过用户名和密码获取访问令牌 (Access Token)
"""
# 调用服务层函数来处理登录逻辑
token = user_service.login_for_access_token(
username=login_in.username,
password=login_in.password,
db=db
)
# 检查服务层返回的 token 是否为空,如果为空,表示认证失败
if not token:
# 抛出 401 Unauthorized 异常
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid username or password",
headers={"WWW-Authenticate": "Bearer"},
)
# 认证成功,返回包含 token 的响应
return {"access_token": token}

View File

@ -0,0 +1,11 @@
from fastapi import APIRouter
from .cluster import router as cluster_router
from .application_controller import router as application_router
router = APIRouter(prefix="/orchestration")
router.include_router(cluster_router)
router.include_router(application_router)

View File

@ -0,0 +1,7 @@
from fastapi import APIRouter
router = APIRouter(prefix="/application")

View File

@ -0,0 +1,30 @@
# list_application_template.py
"""
Controller for Application Templates.
"""
from pydantic import BaseModel, Field
from fastapi import APIRouter, Depends
from cluster_tool import Cluster, get_cluster
from services import application_service
from models.application import ApplicationMetadata
# --- FastAPI Router ---
router = APIRouter(
prefix="/application-templates",
)
@router.get("/", response_model=list[ApplicationTemplate], summary="获取所有可安装的应用模板")
def list_application_templates(cluster: Cluster = Depends(get_cluster)):
"""列出在 `applications_dir` 中所有可供安装的应用及其元数据。"""
return application_service.list_available_applications(cluster)

View File

@ -0,0 +1,87 @@
# ocdp/controllers/application_instances.py
"""
Controller for Application Instances.
(Authentication updated to use OAuth2 Password Bearer flow)
"""
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.security import OAuth2PasswordBearer
from sqlalchemy.orm import Session
from ocdp.orchestration.cluster import Cluster, get_cluster
from ocdp.services.orchestration import application_service
from ocdp.models.orchestration.application import (
InstalledApplicationInstance, InstallReceipt,
UninstallReceipt, NamespaceDeleteReceipt, ApplicationStatus, ApplicationTemplate
)
# 假设的依赖和 Service 函数导入路径
from ocdp.database import get_db
from ocdp.services.user import user_service
# ----------------
from pydantic import BaseModel
from typing import Optional, Dict, Any
class InstallRequest(BaseModel):
app_template_name: str
mode: str
user_overrides: Optional[Dict[str, Any]] = None
# --- FastAPI Router ---
router = APIRouter()
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
def get_current_user_id_dependency(token: str = Depends(oauth2_scheme)):
"""
此依赖函数负责调用 user_service 来验证 token 并获取用户 ID。
"""
# 将获取到的 token 传递给 user_service 的方法
return user_service.get_user_id_by_token(token)
@router.get("/application-templates", response_model=list[ApplicationTemplate], summary="列出所有可用的应用模板")
def list_application_templates(
cluster: Cluster = Depends(get_cluster)
):
"""
获取系统中所有可用的应用模板列表。
"""
return application_service.list_available_applications(cluster)
@router.get("/application-instances", response_model=list[InstalledApplicationInstance], summary="列出当前用户已安装的应用实例")
def list_application_instances(
# 使用新的依赖函数
user_id: str = Depends(get_current_user_id_dependency),
cluster: Cluster = Depends(get_cluster)
):
# 这里直接使用 user_id业务逻辑保持不变
return application_service.list_user_applications(cluster, user_id)
@router.post("/application-instances", response_model=InstallReceipt, status_code=status.HTTP_202_ACCEPTED, summary="安装一个新的应用实例")
def install_application_instance(
request: InstallRequest,
user_id: str = Depends(get_current_user_id_dependency),
cluster: Cluster = Depends(get_cluster)
):
try:
return application_service.install_new_application(
cluster=cluster, user_id=user_id, app_template_name=request.app_template_name,
mode=request.mode, user_overrides=request.user_overrides
)
except (ValueError, FileNotFoundError) as e:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(e))
except Exception as e:
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An unexpected error occurred: {e}")
# ... (get_status 和 uninstall/delete 端点保持不变,因为它们不直接依赖 user_id) ...
@router.get("/application-instances/{namespace}/{app_template_name}/status", response_model=ApplicationStatus, summary="获取指定应用实例的状态")
def get_application_instance_status(namespace: str, app_template_name: str, mode: str, cluster: Cluster = Depends(get_cluster)):
return application_service.get_instance_status(cluster, namespace, app_template_name, mode)
@router.delete("/application-instances/{namespace}/{app_template_name}", response_model=UninstallReceipt, summary="步骤1卸载应用实例的 Release")
def uninstall_instance_release(namespace: str, app_template_name: str, mode: str, cluster: Cluster = Depends(get_cluster)):
return application_service.uninstall_application_release(cluster, namespace, app_template_name, mode)
@router.delete("/application-instances/{namespace}", response_model=NamespaceDeleteReceipt, summary="步骤2删除应用实例的命名空间")
def delete_instance_namespace(namespace: str, cluster: Cluster = Depends(get_cluster)):
return application_service.delete_application_namespace(cluster, namespace)

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .get_cluster_status import router as get_cluster_status_router
router = APIRouter(prefix="/cluster")
router.include_router(get_cluster_status_router)

View File

@ -0,0 +1,24 @@
# get_cluster_status.py
from fastapi import APIRouter, Depends
from ocdp.orchestration.cluster import Cluster, get_cluster
from ocdp.services.orchestration import cluster_service
from ocdp.models.orchestration.cluster import ClusterStatus
router = APIRouter()
# 依赖注入函数名和参数名也同步调整,更清晰
def get_status_from_service(cluster: Cluster = Depends(get_cluster)) -> ClusterStatus:
"""辅助函数在Depends中调用service函数"""
return cluster_service.get_cluster_status(cluster)
@router.get(
"/cluster-status",
response_model=ClusterStatus,
summary="Get Comprehensive Cluster Status"
)
def get_comprehensive_cluster_status(status: ClusterStatus = Depends(get_status_from_service)):
"""
Provides a complete snapshot of the cluster's health and resources.
"""
return status

View File

@ -0,0 +1,34 @@
from pydantic import BaseModel
from fastapi import APIRouter, Depends, HTTPException
from ocdp.orchestration import Cluster
# 假设的依赖和 Service 函数导入路径
from ocdp.orchestration import get_cluster
from ocdp.services.orchestration import node_service
# --- Response Models for this endpoint ---
class NodeHealthStatus(BaseModel):
is_ready: bool
pressures: dict[str, bool]
HealthReportResponse = dict[str, NodeHealthStatus]
# --- Router Definition ---
router = APIRouter()
@router.get(
"/health",
response_model=HealthReportResponse,
# summary="获取集群节点健康状态"
)
def get_health(cluster: Cluster = Depends(get_cluster)):
"""
获取集群所有节点的健康状态报告。
- **is_ready**: 节点是否就绪。
- **pressures**: 节点的各项压力状态,`true` 表示存在压力。
"""
try:
return node_service.get_cluster_health_report(cluster)
except RuntimeError as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,39 @@
from pydantic import BaseModel
from fastapi import APIRouter, Depends, HTTPException
from ocdp.orchestration import Cluster, get_cluster
from ocdp.services.orchestration import node_service
# --- Response Models for this endpoint ---
class ResourceDetail(BaseModel):
total: str
used: str # 修正: 'used_by_system' -> 'used'
free: str # 修正: 'free_for_pods' -> 'free'
class GPUSummary(BaseModel):
total_count: int
allocatable_count: int
models_summary: dict[str, int]
class ClusterSummaryResponse(BaseModel):
cpu: ResourceDetail
memory: ResourceDetail
storage: ResourceDetail
gpu: GPUSummary
# --- Router Definition ---
router = APIRouter()
@router.get(
"/summary/resources",
response_model=ClusterSummaryResponse,
# summary="获取集群资源汇总"
)
def get_summary_resources(cluster: Cluster = Depends(get_cluster)):
"""
获取整个集群的资源聚合汇总报告。
"""
try:
return node_service.get_cluster_summary_report(cluster)
except RuntimeError as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,44 @@
from pydantic import BaseModel
from fastapi import APIRouter, Depends, HTTPException
from ocdp.orchestration import Cluster, get_cluster
from ocdp.services.orchestration import node_service
# --- Response Models for this endpoint ---
class ResourceDetail(BaseModel):
total: str
used: str # 修正: 'used_by_system' -> 'used'
free: str # 修正: 'free_for_pods' -> 'free'
class GPUInfo(BaseModel):
count: int
allocatable_count: int # 新增: 匹配 service 返回的可分配 GPU 数量
model: str
memory_mb: int
class NodeResourceDetail(BaseModel):
cpu: ResourceDetail
memory: ResourceDetail
storage: ResourceDetail
gpu: GPUInfo
NodeResourcesResponse = dict[str, NodeResourceDetail]
# --- Router Definition ---
router = APIRouter()
@router.get(
"/nodes/resources",
response_model=NodeResourcesResponse,
# summary="获取各节点资源详情"
)
def list_nodes_resources(cluster: Cluster = Depends(get_cluster)):
"""
获取集群中每个节点的详细资源使用报告。
- **used_by_system**: 被系统和 Kubelet 预留的资源。
- **free_for_pods**: 可供 Pod 调度的资源。
"""
try:
return node_service.get_per_node_resource_report(cluster)
except RuntimeError as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,15 @@
from fastapi import APIRouter
from . import (
register_user,
get_current_user
)
router = APIRouter(prefix="/users")
router.include_router(register_user.router)
router.include_router(get_current_user.router)

View File

@ -0,0 +1,39 @@
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
from pydantic import BaseModel, constr, EmailStr, validator
from sqlalchemy.orm import Session
from ocdp.database import get_db
from ocdp.services.user import user_service
from ocdp.services.user import user_exceptions
router = APIRouter()
class GetCurrentUserResponse(BaseModel):
"""
获取当前登录用户的信息 (Response Body)
"""
user_id: int
username: str
email: EmailStr
class Config:
# Pydantic V2 推荐的用法
from_attributes = True
# Pydantic V1 的旧用法
# orm_mode = True
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
@router.get("/me", response_model=GetCurrentUserResponse)
def get_current_user(
db: Session = Depends(get_db),
token: str = Depends(oauth2_scheme)
):
"""
获取当前登录用户的信息.
"""
current_user = user_service.get_current_user(token, db)
if not current_user:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid authentication credentials")
return current_user

View File

@ -0,0 +1,79 @@
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, constr, EmailStr, validator
from sqlalchemy.orm import Session
from ocdp.database import get_db
from ocdp.services.user import user_service
from ocdp.services.user import user_exceptions
# 创建一个 API 路由器
router = APIRouter()
ALLOWED_PASSWORD_CHARS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@#$%&*_-")
class RegisterUserRequest(BaseModel):
"""
用户注册的请求体 (Request Body)
"""
username: str
password: str
email: EmailStr
@validator('password')
def validate_password(cls, v):
if len(v) < 8 or len(v) > 32:
raise ValueError('密码长度应在8~32位')
if not any(c.isalpha() for c in v):
raise ValueError('密码必须包含字母')
if not any(c.isdigit() for c in v):
raise ValueError('密码必须包含数字')
if any(c.isspace() for c in v):
raise ValueError('密码不能包含空格')
if any(c not in ALLOWED_PASSWORD_CHARS for c in v):
raise ValueError('密码包含非法字符')
return v
class RegisterUserResponse(BaseModel):
"""
成功注册后返回的用户信息 (Response Body)
不包含密码等敏感数据
"""
id: int
username: str
email: EmailStr
class Config:
# Pydantic V2 推荐的用法
from_attributes = True
# Pydantic V1 的旧用法
# orm_mode = True
@router.post("/", response_model=RegisterUserResponse, status_code=status.HTTP_201_CREATED)
def register_user(
user_in: RegisterUserRequest,
db: Session = Depends(get_db)
):
"""
注册一个新用户.
- **username**: 用户的唯一名称.
- **email**: 用户的唯一邮箱.
- **password**: 用户的密码.
"""
try:
# 调用 service 层的函数来创建用户
# user_in.dict() 将 Pydantic 模型转换为字典
created_user = user_service.create_user(
username=user_in.username,
password=user_in.password,
email=user_in.email,
db=db
)
return created_user
except user_exceptions.UserAlreadyExistsError as e:
# 捕获 service 层抛出的特定异常
# 返回 400 错误,并附带清晰的错误信息
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=str(e),
)

View File

@ -0,0 +1,15 @@
from fastapi import APIRouter
router = APIRouter(prefix="/api/v2")

2
ocdp/daos/__init__.py Normal file
View File

@ -0,0 +1,2 @@
from .user import user_dao

View File

View File

@ -0,0 +1,304 @@
# dao.py
import os
import yaml
import json
import time
from pydantic import ValidationError
from ocdp.orchestration.cluster import Cluster
from ocdp.models.orchestration.application import (ApplicationTemplate, ApplicationMetadata, InstallReceipt,
ApplicationStatus, UninstallReceipt, PodStatusDetail,
InstalledApplicationInstance, NamespaceDeleteReceipt) # <-- 导入更新后的模型
# ... (list_application_templates, list_application_instances, _deep_merge 保持不变) ...
def list_application_templates(cluster: Cluster) -> list[ApplicationTemplate]:
app_dirs = cluster.list_applications(); template_list = []
for app_dir in app_dirs:
try:
metadata_dict = cluster.get_application_metadata(app_dir)
validated_metadata = ApplicationMetadata(**metadata_dict)
template_list.append(ApplicationTemplate(name=app_dir,metadata=validated_metadata))
except Exception as e: print(f"⚠️ Warning: Could not load or validate metadata for '{app_dir}': {e}")
return template_list
def list_application_instances(cluster: Cluster, user_id: str) -> list[InstalledApplicationInstance]:
prefix = f"{user_id}-";
try:
ns_json_str = cluster.get("namespaces"); all_ns_data = json.loads(ns_json_str).get("items", [])
user_namespaces = [ns['metadata']['name'] for ns in all_ns_data if ns['metadata']['name'].startswith(prefix)]
if not user_namespaces: return []
releases_json_str = cluster.list_releases(all_namespaces=True, output="json"); all_releases = json.loads(releases_json_str)
instances = []
for rel in all_releases:
if rel.get("namespace") in user_namespaces:
ns_parts = rel.get("namespace").split('-'); app_name = ns_parts[1] if len(ns_parts) > 2 else "unknown"
instances.append(InstalledApplicationInstance(
application_name=app_name, release_name=rel.get("name"), namespace=rel.get("namespace"), chart=rel.get("chart"), status=rel.get("status")
))
return instances
except (RuntimeError, json.JSONDecodeError) as e:
print(f"❌ Error listing application instances: {e}"); return []
def _deep_merge(source: dict, destination: dict) -> dict:
for key, value in source.items():
if isinstance(value, dict) and key in destination and isinstance(destination[key], dict):
destination[key] = _deep_merge(value, destination[key])
else:
destination[key] = value
return destination
def install_application(
cluster,
namespace,
app_template_name,
mode,
user_overrides=None
) -> InstallReceipt:
metadata = cluster.get_application_metadata(app_template_name)
print(f"Metadata for '{app_template_name}': {metadata}")
app_meta = ApplicationMetadata(**metadata)
deployment_mode = getattr(app_meta, mode, None)
if not deployment_mode:
raise ValueError(f"Mode '{mode}' not found.")
release_name = deployment_mode.release_name
chart_source = deployment_mode.chart
values_to_set = deployment_mode.sets
if user_overrides:
values_to_set = _deep_merge(user_overrides, values_to_set)
temp_values_path = f"/tmp/temp-values-{namespace}.yaml"
with open(temp_values_path, 'w') as f:
yaml.dump(values_to_set, f)
try:
output = cluster.install_release(
release_name=release_name,
chart_source=chart_source,
namespace=namespace,
config_file=temp_values_path,
create_namespace=True
)
print(output)
return InstallReceipt(
application_name=app_meta.application_name,
release_name=release_name,
namespace=namespace,
message=f"Installation triggered successfully. Raw output: {output.strip()}"
)
finally:
if os.path.exists(temp_values_path):
os.remove(temp_values_path)
def uninstall_application_release(cluster: Cluster, namespace: str, app_name: str, mode: str) -> UninstallReceipt:
try:
# 1. 获取并验证元数据
metadata = cluster.get_application_metadata(app_name)
app_meta = ApplicationMetadata(**metadata)
deployment_mode = getattr(app_meta, mode, None)
if not deployment_mode:
raise ValueError(f"Mode '{mode}' not found in metadata.")
release_name = deployment_mode.release_name
# 2. 卸载 Helm Release
output = cluster.uninstall_release(release_name, namespace=namespace, wait=True)
uninstalled_successfully = True
# 3. 验证卸载是否成功
verification_message = "Verification successful: Release is no longer listed by Helm."
is_clean = True
try:
time.sleep(2)
releases_json_str = cluster.list_releases(namespace=namespace, output="json")
releases = json.loads(releases_json_str)
release_found = any(r['name'] == release_name for r in releases)
if release_found:
is_clean = False
verification_message = "Verification failed: Release is still present in Helm's list."
except Exception as e:
verification_message = f"Verification check failed: {e}"
except (ValidationError, ValueError, RuntimeError) as e:
# 捕获所有已知的预处理和运行时错误
return UninstallReceipt(
application_name=app_name,
release_name=release_name if 'release_name' in locals() else 'unknown',
namespace=namespace,
uninstalled_successfully=False,
is_clean=False,
message=f"Operation failed due to an error: {e}"
)
except Exception as e:
# 捕获所有其他意外错误
return UninstallReceipt(
application_name=app_name,
release_name=release_name if 'release_name' in locals() else 'unknown',
namespace=namespace,
uninstalled_successfully=False,
is_clean=False,
message=f"An unexpected error occurred: {e}"
)
return UninstallReceipt(
application_name=app_name,
release_name=release_name,
namespace=namespace,
uninstalled_successfully=uninstalled_successfully,
is_clean=is_clean,
message=f"{output.strip()}. {verification_message}"
)
def delete_namespace(cluster: Cluster, namespace: str) -> NamespaceDeleteReceipt:
app_name = "unknown"
try:
# 尝试从命名空间中提取应用名称
ns_parts = namespace.split('-')
if len(ns_parts) > 2:
app_name = ns_parts[1]
except Exception:
pass # 如果解析失败app_name 保持为 'unknown'
try:
# 1. 提交删除命名空间的命令
output = cluster.delete(resource_type="namespace", name=namespace)
deleted_successfully = True
# 2. 验证命名空间是否已被删除
is_clean = False
verification_message = "Delete command submitted. Namespace is terminating."
try:
# 循环检查命名空间直到它不存在
timeout = 60
start_time = time.time()
while time.time() - start_time < timeout:
try:
cluster.get("namespace", name=namespace)
time.sleep(5)
except RuntimeError as e:
if "not found" in str(e).lower():
is_clean = True
verification_message = "Verification successful: Namespace not found."
break
else:
raise e # 重新抛出其他运行时错误
if not is_clean:
verification_message = "Verification failed: Namespace still exists after timeout."
except Exception as e:
verification_message = f"Verification check failed: {e}"
except RuntimeError as e:
# 如果 delete 命令本身失败
return NamespaceDeleteReceipt(
application_name=app_name,
namespace=namespace,
deleted_successfully=False,
is_clean=False,
message=f"Delete namespace command failed: {e}"
)
return NamespaceDeleteReceipt(
application_name=app_name,
namespace=namespace,
deleted_successfully=deleted_successfully,
is_clean=is_clean,
message=f"{output.strip()}. {verification_message}"
)
def get_application_status(
cluster,
namespace: str,
app_template_name: str,
mode: str
):
app_name = "Unknown"
base_access_url = None
paths = None
try:
metadata_dict = cluster.get_application_metadata(app_template_name)
app_meta = ApplicationMetadata(**metadata_dict)
deployment_mode = getattr(app_meta, mode, None)
if not deployment_mode:
raise ValueError(f"Mode '{mode}' not found.")
app_name = app_meta.application_name
if not deployment_mode.pod or not deployment_mode.pod.name:
raise ValueError("Pod name pattern is not defined.")
pod_name_pattern = deployment_mode.pod.name
if deployment_mode.svc:
base_access_url = deployment_mode.svc.url
paths = deployment_mode.svc.paths
pods_json_str = cluster.get("pods", namespace=namespace)
all_pods = json.loads(pods_json_str).get("items", [])
target_pods = [p for p in all_pods if p.get('metadata', {}).get('name', '').startswith(pod_name_pattern)]
if not target_pods:
return ApplicationStatus(
application_name=app_name,
namespace=namespace,
is_ready=False,
base_access_url=base_access_url,
paths=paths,
details=[]
)
all_ready = True
pod_details = []
for pod in target_pods:
pod_name = pod['metadata']['name']
container_statuses = pod.get('status', {}).get('containerStatuses', [])
pod_phase = pod.get('status', {}).get('phase', '')
ready_count = sum(1 for s in container_statuses if s.get('ready'))
total_count = len(container_statuses)
pod_is_ready = (pod_phase == 'Running') and (ready_count == total_count)
if not pod_is_ready:
all_ready = False
pod_details.append(
PodStatusDetail(
pod_name=pod_name,
is_ready=pod_is_ready,
ready_status=f"{ready_count}/{total_count}",
status_phase=pod_phase
)
)
return ApplicationStatus(
application_name=app_name,
namespace=namespace,
is_ready=all_ready,
base_access_url=base_access_url,
paths=paths,
details=pod_details
)
except (ValidationError, json.JSONDecodeError, KeyError, ValueError, AttributeError) as e:
return ApplicationStatus(
application_name=app_name,
namespace=namespace,
is_ready=False,
base_access_url=base_access_url,
paths=paths,
details=[PodStatusDetail(pod_name="Error", is_ready=False, ready_status="0/0", status_phase=f"Error: {e}")]
)
except Exception as e:
return ApplicationStatus(
application_name=app_name,
namespace=namespace,
is_ready=False,
base_access_url=base_access_url,
paths=paths,
details=[PodStatusDetail(pod_name="Unexpected Error", is_ready=False, ready_status="0/0", status_phase=f"Error: {e}")]
)

View File

@ -0,0 +1,241 @@
# cluster_dao.py
"""
Data Access Object (DAO) 层 - 函数式实现。
本模块负责执行 kubectl 命令,并解析其输出,将其转换为结构化的 Pydantic 模型。
所有与数据获取、解析、转换和计算相关的逻辑都集中在此。
"""
import re
from ocdp.orchestration.cluster import Cluster
from ocdp.models.orchestration.cluster import (
ClusterStatus, ClusterSummary, ClusterHealthSummary, ClusterResourceSummary,
NodeInfo, NodeHealth, NodeCondition, ResourceUsage, GPUType, GPUUsage, GPUInfo,
PodsUsage, PodDetail, TotalResourceUsage, PodsTotalUsage, MaxFreeNodeInfo,
MaxFreeGPUNodeInfo, DistributedGPUAvailability
)
# ... (辅助函数 _parse_size_to_kib, _parse_cpu 等保持不变) ...
def _parse_size_to_kib(size_str: str | None) -> int:
if not size_str or not size_str[0].isdigit(): return 0
size_str = size_str.lower()
val_match = re.search(r'(\d+)', size_str)
if not val_match: return 0
val = int(val_match.group(1))
if 'gi' in size_str: return val * 1024 * 1024
if 'mi' in size_str: return val * 1024
if 'ki' in size_str: return val
return val // 1024
def _format_size_from_kib(kib: int) -> str:
if kib < 0: kib = 0
if kib >= 1024 * 1024: return f"{round(kib / (1024 * 1024), 2)}Gi"
if kib >= 1024: return f"{round(kib / 1024, 2)}Mi"
return f"{kib}Ki"
def _parse_cpu(cpu_str: str | None) -> int:
if not cpu_str or not cpu_str[0].isdigit(): return 0
if 'm' in cpu_str: return int(cpu_str.replace('m', ''))
if '.' in cpu_str: return int(float(cpu_str) * 1000)
if cpu_str.isdigit(): return int(cpu_str) * 1000
return 0
def _format_cpu(millicores: int) -> str:
if millicores < 0: millicores = 0
if millicores < 1000: return f"{millicores}m"
return str(round(millicores / 1000, 3))
def _find_value(pattern: str, text: str, flags: int = 0) -> str | None:
match = re.search(pattern, text, flags)
return match.group(1).strip() if match else None
def _parse_pods_table(full_node_text: str) -> list[PodDetail]:
pods_block_match = re.search(r"Non-terminated Pods:(.*?)(?=\nAllocated resources:|\nEvents:)", full_node_text, re.DOTALL)
if not pods_block_match: return []
pods_text = pods_block_match.group(1).strip()
lines = pods_text.split('\n')
header_index = -1
for i, line in enumerate(lines):
if "Namespace" in line and "Name" in line and "CPU Requests" in line:
header_index = i
break
if header_index == -1 or len(lines) <= header_index + 1: return []
header = lines[header_index]
col_starts = {"ns": header.find("Namespace"), "name": header.find("Name"), "cpu_req": header.find("CPU Requests"), "cpu_lim": header.find("CPU Limits"), "mem_req": header.find("Memory Requests"), "mem_lim": header.find("Memory Limits"), "age": header.find("Age"),}
pod_list = []
for line in lines[header_index + 1:]:
if not line.strip() or "---" in line: continue
parts = re.split(r'\s{2,}', line.strip())
if len(parts) != 7: continue
pod_list.append(PodDetail(
namespace=parts[0], name=parts[1], cpu_requests=parts[2], cpu_limits=parts[3],
memory_requests=parts[4], memory_limits=parts[5], age=parts[6]
))
return pod_list
def _parse_key_value_block(text: str) -> dict[str, str]:
data = {}
for line in text.strip().split('\n'):
parts = line.split(':', 1)
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip()
data[key] = value
return data
def _parse_single_node(text: str) -> NodeInfo | None:
name = _find_value(r"Name:\s*(\S+)", text)
if not name: return None
roles_str = _find_value(r"Roles:\s*([^\n]*)", text) or "<none>"
roles = [r.strip() for r in roles_str.split(',')] if roles_str != '<none>' else ['worker']
labels_text = _find_value(r"Labels:(.*?)(?=\nAnnotations:)", text, re.DOTALL) or ""
labels = {}
if labels_text:
for line in labels_text.strip().split('\n'):
key_value = line.strip().split('=', 1)
if len(key_value) == 2:
labels[key_value[0]] = key_value[1]
conditions_text = _find_value(r"Conditions:(.*?)(?=\nAddresses:)", text, re.DOTALL) or ""
conditions = []
for line in conditions_text.strip().split('\n')[2:]:
parts = re.split(r'\s{2,}', line.strip())
if len(parts) >= 6:
conditions.append(NodeCondition(type=parts[0], status=parts[1], last_heartbeat_time=parts[2], last_transition_time=parts[3], reason=parts[4], message=parts[5]))
health = NodeHealth(conditions=conditions)
allocatable_text = _find_value(r"Allocatable:(.*?)(?=\nSystem Info:)", text, re.DOTALL) or ""
allocatable = _parse_key_value_block(allocatable_text)
allocated_block = _find_value(r"Allocated resources:(.*?)(?=\nEvents:)", text, re.DOTALL) or ""
allocated_requests = {}
for line in allocated_block.strip().split('\n'):
line = line.strip()
if not line or line.startswith('(') or line.startswith('Resource') or line.startswith('---'):
continue
match = re.match(r'^(\S+)\s+(\S+)', line)
if match:
resource, request_val = match.groups()
allocated_requests[resource] = request_val
pods = PodsUsage(
total=int(allocatable.get("pods", "0")),
used=int(_find_value(r"Non-terminated Pods:\s*\((\d+) in total\)", text) or "0"),
free=0
)
cpu = ResourceUsage(total=allocatable.get("cpu", "0"), used=allocated_requests.get("cpu", "0m"), free="0")
memory = ResourceUsage(total=allocatable.get("memory", "0Ki"), used=allocated_requests.get("memory", "0Ki"), free="0")
ephemeral_storage, hugepages_1Gi, hugepages_2Mi, rdma_shared_device_a = None, None, None, None
if "ephemeral-storage" in allocatable:
ephemeral_storage = ResourceUsage(
total=allocatable["ephemeral-storage"],
used=allocated_requests.get("ephemeral-storage", "0Ki"),
free="0"
)
if "hugepages-1Gi" in allocatable:
hugepages_1Gi = ResourceUsage(
total=allocatable["hugepages-1Gi"],
used=allocated_requests.get("hugepages-1Gi", "0"),
free="0"
)
if "hugepages-2Mi" in allocatable:
hugepages_2Mi = ResourceUsage(
total=allocatable["hugepages-2Mi"],
used=allocated_requests.get("hugepages-2Mi", "0"),
free="0"
)
if "rdma/rdma_shared_device_a" in allocatable:
rdma_shared_device_a = ResourceUsage(
total=allocatable["rdma/rdma_shared_device_a"],
used=allocated_requests.get("rdma/rdma_shared_device_a", "0"),
free="0"
)
print(ephemeral_storage, hugepages_1Gi, hugepages_2Mi, rdma_shared_device_a)
running_pods = _parse_pods_table(text)
gpu_info = None
gpu_total = int(allocatable.get("nvidia.com/gpu", "0"))
if gpu_total > 0:
gpu_usage = GPUUsage(total=gpu_total, used=int(allocated_requests.get("nvidia.com/gpu", "0")), free=0)
gpu_type = GPUType(product=labels.get("nvidia.com/gpu.product", "Unknown"), memory_mb=int(labels.get("nvidia.com/gpu.memory", "0")))
gpu_info = GPUInfo(usage=gpu_usage, types=[gpu_type])
return NodeInfo(
name=name, roles=roles, labels=labels, health=health, cpu=cpu, memory=memory, pods=pods,
ephemeral_storage=ephemeral_storage, hugepages_1Gi=hugepages_1Gi,
hugepages_2Mi=hugepages_2Mi, rdma_shared_device_a=rdma_shared_device_a,
gpu_info=gpu_info, running_pods=running_pods
)
def get_cluster_status(cluster: Cluster) -> ClusterStatus:
raw_output = cluster.describe("nodes")
node_texts = re.split(r'\n(?=Name:\s+)', raw_output.strip())
nodes = [_parse_single_node(text) for text in node_texts if text.strip()]
nodes = [node for node in nodes if node is not None]
# --- 更新: 计算所有 ResourceUsage 对象的 free 值 ---
for node in nodes:
node.cpu.free = _format_cpu(_parse_cpu(node.cpu.total) - _parse_cpu(node.cpu.used))
node.memory.free = _format_size_from_kib(_parse_size_to_kib(node.memory.total) - _parse_size_to_kib(node.memory.used))
node.pods.free = node.pods.total - node.pods.used
if node.gpu_info:
node.gpu_info.usage.free = node.gpu_info.usage.total - node.gpu_info.usage.used
if node.ephemeral_storage:
node.ephemeral_storage.free = _format_size_from_kib(_parse_size_to_kib(node.ephemeral_storage.total) - _parse_size_to_kib(node.ephemeral_storage.used))
# hugepages 和 rdma 是纯数字,直接计算
if node.hugepages_1Gi:
node.hugepages_1Gi.free = str(int(node.hugepages_1Gi.total) - int(node.hugepages_1Gi.used))
if node.hugepages_2Mi:
node.hugepages_2Mi.free = str(int(node.hugepages_2Mi.total) - int(node.hugepages_2Mi.used))
if node.rdma_shared_device_a:
node.rdma_shared_device_a.free = str(int(node.rdma_shared_device_a.total) - int(node.rdma_shared_device_a.used))
# (汇总逻辑保持不变)
health_summary = ClusterHealthSummary(total_nodes=len(nodes), ready_nodes=sum(1 for n in nodes if n.health.overall_status == "Ready"), unhealthy_nodes=sum(1 for n in nodes if n.health.overall_status != "Ready"))
total_cpu_m = sum(_parse_cpu(n.cpu.total) for n in nodes)
used_cpu_m = sum(_parse_cpu(n.cpu.used) for n in nodes)
total_mem_kib = sum(_parse_size_to_kib(n.memory.total) for n in nodes)
used_mem_kib = sum(_parse_size_to_kib(n.memory.used) for n in nodes)
total_storage_kib = sum(_parse_size_to_kib(n.ephemeral_storage.total if n.ephemeral_storage else None) for n in nodes)
used_storage_kib = sum(_parse_size_to_kib(n.ephemeral_storage.used if n.ephemeral_storage else None) for n in nodes)
cluster_total_cpu = TotalResourceUsage(total=_format_cpu(total_cpu_m), used=_format_cpu(used_cpu_m), free=_format_cpu(total_cpu_m - used_cpu_m))
cluster_total_memory = TotalResourceUsage(total=_format_size_from_kib(total_mem_kib), used=_format_size_from_kib(used_mem_kib), free=_format_size_from_kib(total_mem_kib - used_mem_kib))
cluster_total_storage = TotalResourceUsage(total=_format_size_from_kib(total_storage_kib), used=_format_size_from_kib(used_storage_kib), free=_format_size_from_kib(total_storage_kib - used_storage_kib)) if total_storage_kib > 0 else None
total_pods = sum(n.pods.total for n in nodes)
used_pods = sum(n.pods.used for n in nodes)
cluster_total_pods = PodsTotalUsage(total=total_pods, used=used_pods, free=total_pods - used_pods)
best_cpu_node, best_mem_node, best_gpu_node = None, None, None
if nodes:
cpu_leader = max(nodes, key=lambda n: _parse_cpu(n.cpu.free))
best_cpu_node = MaxFreeNodeInfo(node_name=cpu_leader.name, free_amount=cpu_leader.cpu.free)
mem_leader = max(nodes, key=lambda n: _parse_size_to_kib(n.memory.free))
best_mem_node = MaxFreeNodeInfo(node_name=mem_leader.name, free_amount=mem_leader.memory.free)
gpu_nodes = [n for n in nodes if n.gpu_info and n.gpu_info.types]
if gpu_nodes:
gpu_leader = max(gpu_nodes, key=lambda n: n.gpu_info.usage.free * n.gpu_info.types[0].memory_mb)
best_gpu_node = MaxFreeGPUNodeInfo(
node_name=gpu_leader.name,
free_gpu_count=gpu_leader.gpu_info.usage.free,
memory_per_gpu_mb=gpu_leader.gpu_info.types[0].memory_mb,
total_potential_memory_gb=round(gpu_leader.gpu_info.usage.free * gpu_leader.gpu_info.types[0].memory_mb / 1024, 2)
)
dist_gpu_map = {}
for node in nodes:
if node.gpu_info:
for gpu_type in node.gpu_info.types:
if gpu_type.product not in dist_gpu_map:
dist_gpu_map[gpu_type.product] = {"product": gpu_type.product, "memory_per_gpu_mb": gpu_type.memory_mb, "total_free_count": 0}
dist_gpu_map[gpu_type.product]["total_free_count"] += node.gpu_info.usage.free
distributed_gpu_availability = [DistributedGPUAvailability(**data) for data in dist_gpu_map.values()]
resource_summary = ClusterResourceSummary(
cluster_total_cpu=cluster_total_cpu,
cluster_total_memory=cluster_total_memory,
cluster_total_pods=cluster_total_pods,
cluster_total_ephemeral_storage=cluster_total_storage,
best_node_for_cpu=best_cpu_node,
best_node_for_memory=best_mem_node,
best_node_for_gpu_app=best_gpu_node,
distributed_gpu_availability=distributed_gpu_availability
)
cluster_summary = ClusterSummary(health=health_summary, resources=resource_summary)
return ClusterStatus(summary=cluster_summary, nodes=nodes)

View File

View File

@ -0,0 +1,36 @@
from sqlalchemy.orm import Session
from ocdp.models.user import User
def get_user_by_id(user_id: int, db: Session = None):
return db.query(User).filter(User.user_id == user_id).first()
def get_user_by_username(username: str, db: Session = None):
return db.query(User).filter(User.username == username).first()
def get_user_by_email(email: str, db: Session = None):
return db.query(User).filter(User.email == email).first()
def add_user(user: User, db: Session = None):
db.add(user)
db.commit()
db.refresh(user)
return user
def update_user(user: User, db: Session = None):
db.commit()
db.refresh(user)
return user
def delete_user(user: User, db: Session = None):
db.delete(user)
db.commit()
return True

View File

@ -0,0 +1,3 @@
from .database import get_db, Base

36
ocdp/database/database.py Normal file
View File

@ -0,0 +1,36 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from urllib.parse import quote_plus
from ocdp.config import CONFIG
#
mysql_host = CONFIG.database.mysql.host
mysql_port = CONFIG.database.mysql.port
mysql_username = CONFIG.database.mysql.username
mysql_password = quote_plus(CONFIG.database.mysql.password)
mysql_db_name = CONFIG.database.mysql.db_name
mysql_url = f"mysql+pymysql://{mysql_username}:{mysql_password}@{mysql_host}:{mysql_port}/{mysql_db_name}?charset=utf8mb4"
engine = create_engine(
mysql_url
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
def get_db():
db = SessionLocal()
try:
yield db
except Exception as e:
db.rollback()
raise e
finally:
db.close()

42
ocdp/logger.py Normal file
View File

@ -0,0 +1,42 @@
import os
import sys
from loguru import logger
from loguru_loki_handler import loki_handler
from ocdp.config import CONFIG
LABELS = CONFIG.logger.loki.labels
LABELS = dict(item.split("=") for item in LABELS.split(",")) if LABELS else {}
LABEL_KEYS = CONFIG.logger.loki.label_keys
LABEL_KEYS = LABEL_KEYS.split(",") if LABEL_KEYS else []
URL = CONFIG.logger.loki.url
LOGS_DIR = CONFIG.orchestration.kube.logs_dir
LOGS_DIR = os.path.expanduser(LOGS_DIR)
logger.configure(handlers=[
{
"sink": sys.stdout,
"level": "INFO",
},
{
"sink": sys.stderr,
"level": "ERROR",
},
{
"sink": f"{LOGS_DIR}/app.log",
"serialize": True,
"level": "DEBUG",
"rotation": "1 day",
"compression": "zip"
},
{
"sink": loki_handler(
url=URL,
labels=LABELS,
labelKeys=LABEL_KEYS
),
"level": "INFO"
},
])

1
ocdp/models/__init__.py Normal file
View File

@ -0,0 +1 @@

View File

View File

@ -0,0 +1,74 @@
# models.py
"""
定义了用于应用编排的所有 Pydantic 数据模型。
"""
from pydantic import BaseModel, Field
# ... (InstallationConfig, ApplicationDeploymentMode, ApplicationMetadata 保持不变) ...
class SvcInfo(BaseModel):
# 根据你的 YAMLURL 可以是 ~ (None)
svc_type: str
protocol: str
hostname: str
port: int
url: str | None = None # 允许 str 或 None
paths: dict[str, str] | None = None # 允许 dict 或 None
class PodInfo(BaseModel):
name: str
class ApplicationDeploymentMode(BaseModel):
method: str
release_name: str
chart: str
sets: dict = Field(default_factory=dict)
svc: SvcInfo
pod: PodInfo
class ApplicationMetadata(BaseModel):
application_name: str
distributed: ApplicationDeploymentMode
monolithic: ApplicationDeploymentMode
class ApplicationTemplate(BaseModel):
"""代表一个可供安装的应用模板。"""
name: str = Field(..., description="应用模板的名称 (文件夹名)")
metadata: ApplicationMetadata = Field(..., description="从 metadata.yaml 解析出的完整配置")
class InstalledApplicationInstance(BaseModel):
"""代表一个已安装的应用实例。"""
application_name: str = Field(..., description="应用的业务名称")
release_name: str = Field(..., description="部署的 Helm Release 名称")
namespace: str = Field(..., description="应用实例所在的唯一命名空间")
chart: str = Field(..., description="所使用的 Helm Chart")
status: str = Field(..., description="Helm Release 的状态 (e.g., 'deployed', 'failed')")
class InstallReceipt(BaseModel):
"""成功触发安装后,返回给客户端的回执。"""
application_name: str; release_name: str; namespace: str; message: str
class UninstallReceipt(BaseModel):
"""卸载 Helm Release 后的回执。"""
application_name: str; release_name: str; namespace: str;
uninstalled_successfully: bool; is_clean: bool; message: str
# --- 关键修改:增加 application_name 字段 ---
class NamespaceDeleteReceipt(BaseModel):
"""删除 Namespace 后的回执。"""
application_name: str = Field(..., description="被删除实例的应用业务名称")
namespace: str = Field(..., description="被删除的命名空间")
deleted_successfully: bool = Field(..., description="delete 命令是否成功提交")
is_clean: bool = Field(..., description="验证步骤Namespace 是否已从集群中清除")
message: str = Field(..., description="操作结果消息")
class PodStatusDetail(BaseModel):
"""描述单个 Pod 的详细状态。"""
pod_name: str; is_ready: bool; ready_status: str; status_phase: str | None
class ApplicationStatus(BaseModel):
application_name: str
namespace: str
is_ready: bool
base_access_url: str | None # 允许 str 或 None
paths: dict | None # 允许 dict 或 None
details: list[PodStatusDetail]

View File

@ -0,0 +1,156 @@
# models.py
"""
定义了用于解析和展示 Kubernetes 集群状态的所有 Pydantic 数据模型。
这些模型被用于 API 的请求响应、数据校验以及各层之间的数据传输。
"""
from pydantic import BaseModel, Field, computed_field
# ---------------------------------------------------------------------------
# I. 单个 K8s 对象的详细模型 (Detailed Models for Single K8s Objects)
# ---------------------------------------------------------------------------
class PodDetail(BaseModel):
"""代表节点上运行的一个 Pod 的详细资源占用信息。"""
namespace: str = Field(..., description="Pod 所在的命名空间")
name: str = Field(..., description="Pod 的名称")
cpu_requests: str = Field(..., description="CPU 请求量")
cpu_limits: str = Field(..., description="CPU 限制量")
memory_requests: str = Field(..., description="内存请求量")
memory_limits: str = Field(..., description="内存限制量")
age: str = Field(..., description="Pod 的运行时长")
class NodeCondition(BaseModel):
"""代表从 `kubectl describe node` 输出中解析出的单个 Condition 行。"""
type: str
status: str
last_heartbeat_time: str
last_transition_time: str
reason: str
message: str
class NodeHealth(BaseModel):
"""封装节点的健康状况,主要由 Condition 列表构成。"""
conditions: list[NodeCondition] = Field(..., description="节点的健康状况条件列表")
@computed_field
@property
def overall_status(self) -> str:
"""根据 'Ready' 类型的 Condition 计算出一个简明的总体状态。"""
for condition in self.conditions:
if condition.type == "Ready":
return "Ready" if condition.status == "True" else "NotReady"
return "Unknown"
class ResourceUsage(BaseModel):
"""通用资源使用情况模型用于表示带单位的资源如CPU, Memory, 存储)。"""
total: str = Field(..., description="资源总量 (来自 Allocatable)")
used: str = Field(..., description="已用资源量 (来自 Allocated Requests)")
free: str = Field(..., description="剩余可用资源量 (计算得出)")
class PodsUsage(BaseModel):
"""Pod 使用情况模型,表示可调度 Pod 的数量。"""
total: int = Field(..., description="节点可容纳的 Pod 总数 (Capacity)")
used: int = Field(..., description="节点上当前运行的 Pod 数量")
free: int = Field(..., description="剩余可调度的 Pod 数量 (计算得出)")
class GPUUsage(BaseModel):
"""表示节点上 GPU 设备的调度使用情况 (使用整数)。"""
total: int = Field(..., description="GPU 设备总数")
used: int = Field(..., description="已被 Pod 请求的 GPU 数量")
free: int = Field(..., description="空闲可用的 GPU 数量")
class GPUType(BaseModel):
"""表示节点上 GPU 的物理规格。"""
product: str = Field(..., description="GPU 产品型号")
memory_mb: int = Field(..., description="单块 GPU 的显存大小 (MB)")
class GPUInfo(BaseModel):
"""统一的、包含嵌套信息的 GPU 汇总对象。"""
usage: GPUUsage = Field(..., description="GPU 数量统计")
types: list[GPUType] = Field(..., description="节点上的 GPU 型号列表")
class NodeInfo(BaseModel):
"""核心模型,完整地描述了一个节点的所有相关信息。"""
name: str = Field(..., description="节点名称")
roles: list[str] = Field(..., description="节点角色")
labels: dict[str, str] = Field(..., description="节点的标签集合")
health: NodeHealth = Field(..., description="节点健康状况")
cpu: ResourceUsage = Field(..., description="CPU 资源使用情况")
memory: ResourceUsage = Field(..., description="内存资源使用情况")
pods: PodsUsage = Field(..., description="Pod 使用情况")
# 注意: 以下资源并非在所有节点上都存在,因此使用 `| None` (可选)。
# 在 API 输出时,如果值为 None这些字段将不会出现。
ephemeral_storage: ResourceUsage | None = Field(None, description="临时存储资源使用情况")
hugepages_1Gi: ResourceUsage | None = Field(None, description="1Gi 大页内存使用情况")
hugepages_2Mi: ResourceUsage | None = Field(None, description="2Mi 大页内存使用情况")
rdma_shared_device_a: ResourceUsage | None = Field(None, description="RDMA 共享设备使用情况")
gpu_info: GPUInfo | None = Field(None, description="节点上所有 GPU 的汇总信息")
running_pods: list[PodDetail] = Field(..., description="在该节点上运行的 Pod 列表")
# ---------------------------------------------------------------------------
# II. 集群级别汇总信息模型 (Cluster-Level Summary Information Models)
# ---------------------------------------------------------------------------
# Section 1: 集群资源总览模型
class TotalResourceUsage(BaseModel):
"""用于集群总览的资源使用情况模型(字符串类型)。"""
total: str; used: str; free: str
class PodsTotalUsage(BaseModel):
"""用于集群总览的 Pod 使用情况模型(整数类型)。"""
total: int; used: int; free: int
# Section 2: 单机最大承载能力模型
class MaxFreeNodeInfo(BaseModel):
"""用于在汇总信息中标识拥有最多空闲资源的节点。"""
node_name: str = Field(..., description="节点名称")
free_amount: str = Field(..., description="空闲资源量(带单位)")
class MaxFreeGPUNodeInfo(BaseModel):
"""专用于标识最适合部署大型单机 GPU 应用的节点。"""
node_name: str = Field(..., description="节点名称")
free_gpu_count: int = Field(..., description="该节点上的空闲 GPU 数量")
memory_per_gpu_mb: int = Field(..., description="该型号 GPU 的单卡显存")
total_potential_memory_gb: float = Field(..., description="空闲 GPU 总显存潜力 (GB), 计算公式: free_gpu_count * memory_per_gpu_mb")
# Section 3: 分布式应用潜力模型
class DistributedGPUAvailability(BaseModel):
"""按型号汇总集群中所有可用(空闲)的 GPU用于评估分布式应用潜力。"""
product: str = Field(..., description="GPU 产品型号")
memory_per_gpu_mb: int = Field(..., description="该型号 GPU 的单卡显存")
total_free_count: int = Field(..., description="该型号 GPU 在整个集群中的空闲总数")
# --- 主汇总模型 ---
class ClusterResourceSummary(BaseModel):
"""集群资源的详细汇总,分为总览、单机最大能力和分布式潜力三个部分。"""
# Part 1: 集群资源总览
cluster_total_cpu: TotalResourceUsage = Field(..., description="集群 CPU 资源总览")
cluster_total_memory: TotalResourceUsage = Field(..., description="集群内存资源总览")
cluster_total_pods: PodsTotalUsage = Field(..., description="集群 Pod 容量总览")
cluster_total_ephemeral_storage: TotalResourceUsage | None = Field(None, description="集群临时存储资源总览")
# Part 2: 单机最大承载能力
best_node_for_cpu: MaxFreeNodeInfo | None = Field(None, description="拥有最多空闲CPU的节点")
best_node_for_memory: MaxFreeNodeInfo | None = Field(None, description="拥有最多空闲内存的节点")
best_node_for_gpu_app: MaxFreeGPUNodeInfo | None = Field(None, description="最适合部署大型单机GPU应用的节点")
# Part 3: 分布式应用潜力
distributed_gpu_availability: list[DistributedGPUAvailability] = Field(..., description="按型号汇总的、整个集群的空闲GPU数量")
class ClusterHealthSummary(BaseModel):
"""集群整体健康状况的简要汇总。"""
total_nodes: int; ready_nodes: int; unhealthy_nodes: int
class ClusterSummary(BaseModel):
"""将健康状况和资源汇总组合在一起的中间模型。"""
health: ClusterHealthSummary
resources: ClusterResourceSummary
class ClusterStatus(BaseModel):
"""API 的顶级响应模型,包含了集群的完整状态。"""
summary: ClusterSummary = Field(..., description="集群的整体汇总信息")
nodes: list[NodeInfo] = Field(..., description="集群中所有节点的详细信息列表")

View File

@ -0,0 +1,55 @@
import json
class Resource:
"""
将 JSON / dict 转成 Python 对象,嵌套 dict 会自动变成 Resource
list 中的 dict 也会自动转换。
"""
def __init__(self, data):
if isinstance(data, dict):
for k, v in data.items():
if isinstance(v, dict):
setattr(self, k, Resource(v))
elif isinstance(v, list):
setattr(self, k, [Resource(i) if isinstance(i, dict) else i for i in v])
else:
setattr(self, k, v)
else:
self.value = data # 基础类型直接存储到 value
def __repr__(self):
return f"{self.__dict__}"
def to_dict(self):
"""可选:将 Resource 再转回 dict"""
result = {}
for k, v in self.__dict__.items():
if isinstance(v, Resource):
result[k] = v.to_dict()
elif isinstance(v, list):
result[k] = [i.to_dict() if isinstance(i, Resource) else i for i in v]
else:
result[k] = v
return result
# ---------------- 使用示例 ----------------
if __name__ == "__main__":
kubectl_json = '''
{
"metadata": {"name": "nginx", "namespace": "default"},
"spec": {"containers": [{"name": "nginx", "image": "nginx:latest"}]},
"status": {"phase": "Running"}
}
'''
# 转成 Resource 对象
data_dict = json.loads(kubectl_json)
pod = Resource(data_dict)
# 访问字段
print(pod.metadata.name) # nginx
print(pod.spec.containers[0].image) # nginx:latest
print(pod.status.phase) # Running
# 可选转回 dict
print(pod.to_dict())

View File

@ -0,0 +1,2 @@
from .user import User

21
ocdp/models/user/user.py Normal file
View File

@ -0,0 +1,21 @@
from sqlalchemy import Column, Integer, String, Boolean, TIMESTAMP, func
import datetime
from ocdp.database import Base
class User(Base):
__tablename__ = "users"
user_id = Column(Integer, primary_key=True, index=True ,autoincrement=True)
username = Column(String(64), unique=True, index=True, nullable=False)
email = Column(String(128), unique=True, index=True, nullable=False)
hashed_password = Column(String(128), nullable=False)
is_active = Column(Boolean, nullable=False, default=True)
is_admin = Column(Boolean, nullable=False, default=False)
created_at = Column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now())
updated_at = Column(TIMESTAMP(timezone=True), nullable=False, server_default=func.now(), onupdate=func.now())
last_login_at = Column(TIMESTAMP(timezone=True), nullable=True)

View File

@ -0,0 +1,2 @@
from .cluster import Cluster, get_cluster

View File

@ -0,0 +1,265 @@
import os
import time
import yaml
import subprocess
from ocdp.config import CONFIG
class Cluster:
def __init__(self, kubeconfig: str | None = None):
self.kubeconfig = kubeconfig or os.path.expanduser(CONFIG.orchestration.kube.kubectl_file)
self.applications_dir = os.path.expanduser(CONFIG.orchestration.kube.applications_dir)
self.env = os.environ.copy()
self.env['KUBECONFIG'] = self.kubeconfig
# ----------------- 应用元数据接口 -----------------
def list_applications(self) -> list[str]:
"""列出 applications_dir 中的所有应用(文件夹名)。"""
if not os.path.exists(self.applications_dir):
raise FileNotFoundError(f"The applications directory {self.applications_dir} does not exist.")
return [f for f in os.listdir(self.applications_dir)
if os.path.isdir(os.path.join(self.applications_dir, f))]
def get_application_metadata(self, application_dir: str) -> dict:
"""
获取指定应用的元数据信息。
Args:
application_dir (str): 应用在 applications_dir 中的文件夹名。
Returns:
dict: 从 metadata.yaml 文件中解析出的内容。
"""
meta_path = os.path.join(self.applications_dir, application_dir, "metadata.yaml")
if not os.path.exists(meta_path):
raise FileNotFoundError(f"metadata.yaml not found in {application_dir}")
with open(meta_path, 'r') as f:
try:
# 使用 FullLoader 以支持 YAML 锚点 (&) 和别名 (*)
metadata = yaml.full_load(f)
except yaml.YAMLError as e:
raise RuntimeError(f"Error parsing metadata.yaml: {e}")
# --- 动态处理元数据 ---
# 遍历所有顶层键 (如 'distributed', 'monolithic')
# --- 动态处理元数据 ---
for mode, config in metadata.items():
if isinstance(config, dict) and config.get('method') == 'helm':
# 1. 拼 chart 绝对路径
if 'chart' in config:
chart_name = config['chart']
config['chart'] = os.path.join(self.applications_dir, application_dir, chart_name)
# 2. 拼服务 URL
# 直接从 config 中获取 svc_config而不是从 sets 中
svc_config = config.get('svc')
if isinstance(svc_config, dict):
protocol = svc_config.get('protocol')
hostname = svc_config.get('hostname')
port = svc_config.get('port')
if protocol and hostname and port:
svc_config['url'] = f"{protocol}://{hostname}:{port}"
return metadata
# ----------------- Kubectl 高层接口 -----------------
def apply(self, file_name: str) -> str:
"""应用 Kubernetes 配置文件"""
return self._run_kubectl_cmd(["apply", "-f", self._resolve_file(file_name)])
def delete(
self,
file_name: str | None = None,
resource_type: str | None = None,
name: str | None = None,
namespace: str | None = None,
force: bool = False
) -> str:
"""删除 Kubernetes 资源。"""
if file_name:
cmd = ["delete", "-f", self._resolve_file(file_name)]
elif resource_type and name:
cmd = ["delete", resource_type, name]
if namespace:
cmd.extend(["--namespace", namespace])
else:
raise ValueError("Invalid arguments: Provide 'file_name' or both 'resource_type' and 'name'.")
if force:
cmd.extend(["--force", "--grace-period=0"])
return self._run_kubectl_cmd(cmd)
def create(self, namespace_name: str) -> str:
"""创建一个新的 Kubernetes 命名空间。"""
cmd = ["create", "namespace", namespace_name]
return self._run_kubectl_cmd(cmd)
def get(self, resource_type: str, namespace: str = None, name: str = None,
output: str = "json",
all_namespaces: bool = False) -> str:
"""通用资源获取方法。默认返回 JSON 格式。"""
cmd = ["get", resource_type.lower()]
if name: cmd.append(name)
if all_namespaces: cmd.append("-A")
elif namespace: cmd.extend(["-n", namespace])
if output: cmd.extend(["-o", output])
return self._run_kubectl_cmd(cmd)
def describe(self, resource_type: str, name: str = None, namespace: str = None) -> str:
"""描述指定资源。"""
cmd = ["describe", resource_type.lower()]
if name: cmd.append(name)
if namespace: cmd.extend(["-n", namespace])
return self._run_kubectl_cmd(cmd)
# ----------------- Helm Repository 管理接口 (新增) -----------------
def add_repo(
self,
repo_name: str,
repo_url: str,
username: str | None = None,
password: str | None = None
) -> str:
"""
添加一个 Helm Chart 仓库 (helm repo add)。
Args:
repo_name (str): 仓库的本地别名。
repo_url (str): 仓库的 URL。
username (str, optional): 私有仓库的用户名。
password (str, optional): 私有仓库的密码。
Returns:
str: 命令的输出。
"""
cmd = ["repo", "add", repo_name, repo_url]
if username:
cmd.extend(["--username", username])
if password:
cmd.extend(["--password", password, "--pass-credentials"])
return self._run_helm_cmd(cmd)
def update_repos(self, repo_names: list[str] | None = None) -> str:
"""
更新一个或多个 Helm Chart 仓库 (helm repo update)。
Args:
repo_names (list[str], optional): 要更新的仓库名称列表。如果为 None则更新所有仓库。
Returns:
str: 命令的输出。
"""
cmd = ["repo", "update"]
if repo_names:
cmd.extend(repo_names)
return self._run_helm_cmd(cmd)
# ----------------- Helm Release 管理接口 -----------------
def install_release(
self,
release_name: str,
chart_source: str,
namespace: str,
config_file: str | None = None,
create_namespace: bool = True
) -> str:
"""
安装一个 Helm Release (应用实例)。
"""
cmd = ["install", release_name, chart_source, "--namespace", namespace]
if create_namespace:
cmd.append("--create-namespace")
# --- 关键修复:直接使用 config_file 路径 ---
if config_file:
# 不再调用 self._resolve_file因为 config_file 是由上层逻辑DAO
# 提供的完整路径(例如 /tmp/temp-values.yaml
if not os.path.exists(config_file):
raise FileNotFoundError(f"Provided config_file does not exist: {config_file}")
cmd.extend(["-f", config_file])
return self._run_helm_cmd(cmd)
def uninstall_release(
self,
release_name: str,
namespace: str | None = None,
wait: bool = False
) -> str:
"""卸载一个 Helm Release (应用实例)。"""
cmd = ["uninstall", release_name]
if namespace:
cmd.extend(["--namespace", namespace])
if wait:
cmd.append("--wait")
return self._run_helm_cmd(cmd)
def list_releases(
self,
namespace: str | None = None,
all_namespaces: bool = False,
output: str = None
) -> str:
"""列出已安装的 Helm Releases (应用实例)。"""
cmd = ["list"]
if all_namespaces:
cmd.append("--all-namespaces")
elif namespace:
cmd.extend(["--namespace", namespace])
if output:
cmd.extend(["--output", output])
return self._run_helm_cmd(cmd)
# ----------------- 私有方法 -----------------
def _run_kubectl_cmd(self, cmd_args: list[str]) -> str:
"""执行 kubectl 命令"""
try:
command = ["kubectl"] + cmd_args
print(f"🚀 Executing Kubectl: {' '.join(command)}")
result = subprocess.run(
command, check=True, capture_output=True, text=True, env=self.env
)
return result.stdout
except FileNotFoundError:
raise RuntimeError("`kubectl` command not found. Is it installed and in your PATH?")
except subprocess.CalledProcessError as e:
error_message = f"kubectl command failed with exit code {e.returncode}:\n{e.stderr.strip()}"
raise RuntimeError(error_message) from e
def _run_helm_cmd(self, cmd_args: list[str]) -> str:
"""[内部实现] 执行 helm 命令"""
try:
command = ["helm", "--kubeconfig", self.kubeconfig] + cmd_args
print(f"🚀 Executing Helm: {' '.join(command)}")
result = subprocess.run(
command, check=True, capture_output=True, text=True, env=self.env
)
return result.stdout
except FileNotFoundError:
raise RuntimeError("`helm` command not found. Is it installed and in your PATH?")
except subprocess.CalledProcessError as e:
error_message = f"Helm command failed with exit code {e.returncode}:\n{e.stderr.strip()}"
raise RuntimeError(error_message) from e
def _resolve_file(self, file_name: str) -> str:
"""解析资源文件路径 (相对于 applications_dir)"""
file_path = os.path.join(self.applications_dir, file_name)
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
return file_path
def get_cluster() -> Cluster:
return Cluster()
if __name__ == "__main__":
cluster = get_cluster()
print(cluster.get_application_metadata("infer"))

View File

View File

View File

@ -0,0 +1,71 @@
# service.py
"""
Service (服务) 层 - 应用编排。
负责处理核心业务逻辑(如权限、命名),并调用 DAO 层来执行数据操作。
"""
import ulid
from ocdp.orchestration.cluster import Cluster
from ocdp.daos.orchestration import application_dao as dao
from ocdp.models.orchestration.application import (ApplicationTemplate, InstallReceipt, ApplicationStatus,
InstalledApplicationInstance, UninstallReceipt, NamespaceDeleteReceipt, ApplicationMetadata)
# ... (list_available_applications, list_user_applications 保持不变) ...
def list_available_applications(cluster: Cluster) -> list[ApplicationTemplate]:
"""(Service) 获取所有可供安装的应用模板列表。"""
return dao.list_application_templates(cluster)
def list_user_applications(cluster: Cluster, user_id: str) -> list[InstalledApplicationInstance]:
"""(Service) 获取指定用户已经安装的应用实例列表。"""
return dao.list_application_instances(cluster, user_id)
def install_new_application(
cluster: Cluster,
user_id: str,
app_template_name: str,
mode: str,
user_overrides: dict | None = None
) -> InstallReceipt:
"""
(Service) 触发一个新应用的安装。
核心职责:根据业务规则生成唯一的命名空间。
"""
# 1. (Service 职责) 获取应用的业务名称,用于构造命名空间
# 这里通过调用一次 get_application_metadata 来获取,但只为了 application_name
# DAO 层为了执行任务,也会自己获取一次
metadata = cluster.get_application_metadata(app_template_name)
application_name = metadata.get("application_name", app_template_name)
# 2. (Service 职责) 生成唯一的命名空间
instance_id = str(ulid.new()).lower()
namespace = f"{user_id}-{application_name}-{instance_id}"
# 3. (Service 职责) 将所有参数包括生成的namespace传递给 DAO 层执行
return dao.install_application(
cluster=cluster,
namespace=namespace,
app_template_name=app_template_name,
mode=mode,
user_overrides=user_overrides
)
def get_instance_status(
cluster: Cluster,
namespace: str,
app_template_name: str,
mode: str
) -> ApplicationStatus:
"""(Service) 获取指定应用实例的详细状态。"""
return dao.get_application_status(cluster, namespace, app_template_name, mode)
def uninstall_application_release(
cluster: Cluster,
namespace: str,
app_template_name: str,
mode: str
) -> UninstallReceipt:
"""(Service) 卸载应用实例 (Helm Release)。"""
return dao.uninstall_application_release(cluster, namespace, app_template_name, mode)
def delete_application_namespace(cluster: Cluster, namespace: str) -> NamespaceDeleteReceipt:
"""(Service) 删除应用实例的命名空间。"""
return dao.delete_namespace(cluster, namespace)

View File

@ -0,0 +1,12 @@
# services.py
from ocdp.orchestration.cluster import Cluster
from ocdp.daos.orchestration import cluster_dao
from ocdp.models.orchestration.cluster import ClusterStatus
def get_cluster_status(cluster: Cluster) -> ClusterStatus: # 参数名修改
"""
Service层函数, 作为业务逻辑的入口。
它将请求直接委托给DAO层来处理。
"""
return cluster_dao.get_cluster_status(cluster) # 变量名修改

View File

@ -0,0 +1,135 @@
# node_service.py
import json
from collections import defaultdict
from ocdp.orchestration import Cluster, get_cluster
# 从我们的 DAO 模块导入函数
from ocdp.daos.orchestration.node_dao import (
get_all_nodes_health_status,
get_all_nodes_resource_details,
refresh_nodes_cache as refresh_dao_cache
)
# --- Service 层的格式化工具 ---
def _format_bytes(byte_count: int) -> str:
"""将字节数格式化为人类可读的字符串"""
if byte_count < 0: return "N/A"
power = 1024
n = 0
power_labels = {0: 'B', 1: 'KiB', 2: 'MiB', 3: 'GiB', 4: 'TiB'}
while byte_count >= power and n < len(power_labels) -1 :
byte_count /= power
n += 1
return f"{byte_count:.2f} {power_labels[n]}"
# --- 对外暴露的 Service 接口函数 ---
def get_cluster_health_report(cluster: Cluster) -> dict:
"""1. (Service) 获取集群 nodes 健康状态报告"""
# 直接调用 DAO 函数,它的返回格式已经很好了
return get_all_nodes_health_status(cluster)
def get_per_node_resource_report(cluster: Cluster) -> dict:
"""2. (Service) 获取各 node 资源的格式化报告"""
# 从 DAO 获取数值数据
raw_resources = get_all_nodes_resource_details(cluster)
formatted_report = {}
for name, data in raw_resources.items():
# 格式化 DAO 传来的数值
formatted_report[name] = {
"cpu": {
"total": f"{data['cpu']['total']:.2f} Cores",
"used": f"{data['cpu']['total'] - data['cpu']['allocatable']:.2f} Cores",
"free": f"{data['cpu']['allocatable']:.2f} Cores"
},
"memory": {
"total": _format_bytes(data['memory']['total']),
"used": _format_bytes(data['memory']['total'] - data['memory']['allocatable']),
"free": _format_bytes(data['memory']['allocatable'])
},
"storage": {
"total": _format_bytes(data['storage']['total']),
"used": _format_bytes(data['storage']['total'] - data['storage']['allocatable']),
"free": _format_bytes(data['storage']['allocatable'])
},
# GPU 信息现在直接从 DAO 获取,已经是正确的格式
"gpu": data['gpu']
}
return formatted_report
def get_cluster_summary_report(cluster: Cluster) -> dict:
"""3. (Service) 获取集群总资源的汇总报告"""
# 从 DAO 获取数值数据
raw_resources = get_all_nodes_resource_details(cluster)
# 初始化聚合器
total_cpu, alloc_cpu = 0.0, 0.0
total_mem, alloc_mem = 0, 0
total_sto, alloc_sto = 0, 0
total_gpu_count = 0
alloc_gpu_count = 0
gpu_models = defaultdict(int)
# 聚合 DAO 传来的数值
for data in raw_resources.values():
total_cpu += data['cpu']['total']
alloc_cpu += data['cpu']['allocatable']
total_mem += data['memory']['total']
alloc_mem += data['memory']['allocatable']
total_sto += data['storage']['total']
alloc_sto += data['storage']['allocatable']
gpu_data = data['gpu']
if gpu_data['count'] > 0:
total_gpu_count += gpu_data['count']
alloc_gpu_count += gpu_data['allocatable_count']
gpu_models[gpu_data['model']] += gpu_data['count']
# 格式化最终结果
return {
"note": "'used' 代表被系统或 Kubelet 预留的资源, 'free' 代表可供 Pod 调度的资源。",
"cpu": {
"total": f"{total_cpu:.2f} Cores",
"used": f"{total_cpu - alloc_cpu:.2f} Cores",
"free": f"{alloc_cpu:.2f} Cores"
},
"memory": {
"total": _format_bytes(total_mem),
"used": _format_bytes(total_mem - alloc_mem),
"free": _format_bytes(alloc_mem)
},
"storage": {
"total": _format_bytes(total_sto),
"used": _format_bytes(total_sto - alloc_sto),
"free": _format_bytes(alloc_sto)
},
"gpu": {
"total_count": total_gpu_count,
"allocatable_count": alloc_gpu_count,
"models_summary": dict(gpu_models)
}
}
# --- 使用示例 ---
if __name__ == "__main__":
try:
# 只需要一个 cluster 客户端
cluster_client = get_cluster()
# 调用 Service 层的函数
print("\n" + "="*20 + " 1. 集群健康状态 " + "="*20)
health_report = get_cluster_health_report(cluster_client)
print(json.dumps(health_report, indent=2))
print("\n" + "="*20 + " 2. 各节点资源详情 " + "="*20)
per_node_report = get_per_node_resource_report(cluster_client)
print(json.dumps(per_node_report, indent=2))
print("\n" + "="*20 + " 3. 集群资源汇总 " + "="*20)
summary_report = get_cluster_summary_report(cluster_client)
print(json.dumps(summary_report, indent=2))
except RuntimeError as e:
print(f"\n发生错误: {e}")

View File

View File

@ -0,0 +1,3 @@
from .password_handler import hash_password, verify_password
from .token_handler import generate_token, verify_token

View File

@ -0,0 +1,11 @@
from argon2 import PasswordHasher
PH = PasswordHasher()
def hash_password(password, ph=PH):
hashed = ph.hash(password)
return hashed
def verify_password(hashed, password, ph=PH):
return ph.verify(hashed, password)

View File

@ -0,0 +1,73 @@
import os
import jwt
import datetime
from ocdp.config import CONFIG
# Best practice: Load from environment variables.
# For local development, you can use a .env file and the python-dotenv library.
SECRET_KEY = CONFIG.token.jwt.secret_key
ALGORITHM = CONFIG.token.jwt.signing_algorithm
def generate_token(
user_id: str,
expires_delta: datetime.timedelta = datetime.timedelta(minutes=30),
secret_key: str = SECRET_KEY,
algorithm: str = ALGORITHM
) -> str:
"""
Generates a JWT token with an expiration time.
"""
# Ensure a secret key is available before proceeding.
if not secret_key:
raise ValueError("SECRET_KEY not found in environment variables.")
# Get the current time in UTC.
issue_time = datetime.datetime.now(datetime.timezone.utc)
# Calculate the expiration time.
expire_time = issue_time + expires_delta
# Create the payload with standard claims.
payload = {
"sub": user_id, # 'sub' (Subject): The user's unique identifier.
"iat": issue_time, # 'iat' (Issued At): The time the token was created.
"exp": expire_time # 'exp' (Expiration Time): When the token becomes invalid.
}
# Encode the payload into a JWT token.
token = jwt.encode(payload, secret_key, algorithm=algorithm)
return token
def verify_token(
token: str,
secret_key: str = SECRET_KEY,
algorithm: str = ALGORITHM
) -> dict | None:
"""
Verifies a JWT token.
If the token is valid, it returns the decoded payload as a dictionary.
If the token is invalid (e.g., expired or bad signature), it returns None.
"""
# Ensure a secret key is available for decoding.
if not secret_key:
raise ValueError("SECRET_KEY not found in environment variables.")
try:
# The core of verification. jwt.decode handles signature, expiration, and algorithm checks.
payload = jwt.decode(
token,
secret_key,
algorithms=[algorithm] # Specify the algorithm to prevent certain attacks.
)
return payload
except jwt.ExpiredSignatureError:
# This is one of the most common errors: the token is past its expiration date.
print("Token verification failed: Token has expired.")
return None
except jwt.InvalidTokenError as e:
# This catches all other JWT errors, such as a bad signature or a malformed token.
print(f"Token verification failed: Invalid token. Error: {e}")
return None

View File

@ -0,0 +1,4 @@
class UserAlreadyExistsError(Exception):
pass

View File

@ -0,0 +1,119 @@
# 文件名: user_service.py
from sqlalchemy.orm import Session
import datetime
from .helpers import hash_password, verify_password, generate_token, verify_token
from .user_exceptions import UserAlreadyExistsError
# 从 DAO 层导入具体的数据库操作函数
from ocdp.daos.user import user_dao
# 从模型和 DTOs/Schemas 中导入
from ocdp.models.user import User
# 从辅助模块导入密码和 Token 相关函数
# 第二层
# --- 业务逻辑函数 ---
def login_for_access_token(username: str, password: str, db: Session) -> str | None:
"""处理用户登录的核心业务逻辑。"""
# 调用 DAO 层函数来获取用户
user = user_dao.get_user_by_username(username, db)
# 验证用户是否存在以及密码是否正确
if not user or not verify_password(user.hashed_password, password):
return None # 认证失败
# 认证成功,生成 Token
token = generate_token(user_id=str(user.user_id)) # 假设主键是 user_id
# (可选)更新最后登录时间
set_last_login(user.user_id, db)
return token
# 第一层
def get_user_by_id(user_id: int, db: Session):
# 直接调用 DAO 层函数
return user_dao.get_user_by_id(user_id, db)
def get_user_by_username(username: str, db: Session):
# 直接调用 DAO 层函数
return user_dao.get_user_by_username(username, db)
def get_user_by_email(email:str, db: Session):
# 直接调用 DAO 层函数
return user_dao.get_user_by_email(email, db)
def get_current_user(token: str, db: Session):
user_id = verify_token(token).get("sub")
if not user_id:
return None
return user_dao.get_user_by_id(user_id, db)
def get_user_id_by_token(token: str):
user_id = verify_token(token).get("sub")
if not user_id:
return None
return user_id
# 文件名: user_service.py
def create_user(username: str, password: str, email: str, db: Session):
# 将验证逻辑移入 Service 层
existing_user = user_dao.get_user_by_username(username, db)
if existing_user:
raise UserAlreadyExistsError(f"User with username '{username}' already exists.")
# 可以在这里也增加 email 的存在性检查
existing_email = user_dao.get_user_by_email(email, db)
if existing_email:
raise UserAlreadyExistsError(f"User with email '{email}' already exists.")
hashed = hash_password(password)
user = User(
username=username,
email=email,
hashed_password=hashed
)
return user_dao.add_user(user, db)
def update_user_password(user_id: int, new_password: str, db: Session):
user = user_dao.get_user_by_id(user_id, db)
if not user:
return None
# 业务逻辑:哈希新密码
user.hashed_password = hash_password(new_password)
# 调用 DAO 层函数来持久化更新
return user_dao.update_user(user, db)
def set_last_login(user_id: int, db: Session):
user = user_dao.get_user_by_id(user_id, db)
if not user:
return None
user.last_login_at = datetime.datetime.now(datetime.timezone.utc)
# 调用 DAO 层函数来持久化更新
return user_dao.update_user(user, db)
def deactivate_user(user_id: int, db: Session):
user = user_dao.get_user_by_id(user_id, db)
if not user:
return None
user.is_active = False
# 调用 DAO 层函数来持久化更新
return user_dao.update_user(user, db)
def delete_user(user_id: int, db: Session):
user = user_dao.get_user_by_id(user_id, db)
if not user:
return False # 表示用户不存在,删除失败
# 调用 DAO 层函数来删除
return user_dao.delete_user(user, db)

0
ocdp/utils/__init__.py Normal file
View File