feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

This commit is contained in:
2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
+21 -9
View File
@@ -96,9 +96,12 @@ export class GpuDetector {
// Get CUDA version separately
if (gpus.length > 0) {
try {
const { stdout: cudaOut } = await execAsync('nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 && nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"', {
timeout: 5000,
});
const { stdout: cudaOut } = await execAsync(
'nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1 && nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"',
{
timeout: 5000,
},
);
const cudaMatch = cudaOut.match(/(\d+\.\d+)/);
if (cudaMatch) {
for (const gpu of gpus) {
@@ -142,7 +145,9 @@ export class GpuDetector {
id: `amd-${index}`,
vendor: 'amd',
model: String(cardData['Card series'] || cardData['card_series'] || 'AMD GPU'),
vram: this.parseMemory(String(cardData['VRAM Total Memory (B)'] || cardData['vram_total'] || '0')),
vram: this.parseMemory(
String(cardData['VRAM Total Memory (B)'] || cardData['vram_total'] || '0'),
),
driverVersion: String(cardData['Driver version'] || cardData['driver_version'] || ''),
rocmVersion: await this.getRocmVersion(),
pciSlot: String(cardData['PCI Bus'] || cardData['pci_bus'] || ''),
@@ -371,14 +376,17 @@ export class GpuDetector {
);
const parts = stdout.trim().split(',').map((p: string) => p.trim());
const [utilization, memUsed, memTotal, temp, power, powerLimit, fan, gpuClock, memClock] = parts;
const [utilization, memUsed, memTotal, temp, power, powerLimit, fan, gpuClock, memClock] =
parts;
return {
id: gpu.id,
utilization: parseInt(utilization, 10) || 0,
memoryUsed: parseInt(memUsed, 10) || 0,
memoryTotal: parseInt(memTotal, 10) || gpu.vram,
memoryPercent: memTotal ? Math.round((parseInt(memUsed, 10) / parseInt(memTotal, 10)) * 100) : 0,
memoryPercent: memTotal
? Math.round((parseInt(memUsed, 10) / parseInt(memTotal, 10)) * 100)
: 0,
temperature: parseInt(temp, 10) || 0,
powerUsage: parseFloat(power) || 0,
powerLimit: parseFloat(powerLimit) || 0,
@@ -513,7 +521,7 @@ export class GpuDetector {
bytes /= 1024;
break;
case 'B':
bytes /= (1024 * 1024);
bytes /= 1024 * 1024;
break;
}
@@ -542,7 +550,9 @@ export class GpuDetector {
*/
private async getRocmVersion(): Promise<string | undefined> {
try {
const { stdout } = await execAsync('cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1');
const { stdout } = await execAsync(
'cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1',
);
const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
return match ? match[1] : undefined;
} catch {
@@ -555,7 +565,9 @@ export class GpuDetector {
*/
private async getOneApiVersion(): Promise<string | undefined> {
try {
const { stdout } = await execAsync('source /opt/intel/oneapi/setvars.sh 2>/dev/null && echo $ONEAPI_ROOT 2>/dev/null || cat /opt/intel/oneapi/compiler/latest/env/vars.sh 2>/dev/null | grep VERSION');
const { stdout } = await execAsync(
'source /opt/intel/oneapi/setvars.sh 2>/dev/null && echo $ONEAPI_ROOT 2>/dev/null || cat /opt/intel/oneapi/compiler/latest/env/vars.sh 2>/dev/null | grep VERSION',
);
const match = stdout.match(/(\d+\.\d+(?:\.\d+)?)/);
return match ? match[1] : undefined;
} catch {
+11 -3
View File
@@ -105,7 +105,9 @@ export class SystemInfo {
*/
private async getNvidiaContainerVersion(): Promise<string | undefined> {
try {
const { stdout } = await execAsync('nvidia-container-cli --version 2>&1 | head -1', { timeout: 5000 });
const { stdout } = await execAsync('nvidia-container-cli --version 2>&1 | head -1', {
timeout: 5000,
});
const match = stdout.match(/version (\d+\.\d+\.\d+)/);
return match ? match[1] : undefined;
} catch {
@@ -156,7 +158,9 @@ export class SystemInfo {
*/
public async getAvailableDiskSpace(path: string = '/var/lib'): Promise<number> {
try {
const { stdout } = await execAsync(`df -m "${path}" | tail -1 | awk '{print $4}'`, { timeout: 5000 });
const { stdout } = await execAsync(`df -m "${path}" | tail -1 | awk '{print $4}'`, {
timeout: 5000,
});
return parseInt(stdout.trim(), 10) || 0;
} catch {
return 0;
@@ -198,7 +202,11 @@ export class SystemInfo {
logger.logBoxLine(`OS: ${info.os}`);
logger.logBoxLine(`Kernel: ${info.kernelVersion}`);
logger.logBoxLine(`CPU: ${info.cpuModel} (${info.cpuCores} cores)`);
logger.logBoxLine(`RAM: ${Math.round(info.ramTotal / 1024)} GB total, ${Math.round(info.ramAvailable / 1024)} GB available`);
logger.logBoxLine(
`RAM: ${Math.round(info.ramTotal / 1024)} GB total, ${
Math.round(info.ramAvailable / 1024)
} GB available`,
);
logger.logBoxLine('');
if (info.dockerVersion) {