feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

This commit is contained in:
2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
+27 -9
View File
@@ -20,10 +20,13 @@ export class AmdDriver extends BaseDriver {
*/
public async isInstalled(): Promise<boolean> {
try {
const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', {
timeout: 5000,
ignoreErrors: true,
});
const { stdout } = await this.execCommand(
'rocm-smi --showdriverversion 2>/dev/null | head -1',
{
timeout: 5000,
ignoreErrors: true,
},
);
return stdout.includes('Driver');
} catch {
return false;
@@ -114,7 +117,10 @@ export class AmdDriver extends BaseDriver {
try {
if (distro.id === 'ubuntu') {
return await this.installOnUbuntu(options);
} else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
} else if (
distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' ||
distro.id === 'almalinux'
) {
return await this.installOnRhel(options);
} else {
logger.error(`Unsupported distribution: ${distro.id}`);
@@ -122,7 +128,11 @@ export class AmdDriver extends BaseDriver {
return false;
}
} catch (error) {
logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to install AMD ROCm drivers: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
@@ -152,7 +162,9 @@ export class AmdDriver extends BaseDriver {
// Add AMDGPU repository
await this.execCommand(
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`,
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${
ubuntuVersion === '2204' ? 'jammy' : 'focal'
} main" > /etc/apt/sources.list.d/amdgpu.list`,
);
await this.aptUpdate();
@@ -250,7 +262,9 @@ EOF`,
// No special runtime needed, just need to pass --device flags
// Verify device files exist
const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true');
const { stdout: devices } = await this.execCommand(
'ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true',
);
if (!devices.includes('/dev/kfd')) {
logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.');
@@ -266,7 +280,11 @@ EOF`,
logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video');
return true;
} catch (error) {
logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to configure ROCm container support: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}