feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

This commit is contained in:
2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
+27 -9
View File
@@ -20,10 +20,13 @@ export class AmdDriver extends BaseDriver {
*/
public async isInstalled(): Promise<boolean> {
try {
const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', {
timeout: 5000,
ignoreErrors: true,
});
const { stdout } = await this.execCommand(
'rocm-smi --showdriverversion 2>/dev/null | head -1',
{
timeout: 5000,
ignoreErrors: true,
},
);
return stdout.includes('Driver');
} catch {
return false;
@@ -114,7 +117,10 @@ export class AmdDriver extends BaseDriver {
try {
if (distro.id === 'ubuntu') {
return await this.installOnUbuntu(options);
} else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
} else if (
distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' ||
distro.id === 'almalinux'
) {
return await this.installOnRhel(options);
} else {
logger.error(`Unsupported distribution: ${distro.id}`);
@@ -122,7 +128,11 @@ export class AmdDriver extends BaseDriver {
return false;
}
} catch (error) {
logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to install AMD ROCm drivers: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
@@ -152,7 +162,9 @@ export class AmdDriver extends BaseDriver {
// Add AMDGPU repository
await this.execCommand(
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`,
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${
ubuntuVersion === '2204' ? 'jammy' : 'focal'
} main" > /etc/apt/sources.list.d/amdgpu.list`,
);
await this.aptUpdate();
@@ -250,7 +262,9 @@ EOF`,
// No special runtime needed, just need to pass --device flags
// Verify device files exist
const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true');
const { stdout: devices } = await this.execCommand(
'ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true',
);
if (!devices.includes('/dev/kfd')) {
logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.');
@@ -266,7 +280,11 @@ EOF`,
logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video');
return true;
} catch (error) {
logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to configure ROCm container support: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
+8 -2
View File
@@ -177,7 +177,9 @@ export abstract class BaseDriver {
protected async addAptRepository(repo: string, keyUrl?: string): Promise<void> {
if (keyUrl) {
// Add GPG key
await this.execCommand(`curl -fsSL ${keyUrl} | gpg --dearmor -o /usr/share/keyrings/$(basename ${keyUrl}).gpg`);
await this.execCommand(
`curl -fsSL ${keyUrl} | gpg --dearmor -o /usr/share/keyrings/$(basename ${keyUrl}).gpg`,
);
}
await this.execCommand(`add-apt-repository -y "${repo}"`);
}
@@ -188,7 +190,11 @@ export abstract class BaseDriver {
public async logStatus(): Promise<void> {
const status = await this.getStatus();
logger.logBoxTitle(`${this.displayName} Driver Status`, 60, status.installed ? 'success' : 'warning');
logger.logBoxTitle(
`${this.displayName} Driver Status`,
60,
status.installed ? 'success' : 'warning',
);
logger.logBoxLine(`Installed: ${status.installed ? 'Yes' : 'No'}`);
if (status.installed) {
+11 -8
View File
@@ -21,7 +21,7 @@ export class DriverManager {
constructor() {
this.gpuDetector = new GpuDetector();
this.drivers = new Map([
this.drivers = new Map<TGpuVendor, BaseDriver>([
['nvidia', new NvidiaDriver()],
['amd', new AmdDriver()],
['intel', new IntelDriver()],
@@ -197,10 +197,15 @@ export class DriverManager {
// Print status for each vendor
for (const [vendor, gpuList] of vendorGpus) {
if (vendor === 'unknown') {
logger.logBox('Unknown GPUs', [
`${gpuList.length} GPU(s) with unknown vendor`,
'Manual driver installation may be required',
], 50, 'warning');
logger.logBox(
'Unknown GPUs',
[
`${gpuList.length} GPU(s) with unknown vendor`,
'Manual driver installation may be required',
],
50,
'warning',
);
continue;
}
@@ -219,9 +224,7 @@ export class DriverManager {
const args: string[] = [];
// Filter to specific GPUs if provided
const targetGpus = gpuIds
? gpus.filter((g) => gpuIds.includes(g.id))
: gpus;
const targetGpus = gpuIds ? gpus.filter((g) => gpuIds.includes(g.id)) : gpus;
if (targetGpus.length === 0) {
return args;
+18 -4
View File
@@ -138,7 +138,11 @@ export class IntelDriver extends BaseDriver {
return false;
}
} catch (error) {
logger.error(`Failed to install Intel drivers: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to install Intel drivers: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
@@ -159,7 +163,11 @@ export class IntelDriver extends BaseDriver {
);
const distro = await this.getLinuxDistro();
const ubuntuCodename = distro.version === '22.04' ? 'jammy' : distro.version === '24.04' ? 'noble' : 'jammy';
const ubuntuCodename = distro.version === '22.04'
? 'jammy'
: distro.version === '24.04'
? 'noble'
: 'jammy';
await this.execCommand(
`echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu ${ubuntuCodename} arc" > /etc/apt/sources.list.d/intel-graphics.list`,
@@ -308,7 +316,9 @@ EOF`,
try {
// Intel GPUs work by passing through device files
// Verify render devices exist
const { stdout: devices } = await this.execCommand('ls -la /dev/dri/renderD* 2>/dev/null || true');
const { stdout: devices } = await this.execCommand(
'ls -la /dev/dri/renderD* 2>/dev/null || true',
);
if (!devices.includes('renderD')) {
logger.warn('/dev/dri/renderD* not found. Intel GPU driver may not be properly loaded.');
@@ -323,7 +333,11 @@ EOF`,
logger.info(' --device=/dev/dri --group-add render');
return true;
} catch (error) {
logger.error(`Failed to configure Intel container support: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to configure Intel container support: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
+37 -14
View File
@@ -20,10 +20,13 @@ export class NvidiaDriver extends BaseDriver {
*/
public async isInstalled(): Promise<boolean> {
try {
const { stdout } = await this.execCommand('nvidia-smi --query-gpu=driver_version --format=csv,noheader', {
timeout: 5000,
ignoreErrors: true,
});
const { stdout } = await this.execCommand(
'nvidia-smi --query-gpu=driver_version --format=csv,noheader',
{
timeout: 5000,
ignoreErrors: true,
},
);
return stdout.trim().length > 0;
} catch {
return false;
@@ -115,7 +118,10 @@ export class NvidiaDriver extends BaseDriver {
try {
if (distro.id === 'ubuntu' || distro.id === 'debian') {
return await this.installOnDebian(options);
} else if (distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
} else if (
distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' ||
distro.id === 'rocky' || distro.id === 'almalinux'
) {
return await this.installOnRhel(options);
} else {
logger.error(`Unsupported distribution: ${distro.id}`);
@@ -123,7 +129,11 @@ export class NvidiaDriver extends BaseDriver {
return false;
}
} catch (error) {
logger.error(`Failed to install NVIDIA drivers: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to install NVIDIA drivers: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
@@ -181,7 +191,9 @@ export class NvidiaDriver extends BaseDriver {
// Add NVIDIA CUDA repository
const distro = await this.getLinuxDistro();
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${distro.version.split('.')[0]}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${
distro.version.split('.')[0]
}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
await this.execCommand(`dnf config-manager --add-repo ${repoUrl}`);
@@ -213,8 +225,11 @@ export class NvidiaDriver extends BaseDriver {
if (distro.id === 'ubuntu' || distro.id === 'debian') {
// Add CUDA repository
const cudaKeyUrl = 'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
await this.execCommand(`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`);
const cudaKeyUrl =
'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
await this.execCommand(
`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`,
);
await this.aptUpdate();
const cudaPackage = options.toolkitVersion
@@ -247,8 +262,8 @@ export class NvidiaDriver extends BaseDriver {
const distribution = `${distro.id}${distro.version}`;
await this.execCommand(
`curl -s -L https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list | ` +
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
);
await this.aptUpdate();
@@ -257,7 +272,7 @@ export class NvidiaDriver extends BaseDriver {
// RHEL/Fedora
await this.execCommand(
'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | ' +
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
);
await this.dnfInstall('nvidia-container-toolkit');
}
@@ -268,7 +283,11 @@ export class NvidiaDriver extends BaseDriver {
logger.success('NVIDIA Container Toolkit installed successfully');
return true;
} catch (error) {
logger.error(`Failed to install NVIDIA Container Toolkit: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to install NVIDIA Container Toolkit: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
@@ -288,7 +307,11 @@ export class NvidiaDriver extends BaseDriver {
logger.success('Docker configured to use NVIDIA runtime');
} catch (error) {
logger.warn(`Could not configure Docker runtime automatically: ${error instanceof Error ? error.message : String(error)}`);
logger.warn(
`Could not configure Docker runtime automatically: ${
error instanceof Error ? error.message : String(error)
}`,
);
logger.info('Please run: nvidia-ctk runtime configure --runtime=docker');
}
}