feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

This commit is contained in:
2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
+37 -14
View File
@@ -20,10 +20,13 @@ export class NvidiaDriver extends BaseDriver {
*/
public async isInstalled(): Promise<boolean> {
try {
const { stdout } = await this.execCommand('nvidia-smi --query-gpu=driver_version --format=csv,noheader', {
timeout: 5000,
ignoreErrors: true,
});
const { stdout } = await this.execCommand(
'nvidia-smi --query-gpu=driver_version --format=csv,noheader',
{
timeout: 5000,
ignoreErrors: true,
},
);
return stdout.trim().length > 0;
} catch {
return false;
@@ -115,7 +118,10 @@ export class NvidiaDriver extends BaseDriver {
try {
if (distro.id === 'ubuntu' || distro.id === 'debian') {
return await this.installOnDebian(options);
} else if (distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
} else if (
distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' ||
distro.id === 'rocky' || distro.id === 'almalinux'
) {
return await this.installOnRhel(options);
} else {
logger.error(`Unsupported distribution: ${distro.id}`);
@@ -123,7 +129,11 @@ export class NvidiaDriver extends BaseDriver {
return false;
}
} catch (error) {
logger.error(`Failed to install NVIDIA drivers: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to install NVIDIA drivers: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
@@ -181,7 +191,9 @@ export class NvidiaDriver extends BaseDriver {
// Add NVIDIA CUDA repository
const distro = await this.getLinuxDistro();
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${distro.version.split('.')[0]}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${
distro.version.split('.')[0]
}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
await this.execCommand(`dnf config-manager --add-repo ${repoUrl}`);
@@ -213,8 +225,11 @@ export class NvidiaDriver extends BaseDriver {
if (distro.id === 'ubuntu' || distro.id === 'debian') {
// Add CUDA repository
const cudaKeyUrl = 'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
await this.execCommand(`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`);
const cudaKeyUrl =
'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
await this.execCommand(
`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`,
);
await this.aptUpdate();
const cudaPackage = options.toolkitVersion
@@ -247,8 +262,8 @@ export class NvidiaDriver extends BaseDriver {
const distribution = `${distro.id}${distro.version}`;
await this.execCommand(
`curl -s -L https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list | ` +
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
);
await this.aptUpdate();
@@ -257,7 +272,7 @@ export class NvidiaDriver extends BaseDriver {
// RHEL/Fedora
await this.execCommand(
'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | ' +
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
);
await this.dnfInstall('nvidia-container-toolkit');
}
@@ -268,7 +283,11 @@ export class NvidiaDriver extends BaseDriver {
logger.success('NVIDIA Container Toolkit installed successfully');
return true;
} catch (error) {
logger.error(`Failed to install NVIDIA Container Toolkit: ${error instanceof Error ? error.message : String(error)}`);
logger.error(
`Failed to install NVIDIA Container Toolkit: ${
error instanceof Error ? error.message : String(error)
}`,
);
return false;
}
}
@@ -288,7 +307,11 @@ export class NvidiaDriver extends BaseDriver {
logger.success('Docker configured to use NVIDIA runtime');
} catch (error) {
logger.warn(`Could not configure Docker runtime automatically: ${error instanceof Error ? error.message : String(error)}`);
logger.warn(
`Could not configure Docker runtime automatically: ${
error instanceof Error ? error.message : String(error)
}`,
);
logger.info('Please run: nvidia-ctk runtime configure --runtime=docker');
}
}