feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+37
-14
@@ -20,10 +20,13 @@ export class NvidiaDriver extends BaseDriver {
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('nvidia-smi --query-gpu=driver_version --format=csv,noheader', {
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
});
|
||||
const { stdout } = await this.execCommand(
|
||||
'nvidia-smi --query-gpu=driver_version --format=csv,noheader',
|
||||
{
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
},
|
||||
);
|
||||
return stdout.trim().length > 0;
|
||||
} catch {
|
||||
return false;
|
||||
@@ -115,7 +118,10 @@ export class NvidiaDriver extends BaseDriver {
|
||||
try {
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
return await this.installOnDebian(options);
|
||||
} else if (distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||||
} else if (
|
||||
distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' ||
|
||||
distro.id === 'rocky' || distro.id === 'almalinux'
|
||||
) {
|
||||
return await this.installOnRhel(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution: ${distro.id}`);
|
||||
@@ -123,7 +129,11 @@ export class NvidiaDriver extends BaseDriver {
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install NVIDIA drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to install NVIDIA drivers: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -181,7 +191,9 @@ export class NvidiaDriver extends BaseDriver {
|
||||
|
||||
// Add NVIDIA CUDA repository
|
||||
const distro = await this.getLinuxDistro();
|
||||
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${distro.version.split('.')[0]}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
|
||||
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${
|
||||
distro.version.split('.')[0]
|
||||
}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
|
||||
|
||||
await this.execCommand(`dnf config-manager --add-repo ${repoUrl}`);
|
||||
|
||||
@@ -213,8 +225,11 @@ export class NvidiaDriver extends BaseDriver {
|
||||
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
// Add CUDA repository
|
||||
const cudaKeyUrl = 'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
|
||||
await this.execCommand(`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`);
|
||||
const cudaKeyUrl =
|
||||
'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
|
||||
await this.execCommand(
|
||||
`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`,
|
||||
);
|
||||
await this.aptUpdate();
|
||||
|
||||
const cudaPackage = options.toolkitVersion
|
||||
@@ -247,8 +262,8 @@ export class NvidiaDriver extends BaseDriver {
|
||||
const distribution = `${distro.id}${distro.version}`;
|
||||
await this.execCommand(
|
||||
`curl -s -L https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list | ` +
|
||||
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
|
||||
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
|
||||
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
|
||||
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
@@ -257,7 +272,7 @@ export class NvidiaDriver extends BaseDriver {
|
||||
// RHEL/Fedora
|
||||
await this.execCommand(
|
||||
'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | ' +
|
||||
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
|
||||
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
|
||||
);
|
||||
await this.dnfInstall('nvidia-container-toolkit');
|
||||
}
|
||||
@@ -268,7 +283,11 @@ export class NvidiaDriver extends BaseDriver {
|
||||
logger.success('NVIDIA Container Toolkit installed successfully');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install NVIDIA Container Toolkit: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to install NVIDIA Container Toolkit: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -288,7 +307,11 @@ export class NvidiaDriver extends BaseDriver {
|
||||
|
||||
logger.success('Docker configured to use NVIDIA runtime');
|
||||
} catch (error) {
|
||||
logger.warn(`Could not configure Docker runtime automatically: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.warn(
|
||||
`Could not configure Docker runtime automatically: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
logger.info('Please run: nvidia-ctk runtime configure --runtime=docker');
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user