Files

319 lines
10 KiB
TypeScript
Raw Permalink Normal View History

2026-01-30 03:16:57 +00:00
/**
* NVIDIA Driver Management
*
* Handles NVIDIA driver detection, installation, and container toolkit setup.
*/
import type { IDriverStatus } from '../interfaces/gpu.ts';
import { logger } from '../logger.ts';
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
/**
* NVIDIA Driver Manager
*/
export class NvidiaDriver extends BaseDriver {
public readonly vendor = 'nvidia' as const;
public readonly displayName = 'NVIDIA';
/**
* Check if the NVIDIA driver is installed
*/
public async isInstalled(): Promise<boolean> {
try {
const { stdout } = await this.execCommand('nvidia-smi --query-gpu=driver_version --format=csv,noheader', {
timeout: 5000,
ignoreErrors: true,
});
return stdout.trim().length > 0;
} catch {
return false;
}
}
/**
* Get NVIDIA driver status
*/
public async getStatus(): Promise<IDriverStatus> {
const status: IDriverStatus = {
vendor: 'nvidia',
installed: false,
containerSupport: false,
issues: [],
};
// Check if nvidia-smi is available
try {
const { stdout: driverVersion } = await this.execCommand(
'nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1',
{ timeout: 5000 },
);
status.installed = true;
status.version = driverVersion.trim();
} catch {
status.issues.push('NVIDIA driver not installed or nvidia-smi not available');
return status;
}
// Check CUDA toolkit
try {
const { stdout: cudaVersion } = await this.execCommand(
'nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"',
{ timeout: 5000, ignoreErrors: true },
);
if (cudaVersion.trim()) {
status.toolkitVersion = cudaVersion.trim();
}
} catch {
// CUDA toolkit not installed
}
// Check nvidia-container-toolkit
try {
const { stdout: containerVersion } = await this.execCommand(
'nvidia-container-cli --version 2>&1 | head -1',
{ timeout: 5000, ignoreErrors: true },
);
if (containerVersion.includes('version')) {
status.containerSupport = true;
const match = containerVersion.match(/version (\d+\.\d+\.\d+)/);
if (match) {
status.containerRuntimeVersion = match[1];
}
}
} catch {
status.issues.push('NVIDIA Container Toolkit not installed');
}
// Check if Docker has nvidia runtime
try {
const { stdout: dockerInfo } = await this.execCommand(
'docker info --format "{{.Runtimes}}" 2>/dev/null',
{ timeout: 5000, ignoreErrors: true },
);
if (!dockerInfo.includes('nvidia')) {
status.issues.push('Docker nvidia runtime not configured');
}
} catch {
// Docker check failed
}
return status;
}
/**
* Install NVIDIA driver and optionally CUDA toolkit
*/
public async install(options: IDriverInstallOptions): Promise<boolean> {
if (!await this.isRoot()) {
logger.error('Root privileges required to install NVIDIA drivers');
return false;
}
const distro = await this.getLinuxDistro();
logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`);
try {
if (distro.id === 'ubuntu' || distro.id === 'debian') {
return await this.installOnDebian(options);
} else if (distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
return await this.installOnRhel(options);
} else {
logger.error(`Unsupported distribution: ${distro.id}`);
logger.info('Please install NVIDIA drivers manually');
return false;
}
} catch (error) {
logger.error(`Failed to install NVIDIA drivers: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Install on Debian/Ubuntu
*/
private async installOnDebian(options: IDriverInstallOptions): Promise<boolean> {
logger.info('Installing NVIDIA drivers on Debian/Ubuntu...');
// Add NVIDIA repository
await this.aptUpdate();
// Install prerequisites
await this.aptInstall(['software-properties-common', 'build-essential', 'dkms']);
// Add NVIDIA PPA (for Ubuntu)
try {
await this.execCommand('add-apt-repository -y ppa:graphics-drivers/ppa 2>/dev/null || true');
await this.aptUpdate();
} catch {
// PPA might not be available on all systems
}
// Install NVIDIA driver
const driverPackage = options.driverVersion
? `nvidia-driver-${options.driverVersion}`
: 'nvidia-driver-535'; // Default to stable version
await this.aptInstall(driverPackage);
// Install CUDA toolkit if requested
if (options.installToolkit) {
await this.installCudaToolkit(options);
}
// Install container support if requested
if (options.installContainerSupport) {
await this.installContainerSupport();
}
logger.success('NVIDIA driver installation completed');
logger.warn('A system reboot is required to load the new driver');
return true;
}
/**
* Install on RHEL/Fedora
*/
private async installOnRhel(options: IDriverInstallOptions): Promise<boolean> {
logger.info('Installing NVIDIA drivers on RHEL/Fedora...');
// Install prerequisites
await this.dnfInstall(['kernel-devel', 'kernel-headers', 'gcc', 'make', 'dkms', 'acpid']);
// Add NVIDIA CUDA repository
const distro = await this.getLinuxDistro();
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${distro.version.split('.')[0]}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
await this.execCommand(`dnf config-manager --add-repo ${repoUrl}`);
// Install NVIDIA driver
await this.dnfInstall('nvidia-driver-latest-dkms');
// Install CUDA toolkit if requested
if (options.installToolkit) {
await this.dnfInstall('cuda');
}
// Install container support if requested
if (options.installContainerSupport) {
await this.installContainerSupport();
}
logger.success('NVIDIA driver installation completed');
logger.warn('A system reboot is required to load the new driver');
return true;
}
/**
* Install CUDA toolkit
*/
private async installCudaToolkit(options: IDriverInstallOptions): Promise<void> {
logger.info('Installing CUDA toolkit...');
const distro = await this.getLinuxDistro();
if (distro.id === 'ubuntu' || distro.id === 'debian') {
// Add CUDA repository
const cudaKeyUrl = 'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
await this.execCommand(`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`);
await this.aptUpdate();
const cudaPackage = options.toolkitVersion
? `cuda-toolkit-${options.toolkitVersion.replace('.', '-')}`
: 'cuda-toolkit';
await this.aptInstall(cudaPackage);
}
}
/**
* Install NVIDIA Container Toolkit
*/
public async installContainerSupport(): Promise<boolean> {
if (!await this.isRoot()) {
logger.error('Root privileges required to install NVIDIA Container Toolkit');
return false;
}
const distro = await this.getLinuxDistro();
logger.info('Installing NVIDIA Container Toolkit...');
try {
if (distro.id === 'ubuntu' || distro.id === 'debian') {
// Add repository
await this.execCommand(
'curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg',
);
const distribution = `${distro.id}${distro.version}`;
await this.execCommand(
`curl -s -L https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list | ` +
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
);
await this.aptUpdate();
await this.aptInstall('nvidia-container-toolkit');
} else {
// RHEL/Fedora
await this.execCommand(
'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | ' +
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
);
await this.dnfInstall('nvidia-container-toolkit');
}
// Configure Docker runtime
await this.configureDockerRuntime();
logger.success('NVIDIA Container Toolkit installed successfully');
return true;
} catch (error) {
logger.error(`Failed to install NVIDIA Container Toolkit: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Configure Docker to use NVIDIA runtime
*/
private async configureDockerRuntime(): Promise<void> {
logger.info('Configuring Docker to use NVIDIA runtime...');
try {
// Run nvidia-ctk to configure Docker
await this.execCommand('nvidia-ctk runtime configure --runtime=docker');
// Restart Docker
await this.execCommand('systemctl restart docker');
logger.success('Docker configured to use NVIDIA runtime');
} catch (error) {
logger.warn(`Could not configure Docker runtime automatically: ${error instanceof Error ? error.message : String(error)}`);
logger.info('Please run: nvidia-ctk runtime configure --runtime=docker');
}
}
/**
* Get available driver versions
*/
public async getAvailableVersions(): Promise<string[]> {
const versions: string[] = [];
try {
const distro = await this.getLinuxDistro();
if (distro.id === 'ubuntu' || distro.id === 'debian') {
const { stdout } = await this.execCommand(
'apt-cache search nvidia-driver | grep "^nvidia-driver-[0-9]" | sed "s/nvidia-driver-\\([0-9]*\\).*/\\1/" | sort -rn | uniq',
{ ignoreErrors: true },
);
versions.push(...stdout.trim().split('\n').filter((v: string) => v.trim()));
}
} catch {
// Failed to get versions
}
return versions;
}
}