/** * NVIDIA Driver Management * * Handles NVIDIA driver detection, installation, and container toolkit setup. */ import type { IDriverStatus } from '../interfaces/gpu.ts'; import { logger } from '../logger.ts'; import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts'; /** * NVIDIA Driver Manager */ export class NvidiaDriver extends BaseDriver { public readonly vendor = 'nvidia' as const; public readonly displayName = 'NVIDIA'; /** * Check if the NVIDIA driver is installed */ public async isInstalled(): Promise { try { const { stdout } = await this.execCommand('nvidia-smi --query-gpu=driver_version --format=csv,noheader', { timeout: 5000, ignoreErrors: true, }); return stdout.trim().length > 0; } catch { return false; } } /** * Get NVIDIA driver status */ public async getStatus(): Promise { const status: IDriverStatus = { vendor: 'nvidia', installed: false, containerSupport: false, issues: [], }; // Check if nvidia-smi is available try { const { stdout: driverVersion } = await this.execCommand( 'nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1', { timeout: 5000 }, ); status.installed = true; status.version = driverVersion.trim(); } catch { status.issues.push('NVIDIA driver not installed or nvidia-smi not available'); return status; } // Check CUDA toolkit try { const { stdout: cudaVersion } = await this.execCommand( 'nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"', { timeout: 5000, ignoreErrors: true }, ); if (cudaVersion.trim()) { status.toolkitVersion = cudaVersion.trim(); } } catch { // CUDA toolkit not installed } // Check nvidia-container-toolkit try { const { stdout: containerVersion } = await this.execCommand( 'nvidia-container-cli --version 2>&1 | head -1', { timeout: 5000, ignoreErrors: true }, ); if (containerVersion.includes('version')) { status.containerSupport = true; const match = containerVersion.match(/version (\d+\.\d+\.\d+)/); if (match) { status.containerRuntimeVersion = match[1]; } } } catch { status.issues.push('NVIDIA Container Toolkit not installed'); } // Check if Docker has nvidia runtime try { const { stdout: dockerInfo } = await this.execCommand( 'docker info --format "{{.Runtimes}}" 2>/dev/null', { timeout: 5000, ignoreErrors: true }, ); if (!dockerInfo.includes('nvidia')) { status.issues.push('Docker nvidia runtime not configured'); } } catch { // Docker check failed } return status; } /** * Install NVIDIA driver and optionally CUDA toolkit */ public async install(options: IDriverInstallOptions): Promise { if (!await this.isRoot()) { logger.error('Root privileges required to install NVIDIA drivers'); return false; } const distro = await this.getLinuxDistro(); logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`); try { if (distro.id === 'ubuntu' || distro.id === 'debian') { return await this.installOnDebian(options); } else if (distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') { return await this.installOnRhel(options); } else { logger.error(`Unsupported distribution: ${distro.id}`); logger.info('Please install NVIDIA drivers manually'); return false; } } catch (error) { logger.error(`Failed to install NVIDIA drivers: ${error instanceof Error ? error.message : String(error)}`); return false; } } /** * Install on Debian/Ubuntu */ private async installOnDebian(options: IDriverInstallOptions): Promise { logger.info('Installing NVIDIA drivers on Debian/Ubuntu...'); // Add NVIDIA repository await this.aptUpdate(); // Install prerequisites await this.aptInstall(['software-properties-common', 'build-essential', 'dkms']); // Add NVIDIA PPA (for Ubuntu) try { await this.execCommand('add-apt-repository -y ppa:graphics-drivers/ppa 2>/dev/null || true'); await this.aptUpdate(); } catch { // PPA might not be available on all systems } // Install NVIDIA driver const driverPackage = options.driverVersion ? `nvidia-driver-${options.driverVersion}` : 'nvidia-driver-535'; // Default to stable version await this.aptInstall(driverPackage); // Install CUDA toolkit if requested if (options.installToolkit) { await this.installCudaToolkit(options); } // Install container support if requested if (options.installContainerSupport) { await this.installContainerSupport(); } logger.success('NVIDIA driver installation completed'); logger.warn('A system reboot is required to load the new driver'); return true; } /** * Install on RHEL/Fedora */ private async installOnRhel(options: IDriverInstallOptions): Promise { logger.info('Installing NVIDIA drivers on RHEL/Fedora...'); // Install prerequisites await this.dnfInstall(['kernel-devel', 'kernel-headers', 'gcc', 'make', 'dkms', 'acpid']); // Add NVIDIA CUDA repository const distro = await this.getLinuxDistro(); const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${distro.version.split('.')[0]}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`; await this.execCommand(`dnf config-manager --add-repo ${repoUrl}`); // Install NVIDIA driver await this.dnfInstall('nvidia-driver-latest-dkms'); // Install CUDA toolkit if requested if (options.installToolkit) { await this.dnfInstall('cuda'); } // Install container support if requested if (options.installContainerSupport) { await this.installContainerSupport(); } logger.success('NVIDIA driver installation completed'); logger.warn('A system reboot is required to load the new driver'); return true; } /** * Install CUDA toolkit */ private async installCudaToolkit(options: IDriverInstallOptions): Promise { logger.info('Installing CUDA toolkit...'); const distro = await this.getLinuxDistro(); if (distro.id === 'ubuntu' || distro.id === 'debian') { // Add CUDA repository const cudaKeyUrl = 'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb'; await this.execCommand(`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`); await this.aptUpdate(); const cudaPackage = options.toolkitVersion ? `cuda-toolkit-${options.toolkitVersion.replace('.', '-')}` : 'cuda-toolkit'; await this.aptInstall(cudaPackage); } } /** * Install NVIDIA Container Toolkit */ public async installContainerSupport(): Promise { if (!await this.isRoot()) { logger.error('Root privileges required to install NVIDIA Container Toolkit'); return false; } const distro = await this.getLinuxDistro(); logger.info('Installing NVIDIA Container Toolkit...'); try { if (distro.id === 'ubuntu' || distro.id === 'debian') { // Add repository await this.execCommand( 'curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg', ); const distribution = `${distro.id}${distro.version}`; await this.execCommand( `curl -s -L https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list | ` + 'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' + 'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list', ); await this.aptUpdate(); await this.aptInstall('nvidia-container-toolkit'); } else { // RHEL/Fedora await this.execCommand( 'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | ' + 'tee /etc/yum.repos.d/nvidia-container-toolkit.repo', ); await this.dnfInstall('nvidia-container-toolkit'); } // Configure Docker runtime await this.configureDockerRuntime(); logger.success('NVIDIA Container Toolkit installed successfully'); return true; } catch (error) { logger.error(`Failed to install NVIDIA Container Toolkit: ${error instanceof Error ? error.message : String(error)}`); return false; } } /** * Configure Docker to use NVIDIA runtime */ private async configureDockerRuntime(): Promise { logger.info('Configuring Docker to use NVIDIA runtime...'); try { // Run nvidia-ctk to configure Docker await this.execCommand('nvidia-ctk runtime configure --runtime=docker'); // Restart Docker await this.execCommand('systemctl restart docker'); logger.success('Docker configured to use NVIDIA runtime'); } catch (error) { logger.warn(`Could not configure Docker runtime automatically: ${error instanceof Error ? error.message : String(error)}`); logger.info('Please run: nvidia-ctk runtime configure --runtime=docker'); } } /** * Get available driver versions */ public async getAvailableVersions(): Promise { const versions: string[] = []; try { const distro = await this.getLinuxDistro(); if (distro.id === 'ubuntu' || distro.id === 'debian') { const { stdout } = await this.execCommand( 'apt-cache search nvidia-driver | grep "^nvidia-driver-[0-9]" | sed "s/nvidia-driver-\\([0-9]*\\).*/\\1/" | sort -rn | uniq', { ignoreErrors: true }, ); versions.push(...stdout.trim().split('\n').filter((v: string) => v.trim())); } } catch { // Failed to get versions } return versions; } }