initial
This commit is contained in:
318
ts/drivers/nvidia.ts
Normal file
318
ts/drivers/nvidia.ts
Normal file
@@ -0,0 +1,318 @@
|
||||
/**
|
||||
* NVIDIA Driver Management
|
||||
*
|
||||
* Handles NVIDIA driver detection, installation, and container toolkit setup.
|
||||
*/
|
||||
|
||||
import type { IDriverStatus } from '../interfaces/gpu.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
|
||||
|
||||
/**
|
||||
* NVIDIA Driver Manager
|
||||
*/
|
||||
export class NvidiaDriver extends BaseDriver {
|
||||
public readonly vendor = 'nvidia' as const;
|
||||
public readonly displayName = 'NVIDIA';
|
||||
|
||||
/**
|
||||
* Check if the NVIDIA driver is installed
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('nvidia-smi --query-gpu=driver_version --format=csv,noheader', {
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
});
|
||||
return stdout.trim().length > 0;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get NVIDIA driver status
|
||||
*/
|
||||
public async getStatus(): Promise<IDriverStatus> {
|
||||
const status: IDriverStatus = {
|
||||
vendor: 'nvidia',
|
||||
installed: false,
|
||||
containerSupport: false,
|
||||
issues: [],
|
||||
};
|
||||
|
||||
// Check if nvidia-smi is available
|
||||
try {
|
||||
const { stdout: driverVersion } = await this.execCommand(
|
||||
'nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1',
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
status.installed = true;
|
||||
status.version = driverVersion.trim();
|
||||
} catch {
|
||||
status.issues.push('NVIDIA driver not installed or nvidia-smi not available');
|
||||
return status;
|
||||
}
|
||||
|
||||
// Check CUDA toolkit
|
||||
try {
|
||||
const { stdout: cudaVersion } = await this.execCommand(
|
||||
'nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
if (cudaVersion.trim()) {
|
||||
status.toolkitVersion = cudaVersion.trim();
|
||||
}
|
||||
} catch {
|
||||
// CUDA toolkit not installed
|
||||
}
|
||||
|
||||
// Check nvidia-container-toolkit
|
||||
try {
|
||||
const { stdout: containerVersion } = await this.execCommand(
|
||||
'nvidia-container-cli --version 2>&1 | head -1',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
if (containerVersion.includes('version')) {
|
||||
status.containerSupport = true;
|
||||
const match = containerVersion.match(/version (\d+\.\d+\.\d+)/);
|
||||
if (match) {
|
||||
status.containerRuntimeVersion = match[1];
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
status.issues.push('NVIDIA Container Toolkit not installed');
|
||||
}
|
||||
|
||||
// Check if Docker has nvidia runtime
|
||||
try {
|
||||
const { stdout: dockerInfo } = await this.execCommand(
|
||||
'docker info --format "{{.Runtimes}}" 2>/dev/null',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
if (!dockerInfo.includes('nvidia')) {
|
||||
status.issues.push('Docker nvidia runtime not configured');
|
||||
}
|
||||
} catch {
|
||||
// Docker check failed
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install NVIDIA driver and optionally CUDA toolkit
|
||||
*/
|
||||
public async install(options: IDriverInstallOptions): Promise<boolean> {
|
||||
if (!await this.isRoot()) {
|
||||
logger.error('Root privileges required to install NVIDIA drivers');
|
||||
return false;
|
||||
}
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`);
|
||||
|
||||
try {
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
return await this.installOnDebian(options);
|
||||
} else if (distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||||
return await this.installOnRhel(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution: ${distro.id}`);
|
||||
logger.info('Please install NVIDIA drivers manually');
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install NVIDIA drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on Debian/Ubuntu
|
||||
*/
|
||||
private async installOnDebian(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing NVIDIA drivers on Debian/Ubuntu...');
|
||||
|
||||
// Add NVIDIA repository
|
||||
await this.aptUpdate();
|
||||
|
||||
// Install prerequisites
|
||||
await this.aptInstall(['software-properties-common', 'build-essential', 'dkms']);
|
||||
|
||||
// Add NVIDIA PPA (for Ubuntu)
|
||||
try {
|
||||
await this.execCommand('add-apt-repository -y ppa:graphics-drivers/ppa 2>/dev/null || true');
|
||||
await this.aptUpdate();
|
||||
} catch {
|
||||
// PPA might not be available on all systems
|
||||
}
|
||||
|
||||
// Install NVIDIA driver
|
||||
const driverPackage = options.driverVersion
|
||||
? `nvidia-driver-${options.driverVersion}`
|
||||
: 'nvidia-driver-535'; // Default to stable version
|
||||
|
||||
await this.aptInstall(driverPackage);
|
||||
|
||||
// Install CUDA toolkit if requested
|
||||
if (options.installToolkit) {
|
||||
await this.installCudaToolkit(options);
|
||||
}
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('NVIDIA driver installation completed');
|
||||
logger.warn('A system reboot is required to load the new driver');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on RHEL/Fedora
|
||||
*/
|
||||
private async installOnRhel(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing NVIDIA drivers on RHEL/Fedora...');
|
||||
|
||||
// Install prerequisites
|
||||
await this.dnfInstall(['kernel-devel', 'kernel-headers', 'gcc', 'make', 'dkms', 'acpid']);
|
||||
|
||||
// Add NVIDIA CUDA repository
|
||||
const distro = await this.getLinuxDistro();
|
||||
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${distro.version.split('.')[0]}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
|
||||
|
||||
await this.execCommand(`dnf config-manager --add-repo ${repoUrl}`);
|
||||
|
||||
// Install NVIDIA driver
|
||||
await this.dnfInstall('nvidia-driver-latest-dkms');
|
||||
|
||||
// Install CUDA toolkit if requested
|
||||
if (options.installToolkit) {
|
||||
await this.dnfInstall('cuda');
|
||||
}
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('NVIDIA driver installation completed');
|
||||
logger.warn('A system reboot is required to load the new driver');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install CUDA toolkit
|
||||
*/
|
||||
private async installCudaToolkit(options: IDriverInstallOptions): Promise<void> {
|
||||
logger.info('Installing CUDA toolkit...');
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
// Add CUDA repository
|
||||
const cudaKeyUrl = 'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
|
||||
await this.execCommand(`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`);
|
||||
await this.aptUpdate();
|
||||
|
||||
const cudaPackage = options.toolkitVersion
|
||||
? `cuda-toolkit-${options.toolkitVersion.replace('.', '-')}`
|
||||
: 'cuda-toolkit';
|
||||
|
||||
await this.aptInstall(cudaPackage);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Install NVIDIA Container Toolkit
|
||||
*/
|
||||
public async installContainerSupport(): Promise<boolean> {
|
||||
if (!await this.isRoot()) {
|
||||
logger.error('Root privileges required to install NVIDIA Container Toolkit');
|
||||
return false;
|
||||
}
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
logger.info('Installing NVIDIA Container Toolkit...');
|
||||
|
||||
try {
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
// Add repository
|
||||
await this.execCommand(
|
||||
'curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg',
|
||||
);
|
||||
|
||||
const distribution = `${distro.id}${distro.version}`;
|
||||
await this.execCommand(
|
||||
`curl -s -L https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list | ` +
|
||||
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
|
||||
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
await this.aptInstall('nvidia-container-toolkit');
|
||||
} else {
|
||||
// RHEL/Fedora
|
||||
await this.execCommand(
|
||||
'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | ' +
|
||||
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
|
||||
);
|
||||
await this.dnfInstall('nvidia-container-toolkit');
|
||||
}
|
||||
|
||||
// Configure Docker runtime
|
||||
await this.configureDockerRuntime();
|
||||
|
||||
logger.success('NVIDIA Container Toolkit installed successfully');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install NVIDIA Container Toolkit: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure Docker to use NVIDIA runtime
|
||||
*/
|
||||
private async configureDockerRuntime(): Promise<void> {
|
||||
logger.info('Configuring Docker to use NVIDIA runtime...');
|
||||
|
||||
try {
|
||||
// Run nvidia-ctk to configure Docker
|
||||
await this.execCommand('nvidia-ctk runtime configure --runtime=docker');
|
||||
|
||||
// Restart Docker
|
||||
await this.execCommand('systemctl restart docker');
|
||||
|
||||
logger.success('Docker configured to use NVIDIA runtime');
|
||||
} catch (error) {
|
||||
logger.warn(`Could not configure Docker runtime automatically: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.info('Please run: nvidia-ctk runtime configure --runtime=docker');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get available driver versions
|
||||
*/
|
||||
public async getAvailableVersions(): Promise<string[]> {
|
||||
const versions: string[] = [];
|
||||
|
||||
try {
|
||||
const distro = await this.getLinuxDistro();
|
||||
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
const { stdout } = await this.execCommand(
|
||||
'apt-cache search nvidia-driver | grep "^nvidia-driver-[0-9]" | sed "s/nvidia-driver-\\([0-9]*\\).*/\\1/" | sort -rn | uniq',
|
||||
{ ignoreErrors: true },
|
||||
);
|
||||
versions.push(...stdout.trim().split('\n').filter((v: string) => v.trim()));
|
||||
}
|
||||
} catch {
|
||||
// Failed to get versions
|
||||
}
|
||||
|
||||
return versions;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user