initial
This commit is contained in:
281
ts/drivers/amd.ts
Normal file
281
ts/drivers/amd.ts
Normal file
@@ -0,0 +1,281 @@
|
||||
/**
|
||||
* AMD Driver Management
|
||||
*
|
||||
* Handles AMD ROCm driver detection, installation, and container setup.
|
||||
*/
|
||||
|
||||
import type { IDriverStatus } from '../interfaces/gpu.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
|
||||
|
||||
/**
|
||||
* AMD ROCm Driver Manager
|
||||
*/
|
||||
export class AmdDriver extends BaseDriver {
|
||||
public readonly vendor = 'amd' as const;
|
||||
public readonly displayName = 'AMD ROCm';
|
||||
|
||||
/**
|
||||
* Check if AMD ROCm driver is installed
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', {
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
});
|
||||
return stdout.includes('Driver');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get AMD ROCm driver status
|
||||
*/
|
||||
public async getStatus(): Promise<IDriverStatus> {
|
||||
const status: IDriverStatus = {
|
||||
vendor: 'amd',
|
||||
installed: false,
|
||||
containerSupport: false,
|
||||
issues: [],
|
||||
};
|
||||
|
||||
// Check if rocm-smi is available
|
||||
try {
|
||||
const { stdout: driverInfo } = await this.execCommand(
|
||||
'rocm-smi --showdriverversion 2>/dev/null',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
if (driverInfo.includes('Driver')) {
|
||||
status.installed = true;
|
||||
const match = driverInfo.match(/Driver version:\s*(\S+)/i);
|
||||
if (match) {
|
||||
status.version = match[1];
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
status.issues.push('ROCm driver not installed or rocm-smi not available');
|
||||
return status;
|
||||
}
|
||||
|
||||
// Check ROCm toolkit version
|
||||
try {
|
||||
const { stdout: rocmVersion } = await this.execCommand(
|
||||
'cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
const match = rocmVersion.match(/(\d+\.\d+(?:\.\d+)?)/);
|
||||
if (match) {
|
||||
status.toolkitVersion = match[1];
|
||||
}
|
||||
} catch {
|
||||
// ROCm toolkit version not available
|
||||
}
|
||||
|
||||
// Check Docker ROCm support
|
||||
try {
|
||||
const { stdout: dockerInfo } = await this.execCommand(
|
||||
'docker info 2>/dev/null | grep -i "rocm\\|amd"',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
// Check if rocm/pytorch or similar images can run
|
||||
const { stdout: deviceCheck } = await this.execCommand(
|
||||
'ls /dev/kfd /dev/dri/render* 2>/dev/null',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
if (deviceCheck.includes('/dev/kfd') || dockerInfo.includes('rocm')) {
|
||||
status.containerSupport = true;
|
||||
} else {
|
||||
status.issues.push('ROCm device files not available for container access');
|
||||
}
|
||||
} catch {
|
||||
status.issues.push('Could not verify Docker ROCm support');
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install AMD ROCm driver
|
||||
*/
|
||||
public async install(options: IDriverInstallOptions): Promise<boolean> {
|
||||
if (!await this.isRoot()) {
|
||||
logger.error('Root privileges required to install AMD ROCm drivers');
|
||||
return false;
|
||||
}
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`);
|
||||
|
||||
try {
|
||||
if (distro.id === 'ubuntu') {
|
||||
return await this.installOnUbuntu(options);
|
||||
} else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||||
return await this.installOnRhel(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution: ${distro.id}`);
|
||||
logger.info('Please install ROCm drivers manually from https://rocm.docs.amd.com/');
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on Ubuntu
|
||||
*/
|
||||
private async installOnUbuntu(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing AMD ROCm on Ubuntu...');
|
||||
|
||||
// Install prerequisites
|
||||
await this.aptUpdate();
|
||||
await this.aptInstall(['wget', 'gnupg2']);
|
||||
|
||||
// Add ROCm repository
|
||||
const rocmVersion = options.toolkitVersion || '6.0';
|
||||
const ubuntuVersion = (await this.getLinuxDistro()).version.replace('.', '');
|
||||
|
||||
// Download and install ROCm repository
|
||||
await this.execCommand(
|
||||
`wget -q https://repo.radeon.com/rocm/rocm.gpg.key -O - | apt-key add -`,
|
||||
);
|
||||
|
||||
await this.execCommand(
|
||||
`echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${rocmVersion} ubuntu main" > /etc/apt/sources.list.d/rocm.list`,
|
||||
);
|
||||
|
||||
// Add AMDGPU repository
|
||||
await this.execCommand(
|
||||
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`,
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
|
||||
// Install AMDGPU driver and ROCm
|
||||
await this.aptInstall('amdgpu-dkms');
|
||||
|
||||
if (options.installToolkit) {
|
||||
await this.aptInstall('rocm-hip-sdk');
|
||||
} else {
|
||||
await this.aptInstall('rocm-smi-lib');
|
||||
}
|
||||
|
||||
// Add user to video and render groups
|
||||
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('AMD ROCm installation completed');
|
||||
logger.warn('A system reboot is required to load the new driver');
|
||||
logger.info('After reboot, verify with: rocm-smi');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on RHEL
|
||||
*/
|
||||
private async installOnRhel(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing AMD ROCm on RHEL/CentOS...');
|
||||
|
||||
const rocmVersion = options.toolkitVersion || '6.0';
|
||||
const distro = await this.getLinuxDistro();
|
||||
const rhelVersion = distro.version.split('.')[0];
|
||||
|
||||
// Add EPEL repository
|
||||
await this.dnfInstall('epel-release');
|
||||
|
||||
// Add ROCm repository
|
||||
await this.execCommand(
|
||||
`cat <<EOF > /etc/yum.repos.d/rocm.repo
|
||||
[ROCm]
|
||||
name=ROCm
|
||||
baseurl=https://repo.radeon.com/rocm/yum/${rocmVersion}/main
|
||||
enabled=1
|
||||
gpgcheck=1
|
||||
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||||
EOF`,
|
||||
);
|
||||
|
||||
// Add AMDGPU repository
|
||||
await this.execCommand(
|
||||
`cat <<EOF > /etc/yum.repos.d/amdgpu.repo
|
||||
[amdgpu]
|
||||
name=amdgpu
|
||||
baseurl=https://repo.radeon.com/amdgpu/${rocmVersion}/rhel/${rhelVersion}/main/x86_64/
|
||||
enabled=1
|
||||
gpgcheck=1
|
||||
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||||
EOF`,
|
||||
);
|
||||
|
||||
// Install AMDGPU driver
|
||||
await this.dnfInstall('amdgpu-dkms');
|
||||
|
||||
if (options.installToolkit) {
|
||||
await this.dnfInstall('rocm-hip-sdk');
|
||||
} else {
|
||||
await this.dnfInstall('rocm-smi-lib');
|
||||
}
|
||||
|
||||
// Add user to video and render groups
|
||||
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('AMD ROCm installation completed');
|
||||
logger.warn('A system reboot is required to load the new driver');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install container support for AMD GPUs
|
||||
*/
|
||||
public async installContainerSupport(): Promise<boolean> {
|
||||
logger.info('Configuring Docker for AMD ROCm...');
|
||||
|
||||
try {
|
||||
// AMD ROCm containers work by passing through device files
|
||||
// No special runtime needed, just need to pass --device flags
|
||||
|
||||
// Verify device files exist
|
||||
const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true');
|
||||
|
||||
if (!devices.includes('/dev/kfd')) {
|
||||
logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.');
|
||||
logger.info('Try rebooting the system after driver installation.');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set permissions
|
||||
await this.execCommand('chmod 666 /dev/kfd /dev/dri/render* || true');
|
||||
|
||||
logger.success('AMD ROCm container support configured');
|
||||
logger.info('Use the following Docker flags for ROCm containers:');
|
||||
logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get available ROCm versions
|
||||
*/
|
||||
public async getAvailableVersions(): Promise<string[]> {
|
||||
// ROCm has a standard set of supported versions
|
||||
return ['6.0', '5.7', '5.6', '5.5', '5.4'];
|
||||
}
|
||||
}
|
||||
217
ts/drivers/base-driver.ts
Normal file
217
ts/drivers/base-driver.ts
Normal file
@@ -0,0 +1,217 @@
|
||||
/**
|
||||
* Base Driver Class
|
||||
*
|
||||
* Abstract base class for GPU driver management.
|
||||
*/
|
||||
|
||||
import { exec } from 'node:child_process';
|
||||
import { promisify } from 'node:util';
|
||||
import type { IDriverStatus, TGpuVendor } from '../interfaces/gpu.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
/**
|
||||
* Options for driver installation
|
||||
*/
|
||||
export interface IDriverInstallOptions {
|
||||
/** Whether to install the GPU toolkit (CUDA, ROCm, oneAPI) */
|
||||
installToolkit: boolean;
|
||||
/** Whether to install container support (nvidia-docker, etc.) */
|
||||
installContainerSupport: boolean;
|
||||
/** Specific driver version to install (optional) */
|
||||
driverVersion?: string;
|
||||
/** Specific toolkit version to install (optional) */
|
||||
toolkitVersion?: string;
|
||||
/** Whether to run non-interactively */
|
||||
nonInteractive: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract base class for GPU drivers
|
||||
*/
|
||||
export abstract class BaseDriver {
|
||||
/** GPU vendor this driver supports */
|
||||
public abstract readonly vendor: TGpuVendor;
|
||||
|
||||
/** Display name for this driver */
|
||||
public abstract readonly displayName: string;
|
||||
|
||||
/**
|
||||
* Check if the driver is installed
|
||||
*/
|
||||
public abstract isInstalled(): Promise<boolean>;
|
||||
|
||||
/**
|
||||
* Get the current driver status
|
||||
*/
|
||||
public abstract getStatus(): Promise<IDriverStatus>;
|
||||
|
||||
/**
|
||||
* Install the driver
|
||||
*/
|
||||
public abstract install(options: IDriverInstallOptions): Promise<boolean>;
|
||||
|
||||
/**
|
||||
* Install container runtime support (e.g., nvidia-docker)
|
||||
*/
|
||||
public abstract installContainerSupport(): Promise<boolean>;
|
||||
|
||||
/**
|
||||
* Get available driver versions
|
||||
*/
|
||||
public abstract getAvailableVersions(): Promise<string[]>;
|
||||
|
||||
/**
|
||||
* Execute a shell command with error handling
|
||||
*/
|
||||
protected async execCommand(
|
||||
command: string,
|
||||
options: { timeout?: number; ignoreErrors?: boolean } = {},
|
||||
): Promise<{ stdout: string; stderr: string }> {
|
||||
const { timeout = 30000, ignoreErrors = false } = options;
|
||||
|
||||
try {
|
||||
const result = await execAsync(command, { timeout });
|
||||
return { stdout: result.stdout, stderr: result.stderr };
|
||||
} catch (error) {
|
||||
if (ignoreErrors) {
|
||||
return { stdout: '', stderr: String(error) };
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if running as root
|
||||
*/
|
||||
protected async isRoot(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('id -u');
|
||||
return stdout.trim() === '0';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the Linux distribution
|
||||
*/
|
||||
protected async getLinuxDistro(): Promise<{ id: string; version: string }> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('cat /etc/os-release', { ignoreErrors: true });
|
||||
|
||||
const idMatch = stdout.match(/^ID=["']?(\w+)["']?$/m);
|
||||
const versionMatch = stdout.match(/^VERSION_ID=["']?([\d.]+)["']?$/m);
|
||||
|
||||
return {
|
||||
id: idMatch ? idMatch[1].toLowerCase() : 'unknown',
|
||||
version: versionMatch ? versionMatch[1] : '',
|
||||
};
|
||||
} catch {
|
||||
return { id: 'unknown', version: '' };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a package is installed (apt-based)
|
||||
*/
|
||||
protected async isAptPackageInstalled(packageName: string): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand(`dpkg -l ${packageName} 2>/dev/null | grep "^ii"`, {
|
||||
ignoreErrors: true,
|
||||
});
|
||||
return stdout.includes(packageName);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a package is installed (dnf/yum-based)
|
||||
*/
|
||||
protected async isDnfPackageInstalled(packageName: string): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand(`rpm -q ${packageName} 2>/dev/null`, {
|
||||
ignoreErrors: true,
|
||||
});
|
||||
return !stdout.includes('not installed');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run apt-get update
|
||||
*/
|
||||
protected async aptUpdate(): Promise<void> {
|
||||
logger.info('Updating package lists...');
|
||||
await this.execCommand('apt-get update', { timeout: 120000 });
|
||||
}
|
||||
|
||||
/**
|
||||
* Install a package using apt
|
||||
*/
|
||||
protected async aptInstall(packages: string | string[]): Promise<void> {
|
||||
const pkgList = Array.isArray(packages) ? packages.join(' ') : packages;
|
||||
logger.info(`Installing packages: ${pkgList}`);
|
||||
await this.execCommand(`DEBIAN_FRONTEND=noninteractive apt-get install -y ${pkgList}`, {
|
||||
timeout: 600000, // 10 minutes for large packages
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Install a package using dnf
|
||||
*/
|
||||
protected async dnfInstall(packages: string | string[]): Promise<void> {
|
||||
const pkgList = Array.isArray(packages) ? packages.join(' ') : packages;
|
||||
logger.info(`Installing packages: ${pkgList}`);
|
||||
await this.execCommand(`dnf install -y ${pkgList}`, {
|
||||
timeout: 600000,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an apt repository
|
||||
*/
|
||||
protected async addAptRepository(repo: string, keyUrl?: string): Promise<void> {
|
||||
if (keyUrl) {
|
||||
// Add GPG key
|
||||
await this.execCommand(`curl -fsSL ${keyUrl} | gpg --dearmor -o /usr/share/keyrings/$(basename ${keyUrl}).gpg`);
|
||||
}
|
||||
await this.execCommand(`add-apt-repository -y "${repo}"`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Log driver status summary
|
||||
*/
|
||||
public async logStatus(): Promise<void> {
|
||||
const status = await this.getStatus();
|
||||
|
||||
logger.logBoxTitle(`${this.displayName} Driver Status`, 60, status.installed ? 'success' : 'warning');
|
||||
logger.logBoxLine(`Installed: ${status.installed ? 'Yes' : 'No'}`);
|
||||
|
||||
if (status.installed) {
|
||||
if (status.version) {
|
||||
logger.logBoxLine(`Driver Version: ${status.version}`);
|
||||
}
|
||||
if (status.toolkitVersion) {
|
||||
logger.logBoxLine(`Toolkit Version: ${status.toolkitVersion}`);
|
||||
}
|
||||
logger.logBoxLine(`Container Support: ${status.containerSupport ? 'Yes' : 'No'}`);
|
||||
if (status.containerRuntimeVersion) {
|
||||
logger.logBoxLine(`Container Runtime: ${status.containerRuntimeVersion}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (status.issues.length > 0) {
|
||||
logger.logBoxLine('');
|
||||
logger.logBoxLine('Issues:');
|
||||
for (const issue of status.issues) {
|
||||
logger.logBoxLine(` - ${issue}`);
|
||||
}
|
||||
}
|
||||
|
||||
logger.logBoxEnd();
|
||||
}
|
||||
}
|
||||
267
ts/drivers/driver-manager.ts
Normal file
267
ts/drivers/driver-manager.ts
Normal file
@@ -0,0 +1,267 @@
|
||||
/**
|
||||
* Driver Manager
|
||||
*
|
||||
* Coordinates detection and installation of GPU drivers across all vendors.
|
||||
*/
|
||||
|
||||
import type { IDriverStatus, TGpuVendor } from '../interfaces/gpu.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
||||
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
|
||||
import { NvidiaDriver } from './nvidia.ts';
|
||||
import { AmdDriver } from './amd.ts';
|
||||
import { IntelDriver } from './intel.ts';
|
||||
|
||||
/**
|
||||
* Driver Manager - coordinates GPU driver management
|
||||
*/
|
||||
export class DriverManager {
|
||||
private gpuDetector: GpuDetector;
|
||||
private drivers: Map<TGpuVendor, BaseDriver>;
|
||||
|
||||
constructor() {
|
||||
this.gpuDetector = new GpuDetector();
|
||||
this.drivers = new Map([
|
||||
['nvidia', new NvidiaDriver()],
|
||||
['amd', new AmdDriver()],
|
||||
['intel', new IntelDriver()],
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get driver manager for a specific vendor
|
||||
*/
|
||||
public getDriver(vendor: TGpuVendor): BaseDriver | undefined {
|
||||
return this.drivers.get(vendor);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get status of all GPU drivers
|
||||
*/
|
||||
public async getAllDriverStatus(): Promise<Map<TGpuVendor, IDriverStatus>> {
|
||||
const statuses = new Map<TGpuVendor, IDriverStatus>();
|
||||
|
||||
// Only check drivers for detected GPUs
|
||||
const gpus = await this.gpuDetector.detectGpus();
|
||||
const detectedVendors = new Set(gpus.map((g) => g.vendor));
|
||||
|
||||
for (const vendor of detectedVendors) {
|
||||
if (vendor === 'unknown') continue;
|
||||
|
||||
const driver = this.drivers.get(vendor);
|
||||
if (driver) {
|
||||
const status = await driver.getStatus();
|
||||
statuses.set(vendor, status);
|
||||
}
|
||||
}
|
||||
|
||||
return statuses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check drivers for all detected GPUs
|
||||
*/
|
||||
public async checkAllDrivers(): Promise<{
|
||||
allInstalled: boolean;
|
||||
allContainerReady: boolean;
|
||||
issues: string[];
|
||||
}> {
|
||||
const gpus = await this.gpuDetector.detectGpus();
|
||||
const issues: string[] = [];
|
||||
let allInstalled = true;
|
||||
let allContainerReady = true;
|
||||
|
||||
if (gpus.length === 0) {
|
||||
issues.push('No GPUs detected');
|
||||
return { allInstalled: false, allContainerReady: false, issues };
|
||||
}
|
||||
|
||||
// Group GPUs by vendor
|
||||
const vendorCounts = new Map<TGpuVendor, number>();
|
||||
for (const gpu of gpus) {
|
||||
vendorCounts.set(gpu.vendor, (vendorCounts.get(gpu.vendor) || 0) + 1);
|
||||
}
|
||||
|
||||
// Check each vendor
|
||||
for (const [vendor, count] of vendorCounts) {
|
||||
if (vendor === 'unknown') {
|
||||
issues.push(`${count} GPU(s) with unknown vendor - cannot manage drivers`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const driver = this.drivers.get(vendor);
|
||||
if (!driver) {
|
||||
issues.push(`No driver manager for ${vendor}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const status = await driver.getStatus();
|
||||
|
||||
if (!status.installed) {
|
||||
allInstalled = false;
|
||||
issues.push(`${driver.displayName} driver not installed for ${count} GPU(s)`);
|
||||
}
|
||||
|
||||
if (!status.containerSupport) {
|
||||
allContainerReady = false;
|
||||
issues.push(`${driver.displayName} container support not configured`);
|
||||
}
|
||||
|
||||
// Add specific issues
|
||||
issues.push(...status.issues);
|
||||
}
|
||||
|
||||
return { allInstalled, allContainerReady, issues };
|
||||
}
|
||||
|
||||
/**
|
||||
* Install drivers for all detected GPUs
|
||||
*/
|
||||
public async installAllDrivers(options: Partial<IDriverInstallOptions> = {}): Promise<boolean> {
|
||||
const fullOptions: IDriverInstallOptions = {
|
||||
installToolkit: options.installToolkit ?? true,
|
||||
installContainerSupport: options.installContainerSupport ?? true,
|
||||
nonInteractive: options.nonInteractive ?? false,
|
||||
driverVersion: options.driverVersion,
|
||||
toolkitVersion: options.toolkitVersion,
|
||||
};
|
||||
|
||||
const gpus = await this.gpuDetector.detectGpus();
|
||||
const vendors = new Set(gpus.map((g) => g.vendor).filter((v) => v !== 'unknown'));
|
||||
|
||||
if (vendors.size === 0) {
|
||||
logger.error('No supported GPUs detected');
|
||||
return false;
|
||||
}
|
||||
|
||||
let allSuccess = true;
|
||||
|
||||
for (const vendor of vendors) {
|
||||
const driver = this.drivers.get(vendor);
|
||||
if (!driver) continue;
|
||||
|
||||
logger.info(`Installing ${driver.displayName} drivers...`);
|
||||
|
||||
const success = await driver.install(fullOptions);
|
||||
if (!success) {
|
||||
allSuccess = false;
|
||||
logger.error(`Failed to install ${driver.displayName} drivers`);
|
||||
}
|
||||
}
|
||||
|
||||
return allSuccess;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install container support for all GPUs
|
||||
*/
|
||||
public async installContainerSupport(): Promise<boolean> {
|
||||
const gpus = await this.gpuDetector.detectGpus();
|
||||
const vendors = new Set(gpus.map((g) => g.vendor).filter((v) => v !== 'unknown'));
|
||||
|
||||
let allSuccess = true;
|
||||
|
||||
for (const vendor of vendors) {
|
||||
const driver = this.drivers.get(vendor);
|
||||
if (!driver) continue;
|
||||
|
||||
const success = await driver.installContainerSupport();
|
||||
if (!success) {
|
||||
allSuccess = false;
|
||||
}
|
||||
}
|
||||
|
||||
return allSuccess;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print driver status summary
|
||||
*/
|
||||
public async printDriverStatus(): Promise<void> {
|
||||
const gpus = await this.gpuDetector.detectGpus();
|
||||
|
||||
if (gpus.length === 0) {
|
||||
logger.logBox('Driver Status', ['No GPUs detected'], 50, 'warning');
|
||||
return;
|
||||
}
|
||||
|
||||
// Group by vendor
|
||||
const vendorGpus = new Map<TGpuVendor, typeof gpus>();
|
||||
for (const gpu of gpus) {
|
||||
if (!vendorGpus.has(gpu.vendor)) {
|
||||
vendorGpus.set(gpu.vendor, []);
|
||||
}
|
||||
vendorGpus.get(gpu.vendor)!.push(gpu);
|
||||
}
|
||||
|
||||
// Print status for each vendor
|
||||
for (const [vendor, gpuList] of vendorGpus) {
|
||||
if (vendor === 'unknown') {
|
||||
logger.logBox('Unknown GPUs', [
|
||||
`${gpuList.length} GPU(s) with unknown vendor`,
|
||||
'Manual driver installation may be required',
|
||||
], 50, 'warning');
|
||||
continue;
|
||||
}
|
||||
|
||||
const driver = this.drivers.get(vendor);
|
||||
if (driver) {
|
||||
await driver.logStatus();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Docker run arguments for GPU support
|
||||
*/
|
||||
public async getDockerGpuArgs(gpuIds?: string[]): Promise<string[]> {
|
||||
const gpus = await this.gpuDetector.detectGpus();
|
||||
const args: string[] = [];
|
||||
|
||||
// Filter to specific GPUs if provided
|
||||
const targetGpus = gpuIds
|
||||
? gpus.filter((g) => gpuIds.includes(g.id))
|
||||
: gpus;
|
||||
|
||||
if (targetGpus.length === 0) {
|
||||
return args;
|
||||
}
|
||||
|
||||
// Determine vendor (assume single vendor for simplicity)
|
||||
const vendor = targetGpus[0].vendor;
|
||||
|
||||
switch (vendor) {
|
||||
case 'nvidia':
|
||||
// NVIDIA uses nvidia-docker runtime
|
||||
args.push('--runtime=nvidia');
|
||||
if (gpuIds && gpuIds.length > 0) {
|
||||
// Use specific GPU indices
|
||||
const indices = targetGpus.map((g) => g.index).join(',');
|
||||
args.push(`--gpus="device=${indices}"`);
|
||||
} else {
|
||||
args.push('--gpus=all');
|
||||
}
|
||||
break;
|
||||
|
||||
case 'amd':
|
||||
// AMD uses device passthrough
|
||||
args.push('--device=/dev/kfd');
|
||||
for (const gpu of targetGpus) {
|
||||
args.push(`--device=/dev/dri/renderD${128 + gpu.index}`);
|
||||
}
|
||||
args.push('--group-add=video');
|
||||
args.push('--security-opt=seccomp=unconfined');
|
||||
break;
|
||||
|
||||
case 'intel':
|
||||
// Intel uses device passthrough
|
||||
for (const gpu of targetGpus) {
|
||||
args.push(`--device=/dev/dri/renderD${128 + gpu.index}`);
|
||||
}
|
||||
args.push('--group-add=render');
|
||||
break;
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
}
|
||||
11
ts/drivers/index.ts
Normal file
11
ts/drivers/index.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
/**
|
||||
* Driver Management Module
|
||||
*
|
||||
* Exports all driver detection and installation functionality.
|
||||
*/
|
||||
|
||||
export { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
|
||||
export { NvidiaDriver } from './nvidia.ts';
|
||||
export { AmdDriver } from './amd.ts';
|
||||
export { IntelDriver } from './intel.ts';
|
||||
export { DriverManager } from './driver-manager.ts';
|
||||
339
ts/drivers/intel.ts
Normal file
339
ts/drivers/intel.ts
Normal file
@@ -0,0 +1,339 @@
|
||||
/**
|
||||
* Intel Driver Management
|
||||
*
|
||||
* Handles Intel Arc GPU driver detection, installation, and oneAPI setup.
|
||||
*/
|
||||
|
||||
import type { IDriverStatus } from '../interfaces/gpu.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
|
||||
|
||||
/**
|
||||
* Intel Arc/oneAPI Driver Manager
|
||||
*/
|
||||
export class IntelDriver extends BaseDriver {
|
||||
public readonly vendor = 'intel' as const;
|
||||
public readonly displayName = 'Intel Arc';
|
||||
|
||||
/**
|
||||
* Check if Intel GPU driver is installed
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
// Check for xpu-smi or intel_gpu_top
|
||||
const { stdout } = await this.execCommand(
|
||||
'xpu-smi discovery 2>/dev/null || intel_gpu_top -l 2>/dev/null || ls /dev/dri/renderD* 2>/dev/null | grep -c renderD',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
return stdout.trim().length > 0 && !stdout.includes('not found');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Intel GPU driver status
|
||||
*/
|
||||
public async getStatus(): Promise<IDriverStatus> {
|
||||
const status: IDriverStatus = {
|
||||
vendor: 'intel',
|
||||
installed: false,
|
||||
containerSupport: false,
|
||||
issues: [],
|
||||
};
|
||||
|
||||
// Check for i915 driver (Intel integrated/Arc)
|
||||
try {
|
||||
const { stdout: driverInfo } = await this.execCommand(
|
||||
'modinfo i915 2>/dev/null | grep "^version:"',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
if (driverInfo.includes('version')) {
|
||||
status.installed = true;
|
||||
const match = driverInfo.match(/version:\s*(\S+)/i);
|
||||
if (match) {
|
||||
status.version = match[1];
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// i915 module info not available
|
||||
}
|
||||
|
||||
// Check for xpu-smi (Intel Arc specific)
|
||||
try {
|
||||
const { stdout: xpuVersion } = await this.execCommand(
|
||||
'xpu-smi --version 2>/dev/null',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
if (xpuVersion.includes('xpu-smi')) {
|
||||
status.installed = true;
|
||||
const match = xpuVersion.match(/(\d+\.\d+(?:\.\d+)?)/);
|
||||
if (match) {
|
||||
status.version = match[1];
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// xpu-smi not available
|
||||
}
|
||||
|
||||
// Check oneAPI toolkit
|
||||
try {
|
||||
const { stdout: oneApiVersion } = await this.execCommand(
|
||||
'ls /opt/intel/oneapi/compiler/*/env/vars.sh 2>/dev/null | head -1 | xargs dirname | xargs dirname | xargs basename',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
if (oneApiVersion.trim()) {
|
||||
status.toolkitVersion = oneApiVersion.trim();
|
||||
}
|
||||
} catch {
|
||||
// oneAPI not installed
|
||||
}
|
||||
|
||||
// Check container support
|
||||
try {
|
||||
const { stdout: renderDevices } = await this.execCommand(
|
||||
'ls /dev/dri/renderD* 2>/dev/null',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
if (renderDevices.includes('renderD')) {
|
||||
status.containerSupport = true;
|
||||
} else {
|
||||
status.issues.push('Intel GPU render devices not available');
|
||||
}
|
||||
} catch {
|
||||
status.issues.push('Could not check Intel GPU device availability');
|
||||
}
|
||||
|
||||
if (!status.installed) {
|
||||
status.issues.push('Intel GPU driver not detected');
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install Intel GPU drivers and optionally oneAPI
|
||||
*/
|
||||
public async install(options: IDriverInstallOptions): Promise<boolean> {
|
||||
if (!await this.isRoot()) {
|
||||
logger.error('Root privileges required to install Intel GPU drivers');
|
||||
return false;
|
||||
}
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`);
|
||||
|
||||
try {
|
||||
if (distro.id === 'ubuntu') {
|
||||
return await this.installOnUbuntu(options);
|
||||
} else if (distro.id === 'fedora') {
|
||||
return await this.installOnFedora(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution for Intel Arc: ${distro.id}`);
|
||||
logger.info('Please install Intel drivers manually from https://dgpu-docs.intel.com/');
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install Intel drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on Ubuntu
|
||||
*/
|
||||
private async installOnUbuntu(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing Intel GPU drivers on Ubuntu...');
|
||||
|
||||
// Install prerequisites
|
||||
await this.aptUpdate();
|
||||
await this.aptInstall(['wget', 'gpg']);
|
||||
|
||||
// Add Intel graphics repository
|
||||
await this.execCommand(
|
||||
'wget -qO - https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg',
|
||||
);
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
const ubuntuCodename = distro.version === '22.04' ? 'jammy' : distro.version === '24.04' ? 'noble' : 'jammy';
|
||||
|
||||
await this.execCommand(
|
||||
`echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu ${ubuntuCodename} arc" > /etc/apt/sources.list.d/intel-graphics.list`,
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
|
||||
// Install Intel GPU packages
|
||||
await this.aptInstall([
|
||||
'intel-opencl-icd',
|
||||
'intel-level-zero-gpu',
|
||||
'level-zero',
|
||||
'intel-media-va-driver-non-free',
|
||||
'libmfx1',
|
||||
'libmfxgen1',
|
||||
'libvpl2',
|
||||
'libegl-mesa0',
|
||||
'libegl1-mesa',
|
||||
'libegl1-mesa-dev',
|
||||
'libgbm1',
|
||||
'libgl1-mesa-dev',
|
||||
'libgl1-mesa-dri',
|
||||
'libglapi-mesa',
|
||||
'libgles2-mesa-dev',
|
||||
'libglx-mesa0',
|
||||
'libigdgmm12',
|
||||
'libxatracker2',
|
||||
'mesa-va-drivers',
|
||||
'mesa-vdpau-drivers',
|
||||
'mesa-vulkan-drivers',
|
||||
'va-driver-all',
|
||||
]);
|
||||
|
||||
// Install xpu-smi for monitoring
|
||||
await this.aptInstall('xpu-smi');
|
||||
|
||||
// Install oneAPI toolkit if requested
|
||||
if (options.installToolkit) {
|
||||
await this.installOneApi();
|
||||
}
|
||||
|
||||
// Add user to video and render groups
|
||||
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('Intel GPU driver installation completed');
|
||||
logger.info('Verify installation with: xpu-smi discovery');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on Fedora
|
||||
*/
|
||||
private async installOnFedora(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing Intel GPU drivers on Fedora...');
|
||||
|
||||
// Intel GPU support is included in newer Fedora kernels
|
||||
// We just need to install the user-space components
|
||||
|
||||
await this.dnfInstall([
|
||||
'intel-media-driver',
|
||||
'libva-intel-driver',
|
||||
'intel-compute-runtime',
|
||||
'level-zero',
|
||||
'oneapi-level-zero',
|
||||
]);
|
||||
|
||||
// Try to install xpu-smi from Intel repo
|
||||
try {
|
||||
await this.execCommand(
|
||||
'dnf copr enable -y intel/oneapi || true',
|
||||
);
|
||||
await this.dnfInstall('xpu-smi');
|
||||
} catch {
|
||||
logger.warn('Could not install xpu-smi. Intel Arc monitoring may be limited.');
|
||||
}
|
||||
|
||||
// Add user to video and render groups
|
||||
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
|
||||
|
||||
// Install oneAPI if requested
|
||||
if (options.installToolkit) {
|
||||
await this.installOneApi();
|
||||
}
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('Intel GPU driver installation completed');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install Intel oneAPI toolkit
|
||||
*/
|
||||
private async installOneApi(): Promise<void> {
|
||||
logger.info('Installing Intel oneAPI toolkit...');
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
// Add Intel oneAPI repository
|
||||
await this.execCommand(
|
||||
'wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null',
|
||||
);
|
||||
|
||||
await this.execCommand(
|
||||
'echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list',
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
await this.aptInstall('intel-basekit');
|
||||
} else if (distro.id === 'fedora') {
|
||||
// Add Intel oneAPI repository
|
||||
await this.execCommand(
|
||||
`cat <<EOF > /etc/yum.repos.d/oneAPI.repo
|
||||
[oneAPI]
|
||||
name=Intel oneAPI repository
|
||||
baseurl=https://yum.repos.intel.com/oneapi
|
||||
enabled=1
|
||||
gpgcheck=1
|
||||
repo_gpgcheck=1
|
||||
gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
EOF`,
|
||||
);
|
||||
|
||||
await this.dnfInstall('intel-basekit');
|
||||
}
|
||||
|
||||
logger.success('Intel oneAPI toolkit installed');
|
||||
logger.info('Source the environment with: source /opt/intel/oneapi/setvars.sh');
|
||||
}
|
||||
|
||||
/**
|
||||
* Install container support for Intel GPUs
|
||||
*/
|
||||
public async installContainerSupport(): Promise<boolean> {
|
||||
logger.info('Configuring Docker for Intel GPUs...');
|
||||
|
||||
try {
|
||||
// Intel GPUs work by passing through device files
|
||||
// Verify render devices exist
|
||||
const { stdout: devices } = await this.execCommand('ls -la /dev/dri/renderD* 2>/dev/null || true');
|
||||
|
||||
if (!devices.includes('renderD')) {
|
||||
logger.warn('/dev/dri/renderD* not found. Intel GPU driver may not be properly loaded.');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set permissions
|
||||
await this.execCommand('chmod 666 /dev/dri/renderD* || true');
|
||||
|
||||
logger.success('Intel GPU container support configured');
|
||||
logger.info('Use the following Docker flags for Intel GPU containers:');
|
||||
logger.info(' --device=/dev/dri --group-add render');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to configure Intel container support: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get available driver versions
|
||||
*/
|
||||
public async getAvailableVersions(): Promise<string[]> {
|
||||
// Intel Arc drivers are typically tied to kernel versions
|
||||
// Return oneAPI versions as reference
|
||||
return ['2024.0', '2023.2', '2023.1', '2023.0'];
|
||||
}
|
||||
}
|
||||
318
ts/drivers/nvidia.ts
Normal file
318
ts/drivers/nvidia.ts
Normal file
@@ -0,0 +1,318 @@
|
||||
/**
|
||||
* NVIDIA Driver Management
|
||||
*
|
||||
* Handles NVIDIA driver detection, installation, and container toolkit setup.
|
||||
*/
|
||||
|
||||
import type { IDriverStatus } from '../interfaces/gpu.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
|
||||
|
||||
/**
|
||||
* NVIDIA Driver Manager
|
||||
*/
|
||||
export class NvidiaDriver extends BaseDriver {
|
||||
public readonly vendor = 'nvidia' as const;
|
||||
public readonly displayName = 'NVIDIA';
|
||||
|
||||
/**
|
||||
* Check if the NVIDIA driver is installed
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('nvidia-smi --query-gpu=driver_version --format=csv,noheader', {
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
});
|
||||
return stdout.trim().length > 0;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get NVIDIA driver status
|
||||
*/
|
||||
public async getStatus(): Promise<IDriverStatus> {
|
||||
const status: IDriverStatus = {
|
||||
vendor: 'nvidia',
|
||||
installed: false,
|
||||
containerSupport: false,
|
||||
issues: [],
|
||||
};
|
||||
|
||||
// Check if nvidia-smi is available
|
||||
try {
|
||||
const { stdout: driverVersion } = await this.execCommand(
|
||||
'nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -1',
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
status.installed = true;
|
||||
status.version = driverVersion.trim();
|
||||
} catch {
|
||||
status.issues.push('NVIDIA driver not installed or nvidia-smi not available');
|
||||
return status;
|
||||
}
|
||||
|
||||
// Check CUDA toolkit
|
||||
try {
|
||||
const { stdout: cudaVersion } = await this.execCommand(
|
||||
'nvcc --version 2>/dev/null | grep "release" | sed "s/.*release \\([0-9.]*\\).*/\\1/"',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
if (cudaVersion.trim()) {
|
||||
status.toolkitVersion = cudaVersion.trim();
|
||||
}
|
||||
} catch {
|
||||
// CUDA toolkit not installed
|
||||
}
|
||||
|
||||
// Check nvidia-container-toolkit
|
||||
try {
|
||||
const { stdout: containerVersion } = await this.execCommand(
|
||||
'nvidia-container-cli --version 2>&1 | head -1',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
if (containerVersion.includes('version')) {
|
||||
status.containerSupport = true;
|
||||
const match = containerVersion.match(/version (\d+\.\d+\.\d+)/);
|
||||
if (match) {
|
||||
status.containerRuntimeVersion = match[1];
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
status.issues.push('NVIDIA Container Toolkit not installed');
|
||||
}
|
||||
|
||||
// Check if Docker has nvidia runtime
|
||||
try {
|
||||
const { stdout: dockerInfo } = await this.execCommand(
|
||||
'docker info --format "{{.Runtimes}}" 2>/dev/null',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
if (!dockerInfo.includes('nvidia')) {
|
||||
status.issues.push('Docker nvidia runtime not configured');
|
||||
}
|
||||
} catch {
|
||||
// Docker check failed
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install NVIDIA driver and optionally CUDA toolkit
|
||||
*/
|
||||
public async install(options: IDriverInstallOptions): Promise<boolean> {
|
||||
if (!await this.isRoot()) {
|
||||
logger.error('Root privileges required to install NVIDIA drivers');
|
||||
return false;
|
||||
}
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`);
|
||||
|
||||
try {
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
return await this.installOnDebian(options);
|
||||
} else if (distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||||
return await this.installOnRhel(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution: ${distro.id}`);
|
||||
logger.info('Please install NVIDIA drivers manually');
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install NVIDIA drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on Debian/Ubuntu
|
||||
*/
|
||||
private async installOnDebian(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing NVIDIA drivers on Debian/Ubuntu...');
|
||||
|
||||
// Add NVIDIA repository
|
||||
await this.aptUpdate();
|
||||
|
||||
// Install prerequisites
|
||||
await this.aptInstall(['software-properties-common', 'build-essential', 'dkms']);
|
||||
|
||||
// Add NVIDIA PPA (for Ubuntu)
|
||||
try {
|
||||
await this.execCommand('add-apt-repository -y ppa:graphics-drivers/ppa 2>/dev/null || true');
|
||||
await this.aptUpdate();
|
||||
} catch {
|
||||
// PPA might not be available on all systems
|
||||
}
|
||||
|
||||
// Install NVIDIA driver
|
||||
const driverPackage = options.driverVersion
|
||||
? `nvidia-driver-${options.driverVersion}`
|
||||
: 'nvidia-driver-535'; // Default to stable version
|
||||
|
||||
await this.aptInstall(driverPackage);
|
||||
|
||||
// Install CUDA toolkit if requested
|
||||
if (options.installToolkit) {
|
||||
await this.installCudaToolkit(options);
|
||||
}
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('NVIDIA driver installation completed');
|
||||
logger.warn('A system reboot is required to load the new driver');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on RHEL/Fedora
|
||||
*/
|
||||
private async installOnRhel(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing NVIDIA drivers on RHEL/Fedora...');
|
||||
|
||||
// Install prerequisites
|
||||
await this.dnfInstall(['kernel-devel', 'kernel-headers', 'gcc', 'make', 'dkms', 'acpid']);
|
||||
|
||||
// Add NVIDIA CUDA repository
|
||||
const distro = await this.getLinuxDistro();
|
||||
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${distro.version.split('.')[0]}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
|
||||
|
||||
await this.execCommand(`dnf config-manager --add-repo ${repoUrl}`);
|
||||
|
||||
// Install NVIDIA driver
|
||||
await this.dnfInstall('nvidia-driver-latest-dkms');
|
||||
|
||||
// Install CUDA toolkit if requested
|
||||
if (options.installToolkit) {
|
||||
await this.dnfInstall('cuda');
|
||||
}
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('NVIDIA driver installation completed');
|
||||
logger.warn('A system reboot is required to load the new driver');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install CUDA toolkit
|
||||
*/
|
||||
private async installCudaToolkit(options: IDriverInstallOptions): Promise<void> {
|
||||
logger.info('Installing CUDA toolkit...');
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
// Add CUDA repository
|
||||
const cudaKeyUrl = 'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
|
||||
await this.execCommand(`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`);
|
||||
await this.aptUpdate();
|
||||
|
||||
const cudaPackage = options.toolkitVersion
|
||||
? `cuda-toolkit-${options.toolkitVersion.replace('.', '-')}`
|
||||
: 'cuda-toolkit';
|
||||
|
||||
await this.aptInstall(cudaPackage);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Install NVIDIA Container Toolkit
|
||||
*/
|
||||
public async installContainerSupport(): Promise<boolean> {
|
||||
if (!await this.isRoot()) {
|
||||
logger.error('Root privileges required to install NVIDIA Container Toolkit');
|
||||
return false;
|
||||
}
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
logger.info('Installing NVIDIA Container Toolkit...');
|
||||
|
||||
try {
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
// Add repository
|
||||
await this.execCommand(
|
||||
'curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg',
|
||||
);
|
||||
|
||||
const distribution = `${distro.id}${distro.version}`;
|
||||
await this.execCommand(
|
||||
`curl -s -L https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list | ` +
|
||||
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
|
||||
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
await this.aptInstall('nvidia-container-toolkit');
|
||||
} else {
|
||||
// RHEL/Fedora
|
||||
await this.execCommand(
|
||||
'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | ' +
|
||||
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
|
||||
);
|
||||
await this.dnfInstall('nvidia-container-toolkit');
|
||||
}
|
||||
|
||||
// Configure Docker runtime
|
||||
await this.configureDockerRuntime();
|
||||
|
||||
logger.success('NVIDIA Container Toolkit installed successfully');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install NVIDIA Container Toolkit: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure Docker to use NVIDIA runtime
|
||||
*/
|
||||
private async configureDockerRuntime(): Promise<void> {
|
||||
logger.info('Configuring Docker to use NVIDIA runtime...');
|
||||
|
||||
try {
|
||||
// Run nvidia-ctk to configure Docker
|
||||
await this.execCommand('nvidia-ctk runtime configure --runtime=docker');
|
||||
|
||||
// Restart Docker
|
||||
await this.execCommand('systemctl restart docker');
|
||||
|
||||
logger.success('Docker configured to use NVIDIA runtime');
|
||||
} catch (error) {
|
||||
logger.warn(`Could not configure Docker runtime automatically: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.info('Please run: nvidia-ctk runtime configure --runtime=docker');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get available driver versions
|
||||
*/
|
||||
public async getAvailableVersions(): Promise<string[]> {
|
||||
const versions: string[] = [];
|
||||
|
||||
try {
|
||||
const distro = await this.getLinuxDistro();
|
||||
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
const { stdout } = await this.execCommand(
|
||||
'apt-cache search nvidia-driver | grep "^nvidia-driver-[0-9]" | sed "s/nvidia-driver-\\([0-9]*\\).*/\\1/" | sort -rn | uniq',
|
||||
{ ignoreErrors: true },
|
||||
);
|
||||
versions.push(...stdout.trim().split('\n').filter((v: string) => v.trim()));
|
||||
}
|
||||
} catch {
|
||||
// Failed to get versions
|
||||
}
|
||||
|
||||
return versions;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user