/** * AMD Driver Management * * Handles AMD ROCm driver detection, installation, and container setup. */ import type { IDriverStatus } from '../interfaces/gpu.ts'; import { logger } from '../logger.ts'; import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts'; /** * AMD ROCm Driver Manager */ export class AmdDriver extends BaseDriver { public readonly vendor = 'amd' as const; public readonly displayName = 'AMD ROCm'; /** * Check if AMD ROCm driver is installed */ public async isInstalled(): Promise { try { const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', { timeout: 5000, ignoreErrors: true, }); return stdout.includes('Driver'); } catch { return false; } } /** * Get AMD ROCm driver status */ public async getStatus(): Promise { const status: IDriverStatus = { vendor: 'amd', installed: false, containerSupport: false, issues: [], }; // Check if rocm-smi is available try { const { stdout: driverInfo } = await this.execCommand( 'rocm-smi --showdriverversion 2>/dev/null', { timeout: 5000, ignoreErrors: true }, ); if (driverInfo.includes('Driver')) { status.installed = true; const match = driverInfo.match(/Driver version:\s*(\S+)/i); if (match) { status.version = match[1]; } } } catch { status.issues.push('ROCm driver not installed or rocm-smi not available'); return status; } // Check ROCm toolkit version try { const { stdout: rocmVersion } = await this.execCommand( 'cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1', { timeout: 5000, ignoreErrors: true }, ); const match = rocmVersion.match(/(\d+\.\d+(?:\.\d+)?)/); if (match) { status.toolkitVersion = match[1]; } } catch { // ROCm toolkit version not available } // Check Docker ROCm support try { const { stdout: dockerInfo } = await this.execCommand( 'docker info 2>/dev/null | grep -i "rocm\\|amd"', { timeout: 5000, ignoreErrors: true }, ); // Check if rocm/pytorch or similar images can run const { stdout: deviceCheck } = await this.execCommand( 'ls /dev/kfd /dev/dri/render* 2>/dev/null', { timeout: 5000, ignoreErrors: true }, ); if (deviceCheck.includes('/dev/kfd') || dockerInfo.includes('rocm')) { status.containerSupport = true; } else { status.issues.push('ROCm device files not available for container access'); } } catch { status.issues.push('Could not verify Docker ROCm support'); } return status; } /** * Install AMD ROCm driver */ public async install(options: IDriverInstallOptions): Promise { if (!await this.isRoot()) { logger.error('Root privileges required to install AMD ROCm drivers'); return false; } const distro = await this.getLinuxDistro(); logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`); try { if (distro.id === 'ubuntu') { return await this.installOnUbuntu(options); } else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') { return await this.installOnRhel(options); } else { logger.error(`Unsupported distribution: ${distro.id}`); logger.info('Please install ROCm drivers manually from https://rocm.docs.amd.com/'); return false; } } catch (error) { logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`); return false; } } /** * Install on Ubuntu */ private async installOnUbuntu(options: IDriverInstallOptions): Promise { logger.info('Installing AMD ROCm on Ubuntu...'); // Install prerequisites await this.aptUpdate(); await this.aptInstall(['wget', 'gnupg2']); // Add ROCm repository const rocmVersion = options.toolkitVersion || '6.0'; const ubuntuVersion = (await this.getLinuxDistro()).version.replace('.', ''); // Download and install ROCm repository await this.execCommand( `wget -q https://repo.radeon.com/rocm/rocm.gpg.key -O - | apt-key add -`, ); await this.execCommand( `echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${rocmVersion} ubuntu main" > /etc/apt/sources.list.d/rocm.list`, ); // Add AMDGPU repository await this.execCommand( `echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`, ); await this.aptUpdate(); // Install AMDGPU driver and ROCm await this.aptInstall('amdgpu-dkms'); if (options.installToolkit) { await this.aptInstall('rocm-hip-sdk'); } else { await this.aptInstall('rocm-smi-lib'); } // Add user to video and render groups await this.execCommand('usermod -a -G video,render $SUDO_USER || true'); // Install container support if requested if (options.installContainerSupport) { await this.installContainerSupport(); } logger.success('AMD ROCm installation completed'); logger.warn('A system reboot is required to load the new driver'); logger.info('After reboot, verify with: rocm-smi'); return true; } /** * Install on RHEL */ private async installOnRhel(options: IDriverInstallOptions): Promise { logger.info('Installing AMD ROCm on RHEL/CentOS...'); const rocmVersion = options.toolkitVersion || '6.0'; const distro = await this.getLinuxDistro(); const rhelVersion = distro.version.split('.')[0]; // Add EPEL repository await this.dnfInstall('epel-release'); // Add ROCm repository await this.execCommand( `cat < /etc/yum.repos.d/rocm.repo [ROCm] name=ROCm baseurl=https://repo.radeon.com/rocm/yum/${rocmVersion}/main enabled=1 gpgcheck=1 gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key EOF`, ); // Add AMDGPU repository await this.execCommand( `cat < /etc/yum.repos.d/amdgpu.repo [amdgpu] name=amdgpu baseurl=https://repo.radeon.com/amdgpu/${rocmVersion}/rhel/${rhelVersion}/main/x86_64/ enabled=1 gpgcheck=1 gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key EOF`, ); // Install AMDGPU driver await this.dnfInstall('amdgpu-dkms'); if (options.installToolkit) { await this.dnfInstall('rocm-hip-sdk'); } else { await this.dnfInstall('rocm-smi-lib'); } // Add user to video and render groups await this.execCommand('usermod -a -G video,render $SUDO_USER || true'); // Install container support if requested if (options.installContainerSupport) { await this.installContainerSupport(); } logger.success('AMD ROCm installation completed'); logger.warn('A system reboot is required to load the new driver'); return true; } /** * Install container support for AMD GPUs */ public async installContainerSupport(): Promise { logger.info('Configuring Docker for AMD ROCm...'); try { // AMD ROCm containers work by passing through device files // No special runtime needed, just need to pass --device flags // Verify device files exist const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true'); if (!devices.includes('/dev/kfd')) { logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.'); logger.info('Try rebooting the system after driver installation.'); return false; } // Set permissions await this.execCommand('chmod 666 /dev/kfd /dev/dri/render* || true'); logger.success('AMD ROCm container support configured'); logger.info('Use the following Docker flags for ROCm containers:'); logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video'); return true; } catch (error) { logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`); return false; } } /** * Get available ROCm versions */ public async getAvailableVersions(): Promise { // ROCm has a standard set of supported versions return ['6.0', '5.7', '5.6', '5.5', '5.4']; } }