Files
modelgrid/ts/drivers/amd.ts

282 lines
8.4 KiB
TypeScript
Raw Normal View History

2026-01-30 03:16:57 +00:00
/**
* AMD Driver Management
*
* Handles AMD ROCm driver detection, installation, and container setup.
*/
import type { IDriverStatus } from '../interfaces/gpu.ts';
import { logger } from '../logger.ts';
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
/**
* AMD ROCm Driver Manager
*/
export class AmdDriver extends BaseDriver {
public readonly vendor = 'amd' as const;
public readonly displayName = 'AMD ROCm';
/**
* Check if AMD ROCm driver is installed
*/
public async isInstalled(): Promise<boolean> {
try {
const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', {
timeout: 5000,
ignoreErrors: true,
});
return stdout.includes('Driver');
} catch {
return false;
}
}
/**
* Get AMD ROCm driver status
*/
public async getStatus(): Promise<IDriverStatus> {
const status: IDriverStatus = {
vendor: 'amd',
installed: false,
containerSupport: false,
issues: [],
};
// Check if rocm-smi is available
try {
const { stdout: driverInfo } = await this.execCommand(
'rocm-smi --showdriverversion 2>/dev/null',
{ timeout: 5000, ignoreErrors: true },
);
if (driverInfo.includes('Driver')) {
status.installed = true;
const match = driverInfo.match(/Driver version:\s*(\S+)/i);
if (match) {
status.version = match[1];
}
}
} catch {
status.issues.push('ROCm driver not installed or rocm-smi not available');
return status;
}
// Check ROCm toolkit version
try {
const { stdout: rocmVersion } = await this.execCommand(
'cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1',
{ timeout: 5000, ignoreErrors: true },
);
const match = rocmVersion.match(/(\d+\.\d+(?:\.\d+)?)/);
if (match) {
status.toolkitVersion = match[1];
}
} catch {
// ROCm toolkit version not available
}
// Check Docker ROCm support
try {
const { stdout: dockerInfo } = await this.execCommand(
'docker info 2>/dev/null | grep -i "rocm\\|amd"',
{ timeout: 5000, ignoreErrors: true },
);
// Check if rocm/pytorch or similar images can run
const { stdout: deviceCheck } = await this.execCommand(
'ls /dev/kfd /dev/dri/render* 2>/dev/null',
{ timeout: 5000, ignoreErrors: true },
);
if (deviceCheck.includes('/dev/kfd') || dockerInfo.includes('rocm')) {
status.containerSupport = true;
} else {
status.issues.push('ROCm device files not available for container access');
}
} catch {
status.issues.push('Could not verify Docker ROCm support');
}
return status;
}
/**
* Install AMD ROCm driver
*/
public async install(options: IDriverInstallOptions): Promise<boolean> {
if (!await this.isRoot()) {
logger.error('Root privileges required to install AMD ROCm drivers');
return false;
}
const distro = await this.getLinuxDistro();
logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`);
try {
if (distro.id === 'ubuntu') {
return await this.installOnUbuntu(options);
} else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
return await this.installOnRhel(options);
} else {
logger.error(`Unsupported distribution: ${distro.id}`);
logger.info('Please install ROCm drivers manually from https://rocm.docs.amd.com/');
return false;
}
} catch (error) {
logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Install on Ubuntu
*/
private async installOnUbuntu(options: IDriverInstallOptions): Promise<boolean> {
logger.info('Installing AMD ROCm on Ubuntu...');
// Install prerequisites
await this.aptUpdate();
await this.aptInstall(['wget', 'gnupg2']);
// Add ROCm repository
const rocmVersion = options.toolkitVersion || '6.0';
const ubuntuVersion = (await this.getLinuxDistro()).version.replace('.', '');
// Download and install ROCm repository
await this.execCommand(
`wget -q https://repo.radeon.com/rocm/rocm.gpg.key -O - | apt-key add -`,
);
await this.execCommand(
`echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${rocmVersion} ubuntu main" > /etc/apt/sources.list.d/rocm.list`,
);
// Add AMDGPU repository
await this.execCommand(
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`,
);
await this.aptUpdate();
// Install AMDGPU driver and ROCm
await this.aptInstall('amdgpu-dkms');
if (options.installToolkit) {
await this.aptInstall('rocm-hip-sdk');
} else {
await this.aptInstall('rocm-smi-lib');
}
// Add user to video and render groups
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
// Install container support if requested
if (options.installContainerSupport) {
await this.installContainerSupport();
}
logger.success('AMD ROCm installation completed');
logger.warn('A system reboot is required to load the new driver');
logger.info('After reboot, verify with: rocm-smi');
return true;
}
/**
* Install on RHEL
*/
private async installOnRhel(options: IDriverInstallOptions): Promise<boolean> {
logger.info('Installing AMD ROCm on RHEL/CentOS...');
const rocmVersion = options.toolkitVersion || '6.0';
const distro = await this.getLinuxDistro();
const rhelVersion = distro.version.split('.')[0];
// Add EPEL repository
await this.dnfInstall('epel-release');
// Add ROCm repository
await this.execCommand(
`cat <<EOF > /etc/yum.repos.d/rocm.repo
[ROCm]
name=ROCm
baseurl=https://repo.radeon.com/rocm/yum/${rocmVersion}/main
enabled=1
gpgcheck=1
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
EOF`,
);
// Add AMDGPU repository
await this.execCommand(
`cat <<EOF > /etc/yum.repos.d/amdgpu.repo
[amdgpu]
name=amdgpu
baseurl=https://repo.radeon.com/amdgpu/${rocmVersion}/rhel/${rhelVersion}/main/x86_64/
enabled=1
gpgcheck=1
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
EOF`,
);
// Install AMDGPU driver
await this.dnfInstall('amdgpu-dkms');
if (options.installToolkit) {
await this.dnfInstall('rocm-hip-sdk');
} else {
await this.dnfInstall('rocm-smi-lib');
}
// Add user to video and render groups
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
// Install container support if requested
if (options.installContainerSupport) {
await this.installContainerSupport();
}
logger.success('AMD ROCm installation completed');
logger.warn('A system reboot is required to load the new driver');
return true;
}
/**
* Install container support for AMD GPUs
*/
public async installContainerSupport(): Promise<boolean> {
logger.info('Configuring Docker for AMD ROCm...');
try {
// AMD ROCm containers work by passing through device files
// No special runtime needed, just need to pass --device flags
// Verify device files exist
const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true');
if (!devices.includes('/dev/kfd')) {
logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.');
logger.info('Try rebooting the system after driver installation.');
return false;
}
// Set permissions
await this.execCommand('chmod 666 /dev/kfd /dev/dri/render* || true');
logger.success('AMD ROCm container support configured');
logger.info('Use the following Docker flags for ROCm containers:');
logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video');
return true;
} catch (error) {
logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Get available ROCm versions
*/
public async getAvailableVersions(): Promise<string[]> {
// ROCm has a standard set of supported versions
return ['6.0', '5.7', '5.6', '5.5', '5.4'];
}
}