282 lines
8.4 KiB
TypeScript
282 lines
8.4 KiB
TypeScript
|
|
/**
|
||
|
|
* AMD Driver Management
|
||
|
|
*
|
||
|
|
* Handles AMD ROCm driver detection, installation, and container setup.
|
||
|
|
*/
|
||
|
|
|
||
|
|
import type { IDriverStatus } from '../interfaces/gpu.ts';
|
||
|
|
import { logger } from '../logger.ts';
|
||
|
|
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
|
||
|
|
|
||
|
|
/**
|
||
|
|
* AMD ROCm Driver Manager
|
||
|
|
*/
|
||
|
|
export class AmdDriver extends BaseDriver {
|
||
|
|
public readonly vendor = 'amd' as const;
|
||
|
|
public readonly displayName = 'AMD ROCm';
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Check if AMD ROCm driver is installed
|
||
|
|
*/
|
||
|
|
public async isInstalled(): Promise<boolean> {
|
||
|
|
try {
|
||
|
|
const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', {
|
||
|
|
timeout: 5000,
|
||
|
|
ignoreErrors: true,
|
||
|
|
});
|
||
|
|
return stdout.includes('Driver');
|
||
|
|
} catch {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get AMD ROCm driver status
|
||
|
|
*/
|
||
|
|
public async getStatus(): Promise<IDriverStatus> {
|
||
|
|
const status: IDriverStatus = {
|
||
|
|
vendor: 'amd',
|
||
|
|
installed: false,
|
||
|
|
containerSupport: false,
|
||
|
|
issues: [],
|
||
|
|
};
|
||
|
|
|
||
|
|
// Check if rocm-smi is available
|
||
|
|
try {
|
||
|
|
const { stdout: driverInfo } = await this.execCommand(
|
||
|
|
'rocm-smi --showdriverversion 2>/dev/null',
|
||
|
|
{ timeout: 5000, ignoreErrors: true },
|
||
|
|
);
|
||
|
|
|
||
|
|
if (driverInfo.includes('Driver')) {
|
||
|
|
status.installed = true;
|
||
|
|
const match = driverInfo.match(/Driver version:\s*(\S+)/i);
|
||
|
|
if (match) {
|
||
|
|
status.version = match[1];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} catch {
|
||
|
|
status.issues.push('ROCm driver not installed or rocm-smi not available');
|
||
|
|
return status;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check ROCm toolkit version
|
||
|
|
try {
|
||
|
|
const { stdout: rocmVersion } = await this.execCommand(
|
||
|
|
'cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1',
|
||
|
|
{ timeout: 5000, ignoreErrors: true },
|
||
|
|
);
|
||
|
|
const match = rocmVersion.match(/(\d+\.\d+(?:\.\d+)?)/);
|
||
|
|
if (match) {
|
||
|
|
status.toolkitVersion = match[1];
|
||
|
|
}
|
||
|
|
} catch {
|
||
|
|
// ROCm toolkit version not available
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check Docker ROCm support
|
||
|
|
try {
|
||
|
|
const { stdout: dockerInfo } = await this.execCommand(
|
||
|
|
'docker info 2>/dev/null | grep -i "rocm\\|amd"',
|
||
|
|
{ timeout: 5000, ignoreErrors: true },
|
||
|
|
);
|
||
|
|
|
||
|
|
// Check if rocm/pytorch or similar images can run
|
||
|
|
const { stdout: deviceCheck } = await this.execCommand(
|
||
|
|
'ls /dev/kfd /dev/dri/render* 2>/dev/null',
|
||
|
|
{ timeout: 5000, ignoreErrors: true },
|
||
|
|
);
|
||
|
|
|
||
|
|
if (deviceCheck.includes('/dev/kfd') || dockerInfo.includes('rocm')) {
|
||
|
|
status.containerSupport = true;
|
||
|
|
} else {
|
||
|
|
status.issues.push('ROCm device files not available for container access');
|
||
|
|
}
|
||
|
|
} catch {
|
||
|
|
status.issues.push('Could not verify Docker ROCm support');
|
||
|
|
}
|
||
|
|
|
||
|
|
return status;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Install AMD ROCm driver
|
||
|
|
*/
|
||
|
|
public async install(options: IDriverInstallOptions): Promise<boolean> {
|
||
|
|
if (!await this.isRoot()) {
|
||
|
|
logger.error('Root privileges required to install AMD ROCm drivers');
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
const distro = await this.getLinuxDistro();
|
||
|
|
logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
if (distro.id === 'ubuntu') {
|
||
|
|
return await this.installOnUbuntu(options);
|
||
|
|
} else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||
|
|
return await this.installOnRhel(options);
|
||
|
|
} else {
|
||
|
|
logger.error(`Unsupported distribution: ${distro.id}`);
|
||
|
|
logger.info('Please install ROCm drivers manually from https://rocm.docs.amd.com/');
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
} catch (error) {
|
||
|
|
logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Install on Ubuntu
|
||
|
|
*/
|
||
|
|
private async installOnUbuntu(options: IDriverInstallOptions): Promise<boolean> {
|
||
|
|
logger.info('Installing AMD ROCm on Ubuntu...');
|
||
|
|
|
||
|
|
// Install prerequisites
|
||
|
|
await this.aptUpdate();
|
||
|
|
await this.aptInstall(['wget', 'gnupg2']);
|
||
|
|
|
||
|
|
// Add ROCm repository
|
||
|
|
const rocmVersion = options.toolkitVersion || '6.0';
|
||
|
|
const ubuntuVersion = (await this.getLinuxDistro()).version.replace('.', '');
|
||
|
|
|
||
|
|
// Download and install ROCm repository
|
||
|
|
await this.execCommand(
|
||
|
|
`wget -q https://repo.radeon.com/rocm/rocm.gpg.key -O - | apt-key add -`,
|
||
|
|
);
|
||
|
|
|
||
|
|
await this.execCommand(
|
||
|
|
`echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${rocmVersion} ubuntu main" > /etc/apt/sources.list.d/rocm.list`,
|
||
|
|
);
|
||
|
|
|
||
|
|
// Add AMDGPU repository
|
||
|
|
await this.execCommand(
|
||
|
|
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`,
|
||
|
|
);
|
||
|
|
|
||
|
|
await this.aptUpdate();
|
||
|
|
|
||
|
|
// Install AMDGPU driver and ROCm
|
||
|
|
await this.aptInstall('amdgpu-dkms');
|
||
|
|
|
||
|
|
if (options.installToolkit) {
|
||
|
|
await this.aptInstall('rocm-hip-sdk');
|
||
|
|
} else {
|
||
|
|
await this.aptInstall('rocm-smi-lib');
|
||
|
|
}
|
||
|
|
|
||
|
|
// Add user to video and render groups
|
||
|
|
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
|
||
|
|
|
||
|
|
// Install container support if requested
|
||
|
|
if (options.installContainerSupport) {
|
||
|
|
await this.installContainerSupport();
|
||
|
|
}
|
||
|
|
|
||
|
|
logger.success('AMD ROCm installation completed');
|
||
|
|
logger.warn('A system reboot is required to load the new driver');
|
||
|
|
logger.info('After reboot, verify with: rocm-smi');
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Install on RHEL
|
||
|
|
*/
|
||
|
|
private async installOnRhel(options: IDriverInstallOptions): Promise<boolean> {
|
||
|
|
logger.info('Installing AMD ROCm on RHEL/CentOS...');
|
||
|
|
|
||
|
|
const rocmVersion = options.toolkitVersion || '6.0';
|
||
|
|
const distro = await this.getLinuxDistro();
|
||
|
|
const rhelVersion = distro.version.split('.')[0];
|
||
|
|
|
||
|
|
// Add EPEL repository
|
||
|
|
await this.dnfInstall('epel-release');
|
||
|
|
|
||
|
|
// Add ROCm repository
|
||
|
|
await this.execCommand(
|
||
|
|
`cat <<EOF > /etc/yum.repos.d/rocm.repo
|
||
|
|
[ROCm]
|
||
|
|
name=ROCm
|
||
|
|
baseurl=https://repo.radeon.com/rocm/yum/${rocmVersion}/main
|
||
|
|
enabled=1
|
||
|
|
gpgcheck=1
|
||
|
|
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||
|
|
EOF`,
|
||
|
|
);
|
||
|
|
|
||
|
|
// Add AMDGPU repository
|
||
|
|
await this.execCommand(
|
||
|
|
`cat <<EOF > /etc/yum.repos.d/amdgpu.repo
|
||
|
|
[amdgpu]
|
||
|
|
name=amdgpu
|
||
|
|
baseurl=https://repo.radeon.com/amdgpu/${rocmVersion}/rhel/${rhelVersion}/main/x86_64/
|
||
|
|
enabled=1
|
||
|
|
gpgcheck=1
|
||
|
|
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||
|
|
EOF`,
|
||
|
|
);
|
||
|
|
|
||
|
|
// Install AMDGPU driver
|
||
|
|
await this.dnfInstall('amdgpu-dkms');
|
||
|
|
|
||
|
|
if (options.installToolkit) {
|
||
|
|
await this.dnfInstall('rocm-hip-sdk');
|
||
|
|
} else {
|
||
|
|
await this.dnfInstall('rocm-smi-lib');
|
||
|
|
}
|
||
|
|
|
||
|
|
// Add user to video and render groups
|
||
|
|
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
|
||
|
|
|
||
|
|
// Install container support if requested
|
||
|
|
if (options.installContainerSupport) {
|
||
|
|
await this.installContainerSupport();
|
||
|
|
}
|
||
|
|
|
||
|
|
logger.success('AMD ROCm installation completed');
|
||
|
|
logger.warn('A system reboot is required to load the new driver');
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Install container support for AMD GPUs
|
||
|
|
*/
|
||
|
|
public async installContainerSupport(): Promise<boolean> {
|
||
|
|
logger.info('Configuring Docker for AMD ROCm...');
|
||
|
|
|
||
|
|
try {
|
||
|
|
// AMD ROCm containers work by passing through device files
|
||
|
|
// No special runtime needed, just need to pass --device flags
|
||
|
|
|
||
|
|
// Verify device files exist
|
||
|
|
const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true');
|
||
|
|
|
||
|
|
if (!devices.includes('/dev/kfd')) {
|
||
|
|
logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.');
|
||
|
|
logger.info('Try rebooting the system after driver installation.');
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Set permissions
|
||
|
|
await this.execCommand('chmod 666 /dev/kfd /dev/dri/render* || true');
|
||
|
|
|
||
|
|
logger.success('AMD ROCm container support configured');
|
||
|
|
logger.info('Use the following Docker flags for ROCm containers:');
|
||
|
|
logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video');
|
||
|
|
return true;
|
||
|
|
} catch (error) {
|
||
|
|
logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get available ROCm versions
|
||
|
|
*/
|
||
|
|
public async getAvailableVersions(): Promise<string[]> {
|
||
|
|
// ROCm has a standard set of supported versions
|
||
|
|
return ['6.0', '5.7', '5.6', '5.5', '5.4'];
|
||
|
|
}
|
||
|
|
}
|