initial
This commit is contained in:
281
ts/drivers/amd.ts
Normal file
281
ts/drivers/amd.ts
Normal file
@@ -0,0 +1,281 @@
|
||||
/**
|
||||
* AMD Driver Management
|
||||
*
|
||||
* Handles AMD ROCm driver detection, installation, and container setup.
|
||||
*/
|
||||
|
||||
import type { IDriverStatus } from '../interfaces/gpu.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseDriver, type IDriverInstallOptions } from './base-driver.ts';
|
||||
|
||||
/**
|
||||
* AMD ROCm Driver Manager
|
||||
*/
|
||||
export class AmdDriver extends BaseDriver {
|
||||
public readonly vendor = 'amd' as const;
|
||||
public readonly displayName = 'AMD ROCm';
|
||||
|
||||
/**
|
||||
* Check if AMD ROCm driver is installed
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', {
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
});
|
||||
return stdout.includes('Driver');
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get AMD ROCm driver status
|
||||
*/
|
||||
public async getStatus(): Promise<IDriverStatus> {
|
||||
const status: IDriverStatus = {
|
||||
vendor: 'amd',
|
||||
installed: false,
|
||||
containerSupport: false,
|
||||
issues: [],
|
||||
};
|
||||
|
||||
// Check if rocm-smi is available
|
||||
try {
|
||||
const { stdout: driverInfo } = await this.execCommand(
|
||||
'rocm-smi --showdriverversion 2>/dev/null',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
if (driverInfo.includes('Driver')) {
|
||||
status.installed = true;
|
||||
const match = driverInfo.match(/Driver version:\s*(\S+)/i);
|
||||
if (match) {
|
||||
status.version = match[1];
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
status.issues.push('ROCm driver not installed or rocm-smi not available');
|
||||
return status;
|
||||
}
|
||||
|
||||
// Check ROCm toolkit version
|
||||
try {
|
||||
const { stdout: rocmVersion } = await this.execCommand(
|
||||
'cat /opt/rocm/.info/version 2>/dev/null || rocminfo 2>/dev/null | grep "ROCm" | head -1',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
const match = rocmVersion.match(/(\d+\.\d+(?:\.\d+)?)/);
|
||||
if (match) {
|
||||
status.toolkitVersion = match[1];
|
||||
}
|
||||
} catch {
|
||||
// ROCm toolkit version not available
|
||||
}
|
||||
|
||||
// Check Docker ROCm support
|
||||
try {
|
||||
const { stdout: dockerInfo } = await this.execCommand(
|
||||
'docker info 2>/dev/null | grep -i "rocm\\|amd"',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
// Check if rocm/pytorch or similar images can run
|
||||
const { stdout: deviceCheck } = await this.execCommand(
|
||||
'ls /dev/kfd /dev/dri/render* 2>/dev/null',
|
||||
{ timeout: 5000, ignoreErrors: true },
|
||||
);
|
||||
|
||||
if (deviceCheck.includes('/dev/kfd') || dockerInfo.includes('rocm')) {
|
||||
status.containerSupport = true;
|
||||
} else {
|
||||
status.issues.push('ROCm device files not available for container access');
|
||||
}
|
||||
} catch {
|
||||
status.issues.push('Could not verify Docker ROCm support');
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install AMD ROCm driver
|
||||
*/
|
||||
public async install(options: IDriverInstallOptions): Promise<boolean> {
|
||||
if (!await this.isRoot()) {
|
||||
logger.error('Root privileges required to install AMD ROCm drivers');
|
||||
return false;
|
||||
}
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
logger.info(`Detected Linux distribution: ${distro.id} ${distro.version}`);
|
||||
|
||||
try {
|
||||
if (distro.id === 'ubuntu') {
|
||||
return await this.installOnUbuntu(options);
|
||||
} else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||||
return await this.installOnRhel(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution: ${distro.id}`);
|
||||
logger.info('Please install ROCm drivers manually from https://rocm.docs.amd.com/');
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on Ubuntu
|
||||
*/
|
||||
private async installOnUbuntu(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing AMD ROCm on Ubuntu...');
|
||||
|
||||
// Install prerequisites
|
||||
await this.aptUpdate();
|
||||
await this.aptInstall(['wget', 'gnupg2']);
|
||||
|
||||
// Add ROCm repository
|
||||
const rocmVersion = options.toolkitVersion || '6.0';
|
||||
const ubuntuVersion = (await this.getLinuxDistro()).version.replace('.', '');
|
||||
|
||||
// Download and install ROCm repository
|
||||
await this.execCommand(
|
||||
`wget -q https://repo.radeon.com/rocm/rocm.gpg.key -O - | apt-key add -`,
|
||||
);
|
||||
|
||||
await this.execCommand(
|
||||
`echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${rocmVersion} ubuntu main" > /etc/apt/sources.list.d/rocm.list`,
|
||||
);
|
||||
|
||||
// Add AMDGPU repository
|
||||
await this.execCommand(
|
||||
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`,
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
|
||||
// Install AMDGPU driver and ROCm
|
||||
await this.aptInstall('amdgpu-dkms');
|
||||
|
||||
if (options.installToolkit) {
|
||||
await this.aptInstall('rocm-hip-sdk');
|
||||
} else {
|
||||
await this.aptInstall('rocm-smi-lib');
|
||||
}
|
||||
|
||||
// Add user to video and render groups
|
||||
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('AMD ROCm installation completed');
|
||||
logger.warn('A system reboot is required to load the new driver');
|
||||
logger.info('After reboot, verify with: rocm-smi');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install on RHEL
|
||||
*/
|
||||
private async installOnRhel(options: IDriverInstallOptions): Promise<boolean> {
|
||||
logger.info('Installing AMD ROCm on RHEL/CentOS...');
|
||||
|
||||
const rocmVersion = options.toolkitVersion || '6.0';
|
||||
const distro = await this.getLinuxDistro();
|
||||
const rhelVersion = distro.version.split('.')[0];
|
||||
|
||||
// Add EPEL repository
|
||||
await this.dnfInstall('epel-release');
|
||||
|
||||
// Add ROCm repository
|
||||
await this.execCommand(
|
||||
`cat <<EOF > /etc/yum.repos.d/rocm.repo
|
||||
[ROCm]
|
||||
name=ROCm
|
||||
baseurl=https://repo.radeon.com/rocm/yum/${rocmVersion}/main
|
||||
enabled=1
|
||||
gpgcheck=1
|
||||
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||||
EOF`,
|
||||
);
|
||||
|
||||
// Add AMDGPU repository
|
||||
await this.execCommand(
|
||||
`cat <<EOF > /etc/yum.repos.d/amdgpu.repo
|
||||
[amdgpu]
|
||||
name=amdgpu
|
||||
baseurl=https://repo.radeon.com/amdgpu/${rocmVersion}/rhel/${rhelVersion}/main/x86_64/
|
||||
enabled=1
|
||||
gpgcheck=1
|
||||
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
|
||||
EOF`,
|
||||
);
|
||||
|
||||
// Install AMDGPU driver
|
||||
await this.dnfInstall('amdgpu-dkms');
|
||||
|
||||
if (options.installToolkit) {
|
||||
await this.dnfInstall('rocm-hip-sdk');
|
||||
} else {
|
||||
await this.dnfInstall('rocm-smi-lib');
|
||||
}
|
||||
|
||||
// Add user to video and render groups
|
||||
await this.execCommand('usermod -a -G video,render $SUDO_USER || true');
|
||||
|
||||
// Install container support if requested
|
||||
if (options.installContainerSupport) {
|
||||
await this.installContainerSupport();
|
||||
}
|
||||
|
||||
logger.success('AMD ROCm installation completed');
|
||||
logger.warn('A system reboot is required to load the new driver');
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install container support for AMD GPUs
|
||||
*/
|
||||
public async installContainerSupport(): Promise<boolean> {
|
||||
logger.info('Configuring Docker for AMD ROCm...');
|
||||
|
||||
try {
|
||||
// AMD ROCm containers work by passing through device files
|
||||
// No special runtime needed, just need to pass --device flags
|
||||
|
||||
// Verify device files exist
|
||||
const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true');
|
||||
|
||||
if (!devices.includes('/dev/kfd')) {
|
||||
logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.');
|
||||
logger.info('Try rebooting the system after driver installation.');
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set permissions
|
||||
await this.execCommand('chmod 666 /dev/kfd /dev/dri/render* || true');
|
||||
|
||||
logger.success('AMD ROCm container support configured');
|
||||
logger.info('Use the following Docker flags for ROCm containers:');
|
||||
logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get available ROCm versions
|
||||
*/
|
||||
public async getAvailableVersions(): Promise<string[]> {
|
||||
// ROCm has a standard set of supported versions
|
||||
return ['6.0', '5.7', '5.6', '5.5', '5.4'];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user