feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+27
-9
@@ -20,10 +20,13 @@ export class AmdDriver extends BaseDriver {
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', {
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
});
|
||||
const { stdout } = await this.execCommand(
|
||||
'rocm-smi --showdriverversion 2>/dev/null | head -1',
|
||||
{
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
},
|
||||
);
|
||||
return stdout.includes('Driver');
|
||||
} catch {
|
||||
return false;
|
||||
@@ -114,7 +117,10 @@ export class AmdDriver extends BaseDriver {
|
||||
try {
|
||||
if (distro.id === 'ubuntu') {
|
||||
return await this.installOnUbuntu(options);
|
||||
} else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||||
} else if (
|
||||
distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' ||
|
||||
distro.id === 'almalinux'
|
||||
) {
|
||||
return await this.installOnRhel(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution: ${distro.id}`);
|
||||
@@ -122,7 +128,11 @@ export class AmdDriver extends BaseDriver {
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to install AMD ROCm drivers: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -152,7 +162,9 @@ export class AmdDriver extends BaseDriver {
|
||||
|
||||
// Add AMDGPU repository
|
||||
await this.execCommand(
|
||||
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`,
|
||||
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${
|
||||
ubuntuVersion === '2204' ? 'jammy' : 'focal'
|
||||
} main" > /etc/apt/sources.list.d/amdgpu.list`,
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
@@ -250,7 +262,9 @@ EOF`,
|
||||
// No special runtime needed, just need to pass --device flags
|
||||
|
||||
// Verify device files exist
|
||||
const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true');
|
||||
const { stdout: devices } = await this.execCommand(
|
||||
'ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true',
|
||||
);
|
||||
|
||||
if (!devices.includes('/dev/kfd')) {
|
||||
logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.');
|
||||
@@ -266,7 +280,11 @@ EOF`,
|
||||
logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to configure ROCm container support: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user