feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+27
-9
@@ -20,10 +20,13 @@ export class AmdDriver extends BaseDriver {
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('rocm-smi --showdriverversion 2>/dev/null | head -1', {
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
});
|
||||
const { stdout } = await this.execCommand(
|
||||
'rocm-smi --showdriverversion 2>/dev/null | head -1',
|
||||
{
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
},
|
||||
);
|
||||
return stdout.includes('Driver');
|
||||
} catch {
|
||||
return false;
|
||||
@@ -114,7 +117,10 @@ export class AmdDriver extends BaseDriver {
|
||||
try {
|
||||
if (distro.id === 'ubuntu') {
|
||||
return await this.installOnUbuntu(options);
|
||||
} else if (distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||||
} else if (
|
||||
distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' ||
|
||||
distro.id === 'almalinux'
|
||||
) {
|
||||
return await this.installOnRhel(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution: ${distro.id}`);
|
||||
@@ -122,7 +128,11 @@ export class AmdDriver extends BaseDriver {
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install AMD ROCm drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to install AMD ROCm drivers: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -152,7 +162,9 @@ export class AmdDriver extends BaseDriver {
|
||||
|
||||
// Add AMDGPU repository
|
||||
await this.execCommand(
|
||||
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${ubuntuVersion === '2204' ? 'jammy' : 'focal'} main" > /etc/apt/sources.list.d/amdgpu.list`,
|
||||
`echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${rocmVersion}/ubuntu ${
|
||||
ubuntuVersion === '2204' ? 'jammy' : 'focal'
|
||||
} main" > /etc/apt/sources.list.d/amdgpu.list`,
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
@@ -250,7 +262,9 @@ EOF`,
|
||||
// No special runtime needed, just need to pass --device flags
|
||||
|
||||
// Verify device files exist
|
||||
const { stdout: devices } = await this.execCommand('ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true');
|
||||
const { stdout: devices } = await this.execCommand(
|
||||
'ls -la /dev/kfd /dev/dri/render* 2>/dev/null || true',
|
||||
);
|
||||
|
||||
if (!devices.includes('/dev/kfd')) {
|
||||
logger.warn('/dev/kfd not found. ROCm driver may not be properly loaded.');
|
||||
@@ -266,7 +280,11 @@ EOF`,
|
||||
logger.info(' --device=/dev/kfd --device=/dev/dri --group-add video');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to configure ROCm container support: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to configure ROCm container support: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -177,7 +177,9 @@ export abstract class BaseDriver {
|
||||
protected async addAptRepository(repo: string, keyUrl?: string): Promise<void> {
|
||||
if (keyUrl) {
|
||||
// Add GPG key
|
||||
await this.execCommand(`curl -fsSL ${keyUrl} | gpg --dearmor -o /usr/share/keyrings/$(basename ${keyUrl}).gpg`);
|
||||
await this.execCommand(
|
||||
`curl -fsSL ${keyUrl} | gpg --dearmor -o /usr/share/keyrings/$(basename ${keyUrl}).gpg`,
|
||||
);
|
||||
}
|
||||
await this.execCommand(`add-apt-repository -y "${repo}"`);
|
||||
}
|
||||
@@ -188,7 +190,11 @@ export abstract class BaseDriver {
|
||||
public async logStatus(): Promise<void> {
|
||||
const status = await this.getStatus();
|
||||
|
||||
logger.logBoxTitle(`${this.displayName} Driver Status`, 60, status.installed ? 'success' : 'warning');
|
||||
logger.logBoxTitle(
|
||||
`${this.displayName} Driver Status`,
|
||||
60,
|
||||
status.installed ? 'success' : 'warning',
|
||||
);
|
||||
logger.logBoxLine(`Installed: ${status.installed ? 'Yes' : 'No'}`);
|
||||
|
||||
if (status.installed) {
|
||||
|
||||
@@ -21,7 +21,7 @@ export class DriverManager {
|
||||
|
||||
constructor() {
|
||||
this.gpuDetector = new GpuDetector();
|
||||
this.drivers = new Map([
|
||||
this.drivers = new Map<TGpuVendor, BaseDriver>([
|
||||
['nvidia', new NvidiaDriver()],
|
||||
['amd', new AmdDriver()],
|
||||
['intel', new IntelDriver()],
|
||||
@@ -197,10 +197,15 @@ export class DriverManager {
|
||||
// Print status for each vendor
|
||||
for (const [vendor, gpuList] of vendorGpus) {
|
||||
if (vendor === 'unknown') {
|
||||
logger.logBox('Unknown GPUs', [
|
||||
`${gpuList.length} GPU(s) with unknown vendor`,
|
||||
'Manual driver installation may be required',
|
||||
], 50, 'warning');
|
||||
logger.logBox(
|
||||
'Unknown GPUs',
|
||||
[
|
||||
`${gpuList.length} GPU(s) with unknown vendor`,
|
||||
'Manual driver installation may be required',
|
||||
],
|
||||
50,
|
||||
'warning',
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -219,9 +224,7 @@ export class DriverManager {
|
||||
const args: string[] = [];
|
||||
|
||||
// Filter to specific GPUs if provided
|
||||
const targetGpus = gpuIds
|
||||
? gpus.filter((g) => gpuIds.includes(g.id))
|
||||
: gpus;
|
||||
const targetGpus = gpuIds ? gpus.filter((g) => gpuIds.includes(g.id)) : gpus;
|
||||
|
||||
if (targetGpus.length === 0) {
|
||||
return args;
|
||||
|
||||
+18
-4
@@ -138,7 +138,11 @@ export class IntelDriver extends BaseDriver {
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install Intel drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to install Intel drivers: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -159,7 +163,11 @@ export class IntelDriver extends BaseDriver {
|
||||
);
|
||||
|
||||
const distro = await this.getLinuxDistro();
|
||||
const ubuntuCodename = distro.version === '22.04' ? 'jammy' : distro.version === '24.04' ? 'noble' : 'jammy';
|
||||
const ubuntuCodename = distro.version === '22.04'
|
||||
? 'jammy'
|
||||
: distro.version === '24.04'
|
||||
? 'noble'
|
||||
: 'jammy';
|
||||
|
||||
await this.execCommand(
|
||||
`echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu ${ubuntuCodename} arc" > /etc/apt/sources.list.d/intel-graphics.list`,
|
||||
@@ -308,7 +316,9 @@ EOF`,
|
||||
try {
|
||||
// Intel GPUs work by passing through device files
|
||||
// Verify render devices exist
|
||||
const { stdout: devices } = await this.execCommand('ls -la /dev/dri/renderD* 2>/dev/null || true');
|
||||
const { stdout: devices } = await this.execCommand(
|
||||
'ls -la /dev/dri/renderD* 2>/dev/null || true',
|
||||
);
|
||||
|
||||
if (!devices.includes('renderD')) {
|
||||
logger.warn('/dev/dri/renderD* not found. Intel GPU driver may not be properly loaded.');
|
||||
@@ -323,7 +333,11 @@ EOF`,
|
||||
logger.info(' --device=/dev/dri --group-add render');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to configure Intel container support: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to configure Intel container support: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
+37
-14
@@ -20,10 +20,13 @@ export class NvidiaDriver extends BaseDriver {
|
||||
*/
|
||||
public async isInstalled(): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await this.execCommand('nvidia-smi --query-gpu=driver_version --format=csv,noheader', {
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
});
|
||||
const { stdout } = await this.execCommand(
|
||||
'nvidia-smi --query-gpu=driver_version --format=csv,noheader',
|
||||
{
|
||||
timeout: 5000,
|
||||
ignoreErrors: true,
|
||||
},
|
||||
);
|
||||
return stdout.trim().length > 0;
|
||||
} catch {
|
||||
return false;
|
||||
@@ -115,7 +118,10 @@ export class NvidiaDriver extends BaseDriver {
|
||||
try {
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
return await this.installOnDebian(options);
|
||||
} else if (distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' || distro.id === 'rocky' || distro.id === 'almalinux') {
|
||||
} else if (
|
||||
distro.id === 'fedora' || distro.id === 'rhel' || distro.id === 'centos' ||
|
||||
distro.id === 'rocky' || distro.id === 'almalinux'
|
||||
) {
|
||||
return await this.installOnRhel(options);
|
||||
} else {
|
||||
logger.error(`Unsupported distribution: ${distro.id}`);
|
||||
@@ -123,7 +129,11 @@ export class NvidiaDriver extends BaseDriver {
|
||||
return false;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install NVIDIA drivers: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to install NVIDIA drivers: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -181,7 +191,9 @@ export class NvidiaDriver extends BaseDriver {
|
||||
|
||||
// Add NVIDIA CUDA repository
|
||||
const distro = await this.getLinuxDistro();
|
||||
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${distro.version.split('.')[0]}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
|
||||
const repoUrl = `https://developer.download.nvidia.com/compute/cuda/repos/rhel${
|
||||
distro.version.split('.')[0]
|
||||
}/x86_64/cuda-rhel${distro.version.split('.')[0]}.repo`;
|
||||
|
||||
await this.execCommand(`dnf config-manager --add-repo ${repoUrl}`);
|
||||
|
||||
@@ -213,8 +225,11 @@ export class NvidiaDriver extends BaseDriver {
|
||||
|
||||
if (distro.id === 'ubuntu' || distro.id === 'debian') {
|
||||
// Add CUDA repository
|
||||
const cudaKeyUrl = 'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
|
||||
await this.execCommand(`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`);
|
||||
const cudaKeyUrl =
|
||||
'https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb';
|
||||
await this.execCommand(
|
||||
`wget -q ${cudaKeyUrl} -O /tmp/cuda-keyring.deb && dpkg -i /tmp/cuda-keyring.deb`,
|
||||
);
|
||||
await this.aptUpdate();
|
||||
|
||||
const cudaPackage = options.toolkitVersion
|
||||
@@ -247,8 +262,8 @@ export class NvidiaDriver extends BaseDriver {
|
||||
const distribution = `${distro.id}${distro.version}`;
|
||||
await this.execCommand(
|
||||
`curl -s -L https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list | ` +
|
||||
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
|
||||
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
|
||||
'sed "s#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g" | ' +
|
||||
'tee /etc/apt/sources.list.d/nvidia-container-toolkit.list',
|
||||
);
|
||||
|
||||
await this.aptUpdate();
|
||||
@@ -257,7 +272,7 @@ export class NvidiaDriver extends BaseDriver {
|
||||
// RHEL/Fedora
|
||||
await this.execCommand(
|
||||
'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | ' +
|
||||
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
|
||||
'tee /etc/yum.repos.d/nvidia-container-toolkit.repo',
|
||||
);
|
||||
await this.dnfInstall('nvidia-container-toolkit');
|
||||
}
|
||||
@@ -268,7 +283,11 @@ export class NvidiaDriver extends BaseDriver {
|
||||
logger.success('NVIDIA Container Toolkit installed successfully');
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to install NVIDIA Container Toolkit: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.error(
|
||||
`Failed to install NVIDIA Container Toolkit: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -288,7 +307,11 @@ export class NvidiaDriver extends BaseDriver {
|
||||
|
||||
logger.success('Docker configured to use NVIDIA runtime');
|
||||
} catch (error) {
|
||||
logger.warn(`Could not configure Docker runtime automatically: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.warn(
|
||||
`Could not configure Docker runtime automatically: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
logger.info('Please run: nvidia-ctk runtime configure --runtime=docker');
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user