feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+38
-16
@@ -86,22 +86,30 @@ export class GpuHandler {
|
||||
logger.info('GPU Status');
|
||||
logger.log('');
|
||||
|
||||
const gpuStatus = await this.gpuDetector.getGpuStatus();
|
||||
const gpuInfo = await this.gpuDetector.detectGpus();
|
||||
const gpuStatus = await this.gpuDetector.getAllGpuStatus();
|
||||
|
||||
if (gpuStatus.length === 0) {
|
||||
if (gpuStatus.size === 0) {
|
||||
logger.warn('No GPUs detected');
|
||||
return;
|
||||
}
|
||||
|
||||
for (const gpu of gpuStatus) {
|
||||
const utilizationBar = this.createProgressBar(gpu.utilization, 30);
|
||||
const memoryBar = this.createProgressBar(gpu.memoryUsed / gpu.memoryTotal * 100, 30);
|
||||
for (const [gpuId, status] of gpuStatus) {
|
||||
const info = gpuInfo.find((gpu) => gpu.id === gpuId);
|
||||
const utilizationBar = this.createProgressBar(status.utilization, 30);
|
||||
const memoryBar = this.createProgressBar(status.memoryUsed / status.memoryTotal * 100, 30);
|
||||
|
||||
logger.logBoxTitle(`GPU ${gpu.id}: ${gpu.name}`, 70, 'info');
|
||||
logger.logBoxLine(`Utilization: ${utilizationBar} ${gpu.utilization.toFixed(1)}%`);
|
||||
logger.logBoxLine(`Memory: ${memoryBar} ${Math.round(gpu.memoryUsed)}/${Math.round(gpu.memoryTotal)} MB`);
|
||||
logger.logBoxLine(`Temperature: ${this.formatTemperature(gpu.temperature)}`);
|
||||
logger.logBoxLine(`Power: ${gpu.powerDraw.toFixed(0)}W / ${gpu.powerLimit.toFixed(0)}W`);
|
||||
logger.logBoxTitle(`GPU ${status.id}: ${info?.model || 'Unknown GPU'}`, 70, 'info');
|
||||
logger.logBoxLine(`Utilization: ${utilizationBar} ${status.utilization.toFixed(1)}%`);
|
||||
logger.logBoxLine(
|
||||
`Memory: ${memoryBar} ${Math.round(status.memoryUsed)}/${
|
||||
Math.round(status.memoryTotal)
|
||||
} MB`,
|
||||
);
|
||||
logger.logBoxLine(`Temperature: ${this.formatTemperature(status.temperature)}`);
|
||||
logger.logBoxLine(
|
||||
`Power: ${status.powerUsage.toFixed(0)}W / ${status.powerLimit.toFixed(0)}W`,
|
||||
);
|
||||
logger.logBoxEnd();
|
||||
logger.log('');
|
||||
}
|
||||
@@ -138,13 +146,23 @@ export class GpuHandler {
|
||||
|
||||
const status = await driver.getStatus();
|
||||
|
||||
logger.logBoxTitle(`${this.formatVendor(vendor)} Driver`, 60, status.installed ? 'success' : 'warning');
|
||||
logger.logBoxLine(`Installed: ${status.installed ? theme.success('Yes') : theme.error('No')}`);
|
||||
logger.logBoxTitle(
|
||||
`${this.formatVendor(vendor)} Driver`,
|
||||
60,
|
||||
status.installed ? 'success' : 'warning',
|
||||
);
|
||||
logger.logBoxLine(
|
||||
`Installed: ${status.installed ? theme.success('Yes') : theme.error('No')}`,
|
||||
);
|
||||
|
||||
if (status.installed) {
|
||||
logger.logBoxLine(`Version: ${status.version || 'Unknown'}`);
|
||||
logger.logBoxLine(`Runtime: ${status.runtimeVersion || 'Unknown'}`);
|
||||
logger.logBoxLine(`Container Support: ${status.containerSupport ? theme.success('Yes') : theme.warning('No')}`);
|
||||
logger.logBoxLine(`Runtime: ${status.containerRuntimeVersion || 'Unknown'}`);
|
||||
logger.logBoxLine(
|
||||
`Container Support: ${
|
||||
status.containerSupport ? theme.success('Yes') : theme.warning('No')
|
||||
}`,
|
||||
);
|
||||
} else {
|
||||
logger.logBoxLine('');
|
||||
logger.logBoxLine(theme.dim('Run `modelgrid gpu install` to install drivers'));
|
||||
@@ -183,14 +201,18 @@ export class GpuHandler {
|
||||
|
||||
logger.info(`Installing ${this.formatVendor(vendor)} drivers...`);
|
||||
|
||||
const success = await driver.install();
|
||||
const success = await driver.install({
|
||||
installToolkit: true,
|
||||
installContainerSupport: true,
|
||||
nonInteractive: false,
|
||||
});
|
||||
|
||||
if (success) {
|
||||
logger.success(`${this.formatVendor(vendor)} drivers installed successfully`);
|
||||
|
||||
// Setup container support
|
||||
logger.info('Setting up container support...');
|
||||
const containerSuccess = await driver.setupContainer();
|
||||
const containerSuccess = await driver.installContainerSupport();
|
||||
|
||||
if (containerSuccess) {
|
||||
logger.success('Container support configured');
|
||||
|
||||
Reference in New Issue
Block a user