feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

This commit is contained in:
2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
+107 -23
View File
@@ -58,6 +58,7 @@ export class ModelGridCli {
const serviceHandler = this.modelgrid.getServiceHandler();
const gpuHandler = this.modelgrid.getGpuHandler();
const containerHandler = this.modelgrid.getContainerHandler();
const clusterHandler = this.modelgrid.getClusterHandler();
const modelHandler = this.modelgrid.getModelHandler();
const configHandler = this.modelgrid.getConfigHandler();
@@ -99,6 +100,51 @@ export class ModelGridCli {
return;
}
if (command === 'cluster') {
const subcommand = commandArgs[0] || 'status';
const subcommandArgs = commandArgs.slice(1);
switch (subcommand) {
case 'status':
await clusterHandler.status();
break;
case 'nodes':
await clusterHandler.nodes();
break;
case 'models':
await clusterHandler.models();
break;
case 'desired':
await clusterHandler.desired();
break;
case 'ensure':
await clusterHandler.ensure(subcommandArgs[0]);
break;
case 'scale':
await clusterHandler.scale(subcommandArgs[0], parseInt(subcommandArgs[1] || '', 10));
break;
case 'clear':
await clusterHandler.clear(subcommandArgs[0]);
break;
case 'cordon':
await clusterHandler.cordon(subcommandArgs[0]);
break;
case 'uncordon':
await clusterHandler.uncordon(subcommandArgs[0]);
break;
case 'drain':
await clusterHandler.drain(subcommandArgs[0]);
break;
case 'activate':
await clusterHandler.activate(subcommandArgs[0]);
break;
default:
this.showClusterHelp();
break;
}
return;
}
// GPU commands
if (command === 'gpu') {
const subcommand = commandArgs[0] || 'list';
@@ -226,6 +272,12 @@ export class ModelGridCli {
// Top-level commands
switch (command) {
case 'run':
await modelHandler.pull(commandArgs[0]);
break;
case 'ps':
await containerHandler.list();
break;
case 'update':
await serviceHandler.update();
break;
@@ -267,10 +319,13 @@ export class ModelGridCli {
console.log('');
logger.log(theme.info('Commands:'));
this.printCommand('run <model>', 'Deploy a vLLM model');
this.printCommand('ps', 'List active deployments');
this.printCommand('service <subcommand>', 'Manage systemd service');
this.printCommand('gpu <subcommand>', 'Manage GPU hardware');
this.printCommand('container <subcommand>', 'Manage AI containers');
this.printCommand('model <subcommand>', 'Manage AI models');
this.printCommand('container <subcommand>', 'Manage deployments directly');
this.printCommand('model <subcommand>', 'Browse and deploy catalog models');
this.printCommand('cluster <subcommand>', 'Inspect cluster control plane');
this.printCommand('config <subcommand>', 'Manage configuration');
this.printCommand('update', 'Update ModelGrid', theme.dim('(requires root)'));
this.printCommand('uninstall', 'Remove ModelGrid', theme.dim('(requires root)'));
@@ -280,9 +335,9 @@ export class ModelGridCli {
logger.log(theme.info('Quick Start:'));
logger.dim(' modelgrid gpu list # Detect GPUs');
logger.dim(' modelgrid container add # Add an Ollama/vLLM container');
logger.dim(' modelgrid container start # Start containers');
logger.dim(' modelgrid model pull llama3 # Pull a model');
logger.dim(' modelgrid model list # Browse catalog');
logger.dim(' modelgrid run <model> # Deploy a vLLM model');
logger.dim(' modelgrid ps # List active deployments');
logger.dim(' modelgrid service enable # Install as service');
console.log('');
@@ -290,7 +345,9 @@ export class ModelGridCli {
logger.dim(' curl -X POST http://localhost:8080/v1/chat/completions \\');
logger.dim(' -H "Authorization: Bearer YOUR_API_KEY" \\');
logger.dim(' -H "Content-Type: application/json" \\');
logger.dim(' -d \'{"model": "llama3", "messages": [{"role": "user", "content": "Hello"}]}\'');
logger.dim(
' -d \'{"model": "llama3", "messages": [{"role": "user", "content": "Hello"}]}\'',
);
console.log('');
}
@@ -360,17 +417,17 @@ Usage:
modelgrid container <subcommand> [arguments]
Subcommands:
list List all configured containers
add Add a new container interactively
remove <id> Remove a container by ID
start [id] Start a container (or all if no ID)
stop [id] Stop a container (or all if no ID)
logs <id> Show container logs
list List all configured deployments
add Add a vLLM deployment interactively
remove <id> Remove a deployment by ID
start [id] Start a deployment (or all if no ID)
stop [id] Stop a deployment (or all if no ID)
logs <id> Show deployment logs
Examples:
modelgrid container add # Add new container
modelgrid container start ollama # Start specific container
modelgrid container logs ollama # View container logs
modelgrid container add # Add new deployment
modelgrid container start qwen2 # Start specific deployment
modelgrid container logs qwen2 # View deployment logs
`);
}
@@ -385,16 +442,43 @@ Usage:
modelgrid model <subcommand> [arguments]
Subcommands:
list List all available models
pull <name> Pull a model (must be greenlit)
remove <name> Remove a model
status Show model loading recommendations
refresh Refresh greenlist cache
list List all catalog models
pull <name> Deploy a model from the registry
remove <name> Remove a deployed model
status Show deployment recommendations
refresh Refresh the model catalog cache
Examples:
modelgrid model list # Show all models
modelgrid model pull llama3:8b # Pull a model
modelgrid model status # Show VRAM recommendations
modelgrid model list # Show all models
modelgrid model pull meta-llama/Llama-3.1-8B-Instruct
modelgrid model status # Show GPU-fit recommendations
`);
}
private showClusterHelp(): void {
logger.log(`
ModelGrid - Cluster Commands
Usage:
modelgrid cluster <subcommand> [arguments]
Subcommands:
status Show cluster status
nodes List registered nodes
models List clustered model locations
desired Show desired deployment targets
ensure <name> Ask the control plane to schedule a model
scale <name> <replicas> Set desired replica count
clear <name> Remove desired deployment target
cordon <node> Prevent new placements on a node
uncordon <node> Re-enable placements on a node
drain <node> Mark a node for evacuation
activate <node> Mark a node active again
Examples:
modelgrid cluster status
modelgrid cluster ensure meta-llama/Llama-3.1-8B-Instruct
modelgrid cluster cordon worker-a
`);
}