feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
@@ -58,6 +58,7 @@ export class ModelGridCli {
|
||||
const serviceHandler = this.modelgrid.getServiceHandler();
|
||||
const gpuHandler = this.modelgrid.getGpuHandler();
|
||||
const containerHandler = this.modelgrid.getContainerHandler();
|
||||
const clusterHandler = this.modelgrid.getClusterHandler();
|
||||
const modelHandler = this.modelgrid.getModelHandler();
|
||||
const configHandler = this.modelgrid.getConfigHandler();
|
||||
|
||||
@@ -99,6 +100,51 @@ export class ModelGridCli {
|
||||
return;
|
||||
}
|
||||
|
||||
if (command === 'cluster') {
|
||||
const subcommand = commandArgs[0] || 'status';
|
||||
const subcommandArgs = commandArgs.slice(1);
|
||||
|
||||
switch (subcommand) {
|
||||
case 'status':
|
||||
await clusterHandler.status();
|
||||
break;
|
||||
case 'nodes':
|
||||
await clusterHandler.nodes();
|
||||
break;
|
||||
case 'models':
|
||||
await clusterHandler.models();
|
||||
break;
|
||||
case 'desired':
|
||||
await clusterHandler.desired();
|
||||
break;
|
||||
case 'ensure':
|
||||
await clusterHandler.ensure(subcommandArgs[0]);
|
||||
break;
|
||||
case 'scale':
|
||||
await clusterHandler.scale(subcommandArgs[0], parseInt(subcommandArgs[1] || '', 10));
|
||||
break;
|
||||
case 'clear':
|
||||
await clusterHandler.clear(subcommandArgs[0]);
|
||||
break;
|
||||
case 'cordon':
|
||||
await clusterHandler.cordon(subcommandArgs[0]);
|
||||
break;
|
||||
case 'uncordon':
|
||||
await clusterHandler.uncordon(subcommandArgs[0]);
|
||||
break;
|
||||
case 'drain':
|
||||
await clusterHandler.drain(subcommandArgs[0]);
|
||||
break;
|
||||
case 'activate':
|
||||
await clusterHandler.activate(subcommandArgs[0]);
|
||||
break;
|
||||
default:
|
||||
this.showClusterHelp();
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// GPU commands
|
||||
if (command === 'gpu') {
|
||||
const subcommand = commandArgs[0] || 'list';
|
||||
@@ -226,6 +272,12 @@ export class ModelGridCli {
|
||||
|
||||
// Top-level commands
|
||||
switch (command) {
|
||||
case 'run':
|
||||
await modelHandler.pull(commandArgs[0]);
|
||||
break;
|
||||
case 'ps':
|
||||
await containerHandler.list();
|
||||
break;
|
||||
case 'update':
|
||||
await serviceHandler.update();
|
||||
break;
|
||||
@@ -267,10 +319,13 @@ export class ModelGridCli {
|
||||
console.log('');
|
||||
|
||||
logger.log(theme.info('Commands:'));
|
||||
this.printCommand('run <model>', 'Deploy a vLLM model');
|
||||
this.printCommand('ps', 'List active deployments');
|
||||
this.printCommand('service <subcommand>', 'Manage systemd service');
|
||||
this.printCommand('gpu <subcommand>', 'Manage GPU hardware');
|
||||
this.printCommand('container <subcommand>', 'Manage AI containers');
|
||||
this.printCommand('model <subcommand>', 'Manage AI models');
|
||||
this.printCommand('container <subcommand>', 'Manage deployments directly');
|
||||
this.printCommand('model <subcommand>', 'Browse and deploy catalog models');
|
||||
this.printCommand('cluster <subcommand>', 'Inspect cluster control plane');
|
||||
this.printCommand('config <subcommand>', 'Manage configuration');
|
||||
this.printCommand('update', 'Update ModelGrid', theme.dim('(requires root)'));
|
||||
this.printCommand('uninstall', 'Remove ModelGrid', theme.dim('(requires root)'));
|
||||
@@ -280,9 +335,9 @@ export class ModelGridCli {
|
||||
|
||||
logger.log(theme.info('Quick Start:'));
|
||||
logger.dim(' modelgrid gpu list # Detect GPUs');
|
||||
logger.dim(' modelgrid container add # Add an Ollama/vLLM container');
|
||||
logger.dim(' modelgrid container start # Start containers');
|
||||
logger.dim(' modelgrid model pull llama3 # Pull a model');
|
||||
logger.dim(' modelgrid model list # Browse catalog');
|
||||
logger.dim(' modelgrid run <model> # Deploy a vLLM model');
|
||||
logger.dim(' modelgrid ps # List active deployments');
|
||||
logger.dim(' modelgrid service enable # Install as service');
|
||||
console.log('');
|
||||
|
||||
@@ -290,7 +345,9 @@ export class ModelGridCli {
|
||||
logger.dim(' curl -X POST http://localhost:8080/v1/chat/completions \\');
|
||||
logger.dim(' -H "Authorization: Bearer YOUR_API_KEY" \\');
|
||||
logger.dim(' -H "Content-Type: application/json" \\');
|
||||
logger.dim(' -d \'{"model": "llama3", "messages": [{"role": "user", "content": "Hello"}]}\'');
|
||||
logger.dim(
|
||||
' -d \'{"model": "llama3", "messages": [{"role": "user", "content": "Hello"}]}\'',
|
||||
);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
@@ -360,17 +417,17 @@ Usage:
|
||||
modelgrid container <subcommand> [arguments]
|
||||
|
||||
Subcommands:
|
||||
list List all configured containers
|
||||
add Add a new container interactively
|
||||
remove <id> Remove a container by ID
|
||||
start [id] Start a container (or all if no ID)
|
||||
stop [id] Stop a container (or all if no ID)
|
||||
logs <id> Show container logs
|
||||
list List all configured deployments
|
||||
add Add a vLLM deployment interactively
|
||||
remove <id> Remove a deployment by ID
|
||||
start [id] Start a deployment (or all if no ID)
|
||||
stop [id] Stop a deployment (or all if no ID)
|
||||
logs <id> Show deployment logs
|
||||
|
||||
Examples:
|
||||
modelgrid container add # Add new container
|
||||
modelgrid container start ollama # Start specific container
|
||||
modelgrid container logs ollama # View container logs
|
||||
modelgrid container add # Add new deployment
|
||||
modelgrid container start qwen2 # Start specific deployment
|
||||
modelgrid container logs qwen2 # View deployment logs
|
||||
`);
|
||||
}
|
||||
|
||||
@@ -385,16 +442,43 @@ Usage:
|
||||
modelgrid model <subcommand> [arguments]
|
||||
|
||||
Subcommands:
|
||||
list List all available models
|
||||
pull <name> Pull a model (must be greenlit)
|
||||
remove <name> Remove a model
|
||||
status Show model loading recommendations
|
||||
refresh Refresh greenlist cache
|
||||
list List all catalog models
|
||||
pull <name> Deploy a model from the registry
|
||||
remove <name> Remove a deployed model
|
||||
status Show deployment recommendations
|
||||
refresh Refresh the model catalog cache
|
||||
|
||||
Examples:
|
||||
modelgrid model list # Show all models
|
||||
modelgrid model pull llama3:8b # Pull a model
|
||||
modelgrid model status # Show VRAM recommendations
|
||||
modelgrid model list # Show all models
|
||||
modelgrid model pull meta-llama/Llama-3.1-8B-Instruct
|
||||
modelgrid model status # Show GPU-fit recommendations
|
||||
`);
|
||||
}
|
||||
|
||||
private showClusterHelp(): void {
|
||||
logger.log(`
|
||||
ModelGrid - Cluster Commands
|
||||
|
||||
Usage:
|
||||
modelgrid cluster <subcommand> [arguments]
|
||||
|
||||
Subcommands:
|
||||
status Show cluster status
|
||||
nodes List registered nodes
|
||||
models List clustered model locations
|
||||
desired Show desired deployment targets
|
||||
ensure <name> Ask the control plane to schedule a model
|
||||
scale <name> <replicas> Set desired replica count
|
||||
clear <name> Remove desired deployment target
|
||||
cordon <node> Prevent new placements on a node
|
||||
uncordon <node> Re-enable placements on a node
|
||||
drain <node> Mark a node for evacuation
|
||||
activate <node> Mark a node active again
|
||||
|
||||
Examples:
|
||||
modelgrid cluster status
|
||||
modelgrid cluster ensure meta-llama/Llama-3.1-8B-Instruct
|
||||
modelgrid cluster cordon worker-a
|
||||
`);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user