feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
@@ -58,6 +58,7 @@ export class ModelGridCli {
    const serviceHandler = this.modelgrid.getServiceHandler();
    const gpuHandler = this.modelgrid.getGpuHandler();
    const containerHandler = this.modelgrid.getContainerHandler();
+    const clusterHandler = this.modelgrid.getClusterHandler();
    const modelHandler = this.modelgrid.getModelHandler();
    const configHandler = this.modelgrid.getConfigHandler();

@@ -99,6 +100,51 @@ export class ModelGridCli {
      return;
    }

+    if (command === 'cluster') {
+      const subcommand = commandArgs[0] || 'status';
+      const subcommandArgs = commandArgs.slice(1);
+
+      switch (subcommand) {
+        case 'status':
+          await clusterHandler.status();
+          break;
+        case 'nodes':
+          await clusterHandler.nodes();
+          break;
+        case 'models':
+          await clusterHandler.models();
+          break;
+        case 'desired':
+          await clusterHandler.desired();
+          break;
+        case 'ensure':
+          await clusterHandler.ensure(subcommandArgs[0]);
+          break;
+        case 'scale':
+          await clusterHandler.scale(subcommandArgs[0], parseInt(subcommandArgs[1] || '', 10));
+          break;
+        case 'clear':
+          await clusterHandler.clear(subcommandArgs[0]);
+          break;
+        case 'cordon':
+          await clusterHandler.cordon(subcommandArgs[0]);
+          break;
+        case 'uncordon':
+          await clusterHandler.uncordon(subcommandArgs[0]);
+          break;
+        case 'drain':
+          await clusterHandler.drain(subcommandArgs[0]);
+          break;
+        case 'activate':
+          await clusterHandler.activate(subcommandArgs[0]);
+          break;
+        default:
+          this.showClusterHelp();
+          break;
+      }
+      return;
+    }
+
    // GPU commands
    if (command === 'gpu') {
      const subcommand = commandArgs[0] || 'list';
@@ -226,6 +272,12 @@ export class ModelGridCli {

    // Top-level commands
    switch (command) {
+      case 'run':
+        await modelHandler.pull(commandArgs[0]);
+        break;
+      case 'ps':
+        await containerHandler.list();
+        break;
      case 'update':
        await serviceHandler.update();
        break;
@@ -267,10 +319,13 @@ export class ModelGridCli {
    console.log('');

    logger.log(theme.info('Commands:'));
+    this.printCommand('run <model>', 'Deploy a vLLM model');
+    this.printCommand('ps', 'List active deployments');
    this.printCommand('service <subcommand>', 'Manage systemd service');
    this.printCommand('gpu <subcommand>', 'Manage GPU hardware');
-    this.printCommand('container <subcommand>', 'Manage AI containers');
-    this.printCommand('model <subcommand>', 'Manage AI models');
+    this.printCommand('container <subcommand>', 'Manage deployments directly');
+    this.printCommand('model <subcommand>', 'Browse and deploy catalog models');
+    this.printCommand('cluster <subcommand>', 'Inspect cluster control plane');
    this.printCommand('config <subcommand>', 'Manage configuration');
    this.printCommand('update', 'Update ModelGrid', theme.dim('(requires root)'));
    this.printCommand('uninstall', 'Remove ModelGrid', theme.dim('(requires root)'));
@@ -280,9 +335,9 @@ export class ModelGridCli {

    logger.log(theme.info('Quick Start:'));
    logger.dim('  modelgrid gpu list           # Detect GPUs');
-    logger.dim('  modelgrid container add      # Add an Ollama/vLLM container');
-    logger.dim('  modelgrid container start    # Start containers');
-    logger.dim('  modelgrid model pull llama3  # Pull a model');
+    logger.dim('  modelgrid model list         # Browse catalog');
+    logger.dim('  modelgrid run <model>        # Deploy a vLLM model');
+    logger.dim('  modelgrid ps                 # List active deployments');
    logger.dim('  modelgrid service enable     # Install as service');
    console.log('');

@@ -290,7 +345,9 @@ export class ModelGridCli {
    logger.dim('  curl -X POST http://localhost:8080/v1/chat/completions \\');
    logger.dim('    -H "Authorization: Bearer YOUR_API_KEY" \\');
    logger.dim('    -H "Content-Type: application/json" \\');
-    logger.dim('    -d \'{"model": "llama3", "messages": [{"role": "user", "content": "Hello"}]}\'');
+    logger.dim(
+      '    -d \'{"model": "llama3", "messages": [{"role": "user", "content": "Hello"}]}\'',
+    );
    console.log('');
  }

@@ -360,17 +417,17 @@ Usage:
  modelgrid container <subcommand> [arguments]

 Subcommands:
-  list         List all configured containers
-  add          Add a new container interactively
-  remove <id>  Remove a container by ID
-  start [id]   Start a container (or all if no ID)
-  stop [id]    Stop a container (or all if no ID)
-  logs <id>    Show container logs
+  list         List all configured deployments
+  add          Add a vLLM deployment interactively
+  remove <id>  Remove a deployment by ID
+  start [id]   Start a deployment (or all if no ID)
+  stop [id]    Stop a deployment (or all if no ID)
+  logs <id>    Show deployment logs

 Examples:
-  modelgrid container add           # Add new container
-  modelgrid container start ollama  # Start specific container
-  modelgrid container logs ollama   # View container logs
+  modelgrid container add           # Add new deployment
+  modelgrid container start qwen2   # Start specific deployment
+  modelgrid container logs qwen2    # View deployment logs
 `);
  }

@@ -385,16 +442,43 @@ Usage:
  modelgrid model <subcommand> [arguments]

 Subcommands:
-  list         List all available models
-  pull <name>  Pull a model (must be greenlit)
-  remove <name> Remove a model
-  status       Show model loading recommendations
-  refresh      Refresh greenlist cache
+  list          List all catalog models
+  pull <name>   Deploy a model from the registry
+  remove <name> Remove a deployed model
+  status        Show deployment recommendations
+  refresh       Refresh the model catalog cache

 Examples:
-  modelgrid model list               # Show all models
-  modelgrid model pull llama3:8b     # Pull a model
-  modelgrid model status             # Show VRAM recommendations
+  modelgrid model list                           # Show all models
+  modelgrid model pull meta-llama/Llama-3.1-8B-Instruct
+  modelgrid model status                         # Show GPU-fit recommendations
+`);
+  }
+
+  private showClusterHelp(): void {
+    logger.log(`
+ModelGrid - Cluster Commands
+
+Usage:
+  modelgrid cluster <subcommand> [arguments]
+
+Subcommands:
+  status         Show cluster status
+  nodes          List registered nodes
+  models         List clustered model locations
+  desired        Show desired deployment targets
+  ensure <name>  Ask the control plane to schedule a model
+  scale <name> <replicas> Set desired replica count
+  clear <name>   Remove desired deployment target
+  cordon <node>  Prevent new placements on a node
+  uncordon <node> Re-enable placements on a node
+  drain <node>   Mark a node for evacuation
+  activate <node> Mark a node active again
+
+Examples:
+  modelgrid cluster status
+  modelgrid cluster ensure meta-llama/Llama-3.1-8B-Instruct
+  modelgrid cluster cordon worker-a
 `);
  }