feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
@@ -141,8 +141,8 @@ const containerColumns: ITableColumn[] = [
 ];

 const containerData = [
-  { id: 'ollama-1', type: 'ollama', status: 'Running', gpu: 'gpu-0', models: '3' },
-  { id: 'vllm-1', type: 'vllm', status: 'Running', gpu: 'gpu-1', models: '1' },
+  { id: 'vllm-qwen', type: 'vllm', status: 'Running', gpu: 'gpu-0', models: '1' },
+  { id: 'vllm-llama', type: 'vllm', status: 'Running', gpu: 'gpu-1', models: '1' },
 ];

 logger.logTable(containerColumns, containerData, 'AI Containers');
@@ -183,9 +183,14 @@ const modelColumns: ITableColumn[] = [
 ];

 const modelData = [
-  { name: 'llama3:8b', container: 'ollama-1', size: '4.7 GB', status: 'Loaded' },
-  { name: 'mistral:7b', container: 'ollama-1', size: '4.1 GB', status: 'Loaded' },
-  { name: 'llama3:70b', container: 'vllm-1', size: '40 GB', status: 'Loaded' },
+  { name: 'Qwen/Qwen2.5-7B-Instruct', container: 'vllm-qwen', size: '15 GB', status: 'Loaded' },
+  {
+    name: 'meta-llama/Llama-3.1-8B-Instruct',
+    container: 'vllm-llama',
+    size: '16 GB',
+    status: 'Loaded',
+  },
+  { name: 'BAAI/bge-m3', container: 'vllm-embed', size: '5 GB', status: 'Loaded' },
 ];

 logger.logTable(modelColumns, modelData, 'Loaded Models');
@@ -1,5 +1,7 @@
 import { assert, assertEquals, assertExists } from 'jsr:@std/assert@^1.0.0';
 import { shortId } from '../ts/helpers/shortid.ts';
+import { ClusterManager } from '../ts/cluster/cluster-manager.ts';
+import { buildGpuTopologyGroups } from '../ts/cluster/placement.ts';

 // =============================================================================
 // UNIT TESTS - ModelGrid Core Components
@@ -54,11 +56,22 @@ Deno.test('IModelGridConfig: valid config structure', () => {
    },
    containers: [],
    models: {
-      greenlistUrl: 'https://example.com/greenlit.json',
-      autoPull: true,
-      defaultContainer: 'ollama',
+      registryUrl: 'https://example.com/models.json',
+      autoDeploy: true,
+      defaultEngine: 'vllm' as const,
      autoLoad: [],
    },
+    cluster: {
+      enabled: false,
+      nodeName: 'modelgrid-local',
+      role: 'standalone' as const,
+      bindHost: '0.0.0.0',
+      gossipPort: 7946,
+      sharedSecret: '',
+      advertiseUrl: 'http://127.0.0.1:8080',
+      heartbeatIntervalMs: 5000,
+      seedNodes: [],
+    },
    checkInterval: 30000,
  };

@@ -92,13 +105,13 @@ Deno.test('IGpuInfo: valid GPU info structure', () => {

 Deno.test('IContainerConfig: valid container config structure', () => {
  const container = {
-    id: 'ollama-1',
-    type: 'ollama' as const,
-    name: 'Ollama Container',
-    image: 'ollama/ollama:latest',
+    id: 'vllm-1',
+    type: 'vllm' as const,
+    name: 'vLLM Deployment',
+    image: 'vllm/vllm-openai:latest',
    gpuIds: ['gpu-0'],
-    port: 11434,
-    models: ['llama3:8b'],
+    port: 8000,
+    models: ['meta-llama/Llama-3.1-8B-Instruct'],
  };

  assertExists(container.id);
@@ -110,53 +123,53 @@ Deno.test('IContainerConfig: valid container config structure', () => {
 });

 // -----------------------------------------------------------------------------
-// Greenlit Model Tests
+// Model Catalog Tests
 // -----------------------------------------------------------------------------

-Deno.test('Greenlit model validation: valid model passes', () => {
-  const greenlist = {
+Deno.test('Catalog model validation: valid model passes', () => {
+  const catalog = {
    version: '1.0',
    models: [
-      { name: 'llama3:8b', container: 'ollama', minVram: 8 },
-      { name: 'mistral:7b', container: 'ollama', minVram: 8 },
+      { id: 'meta-llama/Llama-3.1-8B-Instruct', engine: 'vllm', requirements: { minVramGb: 18 } },
+      { id: 'Qwen/Qwen2.5-7B-Instruct', engine: 'vllm', requirements: { minVramGb: 16 } },
    ],
  };

-  const requestedModel = 'llama3:8b';
+  const requestedModel = 'meta-llama/Llama-3.1-8B-Instruct';
  const availableVram = 24; // GB

-  const model = greenlist.models.find((m) => m.name === requestedModel);
-  assertExists(model, 'Model should be in greenlist');
-  assert(availableVram >= model.minVram, 'Should have enough VRAM');
+  const model = catalog.models.find((m) => m.id === requestedModel);
+  assertExists(model, 'Model should be in catalog');
+  assert(availableVram >= model.requirements.minVramGb, 'Should have enough VRAM');
 });

-Deno.test('Greenlit model validation: insufficient VRAM fails', () => {
-  const greenlist = {
+Deno.test('Catalog model validation: insufficient VRAM fails', () => {
+  const catalog = {
    version: '1.0',
    models: [
-      { name: 'llama3:70b', container: 'vllm', minVram: 48 },
+      { id: 'meta-llama/Llama-3.1-70B-Instruct', engine: 'vllm', requirements: { minVramGb: 48 } },
    ],
  };

-  const requestedModel = 'llama3:70b';
+  const requestedModel = 'meta-llama/Llama-3.1-70B-Instruct';
  const availableVram = 24; // GB

-  const model = greenlist.models.find((m) => m.name === requestedModel);
-  assertExists(model, 'Model should be in greenlist');
-  assert(availableVram < model.minVram, 'Should NOT have enough VRAM');
+  const model = catalog.models.find((m) => m.id === requestedModel);
+  assertExists(model, 'Model should be in catalog');
+  assert(availableVram < model.requirements.minVramGb, 'Should NOT have enough VRAM');
 });

-Deno.test('Greenlit model validation: unlisted model rejected', () => {
-  const greenlist = {
+Deno.test('Catalog model validation: unlisted model rejected', () => {
+  const catalog = {
    version: '1.0',
    models: [
-      { name: 'llama3:8b', container: 'ollama', minVram: 8 },
+      { id: 'meta-llama/Llama-3.1-8B-Instruct', engine: 'vllm', requirements: { minVramGb: 18 } },
    ],
  };

  const requestedModel = 'some-random-model:latest';
-  const model = greenlist.models.find((m) => m.name === requestedModel);
-  assertEquals(model, undefined, 'Model should NOT be in greenlist');
+  const model = catalog.models.find((m) => m.id === requestedModel);
+  assertEquals(model, undefined, 'Model should NOT be in catalog');
 });

 // -----------------------------------------------------------------------------
@@ -223,16 +236,16 @@ Deno.test('Embedding request: array input passes', () => {
 // Container Type Tests
 // -----------------------------------------------------------------------------

-Deno.test('Container types: ollama configuration', () => {
-  const ollamaConfig = {
-    type: 'ollama' as const,
-    image: 'ollama/ollama:latest',
-    defaultPort: 11434,
-    apiPath: '/api',
+Deno.test('Container types: vllm base configuration', () => {
+  const vllmConfig = {
+    type: 'vllm' as const,
+    image: 'vllm/vllm-openai:latest',
+    defaultPort: 8000,
+    apiPath: '/v1',
  };

-  assertEquals(ollamaConfig.type, 'ollama');
-  assertEquals(ollamaConfig.defaultPort, 11434);
+  assertEquals(vllmConfig.type, 'vllm');
+  assertEquals(vllmConfig.defaultPort, 8000);
 });

 Deno.test('Container types: vllm configuration', () => {
@@ -321,3 +334,367 @@ Deno.test('VRAM calculation: multiple models VRAM sum', () => {
  const totalVram = models.reduce((sum, m) => sum + m.vram, 0);
  assertEquals(totalVram, 16);
 });
+
+// -----------------------------------------------------------------------------
+// Cluster Scheduling Tests
+// -----------------------------------------------------------------------------
+
+Deno.test('Cluster manager resolves local model first', () => {
+  const clusterManager = new ClusterManager();
+  clusterManager.configure({
+    enabled: true,
+    nodeName: 'control',
+    role: 'control-plane',
+    bindHost: '0.0.0.0',
+    gossipPort: 7946,
+    advertiseUrl: 'http://control:8080',
+    heartbeatIntervalMs: 5000,
+    seedNodes: [],
+  });
+
+  clusterManager.updateLocalNode({
+    nodeName: 'control',
+    role: 'control-plane',
+    endpoint: 'http://control:8080',
+    healthy: true,
+    resources: {
+      gpuCount: 2,
+      totalVramGb: 48,
+      availableVramGb: 48,
+      maxSingleGpuVramGb: 24,
+      largestGpuGroupCount: 2,
+      largestGpuGroupVramGb: 48,
+      deploymentCount: 1,
+      topologyGroups: [
+        {
+          id: 'nvidia-1',
+          vendor: 'nvidia',
+          gpuIds: ['gpu-0', 'gpu-1'],
+          gpuCount: 2,
+          totalVramGb: 48,
+          maxSingleGpuVramGb: 24,
+          busNumbers: [1, 2],
+        },
+      ],
+    },
+    deployments: [
+      {
+        modelId: 'meta-llama/Llama-3.1-8B-Instruct',
+        engine: 'vllm',
+        endpoint: 'http://control:8080',
+        healthy: true,
+        containerId: 'vllm-llama',
+      },
+    ],
+    lastSeenAt: Date.now(),
+  });
+
+  clusterManager.upsertNode({
+    nodeName: 'worker-a',
+    role: 'worker',
+    endpoint: 'http://worker-a:8080',
+    healthy: true,
+    resources: {
+      gpuCount: 4,
+      totalVramGb: 96,
+      availableVramGb: 72,
+      maxSingleGpuVramGb: 24,
+      largestGpuGroupCount: 4,
+      largestGpuGroupVramGb: 96,
+      deploymentCount: 2,
+      topologyGroups: [
+        {
+          id: 'nvidia-1',
+          vendor: 'nvidia',
+          gpuIds: ['gpu-0', 'gpu-1', 'gpu-2', 'gpu-3'],
+          gpuCount: 4,
+          totalVramGb: 96,
+          maxSingleGpuVramGb: 24,
+          busNumbers: [1, 2, 3, 4],
+        },
+      ],
+    },
+    deployments: [
+      {
+        modelId: 'meta-llama/Llama-3.1-8B-Instruct',
+        engine: 'vllm',
+        endpoint: 'http://worker-a:8080',
+        healthy: true,
+        containerId: 'vllm-llama-worker',
+      },
+    ],
+    lastSeenAt: Date.now(),
+  });
+
+  const resolved = clusterManager.resolveModel('meta-llama/Llama-3.1-8B-Instruct');
+  assertExists(resolved);
+  assertEquals(resolved.nodeName, 'control');
+});
+
+Deno.test('Cluster manager stores desired deployments', () => {
+  const clusterManager = new ClusterManager();
+  const desired = clusterManager.upsertDesiredDeployment('meta-llama/Llama-3.1-8B-Instruct', 3);
+
+  assertEquals(desired.modelId, 'meta-llama/Llama-3.1-8B-Instruct');
+  assertEquals(desired.desiredReplicas, 3);
+  assertEquals(clusterManager.getDesiredDeployments().length, 1);
+});
+
+Deno.test('Cluster manager picks the node with enough VRAM', () => {
+  const clusterManager = new ClusterManager();
+  clusterManager.configure({
+    enabled: true,
+    nodeName: 'control',
+    role: 'control-plane',
+    bindHost: '0.0.0.0',
+    gossipPort: 7946,
+    advertiseUrl: 'http://control:8080',
+    heartbeatIntervalMs: 5000,
+    seedNodes: [],
+  });
+
+  clusterManager.updateLocalNode({
+    nodeName: 'control',
+    role: 'control-plane',
+    endpoint: 'http://control:8080',
+    healthy: true,
+    resources: {
+      gpuCount: 1,
+      totalVramGb: 16,
+      availableVramGb: 8,
+      maxSingleGpuVramGb: 8,
+      largestGpuGroupCount: 1,
+      largestGpuGroupVramGb: 8,
+      deploymentCount: 1,
+      topologyGroups: [
+        {
+          id: 'nvidia-1',
+          vendor: 'nvidia',
+          gpuIds: ['gpu-0'],
+          gpuCount: 1,
+          totalVramGb: 8,
+          maxSingleGpuVramGb: 8,
+          busNumbers: [1],
+        },
+      ],
+    },
+    deployments: [],
+    lastSeenAt: Date.now(),
+  });
+
+  clusterManager.upsertNode({
+    nodeName: 'worker-a',
+    role: 'worker',
+    endpoint: 'http://worker-a:8080',
+    healthy: true,
+    resources: {
+      gpuCount: 2,
+      totalVramGb: 48,
+      availableVramGb: 32,
+      maxSingleGpuVramGb: 16,
+      largestGpuGroupCount: 2,
+      largestGpuGroupVramGb: 32,
+      deploymentCount: 0,
+      topologyGroups: [
+        {
+          id: 'nvidia-1',
+          vendor: 'nvidia',
+          gpuIds: ['gpu-0', 'gpu-1'],
+          gpuCount: 2,
+          totalVramGb: 32,
+          maxSingleGpuVramGb: 16,
+          busNumbers: [1, 2],
+        },
+      ],
+    },
+    deployments: [],
+    lastSeenAt: Date.now(),
+  });
+
+  const selected = clusterManager.pickNodeForModel({
+    id: 'meta-llama/Llama-3.1-8B-Instruct',
+    engine: 'vllm',
+    source: {
+      repo: 'meta-llama/Llama-3.1-8B-Instruct',
+    },
+    capabilities: {
+      chat: true,
+    },
+    requirements: {
+      minVramGb: 18,
+      minGpuCount: 1,
+    },
+  });
+
+  assertExists(selected);
+  assertEquals(selected.nodeName, 'worker-a');
+});
+
+Deno.test('Cluster manager excludes cordoned nodes from placement', () => {
+  const clusterManager = new ClusterManager();
+  clusterManager.configure({
+    enabled: true,
+    nodeName: 'control',
+    role: 'control-plane',
+    bindHost: '0.0.0.0',
+    gossipPort: 7946,
+    advertiseUrl: 'http://control:8080',
+    heartbeatIntervalMs: 5000,
+    seedNodes: [],
+  });
+
+  clusterManager.updateLocalNode({
+    nodeName: 'control',
+    role: 'control-plane',
+    endpoint: 'http://control:8080',
+    healthy: true,
+    resources: {
+      gpuCount: 2,
+      totalVramGb: 48,
+      availableVramGb: 32,
+      maxSingleGpuVramGb: 24,
+      largestGpuGroupCount: 2,
+      largestGpuGroupVramGb: 48,
+      deploymentCount: 0,
+      topologyGroups: [
+        {
+          id: 'nvidia-1',
+          vendor: 'nvidia',
+          gpuIds: ['gpu-0', 'gpu-1'],
+          gpuCount: 2,
+          totalVramGb: 48,
+          maxSingleGpuVramGb: 24,
+          busNumbers: [1, 2],
+        },
+      ],
+    },
+    deployments: [],
+    lastSeenAt: Date.now(),
+  });
+
+  clusterManager.upsertNode({
+    nodeName: 'worker-a',
+    role: 'worker',
+    endpoint: 'http://worker-a:8080',
+    healthy: true,
+    resources: {
+      gpuCount: 2,
+      totalVramGb: 48,
+      availableVramGb: 48,
+      maxSingleGpuVramGb: 24,
+      largestGpuGroupCount: 2,
+      largestGpuGroupVramGb: 48,
+      deploymentCount: 0,
+      topologyGroups: [
+        {
+          id: 'nvidia-1',
+          vendor: 'nvidia',
+          gpuIds: ['gpu-0', 'gpu-1'],
+          gpuCount: 2,
+          totalVramGb: 48,
+          maxSingleGpuVramGb: 24,
+          busNumbers: [1, 2],
+        },
+      ],
+    },
+    deployments: [],
+    lastSeenAt: Date.now(),
+  });
+
+  clusterManager.setNodeSchedulerState('worker-a', 'cordoned');
+
+  const selected = clusterManager.pickNodeForModel({
+    id: 'meta-llama/Llama-3.1-8B-Instruct',
+    engine: 'vllm',
+    source: { repo: 'meta-llama/Llama-3.1-8B-Instruct' },
+    capabilities: { chat: true },
+    requirements: {
+      minVramGb: 18,
+      minGpuCount: 1,
+    },
+  });
+
+  assertExists(selected);
+  assertEquals(selected.nodeName, 'control');
+});
+
+Deno.test('Topology grouping keeps distant PCI buses separate', () => {
+  const groups = buildGpuTopologyGroups([
+    { id: 'gpu-0', vendor: 'nvidia', model: 'A', vram: 24576, pciSlot: '0000:01:00.0', index: 0 },
+    { id: 'gpu-1', vendor: 'nvidia', model: 'A', vram: 24576, pciSlot: '0000:02:00.0', index: 1 },
+    { id: 'gpu-2', vendor: 'nvidia', model: 'A', vram: 24576, pciSlot: '0000:41:00.0', index: 2 },
+    { id: 'gpu-3', vendor: 'nvidia', model: 'A', vram: 24576, pciSlot: '0000:42:00.0', index: 3 },
+  ]);
+
+  assertEquals(groups.length, 2);
+  assertEquals(groups[0].gpuCount, 2);
+  assertEquals(groups[1].gpuCount, 2);
+});
+
+Deno.test('Cluster manager rejects node without suitable topology group', () => {
+  const clusterManager = new ClusterManager();
+  clusterManager.configure({
+    enabled: true,
+    nodeName: 'control',
+    role: 'control-plane',
+    bindHost: '0.0.0.0',
+    gossipPort: 7946,
+    advertiseUrl: 'http://control:8080',
+    heartbeatIntervalMs: 5000,
+    seedNodes: [],
+  });
+
+  clusterManager.updateLocalNode({
+    nodeName: 'control',
+    role: 'control-plane',
+    endpoint: 'http://control:8080',
+    healthy: true,
+    resources: {
+      gpuCount: 4,
+      totalVramGb: 96,
+      availableVramGb: 96,
+      maxSingleGpuVramGb: 24,
+      largestGpuGroupCount: 2,
+      largestGpuGroupVramGb: 48,
+      deploymentCount: 0,
+      topologyGroups: [
+        {
+          id: 'nvidia-1',
+          vendor: 'nvidia',
+          gpuIds: ['gpu-0', 'gpu-1'],
+          gpuCount: 2,
+          totalVramGb: 48,
+          maxSingleGpuVramGb: 24,
+          busNumbers: [1, 2],
+        },
+        {
+          id: 'nvidia-2',
+          vendor: 'nvidia',
+          gpuIds: ['gpu-2', 'gpu-3'],
+          gpuCount: 2,
+          totalVramGb: 48,
+          maxSingleGpuVramGb: 24,
+          busNumbers: [65, 66],
+        },
+      ],
+    },
+    deployments: [],
+    lastSeenAt: Date.now(),
+  });
+
+  const selected = clusterManager.pickNodeForModel({
+    id: 'meta-llama/Llama-3.1-70B-Instruct',
+    engine: 'vllm',
+    source: { repo: 'meta-llama/Llama-3.1-70B-Instruct' },
+    capabilities: { chat: true },
+    requirements: {
+      minVramGb: 72,
+      minGpuCount: 4,
+    },
+    launchDefaults: {
+      tensorParallelSize: 4,
+    },
+  });
+
+  assertEquals(selected, null);
+});