feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
@@ -5,6 +5,8 @@
 * This makes configuration easier and code more self-documenting.
 */

+export const VERSION = '1.0.1';
+
 /**
 * Default timing values in milliseconds
 */
@@ -106,9 +108,6 @@ export const CONTAINER_PORTS = {
 * Container image defaults
 */
 export const CONTAINER_IMAGES = {
-  /** Ollama official image */
-  OLLAMA: 'ollama/ollama:latest',
-
  /** vLLM official image */
  VLLM: 'vllm/vllm-openai:latest',

@@ -120,20 +119,96 @@ export const CONTAINER_IMAGES = {
 * Model registry constants
 */
 export const MODEL_REGISTRY = {
-  /** Default greenlit models URL */
-  DEFAULT_GREENLIST_URL:
-    'https://code.foss.global/modelgrid.com/model_lists/raw/branch/main/greenlit.json',
+  /** Default public catalog URL */
+  DEFAULT_CATALOG_URL: 'https://list.modelgrid.com/catalog/models.json',

-  /** Fallback greenlist if remote fetch fails */
-  FALLBACK_GREENLIST: [
-    { name: 'llama3.2:1b', container: 'ollama', minVram: 4 },
-    { name: 'llama3.2:3b', container: 'ollama', minVram: 6 },
-    { name: 'llama3:8b', container: 'ollama', minVram: 8 },
-    { name: 'mistral:7b', container: 'ollama', minVram: 8 },
-    { name: 'codellama:7b', container: 'ollama', minVram: 8 },
+  /** Fallback catalog if remote fetch fails */
+  FALLBACK_CATALOG: [
+    {
+      id: 'Qwen/Qwen2.5-7B-Instruct',
+      aliases: ['qwen2.5-7b-instruct'],
+      engine: 'vllm',
+      source: {
+        repo: 'Qwen/Qwen2.5-7B-Instruct',
+        license: 'apache-2.0',
+      },
+      capabilities: {
+        chat: true,
+        completions: true,
+        tools: true,
+      },
+      requirements: {
+        minVramGb: 16,
+        recommendedVramGb: 24,
+        minGpuCount: 1,
+      },
+      metadata: {
+        family: 'Qwen2.5',
+        parameterCount: '7B',
+        contextWindow: 131072,
+        summary: 'General purpose instruct model for chat and tool use.',
+        tags: ['chat', 'tool-use', 'instruct'],
+      },
+    },
+    {
+      id: 'meta-llama/Llama-3.1-8B-Instruct',
+      aliases: ['llama-3.1-8b-instruct'],
+      engine: 'vllm',
+      source: {
+        repo: 'meta-llama/Llama-3.1-8B-Instruct',
+        license: 'llama3.1',
+      },
+      capabilities: {
+        chat: true,
+        completions: true,
+        tools: true,
+      },
+      requirements: {
+        minVramGb: 18,
+        recommendedVramGb: 24,
+        minGpuCount: 1,
+      },
+      metadata: {
+        family: 'Llama 3.1',
+        parameterCount: '8B',
+        contextWindow: 131072,
+        summary: 'High quality instruct model with good ecosystem support.',
+        tags: ['chat', 'tool-use', 'instruct'],
+      },
+    },
+    {
+      id: 'BAAI/bge-m3',
+      aliases: ['bge-m3'],
+      engine: 'vllm',
+      source: {
+        repo: 'BAAI/bge-m3',
+        license: 'mit',
+      },
+      capabilities: {
+        embeddings: true,
+      },
+      requirements: {
+        minVramGb: 8,
+        recommendedVramGb: 12,
+        minGpuCount: 1,
+      },
+      metadata: {
+        family: 'BGE',
+        summary: 'Multilingual embedding model for retrieval workloads.',
+        tags: ['embeddings', 'retrieval', 'multilingual'],
+      },
+    },
  ],
 } as const;

+export const CLUSTER = {
+  DEFAULT_BIND_HOST: '0.0.0.0',
+  DEFAULT_GOSSIP_PORT: 7946,
+  DEFAULT_HEARTBEAT_INTERVAL_MS: 5000,
+  NODE_STALE_AFTER_MS: 20000,
+  AUTH_HEADER_NAME: 'x-modelgrid-cluster-secret',
+} as const;
+
 /**
 * Configuration paths
 */