feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
@@ -6,14 +6,13 @@

 import type {
  IContainerConfig,
-  IContainerStatus,
  IContainerEndpoint,
+  IContainerStatus,
  TContainerType,
 } from '../interfaces/container.ts';
 import { logger } from '../logger.ts';
 import { DockerManager } from '../docker/docker-manager.ts';
 import { BaseContainer } from './base-container.ts';
-import { OllamaContainer } from './ollama.ts';
 import { VllmContainer } from './vllm.ts';
 import { TgiContainer } from './tgi.ts';

@@ -47,8 +46,6 @@ export class ContainerManager {
   */
  private createContainerInstance(config: IContainerConfig): BaseContainer {
    switch (config.type) {
-      case 'ollama':
-        return new OllamaContainer(config);
      case 'vllm':
        return new VllmContainer(config);
      case 'tgi':
@@ -108,7 +105,11 @@ export class ContainerManager {
      try {
        this.addContainer(config);
      } catch (error) {
-        logger.warn(`Failed to load container ${config.id}: ${error instanceof Error ? error.message : String(error)}`);
+        logger.warn(
+          `Failed to load container ${config.id}: ${
+            error instanceof Error ? error.message : String(error)
+          }`,
+        );
      }
    }
  }
@@ -128,7 +129,11 @@ export class ContainerManager {
        const success = await container.start();
        results.set(id, success);
      } catch (error) {
-        logger.error(`Failed to start container ${id}: ${error instanceof Error ? error.message : String(error)}`);
+        logger.error(
+          `Failed to start container ${id}: ${
+            error instanceof Error ? error.message : String(error)
+          }`,
+        );
        results.set(id, false);
      }
    }
@@ -147,7 +152,11 @@ export class ContainerManager {
        const success = await container.stop();
        results.set(id, success);
      } catch (error) {
-        logger.error(`Failed to stop container ${id}: ${error instanceof Error ? error.message : String(error)}`);
+        logger.error(
+          `Failed to stop container ${id}: ${
+            error instanceof Error ? error.message : String(error)
+          }`,
+        );
        results.set(id, false);
      }
    }
@@ -166,7 +175,11 @@ export class ContainerManager {
        const status = await container.getStatus();
        statuses.set(id, status);
      } catch (error) {
-        logger.warn(`Failed to get status for container ${id}: ${error instanceof Error ? error.message : String(error)}`);
+        logger.warn(
+          `Failed to get status for container ${id}: ${
+            error instanceof Error ? error.message : String(error)
+          }`,
+        );
      }
    }

@@ -266,7 +279,7 @@ export class ContainerManager {
   */
  public async pullModel(
    modelName: string,
-    containerType: TContainerType = 'ollama',
+    containerType: TContainerType = 'vllm',
    containerId?: string,
  ): Promise<boolean> {
    // Find or create appropriate container
@@ -313,6 +326,16 @@ export class ContainerManager {
    return results;
  }

+  public async checkAllHealth(): Promise<boolean> {
+    const results = await this.healthCheck();
+
+    if (results.size === 0) {
+      return true;
+    }
+
+    return Array.from(results.values()).every((healthy) => healthy);
+  }
+
  /**
   * Print container status summary
   */
@@ -329,9 +352,7 @@ export class ContainerManager {
    for (const [id, status] of statuses) {
      const runningStr = status.running ? 'Running' : 'Stopped';
      const healthStr = status.health;
-      const modelsStr = status.loadedModels.length > 0
-        ? status.loadedModels.join(', ')
-        : 'None';
+      const modelsStr = status.loadedModels.length > 0 ? status.loadedModels.join(', ') : 'None';

      logger.logBoxLine(`${status.name} (${id})`);
      logger.logBoxLine(`  Type: ${status.type} | Status: ${runningStr} | Health: ${healthStr}`);
@@ -339,7 +360,9 @@ export class ContainerManager {
      logger.logBoxLine(`  Endpoint: ${status.endpoint}`);

      if (status.gpuUtilization !== undefined) {
-        logger.logBoxLine(`  GPU: ${status.gpuUtilization}% | Memory: ${status.memoryUsage || 0}MB`);
+        logger.logBoxLine(
+          `  GPU: ${status.gpuUtilization}% | Memory: ${status.memoryUsage || 0}MB`,
+        );
      }
      logger.logBoxLine('');
    }
@@ -5,7 +5,6 @@
 */

 export { BaseContainer } from './base-container.ts';
-export { OllamaContainer } from './ollama.ts';
 export { VllmContainer } from './vllm.ts';
 export { TgiContainer } from './tgi.ts';
 export { ContainerManager } from './container-manager.ts';
@@ -1,387 +0,0 @@
-/**
- * Ollama Container
- *
- * Manages Ollama containers for running local LLMs.
- */
-
-import type {
-  IContainerConfig,
-  ILoadedModel,
-  TContainerType,
-} from '../interfaces/container.ts';
-import type {
-  IChatCompletionRequest,
-  IChatCompletionResponse,
-  IChatCompletionChoice,
-  IChatMessage,
-} from '../interfaces/api.ts';
-import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
-import { logger } from '../logger.ts';
-import { BaseContainer, type TModelPullProgress } from './base-container.ts';
-
-/**
- * Ollama API response types
- */
-interface IOllamaTagsResponse {
-  models: Array<{
-    name: string;
-    size: number;
-    digest: string;
-    modified_at: string;
-  }>;
-}
-
-interface IOllamaChatRequest {
-  model: string;
-  messages: Array<{
-    role: string;
-    content: string;
-  }>;
-  stream?: boolean;
-  options?: {
-    temperature?: number;
-    top_p?: number;
-    num_predict?: number;
-    stop?: string[];
-  };
-}
-
-interface IOllamaChatResponse {
-  model: string;
-  created_at: string;
-  message: {
-    role: string;
-    content: string;
-  };
-  done: boolean;
-  total_duration?: number;
-  load_duration?: number;
-  prompt_eval_count?: number;
-  eval_count?: number;
-}
-
-interface IOllamaPullResponse {
-  status: string;
-  digest?: string;
-  total?: number;
-  completed?: number;
-}
-
-/**
- * Ollama container implementation
- */
-export class OllamaContainer extends BaseContainer {
-  public readonly type: TContainerType = 'ollama';
-  public readonly displayName = 'Ollama';
-  public readonly defaultImage = CONTAINER_IMAGES.OLLAMA;
-  public readonly defaultPort = CONTAINER_PORTS.OLLAMA;
-
-  constructor(config: IContainerConfig) {
-    super(config);
-
-    // Set defaults if not provided
-    if (!config.image) {
-      config.image = this.defaultImage;
-    }
-    if (!config.port) {
-      config.port = this.defaultPort;
-    }
-
-    // Add default volume for model storage
-    if (!config.volumes || config.volumes.length === 0) {
-      config.volumes = [`modelgrid-ollama-${config.id}:/root/.ollama`];
-    }
-  }
-
-  /**
-   * Create Ollama container configuration
-   */
-  public static createConfig(
-    id: string,
-    name: string,
-    gpuIds: string[],
-    options: Partial<IContainerConfig> = {},
-  ): IContainerConfig {
-    return {
-      id,
-      name,
-      type: 'ollama',
-      image: options.image || CONTAINER_IMAGES.OLLAMA,
-      gpuIds,
-      port: options.port || CONTAINER_PORTS.OLLAMA,
-      externalPort: options.externalPort,
-      models: options.models || [],
-      env: options.env,
-      volumes: options.volumes || [`modelgrid-ollama-${id}:/root/.ollama`],
-      autoStart: options.autoStart ?? true,
-      restartPolicy: options.restartPolicy || 'unless-stopped',
-      memoryLimit: options.memoryLimit,
-      cpuLimit: options.cpuLimit,
-      command: options.command,
-    };
-  }
-
-  /**
-   * Check if Ollama is healthy
-   */
-  public async isHealthy(): Promise<boolean> {
-    try {
-      const response = await this.fetch('/api/tags', { timeout: 5000 });
-      return response.ok;
-    } catch {
-      return false;
-    }
-  }
-
-  /**
-   * List available models
-   */
-  public async listModels(): Promise<string[]> {
-    try {
-      const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
-      return (data.models || []).map((m) => m.name);
-    } catch (error) {
-      logger.warn(`Failed to list Ollama models: ${error instanceof Error ? error.message : String(error)}`);
-      return [];
-    }
-  }
-
-  /**
-   * Get loaded models with details
-   */
-  public async getLoadedModels(): Promise<ILoadedModel[]> {
-    try {
-      const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
-      return (data.models || []).map((m) => ({
-        name: m.name,
-        size: m.size,
-        format: m.digest.substring(0, 12),
-        loaded: true, // Ollama doesn't distinguish loaded vs available
-        requestCount: 0,
-      }));
-    } catch {
-      return [];
-    }
-  }
-
-  /**
-   * Pull a model
-   */
-  public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
-    try {
-      logger.info(`Pulling model: ${modelName}`);
-
-      const response = await this.fetch('/api/pull', {
-        method: 'POST',
-        body: { name: modelName },
-        timeout: 3600000, // 1 hour for large models
-      });
-
-      if (!response.ok) {
-        throw new Error(`HTTP ${response.status}`);
-      }
-
-      // Read streaming response
-      const reader = response.body?.getReader();
-      if (!reader) {
-        throw new Error('No response body');
-      }
-
-      const decoder = new TextDecoder();
-      let lastStatus = '';
-
-      while (true) {
-        const { done, value } = await reader.read();
-        if (done) break;
-
-        const text = decoder.decode(value);
-        const lines = text.split('\n').filter((l) => l.trim());
-
-        for (const line of lines) {
-          try {
-            const data = JSON.parse(line) as IOllamaPullResponse;
-            const status = data.status;
-
-            if (status !== lastStatus) {
-              lastStatus = status;
-              let percent: number | undefined;
-
-              if (data.total && data.completed) {
-                percent = Math.round((data.completed / data.total) * 100);
-              }
-
-              if (onProgress) {
-                onProgress({ model: modelName, status, percent });
-              } else {
-                const progressStr = percent !== undefined ? ` (${percent}%)` : '';
-                logger.dim(`  ${status}${progressStr}`);
-              }
-            }
-          } catch {
-            // Invalid JSON line, skip
-          }
-        }
-      }
-
-      logger.success(`Model ${modelName} pulled successfully`);
-      return true;
-    } catch (error) {
-      logger.error(`Failed to pull model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
-      return false;
-    }
-  }
-
-  /**
-   * Remove a model
-   */
-  public async removeModel(modelName: string): Promise<boolean> {
-    try {
-      const response = await this.fetch('/api/delete', {
-        method: 'DELETE',
-        body: { name: modelName },
-      });
-
-      if (response.ok) {
-        logger.success(`Model ${modelName} removed`);
-        return true;
-      }
-
-      throw new Error(`HTTP ${response.status}`);
-    } catch (error) {
-      logger.error(`Failed to remove model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
-      return false;
-    }
-  }
-
-  /**
-   * Send a chat completion request
-   */
-  public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
-    const ollamaRequest: IOllamaChatRequest = {
-      model: request.model,
-      messages: request.messages.map((m) => ({
-        role: m.role,
-        content: m.content,
-      })),
-      stream: false,
-      options: {
-        temperature: request.temperature,
-        top_p: request.top_p,
-        num_predict: request.max_tokens,
-        stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
-      },
-    };
-
-    const response = await this.fetchJson<IOllamaChatResponse>('/api/chat', {
-      method: 'POST',
-      body: ollamaRequest,
-      timeout: 300000, // 5 minutes
-    });
-
-    // Convert to OpenAI format
-    const created = Math.floor(Date.now() / 1000);
-
-    const choice: IChatCompletionChoice = {
-      index: 0,
-      message: {
-        role: 'assistant',
-        content: response.message.content,
-      },
-      finish_reason: response.done ? 'stop' : null,
-    };
-
-    return {
-      id: this.generateRequestId(),
-      object: 'chat.completion',
-      created,
-      model: request.model,
-      choices: [choice],
-      usage: {
-        prompt_tokens: response.prompt_eval_count || 0,
-        completion_tokens: response.eval_count || 0,
-        total_tokens: (response.prompt_eval_count || 0) + (response.eval_count || 0),
-      },
-    };
-  }
-
-  /**
-   * Stream a chat completion request
-   */
-  public async chatCompletionStream(
-    request: IChatCompletionRequest,
-    onChunk: (chunk: string) => void,
-  ): Promise<void> {
-    const ollamaRequest: IOllamaChatRequest = {
-      model: request.model,
-      messages: request.messages.map((m) => ({
-        role: m.role,
-        content: m.content,
-      })),
-      stream: true,
-      options: {
-        temperature: request.temperature,
-        top_p: request.top_p,
-        num_predict: request.max_tokens,
-        stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
-      },
-    };
-
-    const response = await this.fetch('/api/chat', {
-      method: 'POST',
-      body: ollamaRequest,
-      timeout: 300000,
-    });
-
-    if (!response.ok) {
-      throw new Error(`HTTP ${response.status}`);
-    }
-
-    const reader = response.body?.getReader();
-    if (!reader) {
-      throw new Error('No response body');
-    }
-
-    const decoder = new TextDecoder();
-    const requestId = this.generateRequestId();
-    const created = Math.floor(Date.now() / 1000);
-
-    while (true) {
-      const { done, value } = await reader.read();
-      if (done) break;
-
-      const text = decoder.decode(value);
-      const lines = text.split('\n').filter((l) => l.trim());
-
-      for (const line of lines) {
-        try {
-          const data = JSON.parse(line) as IOllamaChatResponse;
-
-          // Convert to OpenAI streaming format
-          const chunk = {
-            id: requestId,
-            object: 'chat.completion.chunk',
-            created,
-            model: request.model,
-            choices: [
-              {
-                index: 0,
-                delta: {
-                  content: data.message.content,
-                } as Partial<IChatMessage>,
-                finish_reason: data.done ? 'stop' : null,
-              },
-            ],
-          };
-
-          onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
-
-          if (data.done) {
-            onChunk('data: [DONE]\n\n');
-          }
-        } catch {
-          // Invalid JSON, skip
-        }
-      }
-    }
-  }
-}
@@ -4,15 +4,11 @@
 * Manages HuggingFace Text Generation Inference containers.
 */

+import type { IContainerConfig, ILoadedModel, TContainerType } from '../interfaces/container.ts';
 import type {
-  IContainerConfig,
-  ILoadedModel,
-  TContainerType,
-} from '../interfaces/container.ts';
-import type {
+  IChatCompletionChoice,
  IChatCompletionRequest,
  IChatCompletionResponse,
-  IChatCompletionChoice,
  IChatMessage,
 } from '../interfaces/api.ts';
 import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
@@ -161,7 +157,9 @@ export class TgiContainer extends BaseContainer {
      const info = await this.fetchJson<ITgiInfoResponse>('/info');
      return [info.model_id];
    } catch (error) {
-      logger.warn(`Failed to get TGI info: ${error instanceof Error ? error.message : String(error)}`);
+      logger.warn(
+        `Failed to get TGI info: ${error instanceof Error ? error.message : String(error)}`,
+      );
      return this.config.models || [];
    }
  }
@@ -232,7 +230,11 @@ export class TgiContainer extends BaseContainer {
        temperature: request.temperature,
        top_p: request.top_p,
        max_new_tokens: request.max_tokens || 1024,
-        stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
+        stop: Array.isArray(request.stop)
+          ? request.stop
+          : request.stop
+          ? [request.stop]
+          : undefined,
        do_sample: (request.temperature || 0) > 0,
        return_full_text: false,
      },
@@ -288,7 +290,11 @@ export class TgiContainer extends BaseContainer {
          temperature: request.temperature,
          top_p: request.top_p,
          max_new_tokens: request.max_tokens || 1024,
-          stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
+          stop: Array.isArray(request.stop)
+            ? request.stop
+            : request.stop
+            ? [request.stop]
+            : undefined,
          do_sample: (request.temperature || 0) > 0,
        },
      },
@@ -4,11 +4,7 @@
 * Manages vLLM containers for high-performance LLM inference.
 */

-import type {
-  IContainerConfig,
-  ILoadedModel,
-  TContainerType,
-} from '../interfaces/container.ts';
+import type { IContainerConfig, ILoadedModel, TContainerType } from '../interfaces/container.ts';
 import type {
  IChatCompletionRequest,
  IChatCompletionResponse,
@@ -72,20 +68,26 @@ export class VllmContainer extends BaseContainer {
    gpuIds: string[],
    options: Partial<IContainerConfig> = {},
  ): IContainerConfig {
-    // vLLM requires model to be specified at startup
-    const command = [
-      '--model', modelName,
-      '--host', '0.0.0.0',
-      '--port', String(options.port || CONTAINER_PORTS.VLLM),
+    const command = options.command ? [...options.command] : [
+      '--model',
+      modelName,
    ];

+    if (!command.includes('--host')) {
+      command.push('--host', '0.0.0.0');
+    }
+
+    if (!command.includes('--port')) {
+      command.push('--port', String(options.port || CONTAINER_PORTS.VLLM));
+    }
+
    // Add tensor parallelism if multiple GPUs
-    if (gpuIds.length > 1) {
+    if (gpuIds.length > 1 && !command.includes('--tensor-parallel-size')) {
      command.push('--tensor-parallel-size', String(gpuIds.length));
    }

    // Add additional options
-    if (options.env?.VLLM_MAX_MODEL_LEN) {
+    if (options.env?.VLLM_MAX_MODEL_LEN && !command.includes('--max-model-len')) {
      command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
    }

@@ -128,11 +130,17 @@ export class VllmContainer extends BaseContainer {
   * vLLM serves a single model per instance
   */
  public async listModels(): Promise<string[]> {
+    if (this.config.models.length > 0) {
+      return this.config.models;
+    }
+
    try {
      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
      return (data.data || []).map((m) => m.id);
    } catch (error) {
-      logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
+      logger.warn(
+        `Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`,
+      );
      return this.config.models || [];
    }
  }
@@ -141,6 +149,15 @@ export class VllmContainer extends BaseContainer {
   * Get loaded models with details
   */
  public async getLoadedModels(): Promise<ILoadedModel[]> {
+    if (this.config.models.length > 0) {
+      return this.config.models.map((name) => ({
+        name,
+        size: 0,
+        loaded: true,
+        requestCount: 0,
+      }));
+    }
+
    try {
      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
      return (data.data || []).map((m) => ({