initial

2026-01-30 03:16:57 +00:00
commit daaf6559e3
80 changed files with 14430 additions and 0 deletions
--- a/ts/containers/base-container.ts
+++ b/ts/containers/base-container.ts
@@ -0,0 +1,216 @@
+/**
+ * Base Container
+ *
+ * Abstract base class for AI model containers.
+ */
+
+import type {
+  IContainerConfig,
+  IContainerStatus,
+  ILoadedModel,
+  TContainerType,
+} from '../interfaces/container.ts';
+import type { IChatCompletionRequest, IChatCompletionResponse } from '../interfaces/api.ts';
+import { ContainerRuntime } from '../docker/container-runtime.ts';
+import { logger } from '../logger.ts';
+
+/**
+ * Model pull progress callback
+ */
+export type TModelPullProgress = (progress: {
+  model: string;
+  status: string;
+  percent?: number;
+}) => void;
+
+/**
+ * Abstract base class for AI model containers
+ */
+export abstract class BaseContainer {
+  /** Container type */
+  public abstract readonly type: TContainerType;
+
+  /** Display name */
+  public abstract readonly displayName: string;
+
+  /** Default Docker image */
+  public abstract readonly defaultImage: string;
+
+  /** Default internal port */
+  public abstract readonly defaultPort: number;
+
+  /** Container configuration */
+  protected config: IContainerConfig;
+
+  /** Container runtime */
+  protected runtime: ContainerRuntime;
+
+  constructor(config: IContainerConfig) {
+    this.config = config;
+    this.runtime = new ContainerRuntime();
+  }
+
+  /**
+   * Get the container configuration
+   */
+  public getConfig(): IContainerConfig {
+    return this.config;
+  }
+
+  /**
+   * Get the endpoint URL for this container
+   */
+  public getEndpoint(): string {
+    const port = this.config.externalPort || this.config.port;
+    return `http://localhost:${port}`;
+  }
+
+  /**
+   * Start the container
+   */
+  public async start(): Promise<boolean> {
+    logger.info(`Starting ${this.displayName} container: ${this.config.name}`);
+    return this.runtime.startContainer(this.config);
+  }
+
+  /**
+   * Stop the container
+   */
+  public async stop(): Promise<boolean> {
+    logger.info(`Stopping ${this.displayName} container: ${this.config.name}`);
+    return this.runtime.stopContainer(this.config.id);
+  }
+
+  /**
+   * Restart the container
+   */
+  public async restart(): Promise<boolean> {
+    logger.info(`Restarting ${this.displayName} container: ${this.config.name}`);
+    return this.runtime.restartContainer(this.config.id);
+  }
+
+  /**
+   * Remove the container
+   */
+  public async remove(): Promise<boolean> {
+    logger.info(`Removing ${this.displayName} container: ${this.config.name}`);
+    return this.runtime.removeContainer(this.config.id);
+  }
+
+  /**
+   * Get container status
+   */
+  public async getStatus(): Promise<IContainerStatus> {
+    return this.runtime.getContainerStatus(this.config);
+  }
+
+  /**
+   * Get container logs
+   */
+  public async getLogs(lines: number = 100): Promise<string> {
+    return this.runtime.getLogs(this.config.id, { lines });
+  }
+
+  /**
+   * Check if the container is healthy
+   */
+  public abstract isHealthy(): Promise<boolean>;
+
+  /**
+   * Get list of available models
+   */
+  public abstract listModels(): Promise<string[]>;
+
+  /**
+   * Get list of loaded models with details
+   */
+  public abstract getLoadedModels(): Promise<ILoadedModel[]>;
+
+  /**
+   * Pull a model
+   */
+  public abstract pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean>;
+
+  /**
+   * Remove a model
+   */
+  public abstract removeModel(modelName: string): Promise<boolean>;
+
+  /**
+   * Send a chat completion request
+   */
+  public abstract chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse>;
+
+  /**
+   * Stream a chat completion request
+   */
+  public abstract chatCompletionStream(
+    request: IChatCompletionRequest,
+    onChunk: (chunk: string) => void,
+  ): Promise<void>;
+
+  /**
+   * Make HTTP request to container
+   */
+  protected async fetch(
+    path: string,
+    options: {
+      method?: string;
+      headers?: Record<string, string>;
+      body?: unknown;
+      timeout?: number;
+    } = {},
+  ): Promise<Response> {
+    const endpoint = this.getEndpoint();
+    const url = `${endpoint}${path}`;
+
+    const controller = new AbortController();
+    const timeout = options.timeout || 30000;
+    const timeoutId = setTimeout(() => controller.abort(), timeout);
+
+    try {
+      const response = await fetch(url, {
+        method: options.method || 'GET',
+        headers: {
+          'Content-Type': 'application/json',
+          ...options.headers,
+        },
+        body: options.body ? JSON.stringify(options.body) : undefined,
+        signal: controller.signal,
+      });
+
+      return response;
+    } finally {
+      clearTimeout(timeoutId);
+    }
+  }
+
+  /**
+   * Make HTTP request and parse JSON response
+   */
+  protected async fetchJson<T>(
+    path: string,
+    options: {
+      method?: string;
+      headers?: Record<string, string>;
+      body?: unknown;
+      timeout?: number;
+    } = {},
+  ): Promise<T> {
+    const response = await this.fetch(path, options);
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      throw new Error(`HTTP ${response.status}: ${errorText}`);
+    }
+
+    return response.json();
+  }
+
+  /**
+   * Generate a unique request ID
+   */
+  protected generateRequestId(): string {
+    return `chatcmpl-${Date.now().toString(36)}-${Math.random().toString(36).substring(2, 8)}`;
+  }
+}
--- a/ts/containers/container-manager.ts
+++ b/ts/containers/container-manager.ts
@@ -0,0 +1,349 @@
+/**
+ * Container Manager
+ *
+ * Orchestrates multiple AI model containers.
+ */
+
+import type {
+  IContainerConfig,
+  IContainerStatus,
+  IContainerEndpoint,
+  TContainerType,
+} from '../interfaces/container.ts';
+import { logger } from '../logger.ts';
+import { DockerManager } from '../docker/docker-manager.ts';
+import { BaseContainer } from './base-container.ts';
+import { OllamaContainer } from './ollama.ts';
+import { VllmContainer } from './vllm.ts';
+import { TgiContainer } from './tgi.ts';
+
+/**
+ * Container Manager - orchestrates all containers
+ */
+export class ContainerManager {
+  private containers: Map<string, BaseContainer>;
+  private dockerManager: DockerManager;
+
+  constructor() {
+    this.containers = new Map();
+    this.dockerManager = new DockerManager();
+  }
+
+  /**
+   * Initialize container manager
+   */
+  public async initialize(): Promise<void> {
+    // Ensure Docker is running
+    if (!await this.dockerManager.isRunning()) {
+      throw new Error('Docker is not running');
+    }
+
+    // Create network if it doesn't exist
+    await this.dockerManager.createNetwork();
+  }
+
+  /**
+   * Create a container instance from config
+   */
+  private createContainerInstance(config: IContainerConfig): BaseContainer {
+    switch (config.type) {
+      case 'ollama':
+        return new OllamaContainer(config);
+      case 'vllm':
+        return new VllmContainer(config);
+      case 'tgi':
+        return new TgiContainer(config);
+      default:
+        throw new Error(`Unknown container type: ${config.type}`);
+    }
+  }
+
+  /**
+   * Add a container
+   */
+  public addContainer(config: IContainerConfig): BaseContainer {
+    if (this.containers.has(config.id)) {
+      throw new Error(`Container with ID ${config.id} already exists`);
+    }
+
+    const container = this.createContainerInstance(config);
+    this.containers.set(config.id, container);
+    return container;
+  }
+
+  /**
+   * Remove a container
+   */
+  public async removeContainer(containerId: string): Promise<boolean> {
+    const container = this.containers.get(containerId);
+    if (!container) {
+      return false;
+    }
+
+    await container.remove();
+    this.containers.delete(containerId);
+    return true;
+  }
+
+  /**
+   * Get a container by ID
+   */
+  public getContainer(containerId: string): BaseContainer | undefined {
+    return this.containers.get(containerId);
+  }
+
+  /**
+   * Get all containers
+   */
+  public getAllContainers(): BaseContainer[] {
+    return Array.from(this.containers.values());
+  }
+
+  /**
+   * Load containers from configuration
+   */
+  public loadFromConfig(configs: IContainerConfig[]): void {
+    this.containers.clear();
+    for (const config of configs) {
+      try {
+        this.addContainer(config);
+      } catch (error) {
+        logger.warn(`Failed to load container ${config.id}: ${error instanceof Error ? error.message : String(error)}`);
+      }
+    }
+  }
+
+  /**
+   * Start all containers
+   */
+  public async startAll(): Promise<Map<string, boolean>> {
+    const results = new Map<string, boolean>();
+
+    for (const [id, container] of this.containers) {
+      if (!container.getConfig().autoStart) {
+        continue;
+      }
+
+      try {
+        const success = await container.start();
+        results.set(id, success);
+      } catch (error) {
+        logger.error(`Failed to start container ${id}: ${error instanceof Error ? error.message : String(error)}`);
+        results.set(id, false);
+      }
+    }
+
+    return results;
+  }
+
+  /**
+   * Stop all containers
+   */
+  public async stopAll(): Promise<Map<string, boolean>> {
+    const results = new Map<string, boolean>();
+
+    for (const [id, container] of this.containers) {
+      try {
+        const success = await container.stop();
+        results.set(id, success);
+      } catch (error) {
+        logger.error(`Failed to stop container ${id}: ${error instanceof Error ? error.message : String(error)}`);
+        results.set(id, false);
+      }
+    }
+
+    return results;
+  }
+
+  /**
+   * Get status of all containers
+   */
+  public async getAllStatus(): Promise<Map<string, IContainerStatus>> {
+    const statuses = new Map<string, IContainerStatus>();
+
+    for (const [id, container] of this.containers) {
+      try {
+        const status = await container.getStatus();
+        statuses.set(id, status);
+      } catch (error) {
+        logger.warn(`Failed to get status for container ${id}: ${error instanceof Error ? error.message : String(error)}`);
+      }
+    }
+
+    return statuses;
+  }
+
+  /**
+   * Get available endpoints for a model
+   */
+  public async getEndpointsForModel(modelName: string): Promise<IContainerEndpoint[]> {
+    const endpoints: IContainerEndpoint[] = [];
+
+    for (const [_id, container] of this.containers) {
+      try {
+        const status = await container.getStatus();
+
+        if (!status.running) {
+          continue;
+        }
+
+        // Check if container has this model
+        const models = await container.listModels();
+        if (!models.includes(modelName)) {
+          continue;
+        }
+
+        endpoints.push({
+          containerId: container.getConfig().id,
+          type: container.type,
+          url: container.getEndpoint(),
+          models,
+          healthy: status.health === 'healthy',
+          priority: 0, // Could be based on load
+        });
+      } catch {
+        // Skip containers that fail to respond
+      }
+    }
+
+    return endpoints;
+  }
+
+  /**
+   * Find best container for a model
+   */
+  public async findContainerForModel(modelName: string): Promise<BaseContainer | null> {
+    const endpoints = await this.getEndpointsForModel(modelName);
+
+    // Filter to healthy endpoints
+    const healthy = endpoints.filter((e) => e.healthy);
+    if (healthy.length === 0) {
+      return null;
+    }
+
+    // Return first healthy endpoint (could add load balancing)
+    const endpoint = healthy[0];
+    return this.containers.get(endpoint.containerId) || null;
+  }
+
+  /**
+   * Get all available models across all containers
+   */
+  public async getAllAvailableModels(): Promise<Map<string, IContainerEndpoint[]>> {
+    const modelMap = new Map<string, IContainerEndpoint[]>();
+
+    for (const container of this.containers.values()) {
+      try {
+        const status = await container.getStatus();
+        if (!status.running) continue;
+
+        const models = await container.listModels();
+
+        for (const model of models) {
+          if (!modelMap.has(model)) {
+            modelMap.set(model, []);
+          }
+
+          modelMap.get(model)!.push({
+            containerId: container.getConfig().id,
+            type: container.type,
+            url: container.getEndpoint(),
+            models,
+            healthy: status.health === 'healthy',
+            priority: 0,
+          });
+        }
+      } catch {
+        // Skip failed containers
+      }
+    }
+
+    return modelMap;
+  }
+
+  /**
+   * Pull a model to a specific container type
+   */
+  public async pullModel(
+    modelName: string,
+    containerType: TContainerType = 'ollama',
+    containerId?: string,
+  ): Promise<boolean> {
+    // Find or create appropriate container
+    let container: BaseContainer | undefined;
+
+    if (containerId) {
+      container = this.containers.get(containerId);
+    } else {
+      // Find first container of the specified type
+      for (const c of this.containers.values()) {
+        if (c.type === containerType) {
+          container = c;
+          break;
+        }
+      }
+    }
+
+    if (!container) {
+      logger.error(`No ${containerType} container available to pull model`);
+      return false;
+    }
+
+    return container.pullModel(modelName, (progress) => {
+      const percent = progress.percent !== undefined ? ` (${progress.percent}%)` : '';
+      logger.dim(`  ${progress.status}${percent}`);
+    });
+  }
+
+  /**
+   * Health check all containers
+   */
+  public async healthCheck(): Promise<Map<string, boolean>> {
+    const results = new Map<string, boolean>();
+
+    for (const [id, container] of this.containers) {
+      try {
+        const healthy = await container.isHealthy();
+        results.set(id, healthy);
+      } catch {
+        results.set(id, false);
+      }
+    }
+
+    return results;
+  }
+
+  /**
+   * Print container status summary
+   */
+  public async printStatus(): Promise<void> {
+    const statuses = await this.getAllStatus();
+
+    if (statuses.size === 0) {
+      logger.logBox('Containers', ['No containers configured'], 50, 'warning');
+      return;
+    }
+
+    logger.logBoxTitle('Container Status', 70, 'info');
+
+    for (const [id, status] of statuses) {
+      const runningStr = status.running ? 'Running' : 'Stopped';
+      const healthStr = status.health;
+      const modelsStr = status.loadedModels.length > 0
+        ? status.loadedModels.join(', ')
+        : 'None';
+
+      logger.logBoxLine(`${status.name} (${id})`);
+      logger.logBoxLine(`  Type: ${status.type} | Status: ${runningStr} | Health: ${healthStr}`);
+      logger.logBoxLine(`  Models: ${modelsStr}`);
+      logger.logBoxLine(`  Endpoint: ${status.endpoint}`);
+
+      if (status.gpuUtilization !== undefined) {
+        logger.logBoxLine(`  GPU: ${status.gpuUtilization}% | Memory: ${status.memoryUsage || 0}MB`);
+      }
+      logger.logBoxLine('');
+    }
+
+    logger.logBoxEnd();
+  }
+}
--- a/ts/containers/index.ts
+++ b/ts/containers/index.ts
@@ -0,0 +1,11 @@
+/**
+ * Container Management Module
+ *
+ * Exports all AI container implementations.
+ */
+
+export { BaseContainer } from './base-container.ts';
+export { OllamaContainer } from './ollama.ts';
+export { VllmContainer } from './vllm.ts';
+export { TgiContainer } from './tgi.ts';
+export { ContainerManager } from './container-manager.ts';
--- a/ts/containers/ollama.ts
+++ b/ts/containers/ollama.ts
@@ -0,0 +1,387 @@
+/**
+ * Ollama Container
+ *
+ * Manages Ollama containers for running local LLMs.
+ */
+
+import type {
+  IContainerConfig,
+  ILoadedModel,
+  TContainerType,
+} from '../interfaces/container.ts';
+import type {
+  IChatCompletionRequest,
+  IChatCompletionResponse,
+  IChatCompletionChoice,
+  IChatMessage,
+} from '../interfaces/api.ts';
+import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
+import { logger } from '../logger.ts';
+import { BaseContainer, type TModelPullProgress } from './base-container.ts';
+
+/**
+ * Ollama API response types
+ */
+interface IOllamaTagsResponse {
+  models: Array<{
+    name: string;
+    size: number;
+    digest: string;
+    modified_at: string;
+  }>;
+}
+
+interface IOllamaChatRequest {
+  model: string;
+  messages: Array<{
+    role: string;
+    content: string;
+  }>;
+  stream?: boolean;
+  options?: {
+    temperature?: number;
+    top_p?: number;
+    num_predict?: number;
+    stop?: string[];
+  };
+}
+
+interface IOllamaChatResponse {
+  model: string;
+  created_at: string;
+  message: {
+    role: string;
+    content: string;
+  };
+  done: boolean;
+  total_duration?: number;
+  load_duration?: number;
+  prompt_eval_count?: number;
+  eval_count?: number;
+}
+
+interface IOllamaPullResponse {
+  status: string;
+  digest?: string;
+  total?: number;
+  completed?: number;
+}
+
+/**
+ * Ollama container implementation
+ */
+export class OllamaContainer extends BaseContainer {
+  public readonly type: TContainerType = 'ollama';
+  public readonly displayName = 'Ollama';
+  public readonly defaultImage = CONTAINER_IMAGES.OLLAMA;
+  public readonly defaultPort = CONTAINER_PORTS.OLLAMA;
+
+  constructor(config: IContainerConfig) {
+    super(config);
+
+    // Set defaults if not provided
+    if (!config.image) {
+      config.image = this.defaultImage;
+    }
+    if (!config.port) {
+      config.port = this.defaultPort;
+    }
+
+    // Add default volume for model storage
+    if (!config.volumes || config.volumes.length === 0) {
+      config.volumes = [`modelgrid-ollama-${config.id}:/root/.ollama`];
+    }
+  }
+
+  /**
+   * Create Ollama container configuration
+   */
+  public static createConfig(
+    id: string,
+    name: string,
+    gpuIds: string[],
+    options: Partial<IContainerConfig> = {},
+  ): IContainerConfig {
+    return {
+      id,
+      name,
+      type: 'ollama',
+      image: options.image || CONTAINER_IMAGES.OLLAMA,
+      gpuIds,
+      port: options.port || CONTAINER_PORTS.OLLAMA,
+      externalPort: options.externalPort,
+      models: options.models || [],
+      env: options.env,
+      volumes: options.volumes || [`modelgrid-ollama-${id}:/root/.ollama`],
+      autoStart: options.autoStart ?? true,
+      restartPolicy: options.restartPolicy || 'unless-stopped',
+      memoryLimit: options.memoryLimit,
+      cpuLimit: options.cpuLimit,
+      command: options.command,
+    };
+  }
+
+  /**
+   * Check if Ollama is healthy
+   */
+  public async isHealthy(): Promise<boolean> {
+    try {
+      const response = await this.fetch('/api/tags', { timeout: 5000 });
+      return response.ok;
+    } catch {
+      return false;
+    }
+  }
+
+  /**
+   * List available models
+   */
+  public async listModels(): Promise<string[]> {
+    try {
+      const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
+      return (data.models || []).map((m) => m.name);
+    } catch (error) {
+      logger.warn(`Failed to list Ollama models: ${error instanceof Error ? error.message : String(error)}`);
+      return [];
+    }
+  }
+
+  /**
+   * Get loaded models with details
+   */
+  public async getLoadedModels(): Promise<ILoadedModel[]> {
+    try {
+      const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
+      return (data.models || []).map((m) => ({
+        name: m.name,
+        size: m.size,
+        format: m.digest.substring(0, 12),
+        loaded: true, // Ollama doesn't distinguish loaded vs available
+        requestCount: 0,
+      }));
+    } catch {
+      return [];
+    }
+  }
+
+  /**
+   * Pull a model
+   */
+  public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
+    try {
+      logger.info(`Pulling model: ${modelName}`);
+
+      const response = await this.fetch('/api/pull', {
+        method: 'POST',
+        body: { name: modelName },
+        timeout: 3600000, // 1 hour for large models
+      });
+
+      if (!response.ok) {
+        throw new Error(`HTTP ${response.status}`);
+      }
+
+      // Read streaming response
+      const reader = response.body?.getReader();
+      if (!reader) {
+        throw new Error('No response body');
+      }
+
+      const decoder = new TextDecoder();
+      let lastStatus = '';
+
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+
+        const text = decoder.decode(value);
+        const lines = text.split('\n').filter((l) => l.trim());
+
+        for (const line of lines) {
+          try {
+            const data = JSON.parse(line) as IOllamaPullResponse;
+            const status = data.status;
+
+            if (status !== lastStatus) {
+              lastStatus = status;
+              let percent: number | undefined;
+
+              if (data.total && data.completed) {
+                percent = Math.round((data.completed / data.total) * 100);
+              }
+
+              if (onProgress) {
+                onProgress({ model: modelName, status, percent });
+              } else {
+                const progressStr = percent !== undefined ? ` (${percent}%)` : '';
+                logger.dim(`  ${status}${progressStr}`);
+              }
+            }
+          } catch {
+            // Invalid JSON line, skip
+          }
+        }
+      }
+
+      logger.success(`Model ${modelName} pulled successfully`);
+      return true;
+    } catch (error) {
+      logger.error(`Failed to pull model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
+      return false;
+    }
+  }
+
+  /**
+   * Remove a model
+   */
+  public async removeModel(modelName: string): Promise<boolean> {
+    try {
+      const response = await this.fetch('/api/delete', {
+        method: 'DELETE',
+        body: { name: modelName },
+      });
+
+      if (response.ok) {
+        logger.success(`Model ${modelName} removed`);
+        return true;
+      }
+
+      throw new Error(`HTTP ${response.status}`);
+    } catch (error) {
+      logger.error(`Failed to remove model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
+      return false;
+    }
+  }
+
+  /**
+   * Send a chat completion request
+   */
+  public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
+    const ollamaRequest: IOllamaChatRequest = {
+      model: request.model,
+      messages: request.messages.map((m) => ({
+        role: m.role,
+        content: m.content,
+      })),
+      stream: false,
+      options: {
+        temperature: request.temperature,
+        top_p: request.top_p,
+        num_predict: request.max_tokens,
+        stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
+      },
+    };
+
+    const response = await this.fetchJson<IOllamaChatResponse>('/api/chat', {
+      method: 'POST',
+      body: ollamaRequest,
+      timeout: 300000, // 5 minutes
+    });
+
+    // Convert to OpenAI format
+    const created = Math.floor(Date.now() / 1000);
+
+    const choice: IChatCompletionChoice = {
+      index: 0,
+      message: {
+        role: 'assistant',
+        content: response.message.content,
+      },
+      finish_reason: response.done ? 'stop' : null,
+    };
+
+    return {
+      id: this.generateRequestId(),
+      object: 'chat.completion',
+      created,
+      model: request.model,
+      choices: [choice],
+      usage: {
+        prompt_tokens: response.prompt_eval_count || 0,
+        completion_tokens: response.eval_count || 0,
+        total_tokens: (response.prompt_eval_count || 0) + (response.eval_count || 0),
+      },
+    };
+  }
+
+  /**
+   * Stream a chat completion request
+   */
+  public async chatCompletionStream(
+    request: IChatCompletionRequest,
+    onChunk: (chunk: string) => void,
+  ): Promise<void> {
+    const ollamaRequest: IOllamaChatRequest = {
+      model: request.model,
+      messages: request.messages.map((m) => ({
+        role: m.role,
+        content: m.content,
+      })),
+      stream: true,
+      options: {
+        temperature: request.temperature,
+        top_p: request.top_p,
+        num_predict: request.max_tokens,
+        stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
+      },
+    };
+
+    const response = await this.fetch('/api/chat', {
+      method: 'POST',
+      body: ollamaRequest,
+      timeout: 300000,
+    });
+
+    if (!response.ok) {
+      throw new Error(`HTTP ${response.status}`);
+    }
+
+    const reader = response.body?.getReader();
+    if (!reader) {
+      throw new Error('No response body');
+    }
+
+    const decoder = new TextDecoder();
+    const requestId = this.generateRequestId();
+    const created = Math.floor(Date.now() / 1000);
+
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+
+      const text = decoder.decode(value);
+      const lines = text.split('\n').filter((l) => l.trim());
+
+      for (const line of lines) {
+        try {
+          const data = JSON.parse(line) as IOllamaChatResponse;
+
+          // Convert to OpenAI streaming format
+          const chunk = {
+            id: requestId,
+            object: 'chat.completion.chunk',
+            created,
+            model: request.model,
+            choices: [
+              {
+                index: 0,
+                delta: {
+                  content: data.message.content,
+                } as Partial<IChatMessage>,
+                finish_reason: data.done ? 'stop' : null,
+              },
+            ],
+          };
+
+          onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
+
+          if (data.done) {
+            onChunk('data: [DONE]\n\n');
+          }
+        } catch {
+          // Invalid JSON, skip
+        }
+      }
+    }
+  }
+}
--- a/ts/containers/tgi.ts
+++ b/ts/containers/tgi.ts
@@ -0,0 +1,417 @@
+/**
+ * TGI Container (Text Generation Inference)
+ *
+ * Manages HuggingFace Text Generation Inference containers.
+ */
+
+import type {
+  IContainerConfig,
+  ILoadedModel,
+  TContainerType,
+} from '../interfaces/container.ts';
+import type {
+  IChatCompletionRequest,
+  IChatCompletionResponse,
+  IChatCompletionChoice,
+  IChatMessage,
+} from '../interfaces/api.ts';
+import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
+import { logger } from '../logger.ts';
+import { BaseContainer, type TModelPullProgress } from './base-container.ts';
+
+/**
+ * TGI info response
+ */
+interface ITgiInfoResponse {
+  model_id: string;
+  model_sha: string;
+  model_dtype: string;
+  model_device_type: string;
+  max_concurrent_requests: number;
+  max_best_of: number;
+  max_stop_sequences: number;
+  max_input_length: number;
+  max_total_tokens: number;
+  version: string;
+}
+
+/**
+ * TGI generate request
+ */
+interface ITgiGenerateRequest {
+  inputs: string;
+  parameters?: {
+    temperature?: number;
+    top_p?: number;
+    max_new_tokens?: number;
+    stop?: string[];
+    do_sample?: boolean;
+    return_full_text?: boolean;
+  };
+}
+
+/**
+ * TGI generate response
+ */
+interface ITgiGenerateResponse {
+  generated_text: string;
+  details?: {
+    finish_reason: string;
+    generated_tokens: number;
+    seed?: number;
+  };
+}
+
+/**
+ * TGI container implementation
+ *
+ * TGI is optimized for:
+ * - Production deployments
+ * - Flash Attention support
+ * - Quantization (bitsandbytes, GPTQ, AWQ)
+ * - Multiple GPU support with tensor parallelism
+ */
+export class TgiContainer extends BaseContainer {
+  public readonly type: TContainerType = 'tgi';
+  public readonly displayName = 'TGI';
+  public readonly defaultImage = CONTAINER_IMAGES.TGI;
+  public readonly defaultPort = CONTAINER_PORTS.TGI;
+
+  constructor(config: IContainerConfig) {
+    super(config);
+
+    // Set defaults if not provided
+    if (!config.image) {
+      config.image = this.defaultImage;
+    }
+    if (!config.port) {
+      config.port = this.defaultPort;
+    }
+
+    // Add default volume for model cache
+    if (!config.volumes || config.volumes.length === 0) {
+      config.volumes = [`modelgrid-tgi-${config.id}:/data`];
+    }
+  }
+
+  /**
+   * Create TGI container configuration
+   */
+  public static createConfig(
+    id: string,
+    name: string,
+    modelName: string,
+    gpuIds: string[],
+    options: Partial<IContainerConfig> = {},
+  ): IContainerConfig {
+    const env: Record<string, string> = {
+      MODEL_ID: modelName,
+      PORT: String(options.port || CONTAINER_PORTS.TGI),
+      HUGGING_FACE_HUB_TOKEN: options.env?.HF_TOKEN || options.env?.HUGGING_FACE_HUB_TOKEN || '',
+      ...options.env,
+    };
+
+    // Add GPU configuration
+    if (gpuIds.length > 1) {
+      env.NUM_SHARD = String(gpuIds.length);
+    }
+
+    // Add quantization if specified
+    if (options.env?.QUANTIZE) {
+      env.QUANTIZE = options.env.QUANTIZE;
+    }
+
+    return {
+      id,
+      name,
+      type: 'tgi',
+      image: options.image || CONTAINER_IMAGES.TGI,
+      gpuIds,
+      port: options.port || CONTAINER_PORTS.TGI,
+      externalPort: options.externalPort,
+      models: [modelName],
+      env,
+      volumes: options.volumes || [`modelgrid-tgi-${id}:/data`],
+      autoStart: options.autoStart ?? true,
+      restartPolicy: options.restartPolicy || 'unless-stopped',
+      memoryLimit: options.memoryLimit,
+      cpuLimit: options.cpuLimit,
+      command: options.command,
+    };
+  }
+
+  /**
+   * Check if TGI is healthy
+   */
+  public async isHealthy(): Promise<boolean> {
+    try {
+      const response = await this.fetch('/health', { timeout: 5000 });
+      return response.ok;
+    } catch {
+      return false;
+    }
+  }
+
+  /**
+   * List available models
+   * TGI serves a single model per instance
+   */
+  public async listModels(): Promise<string[]> {
+    try {
+      const info = await this.fetchJson<ITgiInfoResponse>('/info');
+      return [info.model_id];
+    } catch (error) {
+      logger.warn(`Failed to get TGI info: ${error instanceof Error ? error.message : String(error)}`);
+      return this.config.models || [];
+    }
+  }
+
+  /**
+   * Get loaded models with details
+   */
+  public async getLoadedModels(): Promise<ILoadedModel[]> {
+    try {
+      const info = await this.fetchJson<ITgiInfoResponse>('/info');
+      return [{
+        name: info.model_id,
+        size: 0, // TGI doesn't expose model size
+        format: info.model_dtype,
+        loaded: true,
+        requestCount: 0,
+      }];
+    } catch {
+      return this.config.models.map((name) => ({
+        name,
+        size: 0,
+        loaded: true,
+        requestCount: 0,
+      }));
+    }
+  }
+
+  /**
+   * Pull a model
+   * TGI downloads models automatically at startup
+   */
+  public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
+    logger.info(`TGI downloads models at startup. Model: ${modelName}`);
+    logger.info('To use a different model, create a new TGI container.');
+
+    if (onProgress) {
+      onProgress({
+        model: modelName,
+        status: 'TGI models are loaded at container startup',
+        percent: 100,
+      });
+    }
+
+    return true;
+  }
+
+  /**
+   * Remove a model
+   * TGI serves a single model per instance
+   */
+  public async removeModel(modelName: string): Promise<boolean> {
+    logger.info(`TGI serves a single model per instance.`);
+    logger.info(`To remove model ${modelName}, stop and remove this container.`);
+    return true;
+  }
+
+  /**
+   * Send a chat completion request
+   * Convert OpenAI format to TGI format
+   */
+  public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
+    // Convert messages to TGI prompt format
+    const prompt = this.messagesToPrompt(request.messages);
+
+    const tgiRequest: ITgiGenerateRequest = {
+      inputs: prompt,
+      parameters: {
+        temperature: request.temperature,
+        top_p: request.top_p,
+        max_new_tokens: request.max_tokens || 1024,
+        stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
+        do_sample: (request.temperature || 0) > 0,
+        return_full_text: false,
+      },
+    };
+
+    const response = await this.fetchJson<ITgiGenerateResponse>('/generate', {
+      method: 'POST',
+      body: tgiRequest,
+      timeout: 300000, // 5 minutes
+    });
+
+    // Convert to OpenAI format
+    const created = Math.floor(Date.now() / 1000);
+
+    const choice: IChatCompletionChoice = {
+      index: 0,
+      message: {
+        role: 'assistant',
+        content: response.generated_text,
+      },
+      finish_reason: response.details?.finish_reason === 'eos_token' ? 'stop' : 'length',
+    };
+
+    return {
+      id: this.generateRequestId(),
+      object: 'chat.completion',
+      created,
+      model: this.config.models[0] || 'unknown',
+      choices: [choice],
+      usage: {
+        prompt_tokens: 0, // TGI doesn't always report this
+        completion_tokens: response.details?.generated_tokens || 0,
+        total_tokens: response.details?.generated_tokens || 0,
+      },
+    };
+  }
+
+  /**
+   * Stream a chat completion request
+   */
+  public async chatCompletionStream(
+    request: IChatCompletionRequest,
+    onChunk: (chunk: string) => void,
+  ): Promise<void> {
+    // Convert messages to TGI prompt format
+    const prompt = this.messagesToPrompt(request.messages);
+
+    const response = await this.fetch('/generate_stream', {
+      method: 'POST',
+      body: {
+        inputs: prompt,
+        parameters: {
+          temperature: request.temperature,
+          top_p: request.top_p,
+          max_new_tokens: request.max_tokens || 1024,
+          stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
+          do_sample: (request.temperature || 0) > 0,
+        },
+      },
+      timeout: 300000,
+    });
+
+    if (!response.ok) {
+      const error = await response.text();
+      throw new Error(`HTTP ${response.status}: ${error}`);
+    }
+
+    const reader = response.body?.getReader();
+    if (!reader) {
+      throw new Error('No response body');
+    }
+
+    const decoder = new TextDecoder();
+    const requestId = this.generateRequestId();
+    const created = Math.floor(Date.now() / 1000);
+    const model = this.config.models[0] || 'unknown';
+
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+
+      const text = decoder.decode(value);
+      const lines = text.split('\n').filter((l) => l.startsWith('data:'));
+
+      for (const line of lines) {
+        try {
+          const jsonStr = line.substring(5).trim();
+          if (jsonStr === '[DONE]') {
+            onChunk('data: [DONE]\n\n');
+            continue;
+          }
+
+          const data = JSON.parse(jsonStr);
+
+          // Convert to OpenAI streaming format
+          const chunk = {
+            id: requestId,
+            object: 'chat.completion.chunk',
+            created,
+            model,
+            choices: [
+              {
+                index: 0,
+                delta: {
+                  content: data.token?.text || '',
+                } as Partial<IChatMessage>,
+                finish_reason: data.details?.finish_reason ? 'stop' : null,
+              },
+            ],
+          };
+
+          onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
+        } catch {
+          // Invalid JSON, skip
+        }
+      }
+    }
+  }
+
+  /**
+   * Convert chat messages to TGI prompt format
+   */
+  private messagesToPrompt(messages: IChatMessage[]): string {
+    // Use a simple chat template
+    // TGI can use model-specific templates via the Messages API
+    let prompt = '';
+
+    for (const message of messages) {
+      switch (message.role) {
+        case 'system':
+          prompt += `System: ${message.content}\n\n`;
+          break;
+        case 'user':
+          prompt += `User: ${message.content}\n\n`;
+          break;
+        case 'assistant':
+          prompt += `Assistant: ${message.content}\n\n`;
+          break;
+      }
+    }
+
+    prompt += 'Assistant:';
+    return prompt;
+  }
+
+  /**
+   * Get TGI server info
+   */
+  public async getInfo(): Promise<ITgiInfoResponse | null> {
+    try {
+      return await this.fetchJson<ITgiInfoResponse>('/info');
+    } catch {
+      return null;
+    }
+  }
+
+  /**
+   * Get TGI metrics
+   */
+  public async getMetrics(): Promise<Record<string, unknown>> {
+    try {
+      const response = await this.fetch('/metrics', { timeout: 5000 });
+      if (response.ok) {
+        const text = await response.text();
+        // Parse Prometheus metrics
+        const metrics: Record<string, unknown> = {};
+        const lines = text.split('\n');
+        for (const line of lines) {
+          if (line.startsWith('#') || !line.trim()) continue;
+          const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
+          if (match) {
+            metrics[match[1]] = parseFloat(match[2]);
+          }
+        }
+        return metrics;
+      }
+    } catch {
+      // Metrics endpoint may not be available
+    }
+    return {};
+  }
+}
--- a/ts/containers/vllm.ts
+++ b/ts/containers/vllm.ts
@@ -0,0 +1,272 @@
+/**
+ * vLLM Container
+ *
+ * Manages vLLM containers for high-performance LLM inference.
+ */
+
+import type {
+  IContainerConfig,
+  ILoadedModel,
+  TContainerType,
+} from '../interfaces/container.ts';
+import type {
+  IChatCompletionRequest,
+  IChatCompletionResponse,
+  IChatMessage,
+} from '../interfaces/api.ts';
+import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
+import { logger } from '../logger.ts';
+import { BaseContainer, type TModelPullProgress } from './base-container.ts';
+
+/**
+ * vLLM model info response
+ */
+interface IVllmModelsResponse {
+  object: 'list';
+  data: Array<{
+    id: string;
+    object: 'model';
+    created: number;
+    owned_by: string;
+  }>;
+}
+
+/**
+ * vLLM container implementation
+ *
+ * vLLM serves a single model per instance and is optimized for:
+ * - High throughput with PagedAttention
+ * - Continuous batching
+ * - OpenAI-compatible API
+ */
+export class VllmContainer extends BaseContainer {
+  public readonly type: TContainerType = 'vllm';
+  public readonly displayName = 'vLLM';
+  public readonly defaultImage = CONTAINER_IMAGES.VLLM;
+  public readonly defaultPort = CONTAINER_PORTS.VLLM;
+
+  constructor(config: IContainerConfig) {
+    super(config);
+
+    // Set defaults if not provided
+    if (!config.image) {
+      config.image = this.defaultImage;
+    }
+    if (!config.port) {
+      config.port = this.defaultPort;
+    }
+
+    // Add default volume for model cache
+    if (!config.volumes || config.volumes.length === 0) {
+      config.volumes = [`modelgrid-vllm-${config.id}:/root/.cache/huggingface`];
+    }
+  }
+
+  /**
+   * Create vLLM container configuration
+   */
+  public static createConfig(
+    id: string,
+    name: string,
+    modelName: string,
+    gpuIds: string[],
+    options: Partial<IContainerConfig> = {},
+  ): IContainerConfig {
+    // vLLM requires model to be specified at startup
+    const command = [
+      '--model', modelName,
+      '--host', '0.0.0.0',
+      '--port', String(options.port || CONTAINER_PORTS.VLLM),
+    ];
+
+    // Add tensor parallelism if multiple GPUs
+    if (gpuIds.length > 1) {
+      command.push('--tensor-parallel-size', String(gpuIds.length));
+    }
+
+    // Add additional options
+    if (options.env?.VLLM_MAX_MODEL_LEN) {
+      command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
+    }
+
+    return {
+      id,
+      name,
+      type: 'vllm',
+      image: options.image || CONTAINER_IMAGES.VLLM,
+      gpuIds,
+      port: options.port || CONTAINER_PORTS.VLLM,
+      externalPort: options.externalPort,
+      models: [modelName],
+      env: {
+        HF_TOKEN: options.env?.HF_TOKEN || '',
+        ...options.env,
+      },
+      volumes: options.volumes || [`modelgrid-vllm-${id}:/root/.cache/huggingface`],
+      autoStart: options.autoStart ?? true,
+      restartPolicy: options.restartPolicy || 'unless-stopped',
+      memoryLimit: options.memoryLimit,
+      cpuLimit: options.cpuLimit,
+      command,
+    };
+  }
+
+  /**
+   * Check if vLLM is healthy
+   */
+  public async isHealthy(): Promise<boolean> {
+    try {
+      const response = await this.fetch('/health', { timeout: 5000 });
+      return response.ok;
+    } catch {
+      return false;
+    }
+  }
+
+  /**
+   * List available models
+   * vLLM serves a single model per instance
+   */
+  public async listModels(): Promise<string[]> {
+    try {
+      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
+      return (data.data || []).map((m) => m.id);
+    } catch (error) {
+      logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
+      return this.config.models || [];
+    }
+  }
+
+  /**
+   * Get loaded models with details
+   */
+  public async getLoadedModels(): Promise<ILoadedModel[]> {
+    try {
+      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
+      return (data.data || []).map((m) => ({
+        name: m.id,
+        size: 0, // vLLM doesn't expose size
+        loaded: true,
+        requestCount: 0,
+      }));
+    } catch {
+      // Return configured model as fallback
+      return this.config.models.map((name) => ({
+        name,
+        size: 0,
+        loaded: true,
+        requestCount: 0,
+      }));
+    }
+  }
+
+  /**
+   * Pull a model
+   * vLLM downloads models automatically at startup
+   * This method is a no-op - models are configured at container creation
+   */
+  public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
+    logger.info(`vLLM downloads models at startup. Model: ${modelName}`);
+    logger.info('To use a different model, create a new vLLM container.');
+
+    if (onProgress) {
+      onProgress({
+        model: modelName,
+        status: 'vLLM models are loaded at container startup',
+        percent: 100,
+      });
+    }
+
+    return true;
+  }
+
+  /**
+   * Remove a model
+   * vLLM serves a single model per instance
+   */
+  public async removeModel(modelName: string): Promise<boolean> {
+    logger.info(`vLLM serves a single model per instance.`);
+    logger.info(`To remove model ${modelName}, stop and remove this container.`);
+    return true;
+  }
+
+  /**
+   * Send a chat completion request
+   * vLLM is OpenAI-compatible
+   */
+  public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
+    return this.fetchJson<IChatCompletionResponse>('/v1/chat/completions', {
+      method: 'POST',
+      body: {
+        ...request,
+        stream: false,
+      },
+      timeout: 300000, // 5 minutes
+    });
+  }
+
+  /**
+   * Stream a chat completion request
+   * vLLM is OpenAI-compatible
+   */
+  public async chatCompletionStream(
+    request: IChatCompletionRequest,
+    onChunk: (chunk: string) => void,
+  ): Promise<void> {
+    const response = await this.fetch('/v1/chat/completions', {
+      method: 'POST',
+      body: {
+        ...request,
+        stream: true,
+      },
+      timeout: 300000,
+    });
+
+    if (!response.ok) {
+      const error = await response.text();
+      throw new Error(`HTTP ${response.status}: ${error}`);
+    }
+
+    const reader = response.body?.getReader();
+    if (!reader) {
+      throw new Error('No response body');
+    }
+
+    const decoder = new TextDecoder();
+
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+
+      const text = decoder.decode(value);
+      // vLLM already sends data in SSE format
+      onChunk(text);
+    }
+  }
+
+  /**
+   * Get vLLM-specific metrics
+   */
+  public async getMetrics(): Promise<Record<string, unknown>> {
+    try {
+      const response = await this.fetch('/metrics', { timeout: 5000 });
+      if (response.ok) {
+        const text = await response.text();
+        // Parse Prometheus metrics
+        const metrics: Record<string, unknown> = {};
+        const lines = text.split('\n');
+        for (const line of lines) {
+          if (line.startsWith('#') || !line.trim()) continue;
+          const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
+          if (match) {
+            metrics[match[1]] = parseFloat(match[2]);
+          }
+        }
+        return metrics;
+      }
+    } catch {
+      // Metrics endpoint may not be enabled
+    }
+    return {};
+  }
+}