initial

2026-01-30 03:16:57 +00:00
commit daaf6559e3
80 changed files with 14430 additions and 0 deletions
@@ -0,0 +1,272 @@
+/**
+ * vLLM Container
+ *
+ * Manages vLLM containers for high-performance LLM inference.
+ */
+
+import type {
+  IContainerConfig,
+  ILoadedModel,
+  TContainerType,
+} from '../interfaces/container.ts';
+import type {
+  IChatCompletionRequest,
+  IChatCompletionResponse,
+  IChatMessage,
+} from '../interfaces/api.ts';
+import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
+import { logger } from '../logger.ts';
+import { BaseContainer, type TModelPullProgress } from './base-container.ts';
+
+/**
+ * vLLM model info response
+ */
+interface IVllmModelsResponse {
+  object: 'list';
+  data: Array<{
+    id: string;
+    object: 'model';
+    created: number;
+    owned_by: string;
+  }>;
+}
+
+/**
+ * vLLM container implementation
+ *
+ * vLLM serves a single model per instance and is optimized for:
+ * - High throughput with PagedAttention
+ * - Continuous batching
+ * - OpenAI-compatible API
+ */
+export class VllmContainer extends BaseContainer {
+  public readonly type: TContainerType = 'vllm';
+  public readonly displayName = 'vLLM';
+  public readonly defaultImage = CONTAINER_IMAGES.VLLM;
+  public readonly defaultPort = CONTAINER_PORTS.VLLM;
+
+  constructor(config: IContainerConfig) {
+    super(config);
+
+    // Set defaults if not provided
+    if (!config.image) {
+      config.image = this.defaultImage;
+    }
+    if (!config.port) {
+      config.port = this.defaultPort;
+    }
+
+    // Add default volume for model cache
+    if (!config.volumes || config.volumes.length === 0) {
+      config.volumes = [`modelgrid-vllm-${config.id}:/root/.cache/huggingface`];
+    }
+  }
+
+  /**
+   * Create vLLM container configuration
+   */
+  public static createConfig(
+    id: string,
+    name: string,
+    modelName: string,
+    gpuIds: string[],
+    options: Partial<IContainerConfig> = {},
+  ): IContainerConfig {
+    // vLLM requires model to be specified at startup
+    const command = [
+      '--model', modelName,
+      '--host', '0.0.0.0',
+      '--port', String(options.port || CONTAINER_PORTS.VLLM),
+    ];
+
+    // Add tensor parallelism if multiple GPUs
+    if (gpuIds.length > 1) {
+      command.push('--tensor-parallel-size', String(gpuIds.length));
+    }
+
+    // Add additional options
+    if (options.env?.VLLM_MAX_MODEL_LEN) {
+      command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
+    }
+
+    return {
+      id,
+      name,
+      type: 'vllm',
+      image: options.image || CONTAINER_IMAGES.VLLM,
+      gpuIds,
+      port: options.port || CONTAINER_PORTS.VLLM,
+      externalPort: options.externalPort,
+      models: [modelName],
+      env: {
+        HF_TOKEN: options.env?.HF_TOKEN || '',
+        ...options.env,
+      },
+      volumes: options.volumes || [`modelgrid-vllm-${id}:/root/.cache/huggingface`],
+      autoStart: options.autoStart ?? true,
+      restartPolicy: options.restartPolicy || 'unless-stopped',
+      memoryLimit: options.memoryLimit,
+      cpuLimit: options.cpuLimit,
+      command,
+    };
+  }
+
+  /**
+   * Check if vLLM is healthy
+   */
+  public async isHealthy(): Promise<boolean> {
+    try {
+      const response = await this.fetch('/health', { timeout: 5000 });
+      return response.ok;
+    } catch {
+      return false;
+    }
+  }
+
+  /**
+   * List available models
+   * vLLM serves a single model per instance
+   */
+  public async listModels(): Promise<string[]> {
+    try {
+      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
+      return (data.data || []).map((m) => m.id);
+    } catch (error) {
+      logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
+      return this.config.models || [];
+    }
+  }
+
+  /**
+   * Get loaded models with details
+   */
+  public async getLoadedModels(): Promise<ILoadedModel[]> {
+    try {
+      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
+      return (data.data || []).map((m) => ({
+        name: m.id,
+        size: 0, // vLLM doesn't expose size
+        loaded: true,
+        requestCount: 0,
+      }));
+    } catch {
+      // Return configured model as fallback
+      return this.config.models.map((name) => ({
+        name,
+        size: 0,
+        loaded: true,
+        requestCount: 0,
+      }));
+    }
+  }
+
+  /**
+   * Pull a model
+   * vLLM downloads models automatically at startup
+   * This method is a no-op - models are configured at container creation
+   */
+  public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
+    logger.info(`vLLM downloads models at startup. Model: ${modelName}`);
+    logger.info('To use a different model, create a new vLLM container.');
+
+    if (onProgress) {
+      onProgress({
+        model: modelName,
+        status: 'vLLM models are loaded at container startup',
+        percent: 100,
+      });
+    }
+
+    return true;
+  }
+
+  /**
+   * Remove a model
+   * vLLM serves a single model per instance
+   */
+  public async removeModel(modelName: string): Promise<boolean> {
+    logger.info(`vLLM serves a single model per instance.`);
+    logger.info(`To remove model ${modelName}, stop and remove this container.`);
+    return true;
+  }
+
+  /**
+   * Send a chat completion request
+   * vLLM is OpenAI-compatible
+   */
+  public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
+    return this.fetchJson<IChatCompletionResponse>('/v1/chat/completions', {
+      method: 'POST',
+      body: {
+        ...request,
+        stream: false,
+      },
+      timeout: 300000, // 5 minutes
+    });
+  }
+
+  /**
+   * Stream a chat completion request
+   * vLLM is OpenAI-compatible
+   */
+  public async chatCompletionStream(
+    request: IChatCompletionRequest,
+    onChunk: (chunk: string) => void,
+  ): Promise<void> {
+    const response = await this.fetch('/v1/chat/completions', {
+      method: 'POST',
+      body: {
+        ...request,
+        stream: true,
+      },
+      timeout: 300000,
+    });
+
+    if (!response.ok) {
+      const error = await response.text();
+      throw new Error(`HTTP ${response.status}: ${error}`);
+    }
+
+    const reader = response.body?.getReader();
+    if (!reader) {
+      throw new Error('No response body');
+    }
+
+    const decoder = new TextDecoder();
+
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+
+      const text = decoder.decode(value);
+      // vLLM already sends data in SSE format
+      onChunk(text);
+    }
+  }
+
+  /**
+   * Get vLLM-specific metrics
+   */
+  public async getMetrics(): Promise<Record<string, unknown>> {
+    try {
+      const response = await this.fetch('/metrics', { timeout: 5000 });
+      if (response.ok) {
+        const text = await response.text();
+        // Parse Prometheus metrics
+        const metrics: Record<string, unknown> = {};
+        const lines = text.split('\n');
+        for (const line of lines) {
+          if (line.startsWith('#') || !line.trim()) continue;
+          const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
+          if (match) {
+            metrics[match[1]] = parseFloat(match[2]);
+          }
+        }
+        return metrics;
+      }
+    } catch {
+      // Metrics endpoint may not be enabled
+    }
+    return {};
+  }
+}