/** * vLLM Container * * Manages vLLM containers for high-performance LLM inference. */ import type { IContainerConfig, ILoadedModel, TContainerType, } from '../interfaces/container.ts'; import type { IChatCompletionRequest, IChatCompletionResponse, IChatMessage, } from '../interfaces/api.ts'; import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts'; import { logger } from '../logger.ts'; import { BaseContainer, type TModelPullProgress } from './base-container.ts'; /** * vLLM model info response */ interface IVllmModelsResponse { object: 'list'; data: Array<{ id: string; object: 'model'; created: number; owned_by: string; }>; } /** * vLLM container implementation * * vLLM serves a single model per instance and is optimized for: * - High throughput with PagedAttention * - Continuous batching * - OpenAI-compatible API */ export class VllmContainer extends BaseContainer { public readonly type: TContainerType = 'vllm'; public readonly displayName = 'vLLM'; public readonly defaultImage = CONTAINER_IMAGES.VLLM; public readonly defaultPort = CONTAINER_PORTS.VLLM; constructor(config: IContainerConfig) { super(config); // Set defaults if not provided if (!config.image) { config.image = this.defaultImage; } if (!config.port) { config.port = this.defaultPort; } // Add default volume for model cache if (!config.volumes || config.volumes.length === 0) { config.volumes = [`modelgrid-vllm-${config.id}:/root/.cache/huggingface`]; } } /** * Create vLLM container configuration */ public static createConfig( id: string, name: string, modelName: string, gpuIds: string[], options: Partial = {}, ): IContainerConfig { // vLLM requires model to be specified at startup const command = [ '--model', modelName, '--host', '0.0.0.0', '--port', String(options.port || CONTAINER_PORTS.VLLM), ]; // Add tensor parallelism if multiple GPUs if (gpuIds.length > 1) { command.push('--tensor-parallel-size', String(gpuIds.length)); } // Add additional options if (options.env?.VLLM_MAX_MODEL_LEN) { command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN); } return { id, name, type: 'vllm', image: options.image || CONTAINER_IMAGES.VLLM, gpuIds, port: options.port || CONTAINER_PORTS.VLLM, externalPort: options.externalPort, models: [modelName], env: { HF_TOKEN: options.env?.HF_TOKEN || '', ...options.env, }, volumes: options.volumes || [`modelgrid-vllm-${id}:/root/.cache/huggingface`], autoStart: options.autoStart ?? true, restartPolicy: options.restartPolicy || 'unless-stopped', memoryLimit: options.memoryLimit, cpuLimit: options.cpuLimit, command, }; } /** * Check if vLLM is healthy */ public async isHealthy(): Promise { try { const response = await this.fetch('/health', { timeout: 5000 }); return response.ok; } catch { return false; } } /** * List available models * vLLM serves a single model per instance */ public async listModels(): Promise { try { const data = await this.fetchJson('/v1/models'); return (data.data || []).map((m) => m.id); } catch (error) { logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`); return this.config.models || []; } } /** * Get loaded models with details */ public async getLoadedModels(): Promise { try { const data = await this.fetchJson('/v1/models'); return (data.data || []).map((m) => ({ name: m.id, size: 0, // vLLM doesn't expose size loaded: true, requestCount: 0, })); } catch { // Return configured model as fallback return this.config.models.map((name) => ({ name, size: 0, loaded: true, requestCount: 0, })); } } /** * Pull a model * vLLM downloads models automatically at startup * This method is a no-op - models are configured at container creation */ public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise { logger.info(`vLLM downloads models at startup. Model: ${modelName}`); logger.info('To use a different model, create a new vLLM container.'); if (onProgress) { onProgress({ model: modelName, status: 'vLLM models are loaded at container startup', percent: 100, }); } return true; } /** * Remove a model * vLLM serves a single model per instance */ public async removeModel(modelName: string): Promise { logger.info(`vLLM serves a single model per instance.`); logger.info(`To remove model ${modelName}, stop and remove this container.`); return true; } /** * Send a chat completion request * vLLM is OpenAI-compatible */ public async chatCompletion(request: IChatCompletionRequest): Promise { return this.fetchJson('/v1/chat/completions', { method: 'POST', body: { ...request, stream: false, }, timeout: 300000, // 5 minutes }); } /** * Stream a chat completion request * vLLM is OpenAI-compatible */ public async chatCompletionStream( request: IChatCompletionRequest, onChunk: (chunk: string) => void, ): Promise { const response = await this.fetch('/v1/chat/completions', { method: 'POST', body: { ...request, stream: true, }, timeout: 300000, }); if (!response.ok) { const error = await response.text(); throw new Error(`HTTP ${response.status}: ${error}`); } const reader = response.body?.getReader(); if (!reader) { throw new Error('No response body'); } const decoder = new TextDecoder(); while (true) { const { done, value } = await reader.read(); if (done) break; const text = decoder.decode(value); // vLLM already sends data in SSE format onChunk(text); } } /** * Get vLLM-specific metrics */ public async getMetrics(): Promise> { try { const response = await this.fetch('/metrics', { timeout: 5000 }); if (response.ok) { const text = await response.text(); // Parse Prometheus metrics const metrics: Record = {}; const lines = text.split('\n'); for (const line of lines) { if (line.startsWith('#') || !line.trim()) continue; const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/); if (match) { metrics[match[1]] = parseFloat(match[2]); } } return metrics; } } catch { // Metrics endpoint may not be enabled } return {}; } }