ts/containers/vllm.ts

/**
 * vLLM Container
 *
 * Manages vLLM containers for high-performance LLM inference.
 */

import type {
  IContainerConfig,
  ILoadedModel,
  TContainerType,
} from '../interfaces/container.ts';
import type {
  IChatCompletionRequest,
  IChatCompletionResponse,
  IChatMessage,
} from '../interfaces/api.ts';
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
import { logger } from '../logger.ts';
import { BaseContainer, type TModelPullProgress } from './base-container.ts';

/**
 * vLLM model info response
 */
interface IVllmModelsResponse {
  object: 'list';
  data: Array<{
    id: string;
    object: 'model';
    created: number;
    owned_by: string;
  }>;
}

/**
 * vLLM container implementation
 *
 * vLLM serves a single model per instance and is optimized for:
 * - High throughput with PagedAttention
 * - Continuous batching
 * - OpenAI-compatible API
 */
export class VllmContainer extends BaseContainer {
  public readonly type: TContainerType = 'vllm';
  public readonly displayName = 'vLLM';
  public readonly defaultImage = CONTAINER_IMAGES.VLLM;
  public readonly defaultPort = CONTAINER_PORTS.VLLM;

  constructor(config: IContainerConfig) {
    super(config);

    // Set defaults if not provided
    if (!config.image) {
      config.image = this.defaultImage;
    }
    if (!config.port) {
      config.port = this.defaultPort;
    }

    // Add default volume for model cache
    if (!config.volumes || config.volumes.length === 0) {
      config.volumes = [`modelgrid-vllm-${config.id}:/root/.cache/huggingface`];
    }
  }

  /**
   * Create vLLM container configuration
   */
  public static createConfig(
    id: string,
    name: string,
    modelName: string,
    gpuIds: string[],
    options: Partial<IContainerConfig> = {},
  ): IContainerConfig {
    // vLLM requires model to be specified at startup
    const command = [
      '--model', modelName,
      '--host', '0.0.0.0',
      '--port', String(options.port || CONTAINER_PORTS.VLLM),
    ];

    // Add tensor parallelism if multiple GPUs
    if (gpuIds.length > 1) {
      command.push('--tensor-parallel-size', String(gpuIds.length));
    }

    // Add additional options
    if (options.env?.VLLM_MAX_MODEL_LEN) {
      command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
    }

    return {
      id,
      name,
      type: 'vllm',
      image: options.image || CONTAINER_IMAGES.VLLM,
      gpuIds,
      port: options.port || CONTAINER_PORTS.VLLM,
      externalPort: options.externalPort,
      models: [modelName],
      env: {
        HF_TOKEN: options.env?.HF_TOKEN || '',
        ...options.env,
      },
      volumes: options.volumes || [`modelgrid-vllm-${id}:/root/.cache/huggingface`],
      autoStart: options.autoStart ?? true,
      restartPolicy: options.restartPolicy || 'unless-stopped',
      memoryLimit: options.memoryLimit,
      cpuLimit: options.cpuLimit,
      command,
    };
  }

  /**
   * Check if vLLM is healthy
   */
  public async isHealthy(): Promise<boolean> {
    try {
      const response = await this.fetch('/health', { timeout: 5000 });
      return response.ok;
    } catch {
      return false;
    }
  }

  /**
   * List available models
   * vLLM serves a single model per instance
   */
  public async listModels(): Promise<string[]> {
    try {
      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
      return (data.data || []).map((m) => m.id);
    } catch (error) {
      logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
      return this.config.models || [];
    }
  }

  /**
   * Get loaded models with details
   */
  public async getLoadedModels(): Promise<ILoadedModel[]> {
    try {
      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
      return (data.data || []).map((m) => ({
        name: m.id,
        size: 0, // vLLM doesn't expose size
        loaded: true,
        requestCount: 0,
      }));
    } catch {
      // Return configured model as fallback
      return this.config.models.map((name) => ({
        name,
        size: 0,
        loaded: true,
        requestCount: 0,
      }));
    }
  }

  /**
   * Pull a model
   * vLLM downloads models automatically at startup
   * This method is a no-op - models are configured at container creation
   */
  public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
    logger.info(`vLLM downloads models at startup. Model: ${modelName}`);
    logger.info('To use a different model, create a new vLLM container.');

    if (onProgress) {
      onProgress({
        model: modelName,
        status: 'vLLM models are loaded at container startup',
        percent: 100,
      });
    }

    return true;
  }

  /**
   * Remove a model
   * vLLM serves a single model per instance
   */
  public async removeModel(modelName: string): Promise<boolean> {
    logger.info(`vLLM serves a single model per instance.`);
    logger.info(`To remove model ${modelName}, stop and remove this container.`);
    return true;
  }

  /**
   * Send a chat completion request
   * vLLM is OpenAI-compatible
   */
  public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
    return this.fetchJson<IChatCompletionResponse>('/v1/chat/completions', {
      method: 'POST',
      body: {
        ...request,
        stream: false,
      },
      timeout: 300000, // 5 minutes
    });
  }

  /**
   * Stream a chat completion request
   * vLLM is OpenAI-compatible
   */
  public async chatCompletionStream(
    request: IChatCompletionRequest,
    onChunk: (chunk: string) => void,
  ): Promise<void> {
    const response = await this.fetch('/v1/chat/completions', {
      method: 'POST',
      body: {
        ...request,
        stream: true,
      },
      timeout: 300000,
    });

    if (!response.ok) {
      const error = await response.text();
      throw new Error(`HTTP ${response.status}: ${error}`);
    }

    const reader = response.body?.getReader();
    if (!reader) {
      throw new Error('No response body');
    }

    const decoder = new TextDecoder();

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;

      const text = decoder.decode(value);
      // vLLM already sends data in SSE format
      onChunk(text);
    }
  }

  /**
   * Get vLLM-specific metrics
   */
  public async getMetrics(): Promise<Record<string, unknown>> {
    try {
      const response = await this.fetch('/metrics', { timeout: 5000 });
      if (response.ok) {
        const text = await response.text();
        // Parse Prometheus metrics
        const metrics: Record<string, unknown> = {};
        const lines = text.split('\n');
        for (const line of lines) {
          if (line.startsWith('#') || !line.trim()) continue;
          const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
          if (match) {
            metrics[match[1]] = parseFloat(match[2]);
          }
        }
        return metrics;
      }
    } catch {
      // Metrics endpoint may not be enabled
    }
    return {};
  }
}
initial 2026-01-30 03:16:57 +00:00			`/**`
			`* vLLM Container`
			`*`
			`* Manages vLLM containers for high-performance LLM inference.`
			`*/`

			`import type {`
			`IContainerConfig,`
			`ILoadedModel,`
			`TContainerType,`
			`} from '../interfaces/container.ts';`
			`import type {`
			`IChatCompletionRequest,`
			`IChatCompletionResponse,`
			`IChatMessage,`
			`} from '../interfaces/api.ts';`
			`import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';`
			`import { logger } from '../logger.ts';`
			`import { BaseContainer, type TModelPullProgress } from './base-container.ts';`

			`/**`
			`* vLLM model info response`
			`*/`
			`interface IVllmModelsResponse {`
			`object: 'list';`
			`data: Array<{`
			`id: string;`
			`object: 'model';`
			`created: number;`
			`owned_by: string;`
			`}>;`
			`}`

			`/**`
			`* vLLM container implementation`
			`*`
			`* vLLM serves a single model per instance and is optimized for:`
			`* - High throughput with PagedAttention`
			`* - Continuous batching`
			`* - OpenAI-compatible API`
			`*/`
			`export class VllmContainer extends BaseContainer {`
			`public readonly type: TContainerType = 'vllm';`
			`public readonly displayName = 'vLLM';`
			`public readonly defaultImage = CONTAINER_IMAGES.VLLM;`
			`public readonly defaultPort = CONTAINER_PORTS.VLLM;`

			`constructor(config: IContainerConfig) {`
			`super(config);`

			`// Set defaults if not provided`
			`if (!config.image) {`
			`config.image = this.defaultImage;`
			`}`
			`if (!config.port) {`
			`config.port = this.defaultPort;`
			`}`

			`// Add default volume for model cache`
			`if (!config.volumes \|\| config.volumes.length === 0) {`
			config.volumes = [`modelgrid-vllm-${config.id}:/root/.cache/huggingface`];
			`}`
			`}`

			`/**`
			`* Create vLLM container configuration`
			`*/`
			`public static createConfig(`
			`id: string,`
			`name: string,`
			`modelName: string,`
			`gpuIds: string[],`
			`options: Partial<IContainerConfig> = {},`
			`): IContainerConfig {`
			`// vLLM requires model to be specified at startup`
			`const command = [`
			`'--model', modelName,`
			`'--host', '0.0.0.0',`
			`'--port', String(options.port \|\| CONTAINER_PORTS.VLLM),`
			`];`

			`// Add tensor parallelism if multiple GPUs`
			`if (gpuIds.length > 1) {`
			`command.push('--tensor-parallel-size', String(gpuIds.length));`
			`}`

			`// Add additional options`
			`if (options.env?.VLLM_MAX_MODEL_LEN) {`
			`command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);`
			`}`

			`return {`
			`id,`
			`name,`
			`type: 'vllm',`
			`image: options.image \|\| CONTAINER_IMAGES.VLLM,`
			`gpuIds,`
			`port: options.port \|\| CONTAINER_PORTS.VLLM,`
			`externalPort: options.externalPort,`
			`models: [modelName],`
			`env: {`
			`HF_TOKEN: options.env?.HF_TOKEN \|\| '',`
			`...options.env,`
			`},`
			volumes: options.volumes \|\| [`modelgrid-vllm-${id}:/root/.cache/huggingface`],
			`autoStart: options.autoStart ?? true,`
			`restartPolicy: options.restartPolicy \|\| 'unless-stopped',`
			`memoryLimit: options.memoryLimit,`
			`cpuLimit: options.cpuLimit,`
			`command,`
			`};`
			`}`

			`/**`
			`* Check if vLLM is healthy`
			`*/`
			`public async isHealthy(): Promise<boolean> {`
			`try {`
			`const response = await this.fetch('/health', { timeout: 5000 });`
			`return response.ok;`
			`} catch {`
			`return false;`
			`}`
			`}`

			`/**`
			`* List available models`
			`* vLLM serves a single model per instance`
			`*/`
			`public async listModels(): Promise<string[]> {`
			`try {`
			`const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');`
			`return (data.data \|\| []).map((m) => m.id);`
			`} catch (error) {`
			logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
			`return this.config.models \|\| [];`
			`}`
			`}`

			`/**`
			`* Get loaded models with details`
			`*/`
			`public async getLoadedModels(): Promise<ILoadedModel[]> {`
			`try {`
			`const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');`
			`return (data.data \|\| []).map((m) => ({`
			`name: m.id,`
			`size: 0, // vLLM doesn't expose size`
			`loaded: true,`
			`requestCount: 0,`
			`}));`
			`} catch {`
			`// Return configured model as fallback`
			`return this.config.models.map((name) => ({`
			`name,`
			`size: 0,`
			`loaded: true,`
			`requestCount: 0,`
			`}));`
			`}`
			`}`

			`/**`
			`* Pull a model`
			`* vLLM downloads models automatically at startup`
			`* This method is a no-op - models are configured at container creation`
			`*/`
			`public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {`
			logger.info(`vLLM downloads models at startup. Model: ${modelName}`);
			`logger.info('To use a different model, create a new vLLM container.');`

			`if (onProgress) {`
			`onProgress({`
			`model: modelName,`
			`status: 'vLLM models are loaded at container startup',`
			`percent: 100,`
			`});`
			`}`

			`return true;`
			`}`

			`/**`
			`* Remove a model`
			`* vLLM serves a single model per instance`
			`*/`
			`public async removeModel(modelName: string): Promise<boolean> {`
			logger.info(`vLLM serves a single model per instance.`);
			logger.info(`To remove model ${modelName}, stop and remove this container.`);
			`return true;`
			`}`

			`/**`
			`* Send a chat completion request`
			`* vLLM is OpenAI-compatible`
			`*/`
			`public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {`
			`return this.fetchJson<IChatCompletionResponse>('/v1/chat/completions', {`
			`method: 'POST',`
			`body: {`
			`...request,`
			`stream: false,`
			`},`
			`timeout: 300000, // 5 minutes`
			`});`
			`}`

			`/**`
			`* Stream a chat completion request`
			`* vLLM is OpenAI-compatible`
			`*/`
			`public async chatCompletionStream(`
			`request: IChatCompletionRequest,`
			`onChunk: (chunk: string) => void,`
			`): Promise<void> {`
			`const response = await this.fetch('/v1/chat/completions', {`
			`method: 'POST',`
			`body: {`
			`...request,`
			`stream: true,`
			`},`
			`timeout: 300000,`
			`});`

			`if (!response.ok) {`
			`const error = await response.text();`
			throw new Error(`HTTP ${response.status}: ${error}`);
			`}`

			`const reader = response.body?.getReader();`
			`if (!reader) {`
			`throw new Error('No response body');`
			`}`

			`const decoder = new TextDecoder();`

			`while (true) {`
			`const { done, value } = await reader.read();`
			`if (done) break;`

			`const text = decoder.decode(value);`
			`// vLLM already sends data in SSE format`
			`onChunk(text);`
			`}`
			`}`

			`/**`
			`* Get vLLM-specific metrics`
			`*/`
			`public async getMetrics(): Promise<Record<string, unknown>> {`
			`try {`
			`const response = await this.fetch('/metrics', { timeout: 5000 });`
			`if (response.ok) {`
			`const text = await response.text();`
			`// Parse Prometheus metrics`
			`const metrics: Record<string, unknown> = {};`
			`const lines = text.split('\n');`
			`for (const line of lines) {`
			`if (line.startsWith('#') \|\| !line.trim()) continue;`
			`const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);`
			`if (match) {`
			`metrics[match[1]] = parseFloat(match[2]);`
			`}`
			`}`
			`return metrics;`
			`}`
			`} catch {`
			`// Metrics endpoint may not be enabled`
			`}`
			`return {};`
			`}`
			`}`