modelgrid/ts/models/loader.ts

/**
 * Model loader for vLLM deployments.
 */

import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
import type { IGpuInfo } from '../interfaces/gpu.ts';
import { filterOutUsedGpus, selectPlacementForModel } from '../cluster/placement.ts';
import { VllmContainer } from '../containers/vllm.ts';
import { logger } from '../logger.ts';
import { ModelRegistry } from './registry.ts';
import { ContainerManager } from '../containers/container-manager.ts';
import { GpuDetector } from '../hardware/gpu-detector.ts';

export interface IModelLoadResult {
  success: boolean;
  model: string;
  container?: string;
  error?: string;
  alreadyLoaded?: boolean;
}

export interface IModelLoadOptions {
  forceNewReplica?: boolean;
  replicaOrdinal?: number;
}

export class ModelLoader {
  private registry: ModelRegistry;
  private containerManager: ContainerManager;
  private gpuDetector: GpuDetector;
  private autoDeploy: boolean;

  constructor(
    registry: ModelRegistry,
    containerManager: ContainerManager,
    autoDeploy: boolean = true,
  ) {
    this.registry = registry;
    this.containerManager = containerManager;
    this.gpuDetector = new GpuDetector();
    this.autoDeploy = autoDeploy;
  }

  public async loadModel(
    modelName: string,
    options: IModelLoadOptions = {},
  ): Promise<IModelLoadResult> {
    logger.info(`Loading model: ${modelName}`);

    const modelInfo = await this.registry.getModel(modelName);
    const resolvedModelName = modelInfo?.id || modelName;

    const existing = await this.containerManager.findContainerForModel(resolvedModelName);
    if (existing && !options.forceNewReplica) {
      return {
        success: true,
        model: resolvedModelName,
        container: existing.getConfig().id,
        alreadyLoaded: true,
      };
    }

    if (!modelInfo) {
      return {
        success: false,
        model: resolvedModelName,
        error: `Model "${modelName}" is not listed in the registry`,
      };
    }

    const placement = this.planPlacement(modelInfo, await this.gpuDetector.detectGpus());
    if (!placement) {
      return {
        success: false,
        model: resolvedModelName,
        error: 'Insufficient GPU capacity for deployment',
      };
    }

    if (!this.autoDeploy) {
      return {
        success: false,
        model: resolvedModelName,
        error: 'Automatic deployments are disabled',
      };
    }

    const deploymentId = this.createDeploymentId(
      modelInfo.id,
      options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id),
    );
    const deploymentName = this.createDeploymentName(
      modelInfo.id,
      options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id),
    );
    const config = VllmContainer.createConfig(
      deploymentId,
      deploymentName,
      modelInfo.source.repo,
      placement.gpuIds,
      {
        env: {
          ...(modelInfo.launchDefaults?.env || {}),
        },
        command: this.buildVllmCommand(modelInfo, placement.tensorParallelSize),
      },
    );
    config.models = [modelInfo.id];

    const container = this.containerManager.addContainer(config);
    const started = await container.start();
    if (!started) {
      await this.containerManager.removeContainer(config.id);
      return {
        success: false,
        model: resolvedModelName,
        error: 'Failed to start vLLM deployment',
      };
    }

    return {
      success: true,
      model: modelInfo.id,
      container: config.id,
    };
  }

  public async preloadModels(modelNames: string[]): Promise<Map<string, IModelLoadResult>> {
    const results = new Map<string, IModelLoadResult>();

    for (const modelName of modelNames) {
      const result = await this.loadModel(modelName);
      results.set(modelName, result);

      if (!result.success) {
        logger.warn(`Failed to preload model: ${modelName}`);
      }
    }

    return results;
  }

  public async unloadModel(modelName: string): Promise<boolean> {
    const modelInfo = await this.registry.getModel(modelName);
    const canonicalModel = modelInfo?.id || modelName;
    const containers = this.containerManager.getAllContainers().filter((container) =>
      container.getConfig().models.includes(canonicalModel)
    );

    if (containers.length === 0) {
      logger.warn(`Model ${modelName} not found in any container`);
      return false;
    }

    let allRemoved = true;
    for (const container of containers) {
      const removed = await this.containerManager.removeContainer(container.getConfig().id);
      allRemoved = allRemoved && removed;
    }

    return allRemoved;
  }

  public async deployReplica(
    modelName: string,
    replicaOrdinal?: number,
  ): Promise<IModelLoadResult> {
    return this.loadModel(modelName, {
      forceNewReplica: true,
      replicaOrdinal,
    });
  }

  public isAutoPullEnabled(): boolean {
    return this.autoDeploy;
  }

  public setAutoPull(enabled: boolean): void {
    this.autoDeploy = enabled;
  }

  public async getRecommendations(): Promise<{
    canLoad: string[];
    cannotLoad: string[];
    loaded: string[];
  }> {
    const gpus = await this.gpuDetector.detectGpus();
    const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);

    const allModels = await this.registry.getAllModels();
    const availableModels = await this.containerManager.getAllAvailableModels();
    const loadedNames = new Set(availableModels.keys());

    const canLoad: string[] = [];
    const cannotLoad: string[] = [];
    const loaded: string[] = [];

    for (const model of allModels) {
      if (loadedNames.has(model.id)) {
        loaded.push(model.id);
      } else if (model.requirements.minVramGb <= totalVramGb) {
        canLoad.push(model.id);
      } else {
        cannotLoad.push(model.id);
      }
    }

    return { canLoad, cannotLoad, loaded };
  }

  public async printStatus(): Promise<void> {
    const recommendations = await this.getRecommendations();

    logger.logBoxTitle('Model Deployment Status', 70, 'info');

    logger.logBoxLine(`Running Deployments (${recommendations.loaded.length}):`);
    if (recommendations.loaded.length > 0) {
      for (const model of recommendations.loaded) {
        logger.logBoxLine(`  - ${model}`);
      }
    } else {
      logger.logBoxLine('  None');
    }

    logger.logBoxLine('');
    logger.logBoxLine(`Ready To Deploy (${recommendations.canLoad.length}):`);
    for (const model of recommendations.canLoad.slice(0, 5)) {
      logger.logBoxLine(`  - ${model}`);
    }
    if (recommendations.canLoad.length > 5) {
      logger.logBoxLine(`  ... and ${recommendations.canLoad.length - 5} more`);
    }

    logger.logBoxLine('');
    logger.logBoxLine(`Needs Larger GPUs (${recommendations.cannotLoad.length}):`);
    for (const model of recommendations.cannotLoad.slice(0, 3)) {
      const info = await this.registry.getModel(model);
      logger.logBoxLine(`  - ${model} (needs ${info?.requirements.minVramGb || '?'}GB)`);
    }
    if (recommendations.cannotLoad.length > 3) {
      logger.logBoxLine(`  ... and ${recommendations.cannotLoad.length - 3} more`);
    }

    logger.logBoxEnd();
  }

  private planPlacement(
    modelInfo: IModelCatalogEntry,
    gpus: IGpuInfo[],
  ): { gpuIds: string[]; tensorParallelSize: number } | null {
    const usedGpuIds = this.containerManager.getAllContainers().flatMap((container) =>
      container.getConfig().gpuIds
    );
    const freeGpus = filterOutUsedGpus(gpus, usedGpuIds);

    const preferredPlacement = selectPlacementForModel(modelInfo, freeGpus);
    if (preferredPlacement) {
      return {
        gpuIds: preferredPlacement.gpuIds,
        tensorParallelSize: preferredPlacement.tensorParallelSize,
      };
    }

    const fallbackPlacement = selectPlacementForModel(modelInfo, gpus);
    if (!fallbackPlacement) {
      return null;
    }

    return {
      gpuIds: fallbackPlacement.gpuIds,
      tensorParallelSize: fallbackPlacement.tensorParallelSize,
    };
  }

  private buildVllmCommand(
    modelInfo: IModelCatalogEntry,
    tensorParallelSize: number,
  ): string[] {
    const command = ['--model', modelInfo.source.repo];

    if (tensorParallelSize > 1) {
      command.push('--tensor-parallel-size', String(tensorParallelSize));
    }

    if (modelInfo.launchDefaults?.maxModelLen) {
      command.push('--max-model-len', String(modelInfo.launchDefaults.maxModelLen));
    }

    if (modelInfo.launchDefaults?.gpuMemoryUtilization) {
      command.push(
        '--gpu-memory-utilization',
        String(modelInfo.launchDefaults.gpuMemoryUtilization),
      );
    }

    if (modelInfo.launchDefaults?.quantization) {
      command.push('--quantization', modelInfo.launchDefaults.quantization);
    }

    if (modelInfo.launchDefaults?.dtype) {
      command.push('--dtype', modelInfo.launchDefaults.dtype);
    }

    if (modelInfo.launchDefaults?.generationConfig) {
      command.push('--generation-config', modelInfo.launchDefaults.generationConfig);
    }

    if (modelInfo.launchDefaults?.extraArgs) {
      command.push(...modelInfo.launchDefaults.extraArgs);
    }

    return command;
  }

  private getExistingReplicaCount(modelId: string): number {
    return this.containerManager.getAllContainers().filter((container) =>
      container.getConfig().models.includes(modelId)
    ).length;
  }

  private createDeploymentId(modelId: string, replicaOrdinal: number): string {
    const baseId = modelId.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(
      0,
      32,
    );
    const suffix = replicaOrdinal > 0 ? `-r${replicaOrdinal + 1}` : '';
    return `vllm-${baseId}${suffix}`;
  }

  private createDeploymentName(modelId: string, replicaOrdinal: number): string {
    const baseName = modelId.split('/').pop() || modelId;
    if (replicaOrdinal === 0) {
      return baseName;
    }

    return `${baseName} replica ${replicaOrdinal + 1}`;
  }
}