/** * Model loader for vLLM deployments. */ import type { IModelCatalogEntry } from '../interfaces/catalog.ts'; import type { IGpuInfo } from '../interfaces/gpu.ts'; import { filterOutUsedGpus, selectPlacementForModel } from '../cluster/placement.ts'; import { VllmContainer } from '../containers/vllm.ts'; import { logger } from '../logger.ts'; import { ModelRegistry } from './registry.ts'; import { ContainerManager } from '../containers/container-manager.ts'; import { GpuDetector } from '../hardware/gpu-detector.ts'; export interface IModelLoadResult { success: boolean; model: string; container?: string; error?: string; alreadyLoaded?: boolean; } export interface IModelLoadOptions { forceNewReplica?: boolean; replicaOrdinal?: number; } export class ModelLoader { private registry: ModelRegistry; private containerManager: ContainerManager; private gpuDetector: GpuDetector; private autoDeploy: boolean; constructor( registry: ModelRegistry, containerManager: ContainerManager, autoDeploy: boolean = true, ) { this.registry = registry; this.containerManager = containerManager; this.gpuDetector = new GpuDetector(); this.autoDeploy = autoDeploy; } public async loadModel( modelName: string, options: IModelLoadOptions = {}, ): Promise { logger.info(`Loading model: ${modelName}`); const modelInfo = await this.registry.getModel(modelName); const resolvedModelName = modelInfo?.id || modelName; const existing = await this.containerManager.findContainerForModel(resolvedModelName); if (existing && !options.forceNewReplica) { return { success: true, model: resolvedModelName, container: existing.getConfig().id, alreadyLoaded: true, }; } if (!modelInfo) { return { success: false, model: resolvedModelName, error: `Model "${modelName}" is not listed in the registry`, }; } const placement = this.planPlacement(modelInfo, await this.gpuDetector.detectGpus()); if (!placement) { return { success: false, model: resolvedModelName, error: 'Insufficient GPU capacity for deployment', }; } if (!this.autoDeploy) { return { success: false, model: resolvedModelName, error: 'Automatic deployments are disabled', }; } const deploymentId = this.createDeploymentId( modelInfo.id, options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id), ); const deploymentName = this.createDeploymentName( modelInfo.id, options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id), ); const config = VllmContainer.createConfig( deploymentId, deploymentName, modelInfo.source.repo, placement.gpuIds, { env: { ...(modelInfo.launchDefaults?.env || {}), }, command: this.buildVllmCommand(modelInfo, placement.tensorParallelSize), }, ); config.models = [modelInfo.id]; const container = this.containerManager.addContainer(config); const started = await container.start(); if (!started) { await this.containerManager.removeContainer(config.id); return { success: false, model: resolvedModelName, error: 'Failed to start vLLM deployment', }; } return { success: true, model: modelInfo.id, container: config.id, }; } public async preloadModels(modelNames: string[]): Promise> { const results = new Map(); for (const modelName of modelNames) { const result = await this.loadModel(modelName); results.set(modelName, result); if (!result.success) { logger.warn(`Failed to preload model: ${modelName}`); } } return results; } public async unloadModel(modelName: string): Promise { const modelInfo = await this.registry.getModel(modelName); const canonicalModel = modelInfo?.id || modelName; const containers = this.containerManager.getAllContainers().filter((container) => container.getConfig().models.includes(canonicalModel) ); if (containers.length === 0) { logger.warn(`Model ${modelName} not found in any container`); return false; } let allRemoved = true; for (const container of containers) { const removed = await this.containerManager.removeContainer(container.getConfig().id); allRemoved = allRemoved && removed; } return allRemoved; } public async deployReplica( modelName: string, replicaOrdinal?: number, ): Promise { return this.loadModel(modelName, { forceNewReplica: true, replicaOrdinal, }); } public isAutoPullEnabled(): boolean { return this.autoDeploy; } public setAutoPull(enabled: boolean): void { this.autoDeploy = enabled; } public async getRecommendations(): Promise<{ canLoad: string[]; cannotLoad: string[]; loaded: string[]; }> { const gpus = await this.gpuDetector.detectGpus(); const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024); const allModels = await this.registry.getAllModels(); const availableModels = await this.containerManager.getAllAvailableModels(); const loadedNames = new Set(availableModels.keys()); const canLoad: string[] = []; const cannotLoad: string[] = []; const loaded: string[] = []; for (const model of allModels) { if (loadedNames.has(model.id)) { loaded.push(model.id); } else if (model.requirements.minVramGb <= totalVramGb) { canLoad.push(model.id); } else { cannotLoad.push(model.id); } } return { canLoad, cannotLoad, loaded }; } public async printStatus(): Promise { const recommendations = await this.getRecommendations(); logger.logBoxTitle('Model Deployment Status', 70, 'info'); logger.logBoxLine(`Running Deployments (${recommendations.loaded.length}):`); if (recommendations.loaded.length > 0) { for (const model of recommendations.loaded) { logger.logBoxLine(` - ${model}`); } } else { logger.logBoxLine(' None'); } logger.logBoxLine(''); logger.logBoxLine(`Ready To Deploy (${recommendations.canLoad.length}):`); for (const model of recommendations.canLoad.slice(0, 5)) { logger.logBoxLine(` - ${model}`); } if (recommendations.canLoad.length > 5) { logger.logBoxLine(` ... and ${recommendations.canLoad.length - 5} more`); } logger.logBoxLine(''); logger.logBoxLine(`Needs Larger GPUs (${recommendations.cannotLoad.length}):`); for (const model of recommendations.cannotLoad.slice(0, 3)) { const info = await this.registry.getModel(model); logger.logBoxLine(` - ${model} (needs ${info?.requirements.minVramGb || '?'}GB)`); } if (recommendations.cannotLoad.length > 3) { logger.logBoxLine(` ... and ${recommendations.cannotLoad.length - 3} more`); } logger.logBoxEnd(); } private planPlacement( modelInfo: IModelCatalogEntry, gpus: IGpuInfo[], ): { gpuIds: string[]; tensorParallelSize: number } | null { const usedGpuIds = this.containerManager.getAllContainers().flatMap((container) => container.getConfig().gpuIds ); const freeGpus = filterOutUsedGpus(gpus, usedGpuIds); const preferredPlacement = selectPlacementForModel(modelInfo, freeGpus); if (preferredPlacement) { return { gpuIds: preferredPlacement.gpuIds, tensorParallelSize: preferredPlacement.tensorParallelSize, }; } const fallbackPlacement = selectPlacementForModel(modelInfo, gpus); if (!fallbackPlacement) { return null; } return { gpuIds: fallbackPlacement.gpuIds, tensorParallelSize: fallbackPlacement.tensorParallelSize, }; } private buildVllmCommand( modelInfo: IModelCatalogEntry, tensorParallelSize: number, ): string[] { const command = ['--model', modelInfo.source.repo]; if (tensorParallelSize > 1) { command.push('--tensor-parallel-size', String(tensorParallelSize)); } if (modelInfo.launchDefaults?.maxModelLen) { command.push('--max-model-len', String(modelInfo.launchDefaults.maxModelLen)); } if (modelInfo.launchDefaults?.gpuMemoryUtilization) { command.push( '--gpu-memory-utilization', String(modelInfo.launchDefaults.gpuMemoryUtilization), ); } if (modelInfo.launchDefaults?.quantization) { command.push('--quantization', modelInfo.launchDefaults.quantization); } if (modelInfo.launchDefaults?.dtype) { command.push('--dtype', modelInfo.launchDefaults.dtype); } if (modelInfo.launchDefaults?.generationConfig) { command.push('--generation-config', modelInfo.launchDefaults.generationConfig); } if (modelInfo.launchDefaults?.extraArgs) { command.push(...modelInfo.launchDefaults.extraArgs); } return command; } private getExistingReplicaCount(modelId: string): number { return this.containerManager.getAllContainers().filter((container) => container.getConfig().models.includes(modelId) ).length; } private createDeploymentId(modelId: string, replicaOrdinal: number): string { const baseId = modelId.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice( 0, 32, ); const suffix = replicaOrdinal > 0 ? `-r${replicaOrdinal + 1}` : ''; return `vllm-${baseId}${suffix}`; } private createDeploymentName(modelId: string, replicaOrdinal: number): string { const baseName = modelId.split('/').pop() || modelId; if (replicaOrdinal === 0) { return baseName; } return `${baseName} replica ${replicaOrdinal + 1}`; } }