339 lines
9.8 KiB
TypeScript
339 lines
9.8 KiB
TypeScript
/**
|
|
* Model loader for vLLM deployments.
|
|
*/
|
|
|
|
import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
|
|
import type { IGpuInfo } from '../interfaces/gpu.ts';
|
|
import { filterOutUsedGpus, selectPlacementForModel } from '../cluster/placement.ts';
|
|
import { VllmContainer } from '../containers/vllm.ts';
|
|
import { logger } from '../logger.ts';
|
|
import { ModelRegistry } from './registry.ts';
|
|
import { ContainerManager } from '../containers/container-manager.ts';
|
|
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
|
|
|
export interface IModelLoadResult {
|
|
success: boolean;
|
|
model: string;
|
|
container?: string;
|
|
error?: string;
|
|
alreadyLoaded?: boolean;
|
|
}
|
|
|
|
export interface IModelLoadOptions {
|
|
forceNewReplica?: boolean;
|
|
replicaOrdinal?: number;
|
|
}
|
|
|
|
export class ModelLoader {
|
|
private registry: ModelRegistry;
|
|
private containerManager: ContainerManager;
|
|
private gpuDetector: GpuDetector;
|
|
private autoDeploy: boolean;
|
|
|
|
constructor(
|
|
registry: ModelRegistry,
|
|
containerManager: ContainerManager,
|
|
autoDeploy: boolean = true,
|
|
) {
|
|
this.registry = registry;
|
|
this.containerManager = containerManager;
|
|
this.gpuDetector = new GpuDetector();
|
|
this.autoDeploy = autoDeploy;
|
|
}
|
|
|
|
public async loadModel(
|
|
modelName: string,
|
|
options: IModelLoadOptions = {},
|
|
): Promise<IModelLoadResult> {
|
|
logger.info(`Loading model: ${modelName}`);
|
|
|
|
const modelInfo = await this.registry.getModel(modelName);
|
|
const resolvedModelName = modelInfo?.id || modelName;
|
|
|
|
const existing = await this.containerManager.findContainerForModel(resolvedModelName);
|
|
if (existing && !options.forceNewReplica) {
|
|
return {
|
|
success: true,
|
|
model: resolvedModelName,
|
|
container: existing.getConfig().id,
|
|
alreadyLoaded: true,
|
|
};
|
|
}
|
|
|
|
if (!modelInfo) {
|
|
return {
|
|
success: false,
|
|
model: resolvedModelName,
|
|
error: `Model "${modelName}" is not listed in the registry`,
|
|
};
|
|
}
|
|
|
|
const placement = this.planPlacement(modelInfo, await this.gpuDetector.detectGpus());
|
|
if (!placement) {
|
|
return {
|
|
success: false,
|
|
model: resolvedModelName,
|
|
error: 'Insufficient GPU capacity for deployment',
|
|
};
|
|
}
|
|
|
|
if (!this.autoDeploy) {
|
|
return {
|
|
success: false,
|
|
model: resolvedModelName,
|
|
error: 'Automatic deployments are disabled',
|
|
};
|
|
}
|
|
|
|
const deploymentId = this.createDeploymentId(
|
|
modelInfo.id,
|
|
options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id),
|
|
);
|
|
const deploymentName = this.createDeploymentName(
|
|
modelInfo.id,
|
|
options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id),
|
|
);
|
|
const config = VllmContainer.createConfig(
|
|
deploymentId,
|
|
deploymentName,
|
|
modelInfo.source.repo,
|
|
placement.gpuIds,
|
|
{
|
|
env: {
|
|
...(modelInfo.launchDefaults?.env || {}),
|
|
},
|
|
command: this.buildVllmCommand(modelInfo, placement.tensorParallelSize),
|
|
},
|
|
);
|
|
config.models = [modelInfo.id];
|
|
|
|
const container = this.containerManager.addContainer(config);
|
|
const started = await container.start();
|
|
if (!started) {
|
|
await this.containerManager.removeContainer(config.id);
|
|
return {
|
|
success: false,
|
|
model: resolvedModelName,
|
|
error: 'Failed to start vLLM deployment',
|
|
};
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
model: modelInfo.id,
|
|
container: config.id,
|
|
};
|
|
}
|
|
|
|
public async preloadModels(modelNames: string[]): Promise<Map<string, IModelLoadResult>> {
|
|
const results = new Map<string, IModelLoadResult>();
|
|
|
|
for (const modelName of modelNames) {
|
|
const result = await this.loadModel(modelName);
|
|
results.set(modelName, result);
|
|
|
|
if (!result.success) {
|
|
logger.warn(`Failed to preload model: ${modelName}`);
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
public async unloadModel(modelName: string): Promise<boolean> {
|
|
const modelInfo = await this.registry.getModel(modelName);
|
|
const canonicalModel = modelInfo?.id || modelName;
|
|
const containers = this.containerManager.getAllContainers().filter((container) =>
|
|
container.getConfig().models.includes(canonicalModel)
|
|
);
|
|
|
|
if (containers.length === 0) {
|
|
logger.warn(`Model ${modelName} not found in any container`);
|
|
return false;
|
|
}
|
|
|
|
let allRemoved = true;
|
|
for (const container of containers) {
|
|
const removed = await this.containerManager.removeContainer(container.getConfig().id);
|
|
allRemoved = allRemoved && removed;
|
|
}
|
|
|
|
return allRemoved;
|
|
}
|
|
|
|
public async deployReplica(
|
|
modelName: string,
|
|
replicaOrdinal?: number,
|
|
): Promise<IModelLoadResult> {
|
|
return this.loadModel(modelName, {
|
|
forceNewReplica: true,
|
|
replicaOrdinal,
|
|
});
|
|
}
|
|
|
|
public isAutoPullEnabled(): boolean {
|
|
return this.autoDeploy;
|
|
}
|
|
|
|
public setAutoPull(enabled: boolean): void {
|
|
this.autoDeploy = enabled;
|
|
}
|
|
|
|
public async getRecommendations(): Promise<{
|
|
canLoad: string[];
|
|
cannotLoad: string[];
|
|
loaded: string[];
|
|
}> {
|
|
const gpus = await this.gpuDetector.detectGpus();
|
|
const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
|
|
|
|
const allModels = await this.registry.getAllModels();
|
|
const availableModels = await this.containerManager.getAllAvailableModels();
|
|
const loadedNames = new Set(availableModels.keys());
|
|
|
|
const canLoad: string[] = [];
|
|
const cannotLoad: string[] = [];
|
|
const loaded: string[] = [];
|
|
|
|
for (const model of allModels) {
|
|
if (loadedNames.has(model.id)) {
|
|
loaded.push(model.id);
|
|
} else if (model.requirements.minVramGb <= totalVramGb) {
|
|
canLoad.push(model.id);
|
|
} else {
|
|
cannotLoad.push(model.id);
|
|
}
|
|
}
|
|
|
|
return { canLoad, cannotLoad, loaded };
|
|
}
|
|
|
|
public async printStatus(): Promise<void> {
|
|
const recommendations = await this.getRecommendations();
|
|
|
|
logger.logBoxTitle('Model Deployment Status', 70, 'info');
|
|
|
|
logger.logBoxLine(`Running Deployments (${recommendations.loaded.length}):`);
|
|
if (recommendations.loaded.length > 0) {
|
|
for (const model of recommendations.loaded) {
|
|
logger.logBoxLine(` - ${model}`);
|
|
}
|
|
} else {
|
|
logger.logBoxLine(' None');
|
|
}
|
|
|
|
logger.logBoxLine('');
|
|
logger.logBoxLine(`Ready To Deploy (${recommendations.canLoad.length}):`);
|
|
for (const model of recommendations.canLoad.slice(0, 5)) {
|
|
logger.logBoxLine(` - ${model}`);
|
|
}
|
|
if (recommendations.canLoad.length > 5) {
|
|
logger.logBoxLine(` ... and ${recommendations.canLoad.length - 5} more`);
|
|
}
|
|
|
|
logger.logBoxLine('');
|
|
logger.logBoxLine(`Needs Larger GPUs (${recommendations.cannotLoad.length}):`);
|
|
for (const model of recommendations.cannotLoad.slice(0, 3)) {
|
|
const info = await this.registry.getModel(model);
|
|
logger.logBoxLine(` - ${model} (needs ${info?.requirements.minVramGb || '?'}GB)`);
|
|
}
|
|
if (recommendations.cannotLoad.length > 3) {
|
|
logger.logBoxLine(` ... and ${recommendations.cannotLoad.length - 3} more`);
|
|
}
|
|
|
|
logger.logBoxEnd();
|
|
}
|
|
|
|
private planPlacement(
|
|
modelInfo: IModelCatalogEntry,
|
|
gpus: IGpuInfo[],
|
|
): { gpuIds: string[]; tensorParallelSize: number } | null {
|
|
const usedGpuIds = this.containerManager.getAllContainers().flatMap((container) =>
|
|
container.getConfig().gpuIds
|
|
);
|
|
const freeGpus = filterOutUsedGpus(gpus, usedGpuIds);
|
|
|
|
const preferredPlacement = selectPlacementForModel(modelInfo, freeGpus);
|
|
if (preferredPlacement) {
|
|
return {
|
|
gpuIds: preferredPlacement.gpuIds,
|
|
tensorParallelSize: preferredPlacement.tensorParallelSize,
|
|
};
|
|
}
|
|
|
|
const fallbackPlacement = selectPlacementForModel(modelInfo, gpus);
|
|
if (!fallbackPlacement) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
gpuIds: fallbackPlacement.gpuIds,
|
|
tensorParallelSize: fallbackPlacement.tensorParallelSize,
|
|
};
|
|
}
|
|
|
|
private buildVllmCommand(
|
|
modelInfo: IModelCatalogEntry,
|
|
tensorParallelSize: number,
|
|
): string[] {
|
|
const command = ['--model', modelInfo.source.repo];
|
|
|
|
if (tensorParallelSize > 1) {
|
|
command.push('--tensor-parallel-size', String(tensorParallelSize));
|
|
}
|
|
|
|
if (modelInfo.launchDefaults?.maxModelLen) {
|
|
command.push('--max-model-len', String(modelInfo.launchDefaults.maxModelLen));
|
|
}
|
|
|
|
if (modelInfo.launchDefaults?.gpuMemoryUtilization) {
|
|
command.push(
|
|
'--gpu-memory-utilization',
|
|
String(modelInfo.launchDefaults.gpuMemoryUtilization),
|
|
);
|
|
}
|
|
|
|
if (modelInfo.launchDefaults?.quantization) {
|
|
command.push('--quantization', modelInfo.launchDefaults.quantization);
|
|
}
|
|
|
|
if (modelInfo.launchDefaults?.dtype) {
|
|
command.push('--dtype', modelInfo.launchDefaults.dtype);
|
|
}
|
|
|
|
if (modelInfo.launchDefaults?.generationConfig) {
|
|
command.push('--generation-config', modelInfo.launchDefaults.generationConfig);
|
|
}
|
|
|
|
if (modelInfo.launchDefaults?.extraArgs) {
|
|
command.push(...modelInfo.launchDefaults.extraArgs);
|
|
}
|
|
|
|
return command;
|
|
}
|
|
|
|
private getExistingReplicaCount(modelId: string): number {
|
|
return this.containerManager.getAllContainers().filter((container) =>
|
|
container.getConfig().models.includes(modelId)
|
|
).length;
|
|
}
|
|
|
|
private createDeploymentId(modelId: string, replicaOrdinal: number): string {
|
|
const baseId = modelId.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(
|
|
0,
|
|
32,
|
|
);
|
|
const suffix = replicaOrdinal > 0 ? `-r${replicaOrdinal + 1}` : '';
|
|
return `vllm-${baseId}${suffix}`;
|
|
}
|
|
|
|
private createDeploymentName(modelId: string, replicaOrdinal: number): string {
|
|
const baseName = modelId.split('/').pop() || modelId;
|
|
if (replicaOrdinal === 0) {
|
|
return baseName;
|
|
}
|
|
|
|
return `${baseName} replica ${replicaOrdinal + 1}`;
|
|
}
|
|
}
|