feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

This commit is contained in:
2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
+195 -148
View File
@@ -1,18 +1,16 @@
/**
* Model Loader
*
* Handles automatic model loading with greenlist validation.
* Model loader for vLLM deployments.
*/
import type { TContainerType } from '../interfaces/container.ts';
import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
import type { IGpuInfo } from '../interfaces/gpu.ts';
import { filterOutUsedGpus, selectPlacementForModel } from '../cluster/placement.ts';
import { VllmContainer } from '../containers/vllm.ts';
import { logger } from '../logger.ts';
import { ModelRegistry } from './registry.ts';
import { ContainerManager } from '../containers/container-manager.ts';
import { GpuDetector } from '../hardware/gpu-detector.ts';
/**
* Model load result
*/
export interface IModelLoadResult {
success: boolean;
model: string;
@@ -21,161 +19,112 @@ export interface IModelLoadResult {
alreadyLoaded?: boolean;
}
/**
* Model loader with greenlist validation
*/
export interface IModelLoadOptions {
forceNewReplica?: boolean;
replicaOrdinal?: number;
}
export class ModelLoader {
private registry: ModelRegistry;
private containerManager: ContainerManager;
private gpuDetector: GpuDetector;
private autoPull: boolean;
private autoDeploy: boolean;
constructor(
registry: ModelRegistry,
containerManager: ContainerManager,
autoPull: boolean = true,
autoDeploy: boolean = true,
) {
this.registry = registry;
this.containerManager = containerManager;
this.gpuDetector = new GpuDetector();
this.autoPull = autoPull;
this.autoDeploy = autoDeploy;
}
/**
* Load a model with greenlist validation
*/
public async loadModel(modelName: string): Promise<IModelLoadResult> {
public async loadModel(
modelName: string,
options: IModelLoadOptions = {},
): Promise<IModelLoadResult> {
logger.info(`Loading model: ${modelName}`);
// Step 1: Check if model is already loaded in any container
const container = await this.containerManager.findContainerForModel(modelName);
if (container) {
logger.dim(`Model ${modelName} is already available in container ${container.getConfig().id}`);
const modelInfo = await this.registry.getModel(modelName);
const resolvedModelName = modelInfo?.id || modelName;
const existing = await this.containerManager.findContainerForModel(resolvedModelName);
if (existing && !options.forceNewReplica) {
return {
success: true,
model: modelName,
container: container.getConfig().id,
model: resolvedModelName,
container: existing.getConfig().id,
alreadyLoaded: true,
};
}
// Step 2: Check if model is greenlit
const isGreenlit = await this.registry.isModelGreenlit(modelName);
if (!isGreenlit) {
logger.error(`Model ${modelName} is not in the greenlit list`);
logger.info('Only greenlit models can be auto-pulled for security reasons.');
logger.info('Contact your administrator to add this model to the greenlist.');
return {
success: false,
model: modelName,
error: `Model "${modelName}" is not greenlit. Request via admin or add to greenlist.`,
};
}
// Step 3: Get model info from greenlist
const modelInfo = await this.registry.getGreenlitModel(modelName);
if (!modelInfo) {
return {
success: false,
model: modelName,
error: 'Failed to get model info from greenlist',
model: resolvedModelName,
error: `Model "${modelName}" is not listed in the registry`,
};
}
// Step 4: Check VRAM requirements
const gpus = await this.gpuDetector.detectGpus();
const totalVram = gpus.reduce((sum, gpu) => sum + gpu.vram, 0);
const totalVramGb = Math.round(totalVram / 1024);
if (modelInfo.minVram > totalVramGb) {
logger.error(`Insufficient VRAM for model ${modelName}`);
logger.info(`Required: ${modelInfo.minVram}GB, Available: ${totalVramGb}GB`);
const placement = this.planPlacement(modelInfo, await this.gpuDetector.detectGpus());
if (!placement) {
return {
success: false,
model: modelName,
error: `Insufficient VRAM. Required: ${modelInfo.minVram}GB, Available: ${totalVramGb}GB`,
model: resolvedModelName,
error: 'Insufficient GPU capacity for deployment',
};
}
// Step 5: Find or create appropriate container
const containerType = modelInfo.container;
let targetContainer = await this.findAvailableContainer(containerType);
if (!targetContainer) {
logger.warn(`No ${containerType} container available`);
// Could auto-create container here if desired
if (!this.autoDeploy) {
return {
success: false,
model: modelName,
error: `No ${containerType} container available to load model`,
model: resolvedModelName,
error: 'Automatic deployments are disabled',
};
}
// Step 6: Pull the model if auto-pull is enabled
if (this.autoPull) {
logger.info(`Pulling model ${modelName} to ${containerType} container...`);
const deploymentId = this.createDeploymentId(
modelInfo.id,
options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id),
);
const deploymentName = this.createDeploymentName(
modelInfo.id,
options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id),
);
const config = VllmContainer.createConfig(
deploymentId,
deploymentName,
modelInfo.source.repo,
placement.gpuIds,
{
env: {
...(modelInfo.launchDefaults?.env || {}),
},
command: this.buildVllmCommand(modelInfo, placement.tensorParallelSize),
},
);
config.models = [modelInfo.id];
const pullSuccess = await targetContainer.pullModel(modelName, (progress) => {
const percent = progress.percent !== undefined ? ` (${progress.percent}%)` : '';
logger.dim(` ${progress.status}${percent}`);
});
if (!pullSuccess) {
return {
success: false,
model: modelName,
error: 'Failed to pull model',
};
}
const container = this.containerManager.addContainer(config);
const started = await container.start();
if (!started) {
await this.containerManager.removeContainer(config.id);
return {
success: false,
model: resolvedModelName,
error: 'Failed to start vLLM deployment',
};
}
logger.success(`Model ${modelName} loaded successfully`);
return {
success: true,
model: modelName,
container: targetContainer.getConfig().id,
model: modelInfo.id,
container: config.id,
};
}
/**
* Find an available container of the specified type
*/
private async findAvailableContainer(
containerType: TContainerType,
): Promise<import('../containers/base-container.ts').BaseContainer | null> {
const containers = this.containerManager.getAllContainers();
for (const container of containers) {
if (container.type !== containerType) {
continue;
}
const status = await container.getStatus();
if (status.running) {
return container;
}
}
// No running container found, try to start one
for (const container of containers) {
if (container.type !== containerType) {
continue;
}
logger.info(`Starting ${containerType} container: ${container.getConfig().name}`);
const started = await container.start();
if (started) {
return container;
}
}
return null;
}
/**
* Preload a list of models
*/
public async preloadModels(modelNames: string[]): Promise<Map<string, IModelLoadResult>> {
const results = new Map<string, IModelLoadResult>();
@@ -191,36 +140,45 @@ export class ModelLoader {
return results;
}
/**
* Unload a model from a container
*/
public async unloadModel(modelName: string): Promise<boolean> {
const container = await this.containerManager.findContainerForModel(modelName);
if (!container) {
const modelInfo = await this.registry.getModel(modelName);
const canonicalModel = modelInfo?.id || modelName;
const containers = this.containerManager.getAllContainers().filter((container) =>
container.getConfig().models.includes(canonicalModel)
);
if (containers.length === 0) {
logger.warn(`Model ${modelName} not found in any container`);
return false;
}
return container.removeModel(modelName);
let allRemoved = true;
for (const container of containers) {
const removed = await this.containerManager.removeContainer(container.getConfig().id);
allRemoved = allRemoved && removed;
}
return allRemoved;
}
public async deployReplica(
modelName: string,
replicaOrdinal?: number,
): Promise<IModelLoadResult> {
return this.loadModel(modelName, {
forceNewReplica: true,
replicaOrdinal,
});
}
/**
* Check if auto-pull is enabled
*/
public isAutoPullEnabled(): boolean {
return this.autoPull;
return this.autoDeploy;
}
/**
* Enable or disable auto-pull
*/
public setAutoPull(enabled: boolean): void {
this.autoPull = enabled;
this.autoDeploy = enabled;
}
/**
* Get loading recommendations for available VRAM
*/
public async getRecommendations(): Promise<{
canLoad: string[];
cannotLoad: string[];
@@ -229,7 +187,7 @@ export class ModelLoader {
const gpus = await this.gpuDetector.detectGpus();
const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
const allModels = await this.registry.getAllGreenlitModels();
const allModels = await this.registry.getAllModels();
const availableModels = await this.containerManager.getAllAvailableModels();
const loadedNames = new Set(availableModels.keys());
@@ -238,27 +196,24 @@ export class ModelLoader {
const loaded: string[] = [];
for (const model of allModels) {
if (loadedNames.has(model.name)) {
loaded.push(model.name);
} else if (model.minVram <= totalVramGb) {
canLoad.push(model.name);
if (loadedNames.has(model.id)) {
loaded.push(model.id);
} else if (model.requirements.minVramGb <= totalVramGb) {
canLoad.push(model.id);
} else {
cannotLoad.push(model.name);
cannotLoad.push(model.id);
}
}
return { canLoad, cannotLoad, loaded };
}
/**
* Print loading status
*/
public async printStatus(): Promise<void> {
const recommendations = await this.getRecommendations();
logger.logBoxTitle('Model Loading Status', 60, 'info');
logger.logBoxTitle('Model Deployment Status', 70, 'info');
logger.logBoxLine(`Loaded Models (${recommendations.loaded.length}):`);
logger.logBoxLine(`Running Deployments (${recommendations.loaded.length}):`);
if (recommendations.loaded.length > 0) {
for (const model of recommendations.loaded) {
logger.logBoxLine(` - ${model}`);
@@ -268,7 +223,7 @@ export class ModelLoader {
}
logger.logBoxLine('');
logger.logBoxLine(`Available to Load (${recommendations.canLoad.length}):`);
logger.logBoxLine(`Ready To Deploy (${recommendations.canLoad.length}):`);
for (const model of recommendations.canLoad.slice(0, 5)) {
logger.logBoxLine(` - ${model}`);
}
@@ -277,10 +232,10 @@ export class ModelLoader {
}
logger.logBoxLine('');
logger.logBoxLine(`Insufficient VRAM (${recommendations.cannotLoad.length}):`);
logger.logBoxLine(`Needs Larger GPUs (${recommendations.cannotLoad.length}):`);
for (const model of recommendations.cannotLoad.slice(0, 3)) {
const info = await this.registry.getGreenlitModel(model);
logger.logBoxLine(` - ${model} (needs ${info?.minVram || '?'}GB)`);
const info = await this.registry.getModel(model);
logger.logBoxLine(` - ${model} (needs ${info?.requirements.minVramGb || '?'}GB)`);
}
if (recommendations.cannotLoad.length > 3) {
logger.logBoxLine(` ... and ${recommendations.cannotLoad.length - 3} more`);
@@ -288,4 +243,96 @@ export class ModelLoader {
logger.logBoxEnd();
}
private planPlacement(
modelInfo: IModelCatalogEntry,
gpus: IGpuInfo[],
): { gpuIds: string[]; tensorParallelSize: number } | null {
const usedGpuIds = this.containerManager.getAllContainers().flatMap((container) =>
container.getConfig().gpuIds
);
const freeGpus = filterOutUsedGpus(gpus, usedGpuIds);
const preferredPlacement = selectPlacementForModel(modelInfo, freeGpus);
if (preferredPlacement) {
return {
gpuIds: preferredPlacement.gpuIds,
tensorParallelSize: preferredPlacement.tensorParallelSize,
};
}
const fallbackPlacement = selectPlacementForModel(modelInfo, gpus);
if (!fallbackPlacement) {
return null;
}
return {
gpuIds: fallbackPlacement.gpuIds,
tensorParallelSize: fallbackPlacement.tensorParallelSize,
};
}
private buildVllmCommand(
modelInfo: IModelCatalogEntry,
tensorParallelSize: number,
): string[] {
const command = ['--model', modelInfo.source.repo];
if (tensorParallelSize > 1) {
command.push('--tensor-parallel-size', String(tensorParallelSize));
}
if (modelInfo.launchDefaults?.maxModelLen) {
command.push('--max-model-len', String(modelInfo.launchDefaults.maxModelLen));
}
if (modelInfo.launchDefaults?.gpuMemoryUtilization) {
command.push(
'--gpu-memory-utilization',
String(modelInfo.launchDefaults.gpuMemoryUtilization),
);
}
if (modelInfo.launchDefaults?.quantization) {
command.push('--quantization', modelInfo.launchDefaults.quantization);
}
if (modelInfo.launchDefaults?.dtype) {
command.push('--dtype', modelInfo.launchDefaults.dtype);
}
if (modelInfo.launchDefaults?.generationConfig) {
command.push('--generation-config', modelInfo.launchDefaults.generationConfig);
}
if (modelInfo.launchDefaults?.extraArgs) {
command.push(...modelInfo.launchDefaults.extraArgs);
}
return command;
}
private getExistingReplicaCount(modelId: string): number {
return this.containerManager.getAllContainers().filter((container) =>
container.getConfig().models.includes(modelId)
).length;
}
private createDeploymentId(modelId: string, replicaOrdinal: number): string {
const baseId = modelId.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(
0,
32,
);
const suffix = replicaOrdinal > 0 ? `-r${replicaOrdinal + 1}` : '';
return `vllm-${baseId}${suffix}`;
}
private createDeploymentName(modelId: string, replicaOrdinal: number): string {
const baseName = modelId.split('/').pop() || modelId;
if (replicaOrdinal === 0) {
return baseName;
}
return `${baseName} replica ${replicaOrdinal + 1}`;
}
}