feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+195
-148
@@ -1,18 +1,16 @@
|
||||
/**
|
||||
* Model Loader
|
||||
*
|
||||
* Handles automatic model loading with greenlist validation.
|
||||
* Model loader for vLLM deployments.
|
||||
*/
|
||||
|
||||
import type { TContainerType } from '../interfaces/container.ts';
|
||||
import type { IModelCatalogEntry } from '../interfaces/catalog.ts';
|
||||
import type { IGpuInfo } from '../interfaces/gpu.ts';
|
||||
import { filterOutUsedGpus, selectPlacementForModel } from '../cluster/placement.ts';
|
||||
import { VllmContainer } from '../containers/vllm.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { ModelRegistry } from './registry.ts';
|
||||
import { ContainerManager } from '../containers/container-manager.ts';
|
||||
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
||||
|
||||
/**
|
||||
* Model load result
|
||||
*/
|
||||
export interface IModelLoadResult {
|
||||
success: boolean;
|
||||
model: string;
|
||||
@@ -21,161 +19,112 @@ export interface IModelLoadResult {
|
||||
alreadyLoaded?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Model loader with greenlist validation
|
||||
*/
|
||||
export interface IModelLoadOptions {
|
||||
forceNewReplica?: boolean;
|
||||
replicaOrdinal?: number;
|
||||
}
|
||||
|
||||
export class ModelLoader {
|
||||
private registry: ModelRegistry;
|
||||
private containerManager: ContainerManager;
|
||||
private gpuDetector: GpuDetector;
|
||||
private autoPull: boolean;
|
||||
private autoDeploy: boolean;
|
||||
|
||||
constructor(
|
||||
registry: ModelRegistry,
|
||||
containerManager: ContainerManager,
|
||||
autoPull: boolean = true,
|
||||
autoDeploy: boolean = true,
|
||||
) {
|
||||
this.registry = registry;
|
||||
this.containerManager = containerManager;
|
||||
this.gpuDetector = new GpuDetector();
|
||||
this.autoPull = autoPull;
|
||||
this.autoDeploy = autoDeploy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a model with greenlist validation
|
||||
*/
|
||||
public async loadModel(modelName: string): Promise<IModelLoadResult> {
|
||||
public async loadModel(
|
||||
modelName: string,
|
||||
options: IModelLoadOptions = {},
|
||||
): Promise<IModelLoadResult> {
|
||||
logger.info(`Loading model: ${modelName}`);
|
||||
|
||||
// Step 1: Check if model is already loaded in any container
|
||||
const container = await this.containerManager.findContainerForModel(modelName);
|
||||
if (container) {
|
||||
logger.dim(`Model ${modelName} is already available in container ${container.getConfig().id}`);
|
||||
const modelInfo = await this.registry.getModel(modelName);
|
||||
const resolvedModelName = modelInfo?.id || modelName;
|
||||
|
||||
const existing = await this.containerManager.findContainerForModel(resolvedModelName);
|
||||
if (existing && !options.forceNewReplica) {
|
||||
return {
|
||||
success: true,
|
||||
model: modelName,
|
||||
container: container.getConfig().id,
|
||||
model: resolvedModelName,
|
||||
container: existing.getConfig().id,
|
||||
alreadyLoaded: true,
|
||||
};
|
||||
}
|
||||
|
||||
// Step 2: Check if model is greenlit
|
||||
const isGreenlit = await this.registry.isModelGreenlit(modelName);
|
||||
if (!isGreenlit) {
|
||||
logger.error(`Model ${modelName} is not in the greenlit list`);
|
||||
logger.info('Only greenlit models can be auto-pulled for security reasons.');
|
||||
logger.info('Contact your administrator to add this model to the greenlist.');
|
||||
return {
|
||||
success: false,
|
||||
model: modelName,
|
||||
error: `Model "${modelName}" is not greenlit. Request via admin or add to greenlist.`,
|
||||
};
|
||||
}
|
||||
|
||||
// Step 3: Get model info from greenlist
|
||||
const modelInfo = await this.registry.getGreenlitModel(modelName);
|
||||
if (!modelInfo) {
|
||||
return {
|
||||
success: false,
|
||||
model: modelName,
|
||||
error: 'Failed to get model info from greenlist',
|
||||
model: resolvedModelName,
|
||||
error: `Model "${modelName}" is not listed in the registry`,
|
||||
};
|
||||
}
|
||||
|
||||
// Step 4: Check VRAM requirements
|
||||
const gpus = await this.gpuDetector.detectGpus();
|
||||
const totalVram = gpus.reduce((sum, gpu) => sum + gpu.vram, 0);
|
||||
const totalVramGb = Math.round(totalVram / 1024);
|
||||
|
||||
if (modelInfo.minVram > totalVramGb) {
|
||||
logger.error(`Insufficient VRAM for model ${modelName}`);
|
||||
logger.info(`Required: ${modelInfo.minVram}GB, Available: ${totalVramGb}GB`);
|
||||
const placement = this.planPlacement(modelInfo, await this.gpuDetector.detectGpus());
|
||||
if (!placement) {
|
||||
return {
|
||||
success: false,
|
||||
model: modelName,
|
||||
error: `Insufficient VRAM. Required: ${modelInfo.minVram}GB, Available: ${totalVramGb}GB`,
|
||||
model: resolvedModelName,
|
||||
error: 'Insufficient GPU capacity for deployment',
|
||||
};
|
||||
}
|
||||
|
||||
// Step 5: Find or create appropriate container
|
||||
const containerType = modelInfo.container;
|
||||
let targetContainer = await this.findAvailableContainer(containerType);
|
||||
|
||||
if (!targetContainer) {
|
||||
logger.warn(`No ${containerType} container available`);
|
||||
|
||||
// Could auto-create container here if desired
|
||||
if (!this.autoDeploy) {
|
||||
return {
|
||||
success: false,
|
||||
model: modelName,
|
||||
error: `No ${containerType} container available to load model`,
|
||||
model: resolvedModelName,
|
||||
error: 'Automatic deployments are disabled',
|
||||
};
|
||||
}
|
||||
|
||||
// Step 6: Pull the model if auto-pull is enabled
|
||||
if (this.autoPull) {
|
||||
logger.info(`Pulling model ${modelName} to ${containerType} container...`);
|
||||
const deploymentId = this.createDeploymentId(
|
||||
modelInfo.id,
|
||||
options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id),
|
||||
);
|
||||
const deploymentName = this.createDeploymentName(
|
||||
modelInfo.id,
|
||||
options.replicaOrdinal ?? this.getExistingReplicaCount(modelInfo.id),
|
||||
);
|
||||
const config = VllmContainer.createConfig(
|
||||
deploymentId,
|
||||
deploymentName,
|
||||
modelInfo.source.repo,
|
||||
placement.gpuIds,
|
||||
{
|
||||
env: {
|
||||
...(modelInfo.launchDefaults?.env || {}),
|
||||
},
|
||||
command: this.buildVllmCommand(modelInfo, placement.tensorParallelSize),
|
||||
},
|
||||
);
|
||||
config.models = [modelInfo.id];
|
||||
|
||||
const pullSuccess = await targetContainer.pullModel(modelName, (progress) => {
|
||||
const percent = progress.percent !== undefined ? ` (${progress.percent}%)` : '';
|
||||
logger.dim(` ${progress.status}${percent}`);
|
||||
});
|
||||
|
||||
if (!pullSuccess) {
|
||||
return {
|
||||
success: false,
|
||||
model: modelName,
|
||||
error: 'Failed to pull model',
|
||||
};
|
||||
}
|
||||
const container = this.containerManager.addContainer(config);
|
||||
const started = await container.start();
|
||||
if (!started) {
|
||||
await this.containerManager.removeContainer(config.id);
|
||||
return {
|
||||
success: false,
|
||||
model: resolvedModelName,
|
||||
error: 'Failed to start vLLM deployment',
|
||||
};
|
||||
}
|
||||
|
||||
logger.success(`Model ${modelName} loaded successfully`);
|
||||
return {
|
||||
success: true,
|
||||
model: modelName,
|
||||
container: targetContainer.getConfig().id,
|
||||
model: modelInfo.id,
|
||||
container: config.id,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Find an available container of the specified type
|
||||
*/
|
||||
private async findAvailableContainer(
|
||||
containerType: TContainerType,
|
||||
): Promise<import('../containers/base-container.ts').BaseContainer | null> {
|
||||
const containers = this.containerManager.getAllContainers();
|
||||
|
||||
for (const container of containers) {
|
||||
if (container.type !== containerType) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const status = await container.getStatus();
|
||||
if (status.running) {
|
||||
return container;
|
||||
}
|
||||
}
|
||||
|
||||
// No running container found, try to start one
|
||||
for (const container of containers) {
|
||||
if (container.type !== containerType) {
|
||||
continue;
|
||||
}
|
||||
|
||||
logger.info(`Starting ${containerType} container: ${container.getConfig().name}`);
|
||||
const started = await container.start();
|
||||
if (started) {
|
||||
return container;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Preload a list of models
|
||||
*/
|
||||
public async preloadModels(modelNames: string[]): Promise<Map<string, IModelLoadResult>> {
|
||||
const results = new Map<string, IModelLoadResult>();
|
||||
|
||||
@@ -191,36 +140,45 @@ export class ModelLoader {
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unload a model from a container
|
||||
*/
|
||||
public async unloadModel(modelName: string): Promise<boolean> {
|
||||
const container = await this.containerManager.findContainerForModel(modelName);
|
||||
if (!container) {
|
||||
const modelInfo = await this.registry.getModel(modelName);
|
||||
const canonicalModel = modelInfo?.id || modelName;
|
||||
const containers = this.containerManager.getAllContainers().filter((container) =>
|
||||
container.getConfig().models.includes(canonicalModel)
|
||||
);
|
||||
|
||||
if (containers.length === 0) {
|
||||
logger.warn(`Model ${modelName} not found in any container`);
|
||||
return false;
|
||||
}
|
||||
|
||||
return container.removeModel(modelName);
|
||||
let allRemoved = true;
|
||||
for (const container of containers) {
|
||||
const removed = await this.containerManager.removeContainer(container.getConfig().id);
|
||||
allRemoved = allRemoved && removed;
|
||||
}
|
||||
|
||||
return allRemoved;
|
||||
}
|
||||
|
||||
public async deployReplica(
|
||||
modelName: string,
|
||||
replicaOrdinal?: number,
|
||||
): Promise<IModelLoadResult> {
|
||||
return this.loadModel(modelName, {
|
||||
forceNewReplica: true,
|
||||
replicaOrdinal,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if auto-pull is enabled
|
||||
*/
|
||||
public isAutoPullEnabled(): boolean {
|
||||
return this.autoPull;
|
||||
return this.autoDeploy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable or disable auto-pull
|
||||
*/
|
||||
public setAutoPull(enabled: boolean): void {
|
||||
this.autoPull = enabled;
|
||||
this.autoDeploy = enabled;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get loading recommendations for available VRAM
|
||||
*/
|
||||
public async getRecommendations(): Promise<{
|
||||
canLoad: string[];
|
||||
cannotLoad: string[];
|
||||
@@ -229,7 +187,7 @@ export class ModelLoader {
|
||||
const gpus = await this.gpuDetector.detectGpus();
|
||||
const totalVramGb = Math.round(gpus.reduce((sum, gpu) => sum + gpu.vram, 0) / 1024);
|
||||
|
||||
const allModels = await this.registry.getAllGreenlitModels();
|
||||
const allModels = await this.registry.getAllModels();
|
||||
const availableModels = await this.containerManager.getAllAvailableModels();
|
||||
const loadedNames = new Set(availableModels.keys());
|
||||
|
||||
@@ -238,27 +196,24 @@ export class ModelLoader {
|
||||
const loaded: string[] = [];
|
||||
|
||||
for (const model of allModels) {
|
||||
if (loadedNames.has(model.name)) {
|
||||
loaded.push(model.name);
|
||||
} else if (model.minVram <= totalVramGb) {
|
||||
canLoad.push(model.name);
|
||||
if (loadedNames.has(model.id)) {
|
||||
loaded.push(model.id);
|
||||
} else if (model.requirements.minVramGb <= totalVramGb) {
|
||||
canLoad.push(model.id);
|
||||
} else {
|
||||
cannotLoad.push(model.name);
|
||||
cannotLoad.push(model.id);
|
||||
}
|
||||
}
|
||||
|
||||
return { canLoad, cannotLoad, loaded };
|
||||
}
|
||||
|
||||
/**
|
||||
* Print loading status
|
||||
*/
|
||||
public async printStatus(): Promise<void> {
|
||||
const recommendations = await this.getRecommendations();
|
||||
|
||||
logger.logBoxTitle('Model Loading Status', 60, 'info');
|
||||
logger.logBoxTitle('Model Deployment Status', 70, 'info');
|
||||
|
||||
logger.logBoxLine(`Loaded Models (${recommendations.loaded.length}):`);
|
||||
logger.logBoxLine(`Running Deployments (${recommendations.loaded.length}):`);
|
||||
if (recommendations.loaded.length > 0) {
|
||||
for (const model of recommendations.loaded) {
|
||||
logger.logBoxLine(` - ${model}`);
|
||||
@@ -268,7 +223,7 @@ export class ModelLoader {
|
||||
}
|
||||
|
||||
logger.logBoxLine('');
|
||||
logger.logBoxLine(`Available to Load (${recommendations.canLoad.length}):`);
|
||||
logger.logBoxLine(`Ready To Deploy (${recommendations.canLoad.length}):`);
|
||||
for (const model of recommendations.canLoad.slice(0, 5)) {
|
||||
logger.logBoxLine(` - ${model}`);
|
||||
}
|
||||
@@ -277,10 +232,10 @@ export class ModelLoader {
|
||||
}
|
||||
|
||||
logger.logBoxLine('');
|
||||
logger.logBoxLine(`Insufficient VRAM (${recommendations.cannotLoad.length}):`);
|
||||
logger.logBoxLine(`Needs Larger GPUs (${recommendations.cannotLoad.length}):`);
|
||||
for (const model of recommendations.cannotLoad.slice(0, 3)) {
|
||||
const info = await this.registry.getGreenlitModel(model);
|
||||
logger.logBoxLine(` - ${model} (needs ${info?.minVram || '?'}GB)`);
|
||||
const info = await this.registry.getModel(model);
|
||||
logger.logBoxLine(` - ${model} (needs ${info?.requirements.minVramGb || '?'}GB)`);
|
||||
}
|
||||
if (recommendations.cannotLoad.length > 3) {
|
||||
logger.logBoxLine(` ... and ${recommendations.cannotLoad.length - 3} more`);
|
||||
@@ -288,4 +243,96 @@ export class ModelLoader {
|
||||
|
||||
logger.logBoxEnd();
|
||||
}
|
||||
|
||||
private planPlacement(
|
||||
modelInfo: IModelCatalogEntry,
|
||||
gpus: IGpuInfo[],
|
||||
): { gpuIds: string[]; tensorParallelSize: number } | null {
|
||||
const usedGpuIds = this.containerManager.getAllContainers().flatMap((container) =>
|
||||
container.getConfig().gpuIds
|
||||
);
|
||||
const freeGpus = filterOutUsedGpus(gpus, usedGpuIds);
|
||||
|
||||
const preferredPlacement = selectPlacementForModel(modelInfo, freeGpus);
|
||||
if (preferredPlacement) {
|
||||
return {
|
||||
gpuIds: preferredPlacement.gpuIds,
|
||||
tensorParallelSize: preferredPlacement.tensorParallelSize,
|
||||
};
|
||||
}
|
||||
|
||||
const fallbackPlacement = selectPlacementForModel(modelInfo, gpus);
|
||||
if (!fallbackPlacement) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
gpuIds: fallbackPlacement.gpuIds,
|
||||
tensorParallelSize: fallbackPlacement.tensorParallelSize,
|
||||
};
|
||||
}
|
||||
|
||||
private buildVllmCommand(
|
||||
modelInfo: IModelCatalogEntry,
|
||||
tensorParallelSize: number,
|
||||
): string[] {
|
||||
const command = ['--model', modelInfo.source.repo];
|
||||
|
||||
if (tensorParallelSize > 1) {
|
||||
command.push('--tensor-parallel-size', String(tensorParallelSize));
|
||||
}
|
||||
|
||||
if (modelInfo.launchDefaults?.maxModelLen) {
|
||||
command.push('--max-model-len', String(modelInfo.launchDefaults.maxModelLen));
|
||||
}
|
||||
|
||||
if (modelInfo.launchDefaults?.gpuMemoryUtilization) {
|
||||
command.push(
|
||||
'--gpu-memory-utilization',
|
||||
String(modelInfo.launchDefaults.gpuMemoryUtilization),
|
||||
);
|
||||
}
|
||||
|
||||
if (modelInfo.launchDefaults?.quantization) {
|
||||
command.push('--quantization', modelInfo.launchDefaults.quantization);
|
||||
}
|
||||
|
||||
if (modelInfo.launchDefaults?.dtype) {
|
||||
command.push('--dtype', modelInfo.launchDefaults.dtype);
|
||||
}
|
||||
|
||||
if (modelInfo.launchDefaults?.generationConfig) {
|
||||
command.push('--generation-config', modelInfo.launchDefaults.generationConfig);
|
||||
}
|
||||
|
||||
if (modelInfo.launchDefaults?.extraArgs) {
|
||||
command.push(...modelInfo.launchDefaults.extraArgs);
|
||||
}
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
private getExistingReplicaCount(modelId: string): number {
|
||||
return this.containerManager.getAllContainers().filter((container) =>
|
||||
container.getConfig().models.includes(modelId)
|
||||
).length;
|
||||
}
|
||||
|
||||
private createDeploymentId(modelId: string, replicaOrdinal: number): string {
|
||||
const baseId = modelId.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(
|
||||
0,
|
||||
32,
|
||||
);
|
||||
const suffix = replicaOrdinal > 0 ? `-r${replicaOrdinal + 1}` : '';
|
||||
return `vllm-${baseId}${suffix}`;
|
||||
}
|
||||
|
||||
private createDeploymentName(modelId: string, replicaOrdinal: number): string {
|
||||
const baseName = modelId.split('/').pop() || modelId;
|
||||
if (replicaOrdinal === 0) {
|
||||
return baseName;
|
||||
}
|
||||
|
||||
return `${baseName} replica ${replicaOrdinal + 1}`;
|
||||
}
|
||||
}
|
||||
|
||||
+158
-205
@@ -1,252 +1,205 @@
|
||||
/**
|
||||
* Model Registry
|
||||
*
|
||||
* Manages the greenlit model list and model availability.
|
||||
* Model registry backed by list.modelgrid.com.
|
||||
*/
|
||||
|
||||
import type { IGreenlitModel, IGreenlitModelsList } from '../interfaces/config.ts';
|
||||
import type { TContainerType } from '../interfaces/container.ts';
|
||||
import * as fs from 'node:fs/promises';
|
||||
import type { IModelCatalog, IModelCatalogEntry } from '../interfaces/catalog.ts';
|
||||
import { MODEL_REGISTRY, TIMING } from '../constants.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
|
||||
/**
|
||||
* Model registry for managing greenlit models
|
||||
*/
|
||||
export class ModelRegistry {
|
||||
private greenlistUrl: string;
|
||||
private cachedGreenlist: IGreenlitModelsList | null = null;
|
||||
private catalogUrl: string;
|
||||
private cachedCatalog: IModelCatalog | null = null;
|
||||
private cacheTime: number = 0;
|
||||
|
||||
constructor(greenlistUrl: string = MODEL_REGISTRY.DEFAULT_GREENLIST_URL) {
|
||||
this.greenlistUrl = greenlistUrl;
|
||||
constructor(catalogUrl: string = MODEL_REGISTRY.DEFAULT_CATALOG_URL) {
|
||||
this.catalogUrl = catalogUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the greenlist URL
|
||||
*/
|
||||
public setGreenlistUrl(url: string): void {
|
||||
this.greenlistUrl = url;
|
||||
this.cachedGreenlist = null;
|
||||
public setCatalogUrl(url: string): void {
|
||||
this.catalogUrl = url;
|
||||
this.cachedCatalog = null;
|
||||
this.cacheTime = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the greenlit model list from remote URL
|
||||
*/
|
||||
public async fetchGreenlist(forceRefresh: boolean = false): Promise<IGreenlitModelsList> {
|
||||
// Return cached data if still valid
|
||||
public async fetchCatalog(forceRefresh: boolean = false): Promise<IModelCatalog> {
|
||||
if (
|
||||
!forceRefresh &&
|
||||
this.cachedGreenlist &&
|
||||
this.cachedCatalog &&
|
||||
Date.now() - this.cacheTime < TIMING.GREENLIST_CACHE_DURATION_MS
|
||||
) {
|
||||
return this.cachedGreenlist;
|
||||
return this.cachedCatalog;
|
||||
}
|
||||
|
||||
try {
|
||||
logger.dim(`Fetching greenlit models from: ${this.greenlistUrl}`);
|
||||
logger.dim(`Fetching model catalog from: ${this.catalogUrl}`);
|
||||
const catalog = await this.readCatalogSource(this.catalogUrl);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), 30000);
|
||||
if (!Array.isArray(catalog.models)) {
|
||||
throw new Error('Invalid catalog format: missing models array');
|
||||
}
|
||||
|
||||
const response = await fetch(this.greenlistUrl, {
|
||||
this.cachedCatalog = catalog;
|
||||
this.cacheTime = Date.now();
|
||||
|
||||
logger.dim(`Loaded ${catalog.models.length} catalog models`);
|
||||
return catalog;
|
||||
} catch (error) {
|
||||
logger.warn(
|
||||
`Failed to fetch model catalog: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
|
||||
if (!this.cachedCatalog) {
|
||||
logger.dim('Using fallback catalog');
|
||||
return this.getFallbackCatalog();
|
||||
}
|
||||
|
||||
return this.cachedCatalog;
|
||||
}
|
||||
}
|
||||
|
||||
public async isModelListed(modelName: string): Promise<boolean> {
|
||||
return (await this.getModel(modelName)) !== null;
|
||||
}
|
||||
|
||||
public async getModel(modelName: string): Promise<IModelCatalogEntry | null> {
|
||||
const catalog = await this.fetchCatalog();
|
||||
const normalized = this.normalizeModelName(modelName);
|
||||
|
||||
return catalog.models.find((model) => {
|
||||
const candidates = [model.id, ...(model.aliases || [])];
|
||||
return candidates.some((candidate) => this.normalizeModelName(candidate) === normalized);
|
||||
}) || null;
|
||||
}
|
||||
|
||||
public async getAllModels(): Promise<IModelCatalogEntry[]> {
|
||||
const catalog = await this.fetchCatalog();
|
||||
return catalog.models;
|
||||
}
|
||||
|
||||
public async getModelsByEngine(engine: 'vllm'): Promise<IModelCatalogEntry[]> {
|
||||
const catalog = await this.fetchCatalog();
|
||||
return catalog.models.filter((model) => model.engine === engine);
|
||||
}
|
||||
|
||||
public async getModelsWithinVram(maxVramGb: number): Promise<IModelCatalogEntry[]> {
|
||||
const catalog = await this.fetchCatalog();
|
||||
return catalog.models.filter((model) => model.requirements.minVramGb <= maxVramGb);
|
||||
}
|
||||
|
||||
public async getRecommendedEngine(modelName: string): Promise<'vllm' | null> {
|
||||
const model = await this.getModel(modelName);
|
||||
return model ? model.engine : null;
|
||||
}
|
||||
|
||||
public async getMinVram(modelName: string): Promise<number | null> {
|
||||
const model = await this.getModel(modelName);
|
||||
return model ? model.requirements.minVramGb : null;
|
||||
}
|
||||
|
||||
public async modelFitsInVram(modelName: string, availableVramGb: number): Promise<boolean> {
|
||||
const minVram = await this.getMinVram(modelName);
|
||||
if (minVram === null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return availableVramGb >= minVram;
|
||||
}
|
||||
|
||||
public async searchModels(pattern: string): Promise<IModelCatalogEntry[]> {
|
||||
const catalog = await this.fetchCatalog();
|
||||
const normalizedPattern = pattern.toLowerCase();
|
||||
|
||||
return catalog.models.filter((model) =>
|
||||
model.id.toLowerCase().includes(normalizedPattern) ||
|
||||
model.aliases?.some((alias) => alias.toLowerCase().includes(normalizedPattern)) ||
|
||||
model.metadata?.summary?.toLowerCase().includes(normalizedPattern) ||
|
||||
model.metadata?.tags?.some((tag) => tag.toLowerCase().includes(normalizedPattern))
|
||||
);
|
||||
}
|
||||
|
||||
public async getModelsByTags(tags: string[]): Promise<IModelCatalogEntry[]> {
|
||||
const catalog = await this.fetchCatalog();
|
||||
const normalizedTags = tags.map((tag) => tag.toLowerCase());
|
||||
|
||||
return catalog.models.filter((model) =>
|
||||
model.metadata?.tags?.some((tag) => normalizedTags.includes(tag.toLowerCase()))
|
||||
);
|
||||
}
|
||||
|
||||
public clearCache(): void {
|
||||
this.cachedCatalog = null;
|
||||
this.cacheTime = 0;
|
||||
}
|
||||
|
||||
public async printSummary(): Promise<void> {
|
||||
const catalog = await this.fetchCatalog();
|
||||
|
||||
logger.logBoxTitle('Model Catalog', 70, 'info');
|
||||
logger.logBoxLine(`Version: ${catalog.version}`);
|
||||
logger.logBoxLine(`Generated: ${catalog.generatedAt}`);
|
||||
logger.logBoxLine(`Total Models: ${catalog.models.length}`);
|
||||
logger.logBoxLine('');
|
||||
|
||||
for (const model of catalog.models.slice(0, 10)) {
|
||||
logger.logBoxLine(
|
||||
`- ${model.id} (${model.requirements.minVramGb}GB, ${model.engine})`,
|
||||
);
|
||||
}
|
||||
|
||||
if (catalog.models.length > 10) {
|
||||
logger.logBoxLine(`... and ${catalog.models.length - 10} more`);
|
||||
}
|
||||
|
||||
logger.logBoxEnd();
|
||||
}
|
||||
|
||||
private async readCatalogSource(source: string): Promise<IModelCatalog> {
|
||||
if (source.startsWith('file://')) {
|
||||
const filePath = new URL(source);
|
||||
const content = await fs.readFile(filePath, 'utf-8');
|
||||
return JSON.parse(content) as IModelCatalog;
|
||||
}
|
||||
|
||||
if (source.startsWith('/')) {
|
||||
const content = await fs.readFile(source, 'utf-8');
|
||||
return JSON.parse(content) as IModelCatalog;
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), 30000);
|
||||
|
||||
try {
|
||||
const response = await fetch(source, {
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
'Accept': 'application/json',
|
||||
Accept: 'application/json',
|
||||
'User-Agent': 'ModelGrid/1.0',
|
||||
},
|
||||
});
|
||||
|
||||
clearTimeout(timeout);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const greenlist = await response.json() as IGreenlitModelsList;
|
||||
|
||||
// Validate structure
|
||||
if (!greenlist.models || !Array.isArray(greenlist.models)) {
|
||||
throw new Error('Invalid greenlist format: missing models array');
|
||||
}
|
||||
|
||||
// Cache the result
|
||||
this.cachedGreenlist = greenlist;
|
||||
this.cacheTime = Date.now();
|
||||
|
||||
logger.dim(`Loaded ${greenlist.models.length} greenlit models`);
|
||||
return greenlist;
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to fetch greenlist: ${error instanceof Error ? error.message : String(error)}`);
|
||||
|
||||
// Return fallback if we have no cache
|
||||
if (!this.cachedGreenlist) {
|
||||
logger.dim('Using fallback greenlist');
|
||||
return this.getFallbackGreenlist();
|
||||
}
|
||||
|
||||
// Return stale cache
|
||||
return this.cachedGreenlist;
|
||||
return await response.json() as IModelCatalog;
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get fallback greenlist
|
||||
*/
|
||||
private getFallbackGreenlist(): IGreenlitModelsList {
|
||||
private getFallbackCatalog(): IModelCatalog {
|
||||
return {
|
||||
version: '1.0',
|
||||
lastUpdated: new Date().toISOString(),
|
||||
models: MODEL_REGISTRY.FALLBACK_GREENLIST as unknown as IGreenlitModel[],
|
||||
generatedAt: new Date().toISOString(),
|
||||
models: MODEL_REGISTRY.FALLBACK_CATALOG as unknown as IModelCatalogEntry[],
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a model is greenlit
|
||||
*/
|
||||
public async isModelGreenlit(modelName: string): Promise<boolean> {
|
||||
const greenlist = await this.fetchGreenlist();
|
||||
return greenlist.models.some((m) => this.normalizeModelName(m.name) === this.normalizeModelName(modelName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get greenlit model info
|
||||
*/
|
||||
public async getGreenlitModel(modelName: string): Promise<IGreenlitModel | null> {
|
||||
const greenlist = await this.fetchGreenlist();
|
||||
const normalized = this.normalizeModelName(modelName);
|
||||
return greenlist.models.find((m) => this.normalizeModelName(m.name) === normalized) || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all greenlit models
|
||||
*/
|
||||
public async getAllGreenlitModels(): Promise<IGreenlitModel[]> {
|
||||
const greenlist = await this.fetchGreenlist();
|
||||
return greenlist.models;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get greenlit models by container type
|
||||
*/
|
||||
public async getModelsByContainer(containerType: TContainerType): Promise<IGreenlitModel[]> {
|
||||
const greenlist = await this.fetchGreenlist();
|
||||
return greenlist.models.filter((m) => m.container === containerType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get greenlit models that fit within VRAM limit
|
||||
*/
|
||||
public async getModelsWithinVram(maxVramGb: number): Promise<IGreenlitModel[]> {
|
||||
const greenlist = await this.fetchGreenlist();
|
||||
return greenlist.models.filter((m) => m.minVram <= maxVramGb);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recommended container type for a model
|
||||
*/
|
||||
public async getRecommendedContainer(modelName: string): Promise<TContainerType | null> {
|
||||
const model = await this.getGreenlitModel(modelName);
|
||||
return model ? model.container : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get minimum VRAM required for a model
|
||||
*/
|
||||
public async getMinVram(modelName: string): Promise<number | null> {
|
||||
const model = await this.getGreenlitModel(modelName);
|
||||
return model ? model.minVram : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if model fits in available VRAM
|
||||
*/
|
||||
public async modelFitsInVram(modelName: string, availableVramGb: number): Promise<boolean> {
|
||||
const minVram = await this.getMinVram(modelName);
|
||||
if (minVram === null) {
|
||||
// Model not in greenlist, assume it might fit
|
||||
return true;
|
||||
}
|
||||
return availableVramGb >= minVram;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize model name for comparison
|
||||
* Handles variations like "llama3:8b" vs "llama3:8B" vs "meta-llama/llama-3-8b"
|
||||
*/
|
||||
private normalizeModelName(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9:.-]/g, '')
|
||||
.replace(/[^a-z0-9:/._-]/g, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Search models by name pattern
|
||||
*/
|
||||
public async searchModels(pattern: string): Promise<IGreenlitModel[]> {
|
||||
const greenlist = await this.fetchGreenlist();
|
||||
const normalizedPattern = pattern.toLowerCase();
|
||||
|
||||
return greenlist.models.filter((m) =>
|
||||
m.name.toLowerCase().includes(normalizedPattern) ||
|
||||
m.description?.toLowerCase().includes(normalizedPattern) ||
|
||||
m.tags?.some((t) => t.toLowerCase().includes(normalizedPattern))
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get models by tags
|
||||
*/
|
||||
public async getModelsByTags(tags: string[]): Promise<IGreenlitModel[]> {
|
||||
const greenlist = await this.fetchGreenlist();
|
||||
const normalizedTags = tags.map((t) => t.toLowerCase());
|
||||
|
||||
return greenlist.models.filter((m) =>
|
||||
m.tags?.some((t) => normalizedTags.includes(t.toLowerCase()))
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the cached greenlist
|
||||
*/
|
||||
public clearCache(): void {
|
||||
this.cachedGreenlist = null;
|
||||
this.cacheTime = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print greenlist summary
|
||||
*/
|
||||
public async printSummary(): Promise<void> {
|
||||
const greenlist = await this.fetchGreenlist();
|
||||
|
||||
// Group by container type
|
||||
const byContainer = new Map<string, IGreenlitModel[]>();
|
||||
for (const model of greenlist.models) {
|
||||
if (!byContainer.has(model.container)) {
|
||||
byContainer.set(model.container, []);
|
||||
}
|
||||
byContainer.get(model.container)!.push(model);
|
||||
}
|
||||
|
||||
logger.logBoxTitle('Greenlit Models', 60, 'info');
|
||||
logger.logBoxLine(`Version: ${greenlist.version}`);
|
||||
logger.logBoxLine(`Last Updated: ${greenlist.lastUpdated}`);
|
||||
logger.logBoxLine(`Total Models: ${greenlist.models.length}`);
|
||||
logger.logBoxLine('');
|
||||
|
||||
for (const [container, models] of byContainer) {
|
||||
logger.logBoxLine(`${container.toUpperCase()} (${models.length}):`);
|
||||
for (const model of models.slice(0, 5)) {
|
||||
logger.logBoxLine(` - ${model.name} (${model.minVram}GB VRAM)`);
|
||||
}
|
||||
if (models.length > 5) {
|
||||
logger.logBoxLine(` ... and ${models.length - 5} more`);
|
||||
}
|
||||
logger.logBoxLine('');
|
||||
}
|
||||
|
||||
logger.logBoxEnd();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user