feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

This commit is contained in:
2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
+30 -13
View File
@@ -4,11 +4,7 @@
* Manages vLLM containers for high-performance LLM inference.
*/
import type {
IContainerConfig,
ILoadedModel,
TContainerType,
} from '../interfaces/container.ts';
import type { IContainerConfig, ILoadedModel, TContainerType } from '../interfaces/container.ts';
import type {
IChatCompletionRequest,
IChatCompletionResponse,
@@ -72,20 +68,26 @@ export class VllmContainer extends BaseContainer {
gpuIds: string[],
options: Partial<IContainerConfig> = {},
): IContainerConfig {
// vLLM requires model to be specified at startup
const command = [
'--model', modelName,
'--host', '0.0.0.0',
'--port', String(options.port || CONTAINER_PORTS.VLLM),
const command = options.command ? [...options.command] : [
'--model',
modelName,
];
if (!command.includes('--host')) {
command.push('--host', '0.0.0.0');
}
if (!command.includes('--port')) {
command.push('--port', String(options.port || CONTAINER_PORTS.VLLM));
}
// Add tensor parallelism if multiple GPUs
if (gpuIds.length > 1) {
if (gpuIds.length > 1 && !command.includes('--tensor-parallel-size')) {
command.push('--tensor-parallel-size', String(gpuIds.length));
}
// Add additional options
if (options.env?.VLLM_MAX_MODEL_LEN) {
if (options.env?.VLLM_MAX_MODEL_LEN && !command.includes('--max-model-len')) {
command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
}
@@ -128,11 +130,17 @@ export class VllmContainer extends BaseContainer {
* vLLM serves a single model per instance
*/
public async listModels(): Promise<string[]> {
if (this.config.models.length > 0) {
return this.config.models;
}
try {
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
return (data.data || []).map((m) => m.id);
} catch (error) {
logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
logger.warn(
`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`,
);
return this.config.models || [];
}
}
@@ -141,6 +149,15 @@ export class VllmContainer extends BaseContainer {
* Get loaded models with details
*/
public async getLoadedModels(): Promise<ILoadedModel[]> {
if (this.config.models.length > 0) {
return this.config.models.map((name) => ({
name,
size: 0,
loaded: true,
requestCount: 0,
}));
}
try {
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
return (data.data || []).map((m) => ({