feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
@@ -4,11 +4,7 @@
 * Manages vLLM containers for high-performance LLM inference.
 */

-import type {
-  IContainerConfig,
-  ILoadedModel,
-  TContainerType,
-} from '../interfaces/container.ts';
+import type { IContainerConfig, ILoadedModel, TContainerType } from '../interfaces/container.ts';
 import type {
  IChatCompletionRequest,
  IChatCompletionResponse,
@@ -72,20 +68,26 @@ export class VllmContainer extends BaseContainer {
    gpuIds: string[],
    options: Partial<IContainerConfig> = {},
  ): IContainerConfig {
-    // vLLM requires model to be specified at startup
-    const command = [
-      '--model', modelName,
-      '--host', '0.0.0.0',
-      '--port', String(options.port || CONTAINER_PORTS.VLLM),
+    const command = options.command ? [...options.command] : [
+      '--model',
+      modelName,
    ];

+    if (!command.includes('--host')) {
+      command.push('--host', '0.0.0.0');
+    }
+
+    if (!command.includes('--port')) {
+      command.push('--port', String(options.port || CONTAINER_PORTS.VLLM));
+    }
+
    // Add tensor parallelism if multiple GPUs
-    if (gpuIds.length > 1) {
+    if (gpuIds.length > 1 && !command.includes('--tensor-parallel-size')) {
      command.push('--tensor-parallel-size', String(gpuIds.length));
    }

    // Add additional options
-    if (options.env?.VLLM_MAX_MODEL_LEN) {
+    if (options.env?.VLLM_MAX_MODEL_LEN && !command.includes('--max-model-len')) {
      command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
    }

@@ -128,11 +130,17 @@ export class VllmContainer extends BaseContainer {
   * vLLM serves a single model per instance
   */
  public async listModels(): Promise<string[]> {
+    if (this.config.models.length > 0) {
+      return this.config.models;
+    }
+
    try {
      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
      return (data.data || []).map((m) => m.id);
    } catch (error) {
-      logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
+      logger.warn(
+        `Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`,
+      );
      return this.config.models || [];
    }
  }
@@ -141,6 +149,15 @@ export class VllmContainer extends BaseContainer {
   * Get loaded models with details
   */
  public async getLoadedModels(): Promise<ILoadedModel[]> {
+    if (this.config.models.length > 0) {
+      return this.config.models.map((name) => ({
+        name,
+        size: 0,
+        loaded: true,
+        requestCount: 0,
+      }));
+    }
+
    try {
      const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
      return (data.data || []).map((m) => ({