feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
@@ -1,58 +1,89 @@
 /**
- * Chat Completions Handler
- *
- * Handles /v1/chat/completions and /v1/completions endpoints.
+ * Chat completions handler.
 */

 import * as http from 'node:http';
-import type {
-  IChatCompletionRequest,
-  IChatCompletionResponse,
-  IApiError,
-} from '../../interfaces/api.ts';
-import { logger } from '../../logger.ts';
+import type { IApiError, IChatCompletionRequest } from '../../interfaces/api.ts';
+import { ClusterCoordinator } from '../../cluster/coordinator.ts';
 import { ContainerManager } from '../../containers/container-manager.ts';
+import { logger } from '../../logger.ts';
+import { ModelRegistry } from '../../models/registry.ts';
 import { ModelLoader } from '../../models/loader.ts';

-/**
- * Handler for chat completion requests
- */
 export class ChatHandler {
  private containerManager: ContainerManager;
+  private modelRegistry: ModelRegistry;
  private modelLoader: ModelLoader;
+  private clusterCoordinator: ClusterCoordinator;

-  constructor(containerManager: ContainerManager, modelLoader: ModelLoader) {
+  constructor(
+    containerManager: ContainerManager,
+    modelRegistry: ModelRegistry,
+    modelLoader: ModelLoader,
+    clusterCoordinator: ClusterCoordinator,
+  ) {
    this.containerManager = containerManager;
+    this.modelRegistry = modelRegistry;
    this.modelLoader = modelLoader;
+    this.clusterCoordinator = clusterCoordinator;
  }

-  /**
-   * Handle POST /v1/chat/completions
-   */
  public async handleChatCompletion(
    req: http.IncomingMessage,
    res: http.ServerResponse,
    body: IChatCompletionRequest,
  ): Promise<void> {
-    const modelName = body.model;
-    const isStream = body.stream === true;
+    const canonicalModel = await this.resolveCanonicalModel(body.model);
+    const requestBody: IChatCompletionRequest = {
+      ...body,
+      model: canonicalModel,
+    };

-    logger.dim(`Chat completion request for model: ${modelName}`);
+    logger.dim(`Chat completion request for model: ${canonicalModel}`);

    try {
-      // Find or load the model
-      const container = await this.findOrLoadModel(modelName);
-      if (!container) {
-        this.sendError(res, 404, `Model "${modelName}" not found or could not be loaded`, 'model_not_found');
+      const container = await this.findOrLoadLocalModel(canonicalModel);
+      if (container) {
+        if (requestBody.stream) {
+          await this.handleStreamingCompletion(res, container, requestBody);
+        } else {
+          await this.handleNonStreamingCompletion(res, container, requestBody);
+        }
        return;
      }

-      // Route to streaming or non-streaming handler
-      if (isStream) {
-        await this.handleStreamingCompletion(res, container, body);
-      } else {
-        await this.handleNonStreamingCompletion(res, container, body);
+      const ensured = await this.clusterCoordinator.ensureModelViaControlPlane(canonicalModel);
+      if (!ensured) {
+        this.sendError(
+          res,
+          404,
+          `Model "${canonicalModel}" not found or could not be deployed`,
+          'model_not_found',
+        );
+        return;
      }
+
+      if (ensured.location.nodeName === this.clusterCoordinator.getLocalNodeName()) {
+        const localContainer = await this.findLocalContainer(canonicalModel);
+        if (!localContainer) {
+          this.sendError(
+            res,
+            503,
+            `Model "${canonicalModel}" was scheduled locally but is not ready`,
+            'server_error',
+          );
+          return;
+        }
+
+        if (requestBody.stream) {
+          await this.handleStreamingCompletion(res, localContainer, requestBody);
+        } else {
+          await this.handleNonStreamingCompletion(res, localContainer, requestBody);
+        }
+        return;
+      }
+
+      await this.proxyChatRequest(req, res, ensured.location.endpoint, requestBody);
    } catch (error) {
      const message = error instanceof Error ? error.message : String(error);
      logger.error(`Chat completion error: ${message}`);
@@ -60,34 +91,38 @@ export class ChatHandler {
    }
  }

-  /**
-   * Find container with model or attempt to load it
-   */
-  private async findOrLoadModel(
+  private async resolveCanonicalModel(modelName: string): Promise<string> {
+    const model = await this.modelRegistry.getModel(modelName);
+    return model?.id || modelName;
+  }
+
+  private async findLocalContainer(
    modelName: string,
  ): Promise<import('../../containers/base-container.ts').BaseContainer | null> {
-    // First, check if model is already loaded
-    const container = await this.containerManager.findContainerForModel(modelName);
-    if (container) {
-      return container;
-    }
-
-    // Try to load the model
-    logger.info(`Model ${modelName} not loaded, attempting to load...`);
-    const loadResult = await this.modelLoader.loadModel(modelName);
-
-    if (!loadResult.success) {
-      logger.error(`Failed to load model: ${loadResult.error}`);
-      return null;
-    }
-
-    // Find the container again after loading
    return this.containerManager.findContainerForModel(modelName);
  }

-  /**
-   * Handle non-streaming chat completion
-   */
+  private async findOrLoadLocalModel(
+    modelName: string,
+  ): Promise<import('../../containers/base-container.ts').BaseContainer | null> {
+    const existing = await this.findLocalContainer(modelName);
+    if (existing) {
+      return existing;
+    }
+
+    if (!this.clusterCoordinator.shouldDeployLocallyFirst()) {
+      return null;
+    }
+
+    logger.info(`Model ${modelName} not loaded, attempting local deploy...`);
+    const loadResult = await this.modelLoader.loadModel(modelName);
+    if (!loadResult.success) {
+      return null;
+    }
+
+    return this.findLocalContainer(loadResult.model);
+  }
+
  private async handleNonStreamingCompletion(
    res: http.ServerResponse,
    container: import('../../containers/base-container.ts').BaseContainer,
@@ -99,35 +134,85 @@ export class ChatHandler {
    res.end(JSON.stringify(response));
  }

-  /**
-   * Handle streaming chat completion
-   */
  private async handleStreamingCompletion(
    res: http.ServerResponse,
    container: import('../../containers/base-container.ts').BaseContainer,
    body: IChatCompletionRequest,
  ): Promise<void> {
-    // Set SSE headers
    res.writeHead(200, {
      'Content-Type': 'text/event-stream',
      'Cache-Control': 'no-cache',
-      'Connection': 'keep-alive',
+      Connection: 'keep-alive',
      'X-Accel-Buffering': 'no',
    });

-    // Stream chunks to client
    await container.chatCompletionStream(body, (chunk) => {
-      res.write(`data: ${chunk}\n\n`);
+      res.write(chunk);
    });
-
-    // Send final done message
-    res.write('data: [DONE]\n\n');
    res.end();
  }

-  /**
-   * Send error response
-   */
+  private async proxyChatRequest(
+    req: http.IncomingMessage,
+    res: http.ServerResponse,
+    targetEndpoint: string,
+    body: IChatCompletionRequest,
+  ): Promise<void> {
+    const response = await fetch(`${targetEndpoint}/v1/chat/completions`, {
+      method: 'POST',
+      headers: this.buildForwardHeaders(req),
+      body: JSON.stringify(body),
+    });
+
+    if (body.stream) {
+      res.writeHead(response.status, {
+        'Content-Type': response.headers.get('content-type') || 'text/event-stream',
+        'Cache-Control': 'no-cache',
+        Connection: 'keep-alive',
+      });
+
+      const reader = response.body?.getReader();
+      if (!reader) {
+        res.end();
+        return;
+      }
+
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) {
+          break;
+        }
+
+        res.write(value);
+      }
+
+      res.end();
+      return;
+    }
+
+    const text = await response.text();
+    res.writeHead(response.status, {
+      'Content-Type': response.headers.get('content-type') || 'application/json',
+    });
+    res.end(text);
+  }
+
+  private buildForwardHeaders(req: http.IncomingMessage): Record<string, string> {
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+    };
+
+    if (typeof req.headers.authorization === 'string') {
+      headers.Authorization = req.headers.authorization;
+    }
+
+    if (typeof req.headers['x-request-id'] === 'string') {
+      headers['X-Request-Id'] = req.headers['x-request-id'];
+    }
+
+    return headers;
+  }
+
  private sendError(
    res: http.ServerResponse,
    statusCode: number,
@@ -140,7 +225,6 @@ export class ChatHandler {
        message,
        type,
        param,
-        code: null,
      },
    };