feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing

2026-04-20 23:00:50 +00:00
parent 83cacd0cf1
commit 4f2266e1b7
55 changed files with 3970 additions and 1630 deletions
@@ -1,53 +1,96 @@
 /**
- * Embeddings Handler
- *
- * Handles /v1/embeddings endpoint.
+ * Embeddings handler.
 */

 import * as http from 'node:http';
 import type {
+  IApiError,
+  IEmbeddingData,
  IEmbeddingsRequest,
  IEmbeddingsResponse,
-  IEmbeddingData,
-  IApiError,
 } from '../../interfaces/api.ts';
-import { logger } from '../../logger.ts';
+import { ClusterCoordinator } from '../../cluster/coordinator.ts';
 import { ContainerManager } from '../../containers/container-manager.ts';
+import { logger } from '../../logger.ts';
+import { ModelRegistry } from '../../models/registry.ts';

-/**
- * Handler for embeddings requests
- */
 export class EmbeddingsHandler {
  private containerManager: ContainerManager;
+  private modelRegistry: ModelRegistry;
+  private clusterCoordinator: ClusterCoordinator;

-  constructor(containerManager: ContainerManager) {
+  constructor(
+    containerManager: ContainerManager,
+    modelRegistry: ModelRegistry,
+    clusterCoordinator: ClusterCoordinator,
+  ) {
    this.containerManager = containerManager;
+    this.modelRegistry = modelRegistry;
+    this.clusterCoordinator = clusterCoordinator;
  }

-  /**
-   * Handle POST /v1/embeddings
-   */
  public async handleEmbeddings(
+    req: http.IncomingMessage,
    res: http.ServerResponse,
    body: IEmbeddingsRequest,
  ): Promise<void> {
-    const modelName = body.model;
+    const canonicalModel = await this.resolveCanonicalModel(body.model);
+    const requestBody: IEmbeddingsRequest = {
+      ...body,
+      model: canonicalModel,
+    };

-    logger.dim(`Embeddings request for model: ${modelName}`);
+    logger.dim(`Embeddings request for model: ${canonicalModel}`);

    try {
-      // Find container with the embedding model
-      const container = await this.containerManager.findContainerForModel(modelName);
-      if (!container) {
-        this.sendError(res, 404, `Embedding model "${modelName}" not found`, 'model_not_found');
+      const container = await this.containerManager.findContainerForModel(canonicalModel);
+      if (container) {
+        const response = await this.generateEmbeddings(container, requestBody);
+        res.writeHead(200, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify(response));
        return;
      }

-      // Generate embeddings
-      const response = await this.generateEmbeddings(container, body);
+      const ensured = await this.clusterCoordinator.ensureModelViaControlPlane(canonicalModel);
+      if (!ensured) {
+        this.sendError(
+          res,
+          404,
+          `Embedding model "${canonicalModel}" not found`,
+          'model_not_found',
+        );
+        return;
+      }

-      res.writeHead(200, { 'Content-Type': 'application/json' });
-      res.end(JSON.stringify(response));
+      if (ensured.location.nodeName === this.clusterCoordinator.getLocalNodeName()) {
+        const localContainer = await this.containerManager.findContainerForModel(canonicalModel);
+        if (!localContainer) {
+          this.sendError(
+            res,
+            503,
+            `Embedding model "${canonicalModel}" is not ready`,
+            'server_error',
+          );
+          return;
+        }
+
+        const response = await this.generateEmbeddings(localContainer, requestBody);
+        res.writeHead(200, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify(response));
+        return;
+      }
+
+      const response = await fetch(`${ensured.location.endpoint}/v1/embeddings`, {
+        method: 'POST',
+        headers: this.buildForwardHeaders(req),
+        body: JSON.stringify(requestBody),
+      });
+
+      const text = await response.text();
+      res.writeHead(response.status, {
+        'Content-Type': response.headers.get('content-type') || 'application/json',
+      });
+      res.end(text);
    } catch (error) {
      const message = error instanceof Error ? error.message : String(error);
      logger.error(`Embeddings error: ${message}`);
@@ -55,9 +98,11 @@ export class EmbeddingsHandler {
    }
  }

-  /**
-   * Generate embeddings from container
-   */
+  private async resolveCanonicalModel(modelName: string): Promise<string> {
+    const model = await this.modelRegistry.getModel(modelName);
+    return model?.id || modelName;
+  }
+
  private async generateEmbeddings(
    container: import('../../containers/base-container.ts').BaseContainer,
    request: IEmbeddingsRequest,
@@ -66,7 +111,6 @@ export class EmbeddingsHandler {
    const embeddings: IEmbeddingData[] = [];
    let totalTokens = 0;

-    // Generate embeddings for each input
    for (let i = 0; i < inputs.length; i++) {
      const input = inputs[i];
      const embedding = await this.getEmbeddingFromContainer(container, request.model, input);
@@ -91,9 +135,6 @@ export class EmbeddingsHandler {
    };
  }

-  /**
-   * Get embedding from container (container-specific implementation)
-   */
  private async getEmbeddingFromContainer(
    container: import('../../containers/base-container.ts').BaseContainer,
    model: string,
@@ -102,54 +143,17 @@ export class EmbeddingsHandler {
    const endpoint = container.getEndpoint();
    const containerType = container.type;

-    // Route to container-specific embedding endpoint
-    if (containerType === 'ollama') {
-      return this.getOllamaEmbedding(endpoint, model, input);
-    } else if (containerType === 'vllm') {
+    if (containerType === 'vllm') {
      return this.getVllmEmbedding(endpoint, model, input);
-    } else if (containerType === 'tgi') {
+    }
+
+    if (containerType === 'tgi') {
      return this.getTgiEmbedding(endpoint, model, input);
    }

    throw new Error(`Container type ${containerType} does not support embeddings`);
  }

-  /**
-   * Get embedding from Ollama
-   */
-  private async getOllamaEmbedding(
-    endpoint: string,
-    model: string,
-    input: string,
-  ): Promise<{ vector: number[]; tokenCount: number }> {
-    const response = await fetch(`${endpoint}/api/embeddings`, {
-      method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({
-        model,
-        prompt: input,
-      }),
-    });
-
-    if (!response.ok) {
-      const errorText = await response.text();
-      throw new Error(`Ollama embedding error: ${errorText}`);
-    }
-
-    const result = await response.json() as { embedding: number[] };
-
-    // Estimate token count (rough approximation: ~4 chars per token)
-    const tokenCount = Math.ceil(input.length / 4);
-
-    return {
-      vector: result.embedding,
-      tokenCount,
-    };
-  }
-
-  /**
-   * Get embedding from vLLM (OpenAI-compatible)
-   */
  private async getVllmEmbedding(
    endpoint: string,
    model: string,
@@ -158,61 +162,58 @@ export class EmbeddingsHandler {
    const response = await fetch(`${endpoint}/v1/embeddings`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({
-        model,
-        input,
-      }),
+      body: JSON.stringify({ model, input }),
    });

    if (!response.ok) {
-      const errorText = await response.text();
-      throw new Error(`vLLM embedding error: ${errorText}`);
+      throw new Error(`vLLM embedding error: ${await response.text()}`);
    }

    const result = await response.json() as IEmbeddingsResponse;
-
    return {
      vector: result.data[0].embedding,
      tokenCount: result.usage.total_tokens,
    };
  }

-  /**
-   * Get embedding from TGI
-   */
  private async getTgiEmbedding(
    endpoint: string,
    _model: string,
    input: string,
  ): Promise<{ vector: number[]; tokenCount: number }> {
-    // TGI uses /embed endpoint
    const response = await fetch(`${endpoint}/embed`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({
-        inputs: input,
-      }),
+      body: JSON.stringify({ inputs: input }),
    });

    if (!response.ok) {
-      const errorText = await response.text();
-      throw new Error(`TGI embedding error: ${errorText}`);
+      throw new Error(`TGI embedding error: ${await response.text()}`);
    }

    const result = await response.json() as number[][];
-
-    // Estimate token count
-    const tokenCount = Math.ceil(input.length / 4);
-
    return {
      vector: result[0],
-      tokenCount,
+      tokenCount: Math.ceil(input.length / 4),
    };
  }

-  /**
-   * Send error response
-   */
+  private buildForwardHeaders(req: http.IncomingMessage): Record<string, string> {
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+    };
+
+    if (typeof req.headers.authorization === 'string') {
+      headers.Authorization = req.headers.authorization;
+    }
+
+    if (typeof req.headers['x-request-id'] === 'string') {
+      headers['X-Request-Id'] = req.headers['x-request-id'];
+    }
+
+    return headers;
+  }
+
  private sendError(
    res: http.ServerResponse,
    statusCode: number,
@@ -225,7 +226,6 @@ export class EmbeddingsHandler {
        message,
        type,
        param,
-        code: null,
      },
    };