feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
@@ -1,53 +1,96 @@
|
||||
/**
|
||||
* Embeddings Handler
|
||||
*
|
||||
* Handles /v1/embeddings endpoint.
|
||||
* Embeddings handler.
|
||||
*/
|
||||
|
||||
import * as http from 'node:http';
|
||||
import type {
|
||||
IApiError,
|
||||
IEmbeddingData,
|
||||
IEmbeddingsRequest,
|
||||
IEmbeddingsResponse,
|
||||
IEmbeddingData,
|
||||
IApiError,
|
||||
} from '../../interfaces/api.ts';
|
||||
import { logger } from '../../logger.ts';
|
||||
import { ClusterCoordinator } from '../../cluster/coordinator.ts';
|
||||
import { ContainerManager } from '../../containers/container-manager.ts';
|
||||
import { logger } from '../../logger.ts';
|
||||
import { ModelRegistry } from '../../models/registry.ts';
|
||||
|
||||
/**
|
||||
* Handler for embeddings requests
|
||||
*/
|
||||
export class EmbeddingsHandler {
|
||||
private containerManager: ContainerManager;
|
||||
private modelRegistry: ModelRegistry;
|
||||
private clusterCoordinator: ClusterCoordinator;
|
||||
|
||||
constructor(containerManager: ContainerManager) {
|
||||
constructor(
|
||||
containerManager: ContainerManager,
|
||||
modelRegistry: ModelRegistry,
|
||||
clusterCoordinator: ClusterCoordinator,
|
||||
) {
|
||||
this.containerManager = containerManager;
|
||||
this.modelRegistry = modelRegistry;
|
||||
this.clusterCoordinator = clusterCoordinator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle POST /v1/embeddings
|
||||
*/
|
||||
public async handleEmbeddings(
|
||||
req: http.IncomingMessage,
|
||||
res: http.ServerResponse,
|
||||
body: IEmbeddingsRequest,
|
||||
): Promise<void> {
|
||||
const modelName = body.model;
|
||||
const canonicalModel = await this.resolveCanonicalModel(body.model);
|
||||
const requestBody: IEmbeddingsRequest = {
|
||||
...body,
|
||||
model: canonicalModel,
|
||||
};
|
||||
|
||||
logger.dim(`Embeddings request for model: ${modelName}`);
|
||||
logger.dim(`Embeddings request for model: ${canonicalModel}`);
|
||||
|
||||
try {
|
||||
// Find container with the embedding model
|
||||
const container = await this.containerManager.findContainerForModel(modelName);
|
||||
if (!container) {
|
||||
this.sendError(res, 404, `Embedding model "${modelName}" not found`, 'model_not_found');
|
||||
const container = await this.containerManager.findContainerForModel(canonicalModel);
|
||||
if (container) {
|
||||
const response = await this.generateEmbeddings(container, requestBody);
|
||||
res.writeHead(200, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify(response));
|
||||
return;
|
||||
}
|
||||
|
||||
// Generate embeddings
|
||||
const response = await this.generateEmbeddings(container, body);
|
||||
const ensured = await this.clusterCoordinator.ensureModelViaControlPlane(canonicalModel);
|
||||
if (!ensured) {
|
||||
this.sendError(
|
||||
res,
|
||||
404,
|
||||
`Embedding model "${canonicalModel}" not found`,
|
||||
'model_not_found',
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
res.writeHead(200, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify(response));
|
||||
if (ensured.location.nodeName === this.clusterCoordinator.getLocalNodeName()) {
|
||||
const localContainer = await this.containerManager.findContainerForModel(canonicalModel);
|
||||
if (!localContainer) {
|
||||
this.sendError(
|
||||
res,
|
||||
503,
|
||||
`Embedding model "${canonicalModel}" is not ready`,
|
||||
'server_error',
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const response = await this.generateEmbeddings(localContainer, requestBody);
|
||||
res.writeHead(200, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify(response));
|
||||
return;
|
||||
}
|
||||
|
||||
const response = await fetch(`${ensured.location.endpoint}/v1/embeddings`, {
|
||||
method: 'POST',
|
||||
headers: this.buildForwardHeaders(req),
|
||||
body: JSON.stringify(requestBody),
|
||||
});
|
||||
|
||||
const text = await response.text();
|
||||
res.writeHead(response.status, {
|
||||
'Content-Type': response.headers.get('content-type') || 'application/json',
|
||||
});
|
||||
res.end(text);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Embeddings error: ${message}`);
|
||||
@@ -55,9 +98,11 @@ export class EmbeddingsHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate embeddings from container
|
||||
*/
|
||||
private async resolveCanonicalModel(modelName: string): Promise<string> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
return model?.id || modelName;
|
||||
}
|
||||
|
||||
private async generateEmbeddings(
|
||||
container: import('../../containers/base-container.ts').BaseContainer,
|
||||
request: IEmbeddingsRequest,
|
||||
@@ -66,7 +111,6 @@ export class EmbeddingsHandler {
|
||||
const embeddings: IEmbeddingData[] = [];
|
||||
let totalTokens = 0;
|
||||
|
||||
// Generate embeddings for each input
|
||||
for (let i = 0; i < inputs.length; i++) {
|
||||
const input = inputs[i];
|
||||
const embedding = await this.getEmbeddingFromContainer(container, request.model, input);
|
||||
@@ -91,9 +135,6 @@ export class EmbeddingsHandler {
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get embedding from container (container-specific implementation)
|
||||
*/
|
||||
private async getEmbeddingFromContainer(
|
||||
container: import('../../containers/base-container.ts').BaseContainer,
|
||||
model: string,
|
||||
@@ -102,54 +143,17 @@ export class EmbeddingsHandler {
|
||||
const endpoint = container.getEndpoint();
|
||||
const containerType = container.type;
|
||||
|
||||
// Route to container-specific embedding endpoint
|
||||
if (containerType === 'ollama') {
|
||||
return this.getOllamaEmbedding(endpoint, model, input);
|
||||
} else if (containerType === 'vllm') {
|
||||
if (containerType === 'vllm') {
|
||||
return this.getVllmEmbedding(endpoint, model, input);
|
||||
} else if (containerType === 'tgi') {
|
||||
}
|
||||
|
||||
if (containerType === 'tgi') {
|
||||
return this.getTgiEmbedding(endpoint, model, input);
|
||||
}
|
||||
|
||||
throw new Error(`Container type ${containerType} does not support embeddings`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get embedding from Ollama
|
||||
*/
|
||||
private async getOllamaEmbedding(
|
||||
endpoint: string,
|
||||
model: string,
|
||||
input: string,
|
||||
): Promise<{ vector: number[]; tokenCount: number }> {
|
||||
const response = await fetch(`${endpoint}/api/embeddings`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
prompt: input,
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Ollama embedding error: ${errorText}`);
|
||||
}
|
||||
|
||||
const result = await response.json() as { embedding: number[] };
|
||||
|
||||
// Estimate token count (rough approximation: ~4 chars per token)
|
||||
const tokenCount = Math.ceil(input.length / 4);
|
||||
|
||||
return {
|
||||
vector: result.embedding,
|
||||
tokenCount,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get embedding from vLLM (OpenAI-compatible)
|
||||
*/
|
||||
private async getVllmEmbedding(
|
||||
endpoint: string,
|
||||
model: string,
|
||||
@@ -158,61 +162,58 @@ export class EmbeddingsHandler {
|
||||
const response = await fetch(`${endpoint}/v1/embeddings`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input,
|
||||
}),
|
||||
body: JSON.stringify({ model, input }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`vLLM embedding error: ${errorText}`);
|
||||
throw new Error(`vLLM embedding error: ${await response.text()}`);
|
||||
}
|
||||
|
||||
const result = await response.json() as IEmbeddingsResponse;
|
||||
|
||||
return {
|
||||
vector: result.data[0].embedding,
|
||||
tokenCount: result.usage.total_tokens,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get embedding from TGI
|
||||
*/
|
||||
private async getTgiEmbedding(
|
||||
endpoint: string,
|
||||
_model: string,
|
||||
input: string,
|
||||
): Promise<{ vector: number[]; tokenCount: number }> {
|
||||
// TGI uses /embed endpoint
|
||||
const response = await fetch(`${endpoint}/embed`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
inputs: input,
|
||||
}),
|
||||
body: JSON.stringify({ inputs: input }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`TGI embedding error: ${errorText}`);
|
||||
throw new Error(`TGI embedding error: ${await response.text()}`);
|
||||
}
|
||||
|
||||
const result = await response.json() as number[][];
|
||||
|
||||
// Estimate token count
|
||||
const tokenCount = Math.ceil(input.length / 4);
|
||||
|
||||
return {
|
||||
vector: result[0],
|
||||
tokenCount,
|
||||
tokenCount: Math.ceil(input.length / 4),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Send error response
|
||||
*/
|
||||
private buildForwardHeaders(req: http.IncomingMessage): Record<string, string> {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
|
||||
if (typeof req.headers.authorization === 'string') {
|
||||
headers.Authorization = req.headers.authorization;
|
||||
}
|
||||
|
||||
if (typeof req.headers['x-request-id'] === 'string') {
|
||||
headers['X-Request-Id'] = req.headers['x-request-id'];
|
||||
}
|
||||
|
||||
return headers;
|
||||
}
|
||||
|
||||
private sendError(
|
||||
res: http.ServerResponse,
|
||||
statusCode: number,
|
||||
@@ -225,7 +226,6 @@ export class EmbeddingsHandler {
|
||||
message,
|
||||
type,
|
||||
param,
|
||||
code: null,
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user