feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+149
-65
@@ -1,58 +1,89 @@
|
||||
/**
|
||||
* Chat Completions Handler
|
||||
*
|
||||
* Handles /v1/chat/completions and /v1/completions endpoints.
|
||||
* Chat completions handler.
|
||||
*/
|
||||
|
||||
import * as http from 'node:http';
|
||||
import type {
|
||||
IChatCompletionRequest,
|
||||
IChatCompletionResponse,
|
||||
IApiError,
|
||||
} from '../../interfaces/api.ts';
|
||||
import { logger } from '../../logger.ts';
|
||||
import type { IApiError, IChatCompletionRequest } from '../../interfaces/api.ts';
|
||||
import { ClusterCoordinator } from '../../cluster/coordinator.ts';
|
||||
import { ContainerManager } from '../../containers/container-manager.ts';
|
||||
import { logger } from '../../logger.ts';
|
||||
import { ModelRegistry } from '../../models/registry.ts';
|
||||
import { ModelLoader } from '../../models/loader.ts';
|
||||
|
||||
/**
|
||||
* Handler for chat completion requests
|
||||
*/
|
||||
export class ChatHandler {
|
||||
private containerManager: ContainerManager;
|
||||
private modelRegistry: ModelRegistry;
|
||||
private modelLoader: ModelLoader;
|
||||
private clusterCoordinator: ClusterCoordinator;
|
||||
|
||||
constructor(containerManager: ContainerManager, modelLoader: ModelLoader) {
|
||||
constructor(
|
||||
containerManager: ContainerManager,
|
||||
modelRegistry: ModelRegistry,
|
||||
modelLoader: ModelLoader,
|
||||
clusterCoordinator: ClusterCoordinator,
|
||||
) {
|
||||
this.containerManager = containerManager;
|
||||
this.modelRegistry = modelRegistry;
|
||||
this.modelLoader = modelLoader;
|
||||
this.clusterCoordinator = clusterCoordinator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle POST /v1/chat/completions
|
||||
*/
|
||||
public async handleChatCompletion(
|
||||
req: http.IncomingMessage,
|
||||
res: http.ServerResponse,
|
||||
body: IChatCompletionRequest,
|
||||
): Promise<void> {
|
||||
const modelName = body.model;
|
||||
const isStream = body.stream === true;
|
||||
const canonicalModel = await this.resolveCanonicalModel(body.model);
|
||||
const requestBody: IChatCompletionRequest = {
|
||||
...body,
|
||||
model: canonicalModel,
|
||||
};
|
||||
|
||||
logger.dim(`Chat completion request for model: ${modelName}`);
|
||||
logger.dim(`Chat completion request for model: ${canonicalModel}`);
|
||||
|
||||
try {
|
||||
// Find or load the model
|
||||
const container = await this.findOrLoadModel(modelName);
|
||||
if (!container) {
|
||||
this.sendError(res, 404, `Model "${modelName}" not found or could not be loaded`, 'model_not_found');
|
||||
const container = await this.findOrLoadLocalModel(canonicalModel);
|
||||
if (container) {
|
||||
if (requestBody.stream) {
|
||||
await this.handleStreamingCompletion(res, container, requestBody);
|
||||
} else {
|
||||
await this.handleNonStreamingCompletion(res, container, requestBody);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Route to streaming or non-streaming handler
|
||||
if (isStream) {
|
||||
await this.handleStreamingCompletion(res, container, body);
|
||||
} else {
|
||||
await this.handleNonStreamingCompletion(res, container, body);
|
||||
const ensured = await this.clusterCoordinator.ensureModelViaControlPlane(canonicalModel);
|
||||
if (!ensured) {
|
||||
this.sendError(
|
||||
res,
|
||||
404,
|
||||
`Model "${canonicalModel}" not found or could not be deployed`,
|
||||
'model_not_found',
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ensured.location.nodeName === this.clusterCoordinator.getLocalNodeName()) {
|
||||
const localContainer = await this.findLocalContainer(canonicalModel);
|
||||
if (!localContainer) {
|
||||
this.sendError(
|
||||
res,
|
||||
503,
|
||||
`Model "${canonicalModel}" was scheduled locally but is not ready`,
|
||||
'server_error',
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
if (requestBody.stream) {
|
||||
await this.handleStreamingCompletion(res, localContainer, requestBody);
|
||||
} else {
|
||||
await this.handleNonStreamingCompletion(res, localContainer, requestBody);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
await this.proxyChatRequest(req, res, ensured.location.endpoint, requestBody);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
logger.error(`Chat completion error: ${message}`);
|
||||
@@ -60,34 +91,38 @@ export class ChatHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find container with model or attempt to load it
|
||||
*/
|
||||
private async findOrLoadModel(
|
||||
private async resolveCanonicalModel(modelName: string): Promise<string> {
|
||||
const model = await this.modelRegistry.getModel(modelName);
|
||||
return model?.id || modelName;
|
||||
}
|
||||
|
||||
private async findLocalContainer(
|
||||
modelName: string,
|
||||
): Promise<import('../../containers/base-container.ts').BaseContainer | null> {
|
||||
// First, check if model is already loaded
|
||||
const container = await this.containerManager.findContainerForModel(modelName);
|
||||
if (container) {
|
||||
return container;
|
||||
}
|
||||
|
||||
// Try to load the model
|
||||
logger.info(`Model ${modelName} not loaded, attempting to load...`);
|
||||
const loadResult = await this.modelLoader.loadModel(modelName);
|
||||
|
||||
if (!loadResult.success) {
|
||||
logger.error(`Failed to load model: ${loadResult.error}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find the container again after loading
|
||||
return this.containerManager.findContainerForModel(modelName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle non-streaming chat completion
|
||||
*/
|
||||
private async findOrLoadLocalModel(
|
||||
modelName: string,
|
||||
): Promise<import('../../containers/base-container.ts').BaseContainer | null> {
|
||||
const existing = await this.findLocalContainer(modelName);
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
|
||||
if (!this.clusterCoordinator.shouldDeployLocallyFirst()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.info(`Model ${modelName} not loaded, attempting local deploy...`);
|
||||
const loadResult = await this.modelLoader.loadModel(modelName);
|
||||
if (!loadResult.success) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return this.findLocalContainer(loadResult.model);
|
||||
}
|
||||
|
||||
private async handleNonStreamingCompletion(
|
||||
res: http.ServerResponse,
|
||||
container: import('../../containers/base-container.ts').BaseContainer,
|
||||
@@ -99,35 +134,85 @@ export class ChatHandler {
|
||||
res.end(JSON.stringify(response));
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle streaming chat completion
|
||||
*/
|
||||
private async handleStreamingCompletion(
|
||||
res: http.ServerResponse,
|
||||
container: import('../../containers/base-container.ts').BaseContainer,
|
||||
body: IChatCompletionRequest,
|
||||
): Promise<void> {
|
||||
// Set SSE headers
|
||||
res.writeHead(200, {
|
||||
'Content-Type': 'text/event-stream',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
Connection: 'keep-alive',
|
||||
'X-Accel-Buffering': 'no',
|
||||
});
|
||||
|
||||
// Stream chunks to client
|
||||
await container.chatCompletionStream(body, (chunk) => {
|
||||
res.write(`data: ${chunk}\n\n`);
|
||||
res.write(chunk);
|
||||
});
|
||||
|
||||
// Send final done message
|
||||
res.write('data: [DONE]\n\n');
|
||||
res.end();
|
||||
}
|
||||
|
||||
/**
|
||||
* Send error response
|
||||
*/
|
||||
private async proxyChatRequest(
|
||||
req: http.IncomingMessage,
|
||||
res: http.ServerResponse,
|
||||
targetEndpoint: string,
|
||||
body: IChatCompletionRequest,
|
||||
): Promise<void> {
|
||||
const response = await fetch(`${targetEndpoint}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: this.buildForwardHeaders(req),
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (body.stream) {
|
||||
res.writeHead(response.status, {
|
||||
'Content-Type': response.headers.get('content-type') || 'text/event-stream',
|
||||
'Cache-Control': 'no-cache',
|
||||
Connection: 'keep-alive',
|
||||
});
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
res.end();
|
||||
return;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
|
||||
res.write(value);
|
||||
}
|
||||
|
||||
res.end();
|
||||
return;
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
res.writeHead(response.status, {
|
||||
'Content-Type': response.headers.get('content-type') || 'application/json',
|
||||
});
|
||||
res.end(text);
|
||||
}
|
||||
|
||||
private buildForwardHeaders(req: http.IncomingMessage): Record<string, string> {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
|
||||
if (typeof req.headers.authorization === 'string') {
|
||||
headers.Authorization = req.headers.authorization;
|
||||
}
|
||||
|
||||
if (typeof req.headers['x-request-id'] === 'string') {
|
||||
headers['X-Request-Id'] = req.headers['x-request-id'];
|
||||
}
|
||||
|
||||
return headers;
|
||||
}
|
||||
|
||||
private sendError(
|
||||
res: http.ServerResponse,
|
||||
statusCode: number,
|
||||
@@ -140,7 +225,6 @@ export class ChatHandler {
|
||||
message,
|
||||
type,
|
||||
param,
|
||||
code: null,
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user