feat(cluster,api,models,cli): add cluster-aware model catalog deployments and request routing
This commit is contained in:
+30
-13
@@ -4,11 +4,7 @@
|
||||
* Manages vLLM containers for high-performance LLM inference.
|
||||
*/
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
ILoadedModel,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import type { IContainerConfig, ILoadedModel, TContainerType } from '../interfaces/container.ts';
|
||||
import type {
|
||||
IChatCompletionRequest,
|
||||
IChatCompletionResponse,
|
||||
@@ -72,20 +68,26 @@ export class VllmContainer extends BaseContainer {
|
||||
gpuIds: string[],
|
||||
options: Partial<IContainerConfig> = {},
|
||||
): IContainerConfig {
|
||||
// vLLM requires model to be specified at startup
|
||||
const command = [
|
||||
'--model', modelName,
|
||||
'--host', '0.0.0.0',
|
||||
'--port', String(options.port || CONTAINER_PORTS.VLLM),
|
||||
const command = options.command ? [...options.command] : [
|
||||
'--model',
|
||||
modelName,
|
||||
];
|
||||
|
||||
if (!command.includes('--host')) {
|
||||
command.push('--host', '0.0.0.0');
|
||||
}
|
||||
|
||||
if (!command.includes('--port')) {
|
||||
command.push('--port', String(options.port || CONTAINER_PORTS.VLLM));
|
||||
}
|
||||
|
||||
// Add tensor parallelism if multiple GPUs
|
||||
if (gpuIds.length > 1) {
|
||||
if (gpuIds.length > 1 && !command.includes('--tensor-parallel-size')) {
|
||||
command.push('--tensor-parallel-size', String(gpuIds.length));
|
||||
}
|
||||
|
||||
// Add additional options
|
||||
if (options.env?.VLLM_MAX_MODEL_LEN) {
|
||||
if (options.env?.VLLM_MAX_MODEL_LEN && !command.includes('--max-model-len')) {
|
||||
command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
|
||||
}
|
||||
|
||||
@@ -128,11 +130,17 @@ export class VllmContainer extends BaseContainer {
|
||||
* vLLM serves a single model per instance
|
||||
*/
|
||||
public async listModels(): Promise<string[]> {
|
||||
if (this.config.models.length > 0) {
|
||||
return this.config.models;
|
||||
}
|
||||
|
||||
try {
|
||||
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
|
||||
return (data.data || []).map((m) => m.id);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
|
||||
logger.warn(
|
||||
`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
return this.config.models || [];
|
||||
}
|
||||
}
|
||||
@@ -141,6 +149,15 @@ export class VllmContainer extends BaseContainer {
|
||||
* Get loaded models with details
|
||||
*/
|
||||
public async getLoadedModels(): Promise<ILoadedModel[]> {
|
||||
if (this.config.models.length > 0) {
|
||||
return this.config.models.map((name) => ({
|
||||
name,
|
||||
size: 0,
|
||||
loaded: true,
|
||||
requestCount: 0,
|
||||
}));
|
||||
}
|
||||
|
||||
try {
|
||||
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
|
||||
return (data.data || []).map((m) => ({
|
||||
|
||||
Reference in New Issue
Block a user