Files

273 lines
7.0 KiB
TypeScript
Raw Permalink Normal View History

2026-01-30 03:16:57 +00:00
/**
* vLLM Container
*
* Manages vLLM containers for high-performance LLM inference.
*/
import type {
IContainerConfig,
ILoadedModel,
TContainerType,
} from '../interfaces/container.ts';
import type {
IChatCompletionRequest,
IChatCompletionResponse,
IChatMessage,
} from '../interfaces/api.ts';
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
import { logger } from '../logger.ts';
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
/**
* vLLM model info response
*/
interface IVllmModelsResponse {
object: 'list';
data: Array<{
id: string;
object: 'model';
created: number;
owned_by: string;
}>;
}
/**
* vLLM container implementation
*
* vLLM serves a single model per instance and is optimized for:
* - High throughput with PagedAttention
* - Continuous batching
* - OpenAI-compatible API
*/
export class VllmContainer extends BaseContainer {
public readonly type: TContainerType = 'vllm';
public readonly displayName = 'vLLM';
public readonly defaultImage = CONTAINER_IMAGES.VLLM;
public readonly defaultPort = CONTAINER_PORTS.VLLM;
constructor(config: IContainerConfig) {
super(config);
// Set defaults if not provided
if (!config.image) {
config.image = this.defaultImage;
}
if (!config.port) {
config.port = this.defaultPort;
}
// Add default volume for model cache
if (!config.volumes || config.volumes.length === 0) {
config.volumes = [`modelgrid-vllm-${config.id}:/root/.cache/huggingface`];
}
}
/**
* Create vLLM container configuration
*/
public static createConfig(
id: string,
name: string,
modelName: string,
gpuIds: string[],
options: Partial<IContainerConfig> = {},
): IContainerConfig {
// vLLM requires model to be specified at startup
const command = [
'--model', modelName,
'--host', '0.0.0.0',
'--port', String(options.port || CONTAINER_PORTS.VLLM),
];
// Add tensor parallelism if multiple GPUs
if (gpuIds.length > 1) {
command.push('--tensor-parallel-size', String(gpuIds.length));
}
// Add additional options
if (options.env?.VLLM_MAX_MODEL_LEN) {
command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
}
return {
id,
name,
type: 'vllm',
image: options.image || CONTAINER_IMAGES.VLLM,
gpuIds,
port: options.port || CONTAINER_PORTS.VLLM,
externalPort: options.externalPort,
models: [modelName],
env: {
HF_TOKEN: options.env?.HF_TOKEN || '',
...options.env,
},
volumes: options.volumes || [`modelgrid-vllm-${id}:/root/.cache/huggingface`],
autoStart: options.autoStart ?? true,
restartPolicy: options.restartPolicy || 'unless-stopped',
memoryLimit: options.memoryLimit,
cpuLimit: options.cpuLimit,
command,
};
}
/**
* Check if vLLM is healthy
*/
public async isHealthy(): Promise<boolean> {
try {
const response = await this.fetch('/health', { timeout: 5000 });
return response.ok;
} catch {
return false;
}
}
/**
* List available models
* vLLM serves a single model per instance
*/
public async listModels(): Promise<string[]> {
try {
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
return (data.data || []).map((m) => m.id);
} catch (error) {
logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
return this.config.models || [];
}
}
/**
* Get loaded models with details
*/
public async getLoadedModels(): Promise<ILoadedModel[]> {
try {
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
return (data.data || []).map((m) => ({
name: m.id,
size: 0, // vLLM doesn't expose size
loaded: true,
requestCount: 0,
}));
} catch {
// Return configured model as fallback
return this.config.models.map((name) => ({
name,
size: 0,
loaded: true,
requestCount: 0,
}));
}
}
/**
* Pull a model
* vLLM downloads models automatically at startup
* This method is a no-op - models are configured at container creation
*/
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
logger.info(`vLLM downloads models at startup. Model: ${modelName}`);
logger.info('To use a different model, create a new vLLM container.');
if (onProgress) {
onProgress({
model: modelName,
status: 'vLLM models are loaded at container startup',
percent: 100,
});
}
return true;
}
/**
* Remove a model
* vLLM serves a single model per instance
*/
public async removeModel(modelName: string): Promise<boolean> {
logger.info(`vLLM serves a single model per instance.`);
logger.info(`To remove model ${modelName}, stop and remove this container.`);
return true;
}
/**
* Send a chat completion request
* vLLM is OpenAI-compatible
*/
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
return this.fetchJson<IChatCompletionResponse>('/v1/chat/completions', {
method: 'POST',
body: {
...request,
stream: false,
},
timeout: 300000, // 5 minutes
});
}
/**
* Stream a chat completion request
* vLLM is OpenAI-compatible
*/
public async chatCompletionStream(
request: IChatCompletionRequest,
onChunk: (chunk: string) => void,
): Promise<void> {
const response = await this.fetch('/v1/chat/completions', {
method: 'POST',
body: {
...request,
stream: true,
},
timeout: 300000,
});
if (!response.ok) {
const error = await response.text();
throw new Error(`HTTP ${response.status}: ${error}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
// vLLM already sends data in SSE format
onChunk(text);
}
}
/**
* Get vLLM-specific metrics
*/
public async getMetrics(): Promise<Record<string, unknown>> {
try {
const response = await this.fetch('/metrics', { timeout: 5000 });
if (response.ok) {
const text = await response.text();
// Parse Prometheus metrics
const metrics: Record<string, unknown> = {};
const lines = text.split('\n');
for (const line of lines) {
if (line.startsWith('#') || !line.trim()) continue;
const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
if (match) {
metrics[match[1]] = parseFloat(match[2]);
}
}
return metrics;
}
} catch {
// Metrics endpoint may not be enabled
}
return {};
}
}