initial
Some checks failed
CI / Type Check & Lint (push) Failing after 5s
CI / Build Test (Current Platform) (push) Failing after 5s
CI / Build All Platforms (push) Successful in 49s

This commit is contained in:
2026-01-30 03:16:57 +00:00
commit daaf6559e3
80 changed files with 14430 additions and 0 deletions

View File

@@ -0,0 +1,216 @@
/**
* Base Container
*
* Abstract base class for AI model containers.
*/
import type {
IContainerConfig,
IContainerStatus,
ILoadedModel,
TContainerType,
} from '../interfaces/container.ts';
import type { IChatCompletionRequest, IChatCompletionResponse } from '../interfaces/api.ts';
import { ContainerRuntime } from '../docker/container-runtime.ts';
import { logger } from '../logger.ts';
/**
* Model pull progress callback
*/
export type TModelPullProgress = (progress: {
model: string;
status: string;
percent?: number;
}) => void;
/**
* Abstract base class for AI model containers
*/
export abstract class BaseContainer {
/** Container type */
public abstract readonly type: TContainerType;
/** Display name */
public abstract readonly displayName: string;
/** Default Docker image */
public abstract readonly defaultImage: string;
/** Default internal port */
public abstract readonly defaultPort: number;
/** Container configuration */
protected config: IContainerConfig;
/** Container runtime */
protected runtime: ContainerRuntime;
constructor(config: IContainerConfig) {
this.config = config;
this.runtime = new ContainerRuntime();
}
/**
* Get the container configuration
*/
public getConfig(): IContainerConfig {
return this.config;
}
/**
* Get the endpoint URL for this container
*/
public getEndpoint(): string {
const port = this.config.externalPort || this.config.port;
return `http://localhost:${port}`;
}
/**
* Start the container
*/
public async start(): Promise<boolean> {
logger.info(`Starting ${this.displayName} container: ${this.config.name}`);
return this.runtime.startContainer(this.config);
}
/**
* Stop the container
*/
public async stop(): Promise<boolean> {
logger.info(`Stopping ${this.displayName} container: ${this.config.name}`);
return this.runtime.stopContainer(this.config.id);
}
/**
* Restart the container
*/
public async restart(): Promise<boolean> {
logger.info(`Restarting ${this.displayName} container: ${this.config.name}`);
return this.runtime.restartContainer(this.config.id);
}
/**
* Remove the container
*/
public async remove(): Promise<boolean> {
logger.info(`Removing ${this.displayName} container: ${this.config.name}`);
return this.runtime.removeContainer(this.config.id);
}
/**
* Get container status
*/
public async getStatus(): Promise<IContainerStatus> {
return this.runtime.getContainerStatus(this.config);
}
/**
* Get container logs
*/
public async getLogs(lines: number = 100): Promise<string> {
return this.runtime.getLogs(this.config.id, { lines });
}
/**
* Check if the container is healthy
*/
public abstract isHealthy(): Promise<boolean>;
/**
* Get list of available models
*/
public abstract listModels(): Promise<string[]>;
/**
* Get list of loaded models with details
*/
public abstract getLoadedModels(): Promise<ILoadedModel[]>;
/**
* Pull a model
*/
public abstract pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean>;
/**
* Remove a model
*/
public abstract removeModel(modelName: string): Promise<boolean>;
/**
* Send a chat completion request
*/
public abstract chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse>;
/**
* Stream a chat completion request
*/
public abstract chatCompletionStream(
request: IChatCompletionRequest,
onChunk: (chunk: string) => void,
): Promise<void>;
/**
* Make HTTP request to container
*/
protected async fetch(
path: string,
options: {
method?: string;
headers?: Record<string, string>;
body?: unknown;
timeout?: number;
} = {},
): Promise<Response> {
const endpoint = this.getEndpoint();
const url = `${endpoint}${path}`;
const controller = new AbortController();
const timeout = options.timeout || 30000;
const timeoutId = setTimeout(() => controller.abort(), timeout);
try {
const response = await fetch(url, {
method: options.method || 'GET',
headers: {
'Content-Type': 'application/json',
...options.headers,
},
body: options.body ? JSON.stringify(options.body) : undefined,
signal: controller.signal,
});
return response;
} finally {
clearTimeout(timeoutId);
}
}
/**
* Make HTTP request and parse JSON response
*/
protected async fetchJson<T>(
path: string,
options: {
method?: string;
headers?: Record<string, string>;
body?: unknown;
timeout?: number;
} = {},
): Promise<T> {
const response = await this.fetch(path, options);
if (!response.ok) {
const errorText = await response.text();
throw new Error(`HTTP ${response.status}: ${errorText}`);
}
return response.json();
}
/**
* Generate a unique request ID
*/
protected generateRequestId(): string {
return `chatcmpl-${Date.now().toString(36)}-${Math.random().toString(36).substring(2, 8)}`;
}
}

View File

@@ -0,0 +1,349 @@
/**
* Container Manager
*
* Orchestrates multiple AI model containers.
*/
import type {
IContainerConfig,
IContainerStatus,
IContainerEndpoint,
TContainerType,
} from '../interfaces/container.ts';
import { logger } from '../logger.ts';
import { DockerManager } from '../docker/docker-manager.ts';
import { BaseContainer } from './base-container.ts';
import { OllamaContainer } from './ollama.ts';
import { VllmContainer } from './vllm.ts';
import { TgiContainer } from './tgi.ts';
/**
* Container Manager - orchestrates all containers
*/
export class ContainerManager {
private containers: Map<string, BaseContainer>;
private dockerManager: DockerManager;
constructor() {
this.containers = new Map();
this.dockerManager = new DockerManager();
}
/**
* Initialize container manager
*/
public async initialize(): Promise<void> {
// Ensure Docker is running
if (!await this.dockerManager.isRunning()) {
throw new Error('Docker is not running');
}
// Create network if it doesn't exist
await this.dockerManager.createNetwork();
}
/**
* Create a container instance from config
*/
private createContainerInstance(config: IContainerConfig): BaseContainer {
switch (config.type) {
case 'ollama':
return new OllamaContainer(config);
case 'vllm':
return new VllmContainer(config);
case 'tgi':
return new TgiContainer(config);
default:
throw new Error(`Unknown container type: ${config.type}`);
}
}
/**
* Add a container
*/
public addContainer(config: IContainerConfig): BaseContainer {
if (this.containers.has(config.id)) {
throw new Error(`Container with ID ${config.id} already exists`);
}
const container = this.createContainerInstance(config);
this.containers.set(config.id, container);
return container;
}
/**
* Remove a container
*/
public async removeContainer(containerId: string): Promise<boolean> {
const container = this.containers.get(containerId);
if (!container) {
return false;
}
await container.remove();
this.containers.delete(containerId);
return true;
}
/**
* Get a container by ID
*/
public getContainer(containerId: string): BaseContainer | undefined {
return this.containers.get(containerId);
}
/**
* Get all containers
*/
public getAllContainers(): BaseContainer[] {
return Array.from(this.containers.values());
}
/**
* Load containers from configuration
*/
public loadFromConfig(configs: IContainerConfig[]): void {
this.containers.clear();
for (const config of configs) {
try {
this.addContainer(config);
} catch (error) {
logger.warn(`Failed to load container ${config.id}: ${error instanceof Error ? error.message : String(error)}`);
}
}
}
/**
* Start all containers
*/
public async startAll(): Promise<Map<string, boolean>> {
const results = new Map<string, boolean>();
for (const [id, container] of this.containers) {
if (!container.getConfig().autoStart) {
continue;
}
try {
const success = await container.start();
results.set(id, success);
} catch (error) {
logger.error(`Failed to start container ${id}: ${error instanceof Error ? error.message : String(error)}`);
results.set(id, false);
}
}
return results;
}
/**
* Stop all containers
*/
public async stopAll(): Promise<Map<string, boolean>> {
const results = new Map<string, boolean>();
for (const [id, container] of this.containers) {
try {
const success = await container.stop();
results.set(id, success);
} catch (error) {
logger.error(`Failed to stop container ${id}: ${error instanceof Error ? error.message : String(error)}`);
results.set(id, false);
}
}
return results;
}
/**
* Get status of all containers
*/
public async getAllStatus(): Promise<Map<string, IContainerStatus>> {
const statuses = new Map<string, IContainerStatus>();
for (const [id, container] of this.containers) {
try {
const status = await container.getStatus();
statuses.set(id, status);
} catch (error) {
logger.warn(`Failed to get status for container ${id}: ${error instanceof Error ? error.message : String(error)}`);
}
}
return statuses;
}
/**
* Get available endpoints for a model
*/
public async getEndpointsForModel(modelName: string): Promise<IContainerEndpoint[]> {
const endpoints: IContainerEndpoint[] = [];
for (const [_id, container] of this.containers) {
try {
const status = await container.getStatus();
if (!status.running) {
continue;
}
// Check if container has this model
const models = await container.listModels();
if (!models.includes(modelName)) {
continue;
}
endpoints.push({
containerId: container.getConfig().id,
type: container.type,
url: container.getEndpoint(),
models,
healthy: status.health === 'healthy',
priority: 0, // Could be based on load
});
} catch {
// Skip containers that fail to respond
}
}
return endpoints;
}
/**
* Find best container for a model
*/
public async findContainerForModel(modelName: string): Promise<BaseContainer | null> {
const endpoints = await this.getEndpointsForModel(modelName);
// Filter to healthy endpoints
const healthy = endpoints.filter((e) => e.healthy);
if (healthy.length === 0) {
return null;
}
// Return first healthy endpoint (could add load balancing)
const endpoint = healthy[0];
return this.containers.get(endpoint.containerId) || null;
}
/**
* Get all available models across all containers
*/
public async getAllAvailableModels(): Promise<Map<string, IContainerEndpoint[]>> {
const modelMap = new Map<string, IContainerEndpoint[]>();
for (const container of this.containers.values()) {
try {
const status = await container.getStatus();
if (!status.running) continue;
const models = await container.listModels();
for (const model of models) {
if (!modelMap.has(model)) {
modelMap.set(model, []);
}
modelMap.get(model)!.push({
containerId: container.getConfig().id,
type: container.type,
url: container.getEndpoint(),
models,
healthy: status.health === 'healthy',
priority: 0,
});
}
} catch {
// Skip failed containers
}
}
return modelMap;
}
/**
* Pull a model to a specific container type
*/
public async pullModel(
modelName: string,
containerType: TContainerType = 'ollama',
containerId?: string,
): Promise<boolean> {
// Find or create appropriate container
let container: BaseContainer | undefined;
if (containerId) {
container = this.containers.get(containerId);
} else {
// Find first container of the specified type
for (const c of this.containers.values()) {
if (c.type === containerType) {
container = c;
break;
}
}
}
if (!container) {
logger.error(`No ${containerType} container available to pull model`);
return false;
}
return container.pullModel(modelName, (progress) => {
const percent = progress.percent !== undefined ? ` (${progress.percent}%)` : '';
logger.dim(` ${progress.status}${percent}`);
});
}
/**
* Health check all containers
*/
public async healthCheck(): Promise<Map<string, boolean>> {
const results = new Map<string, boolean>();
for (const [id, container] of this.containers) {
try {
const healthy = await container.isHealthy();
results.set(id, healthy);
} catch {
results.set(id, false);
}
}
return results;
}
/**
* Print container status summary
*/
public async printStatus(): Promise<void> {
const statuses = await this.getAllStatus();
if (statuses.size === 0) {
logger.logBox('Containers', ['No containers configured'], 50, 'warning');
return;
}
logger.logBoxTitle('Container Status', 70, 'info');
for (const [id, status] of statuses) {
const runningStr = status.running ? 'Running' : 'Stopped';
const healthStr = status.health;
const modelsStr = status.loadedModels.length > 0
? status.loadedModels.join(', ')
: 'None';
logger.logBoxLine(`${status.name} (${id})`);
logger.logBoxLine(` Type: ${status.type} | Status: ${runningStr} | Health: ${healthStr}`);
logger.logBoxLine(` Models: ${modelsStr}`);
logger.logBoxLine(` Endpoint: ${status.endpoint}`);
if (status.gpuUtilization !== undefined) {
logger.logBoxLine(` GPU: ${status.gpuUtilization}% | Memory: ${status.memoryUsage || 0}MB`);
}
logger.logBoxLine('');
}
logger.logBoxEnd();
}
}

11
ts/containers/index.ts Normal file
View File

@@ -0,0 +1,11 @@
/**
* Container Management Module
*
* Exports all AI container implementations.
*/
export { BaseContainer } from './base-container.ts';
export { OllamaContainer } from './ollama.ts';
export { VllmContainer } from './vllm.ts';
export { TgiContainer } from './tgi.ts';
export { ContainerManager } from './container-manager.ts';

387
ts/containers/ollama.ts Normal file
View File

@@ -0,0 +1,387 @@
/**
* Ollama Container
*
* Manages Ollama containers for running local LLMs.
*/
import type {
IContainerConfig,
ILoadedModel,
TContainerType,
} from '../interfaces/container.ts';
import type {
IChatCompletionRequest,
IChatCompletionResponse,
IChatCompletionChoice,
IChatMessage,
} from '../interfaces/api.ts';
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
import { logger } from '../logger.ts';
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
/**
* Ollama API response types
*/
interface IOllamaTagsResponse {
models: Array<{
name: string;
size: number;
digest: string;
modified_at: string;
}>;
}
interface IOllamaChatRequest {
model: string;
messages: Array<{
role: string;
content: string;
}>;
stream?: boolean;
options?: {
temperature?: number;
top_p?: number;
num_predict?: number;
stop?: string[];
};
}
interface IOllamaChatResponse {
model: string;
created_at: string;
message: {
role: string;
content: string;
};
done: boolean;
total_duration?: number;
load_duration?: number;
prompt_eval_count?: number;
eval_count?: number;
}
interface IOllamaPullResponse {
status: string;
digest?: string;
total?: number;
completed?: number;
}
/**
* Ollama container implementation
*/
export class OllamaContainer extends BaseContainer {
public readonly type: TContainerType = 'ollama';
public readonly displayName = 'Ollama';
public readonly defaultImage = CONTAINER_IMAGES.OLLAMA;
public readonly defaultPort = CONTAINER_PORTS.OLLAMA;
constructor(config: IContainerConfig) {
super(config);
// Set defaults if not provided
if (!config.image) {
config.image = this.defaultImage;
}
if (!config.port) {
config.port = this.defaultPort;
}
// Add default volume for model storage
if (!config.volumes || config.volumes.length === 0) {
config.volumes = [`modelgrid-ollama-${config.id}:/root/.ollama`];
}
}
/**
* Create Ollama container configuration
*/
public static createConfig(
id: string,
name: string,
gpuIds: string[],
options: Partial<IContainerConfig> = {},
): IContainerConfig {
return {
id,
name,
type: 'ollama',
image: options.image || CONTAINER_IMAGES.OLLAMA,
gpuIds,
port: options.port || CONTAINER_PORTS.OLLAMA,
externalPort: options.externalPort,
models: options.models || [],
env: options.env,
volumes: options.volumes || [`modelgrid-ollama-${id}:/root/.ollama`],
autoStart: options.autoStart ?? true,
restartPolicy: options.restartPolicy || 'unless-stopped',
memoryLimit: options.memoryLimit,
cpuLimit: options.cpuLimit,
command: options.command,
};
}
/**
* Check if Ollama is healthy
*/
public async isHealthy(): Promise<boolean> {
try {
const response = await this.fetch('/api/tags', { timeout: 5000 });
return response.ok;
} catch {
return false;
}
}
/**
* List available models
*/
public async listModels(): Promise<string[]> {
try {
const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
return (data.models || []).map((m) => m.name);
} catch (error) {
logger.warn(`Failed to list Ollama models: ${error instanceof Error ? error.message : String(error)}`);
return [];
}
}
/**
* Get loaded models with details
*/
public async getLoadedModels(): Promise<ILoadedModel[]> {
try {
const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
return (data.models || []).map((m) => ({
name: m.name,
size: m.size,
format: m.digest.substring(0, 12),
loaded: true, // Ollama doesn't distinguish loaded vs available
requestCount: 0,
}));
} catch {
return [];
}
}
/**
* Pull a model
*/
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
try {
logger.info(`Pulling model: ${modelName}`);
const response = await this.fetch('/api/pull', {
method: 'POST',
body: { name: modelName },
timeout: 3600000, // 1 hour for large models
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
// Read streaming response
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
let lastStatus = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
const lines = text.split('\n').filter((l) => l.trim());
for (const line of lines) {
try {
const data = JSON.parse(line) as IOllamaPullResponse;
const status = data.status;
if (status !== lastStatus) {
lastStatus = status;
let percent: number | undefined;
if (data.total && data.completed) {
percent = Math.round((data.completed / data.total) * 100);
}
if (onProgress) {
onProgress({ model: modelName, status, percent });
} else {
const progressStr = percent !== undefined ? ` (${percent}%)` : '';
logger.dim(` ${status}${progressStr}`);
}
}
} catch {
// Invalid JSON line, skip
}
}
}
logger.success(`Model ${modelName} pulled successfully`);
return true;
} catch (error) {
logger.error(`Failed to pull model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Remove a model
*/
public async removeModel(modelName: string): Promise<boolean> {
try {
const response = await this.fetch('/api/delete', {
method: 'DELETE',
body: { name: modelName },
});
if (response.ok) {
logger.success(`Model ${modelName} removed`);
return true;
}
throw new Error(`HTTP ${response.status}`);
} catch (error) {
logger.error(`Failed to remove model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Send a chat completion request
*/
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
const ollamaRequest: IOllamaChatRequest = {
model: request.model,
messages: request.messages.map((m) => ({
role: m.role,
content: m.content,
})),
stream: false,
options: {
temperature: request.temperature,
top_p: request.top_p,
num_predict: request.max_tokens,
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
},
};
const response = await this.fetchJson<IOllamaChatResponse>('/api/chat', {
method: 'POST',
body: ollamaRequest,
timeout: 300000, // 5 minutes
});
// Convert to OpenAI format
const created = Math.floor(Date.now() / 1000);
const choice: IChatCompletionChoice = {
index: 0,
message: {
role: 'assistant',
content: response.message.content,
},
finish_reason: response.done ? 'stop' : null,
};
return {
id: this.generateRequestId(),
object: 'chat.completion',
created,
model: request.model,
choices: [choice],
usage: {
prompt_tokens: response.prompt_eval_count || 0,
completion_tokens: response.eval_count || 0,
total_tokens: (response.prompt_eval_count || 0) + (response.eval_count || 0),
},
};
}
/**
* Stream a chat completion request
*/
public async chatCompletionStream(
request: IChatCompletionRequest,
onChunk: (chunk: string) => void,
): Promise<void> {
const ollamaRequest: IOllamaChatRequest = {
model: request.model,
messages: request.messages.map((m) => ({
role: m.role,
content: m.content,
})),
stream: true,
options: {
temperature: request.temperature,
top_p: request.top_p,
num_predict: request.max_tokens,
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
},
};
const response = await this.fetch('/api/chat', {
method: 'POST',
body: ollamaRequest,
timeout: 300000,
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
const requestId = this.generateRequestId();
const created = Math.floor(Date.now() / 1000);
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
const lines = text.split('\n').filter((l) => l.trim());
for (const line of lines) {
try {
const data = JSON.parse(line) as IOllamaChatResponse;
// Convert to OpenAI streaming format
const chunk = {
id: requestId,
object: 'chat.completion.chunk',
created,
model: request.model,
choices: [
{
index: 0,
delta: {
content: data.message.content,
} as Partial<IChatMessage>,
finish_reason: data.done ? 'stop' : null,
},
],
};
onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
if (data.done) {
onChunk('data: [DONE]\n\n');
}
} catch {
// Invalid JSON, skip
}
}
}
}
}

417
ts/containers/tgi.ts Normal file
View File

@@ -0,0 +1,417 @@
/**
* TGI Container (Text Generation Inference)
*
* Manages HuggingFace Text Generation Inference containers.
*/
import type {
IContainerConfig,
ILoadedModel,
TContainerType,
} from '../interfaces/container.ts';
import type {
IChatCompletionRequest,
IChatCompletionResponse,
IChatCompletionChoice,
IChatMessage,
} from '../interfaces/api.ts';
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
import { logger } from '../logger.ts';
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
/**
* TGI info response
*/
interface ITgiInfoResponse {
model_id: string;
model_sha: string;
model_dtype: string;
model_device_type: string;
max_concurrent_requests: number;
max_best_of: number;
max_stop_sequences: number;
max_input_length: number;
max_total_tokens: number;
version: string;
}
/**
* TGI generate request
*/
interface ITgiGenerateRequest {
inputs: string;
parameters?: {
temperature?: number;
top_p?: number;
max_new_tokens?: number;
stop?: string[];
do_sample?: boolean;
return_full_text?: boolean;
};
}
/**
* TGI generate response
*/
interface ITgiGenerateResponse {
generated_text: string;
details?: {
finish_reason: string;
generated_tokens: number;
seed?: number;
};
}
/**
* TGI container implementation
*
* TGI is optimized for:
* - Production deployments
* - Flash Attention support
* - Quantization (bitsandbytes, GPTQ, AWQ)
* - Multiple GPU support with tensor parallelism
*/
export class TgiContainer extends BaseContainer {
public readonly type: TContainerType = 'tgi';
public readonly displayName = 'TGI';
public readonly defaultImage = CONTAINER_IMAGES.TGI;
public readonly defaultPort = CONTAINER_PORTS.TGI;
constructor(config: IContainerConfig) {
super(config);
// Set defaults if not provided
if (!config.image) {
config.image = this.defaultImage;
}
if (!config.port) {
config.port = this.defaultPort;
}
// Add default volume for model cache
if (!config.volumes || config.volumes.length === 0) {
config.volumes = [`modelgrid-tgi-${config.id}:/data`];
}
}
/**
* Create TGI container configuration
*/
public static createConfig(
id: string,
name: string,
modelName: string,
gpuIds: string[],
options: Partial<IContainerConfig> = {},
): IContainerConfig {
const env: Record<string, string> = {
MODEL_ID: modelName,
PORT: String(options.port || CONTAINER_PORTS.TGI),
HUGGING_FACE_HUB_TOKEN: options.env?.HF_TOKEN || options.env?.HUGGING_FACE_HUB_TOKEN || '',
...options.env,
};
// Add GPU configuration
if (gpuIds.length > 1) {
env.NUM_SHARD = String(gpuIds.length);
}
// Add quantization if specified
if (options.env?.QUANTIZE) {
env.QUANTIZE = options.env.QUANTIZE;
}
return {
id,
name,
type: 'tgi',
image: options.image || CONTAINER_IMAGES.TGI,
gpuIds,
port: options.port || CONTAINER_PORTS.TGI,
externalPort: options.externalPort,
models: [modelName],
env,
volumes: options.volumes || [`modelgrid-tgi-${id}:/data`],
autoStart: options.autoStart ?? true,
restartPolicy: options.restartPolicy || 'unless-stopped',
memoryLimit: options.memoryLimit,
cpuLimit: options.cpuLimit,
command: options.command,
};
}
/**
* Check if TGI is healthy
*/
public async isHealthy(): Promise<boolean> {
try {
const response = await this.fetch('/health', { timeout: 5000 });
return response.ok;
} catch {
return false;
}
}
/**
* List available models
* TGI serves a single model per instance
*/
public async listModels(): Promise<string[]> {
try {
const info = await this.fetchJson<ITgiInfoResponse>('/info');
return [info.model_id];
} catch (error) {
logger.warn(`Failed to get TGI info: ${error instanceof Error ? error.message : String(error)}`);
return this.config.models || [];
}
}
/**
* Get loaded models with details
*/
public async getLoadedModels(): Promise<ILoadedModel[]> {
try {
const info = await this.fetchJson<ITgiInfoResponse>('/info');
return [{
name: info.model_id,
size: 0, // TGI doesn't expose model size
format: info.model_dtype,
loaded: true,
requestCount: 0,
}];
} catch {
return this.config.models.map((name) => ({
name,
size: 0,
loaded: true,
requestCount: 0,
}));
}
}
/**
* Pull a model
* TGI downloads models automatically at startup
*/
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
logger.info(`TGI downloads models at startup. Model: ${modelName}`);
logger.info('To use a different model, create a new TGI container.');
if (onProgress) {
onProgress({
model: modelName,
status: 'TGI models are loaded at container startup',
percent: 100,
});
}
return true;
}
/**
* Remove a model
* TGI serves a single model per instance
*/
public async removeModel(modelName: string): Promise<boolean> {
logger.info(`TGI serves a single model per instance.`);
logger.info(`To remove model ${modelName}, stop and remove this container.`);
return true;
}
/**
* Send a chat completion request
* Convert OpenAI format to TGI format
*/
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
// Convert messages to TGI prompt format
const prompt = this.messagesToPrompt(request.messages);
const tgiRequest: ITgiGenerateRequest = {
inputs: prompt,
parameters: {
temperature: request.temperature,
top_p: request.top_p,
max_new_tokens: request.max_tokens || 1024,
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
do_sample: (request.temperature || 0) > 0,
return_full_text: false,
},
};
const response = await this.fetchJson<ITgiGenerateResponse>('/generate', {
method: 'POST',
body: tgiRequest,
timeout: 300000, // 5 minutes
});
// Convert to OpenAI format
const created = Math.floor(Date.now() / 1000);
const choice: IChatCompletionChoice = {
index: 0,
message: {
role: 'assistant',
content: response.generated_text,
},
finish_reason: response.details?.finish_reason === 'eos_token' ? 'stop' : 'length',
};
return {
id: this.generateRequestId(),
object: 'chat.completion',
created,
model: this.config.models[0] || 'unknown',
choices: [choice],
usage: {
prompt_tokens: 0, // TGI doesn't always report this
completion_tokens: response.details?.generated_tokens || 0,
total_tokens: response.details?.generated_tokens || 0,
},
};
}
/**
* Stream a chat completion request
*/
public async chatCompletionStream(
request: IChatCompletionRequest,
onChunk: (chunk: string) => void,
): Promise<void> {
// Convert messages to TGI prompt format
const prompt = this.messagesToPrompt(request.messages);
const response = await this.fetch('/generate_stream', {
method: 'POST',
body: {
inputs: prompt,
parameters: {
temperature: request.temperature,
top_p: request.top_p,
max_new_tokens: request.max_tokens || 1024,
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
do_sample: (request.temperature || 0) > 0,
},
},
timeout: 300000,
});
if (!response.ok) {
const error = await response.text();
throw new Error(`HTTP ${response.status}: ${error}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
const requestId = this.generateRequestId();
const created = Math.floor(Date.now() / 1000);
const model = this.config.models[0] || 'unknown';
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
const lines = text.split('\n').filter((l) => l.startsWith('data:'));
for (const line of lines) {
try {
const jsonStr = line.substring(5).trim();
if (jsonStr === '[DONE]') {
onChunk('data: [DONE]\n\n');
continue;
}
const data = JSON.parse(jsonStr);
// Convert to OpenAI streaming format
const chunk = {
id: requestId,
object: 'chat.completion.chunk',
created,
model,
choices: [
{
index: 0,
delta: {
content: data.token?.text || '',
} as Partial<IChatMessage>,
finish_reason: data.details?.finish_reason ? 'stop' : null,
},
],
};
onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
} catch {
// Invalid JSON, skip
}
}
}
}
/**
* Convert chat messages to TGI prompt format
*/
private messagesToPrompt(messages: IChatMessage[]): string {
// Use a simple chat template
// TGI can use model-specific templates via the Messages API
let prompt = '';
for (const message of messages) {
switch (message.role) {
case 'system':
prompt += `System: ${message.content}\n\n`;
break;
case 'user':
prompt += `User: ${message.content}\n\n`;
break;
case 'assistant':
prompt += `Assistant: ${message.content}\n\n`;
break;
}
}
prompt += 'Assistant:';
return prompt;
}
/**
* Get TGI server info
*/
public async getInfo(): Promise<ITgiInfoResponse | null> {
try {
return await this.fetchJson<ITgiInfoResponse>('/info');
} catch {
return null;
}
}
/**
* Get TGI metrics
*/
public async getMetrics(): Promise<Record<string, unknown>> {
try {
const response = await this.fetch('/metrics', { timeout: 5000 });
if (response.ok) {
const text = await response.text();
// Parse Prometheus metrics
const metrics: Record<string, unknown> = {};
const lines = text.split('\n');
for (const line of lines) {
if (line.startsWith('#') || !line.trim()) continue;
const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
if (match) {
metrics[match[1]] = parseFloat(match[2]);
}
}
return metrics;
}
} catch {
// Metrics endpoint may not be available
}
return {};
}
}

272
ts/containers/vllm.ts Normal file
View File

@@ -0,0 +1,272 @@
/**
* vLLM Container
*
* Manages vLLM containers for high-performance LLM inference.
*/
import type {
IContainerConfig,
ILoadedModel,
TContainerType,
} from '../interfaces/container.ts';
import type {
IChatCompletionRequest,
IChatCompletionResponse,
IChatMessage,
} from '../interfaces/api.ts';
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
import { logger } from '../logger.ts';
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
/**
* vLLM model info response
*/
interface IVllmModelsResponse {
object: 'list';
data: Array<{
id: string;
object: 'model';
created: number;
owned_by: string;
}>;
}
/**
* vLLM container implementation
*
* vLLM serves a single model per instance and is optimized for:
* - High throughput with PagedAttention
* - Continuous batching
* - OpenAI-compatible API
*/
export class VllmContainer extends BaseContainer {
public readonly type: TContainerType = 'vllm';
public readonly displayName = 'vLLM';
public readonly defaultImage = CONTAINER_IMAGES.VLLM;
public readonly defaultPort = CONTAINER_PORTS.VLLM;
constructor(config: IContainerConfig) {
super(config);
// Set defaults if not provided
if (!config.image) {
config.image = this.defaultImage;
}
if (!config.port) {
config.port = this.defaultPort;
}
// Add default volume for model cache
if (!config.volumes || config.volumes.length === 0) {
config.volumes = [`modelgrid-vllm-${config.id}:/root/.cache/huggingface`];
}
}
/**
* Create vLLM container configuration
*/
public static createConfig(
id: string,
name: string,
modelName: string,
gpuIds: string[],
options: Partial<IContainerConfig> = {},
): IContainerConfig {
// vLLM requires model to be specified at startup
const command = [
'--model', modelName,
'--host', '0.0.0.0',
'--port', String(options.port || CONTAINER_PORTS.VLLM),
];
// Add tensor parallelism if multiple GPUs
if (gpuIds.length > 1) {
command.push('--tensor-parallel-size', String(gpuIds.length));
}
// Add additional options
if (options.env?.VLLM_MAX_MODEL_LEN) {
command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
}
return {
id,
name,
type: 'vllm',
image: options.image || CONTAINER_IMAGES.VLLM,
gpuIds,
port: options.port || CONTAINER_PORTS.VLLM,
externalPort: options.externalPort,
models: [modelName],
env: {
HF_TOKEN: options.env?.HF_TOKEN || '',
...options.env,
},
volumes: options.volumes || [`modelgrid-vllm-${id}:/root/.cache/huggingface`],
autoStart: options.autoStart ?? true,
restartPolicy: options.restartPolicy || 'unless-stopped',
memoryLimit: options.memoryLimit,
cpuLimit: options.cpuLimit,
command,
};
}
/**
* Check if vLLM is healthy
*/
public async isHealthy(): Promise<boolean> {
try {
const response = await this.fetch('/health', { timeout: 5000 });
return response.ok;
} catch {
return false;
}
}
/**
* List available models
* vLLM serves a single model per instance
*/
public async listModels(): Promise<string[]> {
try {
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
return (data.data || []).map((m) => m.id);
} catch (error) {
logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
return this.config.models || [];
}
}
/**
* Get loaded models with details
*/
public async getLoadedModels(): Promise<ILoadedModel[]> {
try {
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
return (data.data || []).map((m) => ({
name: m.id,
size: 0, // vLLM doesn't expose size
loaded: true,
requestCount: 0,
}));
} catch {
// Return configured model as fallback
return this.config.models.map((name) => ({
name,
size: 0,
loaded: true,
requestCount: 0,
}));
}
}
/**
* Pull a model
* vLLM downloads models automatically at startup
* This method is a no-op - models are configured at container creation
*/
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
logger.info(`vLLM downloads models at startup. Model: ${modelName}`);
logger.info('To use a different model, create a new vLLM container.');
if (onProgress) {
onProgress({
model: modelName,
status: 'vLLM models are loaded at container startup',
percent: 100,
});
}
return true;
}
/**
* Remove a model
* vLLM serves a single model per instance
*/
public async removeModel(modelName: string): Promise<boolean> {
logger.info(`vLLM serves a single model per instance.`);
logger.info(`To remove model ${modelName}, stop and remove this container.`);
return true;
}
/**
* Send a chat completion request
* vLLM is OpenAI-compatible
*/
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
return this.fetchJson<IChatCompletionResponse>('/v1/chat/completions', {
method: 'POST',
body: {
...request,
stream: false,
},
timeout: 300000, // 5 minutes
});
}
/**
* Stream a chat completion request
* vLLM is OpenAI-compatible
*/
public async chatCompletionStream(
request: IChatCompletionRequest,
onChunk: (chunk: string) => void,
): Promise<void> {
const response = await this.fetch('/v1/chat/completions', {
method: 'POST',
body: {
...request,
stream: true,
},
timeout: 300000,
});
if (!response.ok) {
const error = await response.text();
throw new Error(`HTTP ${response.status}: ${error}`);
}
const reader = response.body?.getReader();
if (!reader) {
throw new Error('No response body');
}
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
// vLLM already sends data in SSE format
onChunk(text);
}
}
/**
* Get vLLM-specific metrics
*/
public async getMetrics(): Promise<Record<string, unknown>> {
try {
const response = await this.fetch('/metrics', { timeout: 5000 });
if (response.ok) {
const text = await response.text();
// Parse Prometheus metrics
const metrics: Record<string, unknown> = {};
const lines = text.split('\n');
for (const line of lines) {
if (line.startsWith('#') || !line.trim()) continue;
const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
if (match) {
metrics[match[1]] = parseFloat(match[2]);
}
}
return metrics;
}
} catch {
// Metrics endpoint may not be enabled
}
return {};
}
}