initial
This commit is contained in:
216
ts/containers/base-container.ts
Normal file
216
ts/containers/base-container.ts
Normal file
@@ -0,0 +1,216 @@
|
||||
/**
|
||||
* Base Container
|
||||
*
|
||||
* Abstract base class for AI model containers.
|
||||
*/
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
IContainerStatus,
|
||||
ILoadedModel,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import type { IChatCompletionRequest, IChatCompletionResponse } from '../interfaces/api.ts';
|
||||
import { ContainerRuntime } from '../docker/container-runtime.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
|
||||
/**
|
||||
* Model pull progress callback
|
||||
*/
|
||||
export type TModelPullProgress = (progress: {
|
||||
model: string;
|
||||
status: string;
|
||||
percent?: number;
|
||||
}) => void;
|
||||
|
||||
/**
|
||||
* Abstract base class for AI model containers
|
||||
*/
|
||||
export abstract class BaseContainer {
|
||||
/** Container type */
|
||||
public abstract readonly type: TContainerType;
|
||||
|
||||
/** Display name */
|
||||
public abstract readonly displayName: string;
|
||||
|
||||
/** Default Docker image */
|
||||
public abstract readonly defaultImage: string;
|
||||
|
||||
/** Default internal port */
|
||||
public abstract readonly defaultPort: number;
|
||||
|
||||
/** Container configuration */
|
||||
protected config: IContainerConfig;
|
||||
|
||||
/** Container runtime */
|
||||
protected runtime: ContainerRuntime;
|
||||
|
||||
constructor(config: IContainerConfig) {
|
||||
this.config = config;
|
||||
this.runtime = new ContainerRuntime();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the container configuration
|
||||
*/
|
||||
public getConfig(): IContainerConfig {
|
||||
return this.config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the endpoint URL for this container
|
||||
*/
|
||||
public getEndpoint(): string {
|
||||
const port = this.config.externalPort || this.config.port;
|
||||
return `http://localhost:${port}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the container
|
||||
*/
|
||||
public async start(): Promise<boolean> {
|
||||
logger.info(`Starting ${this.displayName} container: ${this.config.name}`);
|
||||
return this.runtime.startContainer(this.config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the container
|
||||
*/
|
||||
public async stop(): Promise<boolean> {
|
||||
logger.info(`Stopping ${this.displayName} container: ${this.config.name}`);
|
||||
return this.runtime.stopContainer(this.config.id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Restart the container
|
||||
*/
|
||||
public async restart(): Promise<boolean> {
|
||||
logger.info(`Restarting ${this.displayName} container: ${this.config.name}`);
|
||||
return this.runtime.restartContainer(this.config.id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the container
|
||||
*/
|
||||
public async remove(): Promise<boolean> {
|
||||
logger.info(`Removing ${this.displayName} container: ${this.config.name}`);
|
||||
return this.runtime.removeContainer(this.config.id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get container status
|
||||
*/
|
||||
public async getStatus(): Promise<IContainerStatus> {
|
||||
return this.runtime.getContainerStatus(this.config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get container logs
|
||||
*/
|
||||
public async getLogs(lines: number = 100): Promise<string> {
|
||||
return this.runtime.getLogs(this.config.id, { lines });
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the container is healthy
|
||||
*/
|
||||
public abstract isHealthy(): Promise<boolean>;
|
||||
|
||||
/**
|
||||
* Get list of available models
|
||||
*/
|
||||
public abstract listModels(): Promise<string[]>;
|
||||
|
||||
/**
|
||||
* Get list of loaded models with details
|
||||
*/
|
||||
public abstract getLoadedModels(): Promise<ILoadedModel[]>;
|
||||
|
||||
/**
|
||||
* Pull a model
|
||||
*/
|
||||
public abstract pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean>;
|
||||
|
||||
/**
|
||||
* Remove a model
|
||||
*/
|
||||
public abstract removeModel(modelName: string): Promise<boolean>;
|
||||
|
||||
/**
|
||||
* Send a chat completion request
|
||||
*/
|
||||
public abstract chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse>;
|
||||
|
||||
/**
|
||||
* Stream a chat completion request
|
||||
*/
|
||||
public abstract chatCompletionStream(
|
||||
request: IChatCompletionRequest,
|
||||
onChunk: (chunk: string) => void,
|
||||
): Promise<void>;
|
||||
|
||||
/**
|
||||
* Make HTTP request to container
|
||||
*/
|
||||
protected async fetch(
|
||||
path: string,
|
||||
options: {
|
||||
method?: string;
|
||||
headers?: Record<string, string>;
|
||||
body?: unknown;
|
||||
timeout?: number;
|
||||
} = {},
|
||||
): Promise<Response> {
|
||||
const endpoint = this.getEndpoint();
|
||||
const url = `${endpoint}${path}`;
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = options.timeout || 30000;
|
||||
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: options.method || 'GET',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...options.headers,
|
||||
},
|
||||
body: options.body ? JSON.stringify(options.body) : undefined,
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
return response;
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Make HTTP request and parse JSON response
|
||||
*/
|
||||
protected async fetchJson<T>(
|
||||
path: string,
|
||||
options: {
|
||||
method?: string;
|
||||
headers?: Record<string, string>;
|
||||
body?: unknown;
|
||||
timeout?: number;
|
||||
} = {},
|
||||
): Promise<T> {
|
||||
const response = await this.fetch(path, options);
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`HTTP ${response.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
return response.json();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a unique request ID
|
||||
*/
|
||||
protected generateRequestId(): string {
|
||||
return `chatcmpl-${Date.now().toString(36)}-${Math.random().toString(36).substring(2, 8)}`;
|
||||
}
|
||||
}
|
||||
349
ts/containers/container-manager.ts
Normal file
349
ts/containers/container-manager.ts
Normal file
@@ -0,0 +1,349 @@
|
||||
/**
|
||||
* Container Manager
|
||||
*
|
||||
* Orchestrates multiple AI model containers.
|
||||
*/
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
IContainerStatus,
|
||||
IContainerEndpoint,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { DockerManager } from '../docker/docker-manager.ts';
|
||||
import { BaseContainer } from './base-container.ts';
|
||||
import { OllamaContainer } from './ollama.ts';
|
||||
import { VllmContainer } from './vllm.ts';
|
||||
import { TgiContainer } from './tgi.ts';
|
||||
|
||||
/**
|
||||
* Container Manager - orchestrates all containers
|
||||
*/
|
||||
export class ContainerManager {
|
||||
private containers: Map<string, BaseContainer>;
|
||||
private dockerManager: DockerManager;
|
||||
|
||||
constructor() {
|
||||
this.containers = new Map();
|
||||
this.dockerManager = new DockerManager();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize container manager
|
||||
*/
|
||||
public async initialize(): Promise<void> {
|
||||
// Ensure Docker is running
|
||||
if (!await this.dockerManager.isRunning()) {
|
||||
throw new Error('Docker is not running');
|
||||
}
|
||||
|
||||
// Create network if it doesn't exist
|
||||
await this.dockerManager.createNetwork();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a container instance from config
|
||||
*/
|
||||
private createContainerInstance(config: IContainerConfig): BaseContainer {
|
||||
switch (config.type) {
|
||||
case 'ollama':
|
||||
return new OllamaContainer(config);
|
||||
case 'vllm':
|
||||
return new VllmContainer(config);
|
||||
case 'tgi':
|
||||
return new TgiContainer(config);
|
||||
default:
|
||||
throw new Error(`Unknown container type: ${config.type}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a container
|
||||
*/
|
||||
public addContainer(config: IContainerConfig): BaseContainer {
|
||||
if (this.containers.has(config.id)) {
|
||||
throw new Error(`Container with ID ${config.id} already exists`);
|
||||
}
|
||||
|
||||
const container = this.createContainerInstance(config);
|
||||
this.containers.set(config.id, container);
|
||||
return container;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a container
|
||||
*/
|
||||
public async removeContainer(containerId: string): Promise<boolean> {
|
||||
const container = this.containers.get(containerId);
|
||||
if (!container) {
|
||||
return false;
|
||||
}
|
||||
|
||||
await container.remove();
|
||||
this.containers.delete(containerId);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a container by ID
|
||||
*/
|
||||
public getContainer(containerId: string): BaseContainer | undefined {
|
||||
return this.containers.get(containerId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all containers
|
||||
*/
|
||||
public getAllContainers(): BaseContainer[] {
|
||||
return Array.from(this.containers.values());
|
||||
}
|
||||
|
||||
/**
|
||||
* Load containers from configuration
|
||||
*/
|
||||
public loadFromConfig(configs: IContainerConfig[]): void {
|
||||
this.containers.clear();
|
||||
for (const config of configs) {
|
||||
try {
|
||||
this.addContainer(config);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to load container ${config.id}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start all containers
|
||||
*/
|
||||
public async startAll(): Promise<Map<string, boolean>> {
|
||||
const results = new Map<string, boolean>();
|
||||
|
||||
for (const [id, container] of this.containers) {
|
||||
if (!container.getConfig().autoStart) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const success = await container.start();
|
||||
results.set(id, success);
|
||||
} catch (error) {
|
||||
logger.error(`Failed to start container ${id}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
results.set(id, false);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop all containers
|
||||
*/
|
||||
public async stopAll(): Promise<Map<string, boolean>> {
|
||||
const results = new Map<string, boolean>();
|
||||
|
||||
for (const [id, container] of this.containers) {
|
||||
try {
|
||||
const success = await container.stop();
|
||||
results.set(id, success);
|
||||
} catch (error) {
|
||||
logger.error(`Failed to stop container ${id}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
results.set(id, false);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get status of all containers
|
||||
*/
|
||||
public async getAllStatus(): Promise<Map<string, IContainerStatus>> {
|
||||
const statuses = new Map<string, IContainerStatus>();
|
||||
|
||||
for (const [id, container] of this.containers) {
|
||||
try {
|
||||
const status = await container.getStatus();
|
||||
statuses.set(id, status);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to get status for container ${id}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
return statuses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get available endpoints for a model
|
||||
*/
|
||||
public async getEndpointsForModel(modelName: string): Promise<IContainerEndpoint[]> {
|
||||
const endpoints: IContainerEndpoint[] = [];
|
||||
|
||||
for (const [_id, container] of this.containers) {
|
||||
try {
|
||||
const status = await container.getStatus();
|
||||
|
||||
if (!status.running) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if container has this model
|
||||
const models = await container.listModels();
|
||||
if (!models.includes(modelName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
endpoints.push({
|
||||
containerId: container.getConfig().id,
|
||||
type: container.type,
|
||||
url: container.getEndpoint(),
|
||||
models,
|
||||
healthy: status.health === 'healthy',
|
||||
priority: 0, // Could be based on load
|
||||
});
|
||||
} catch {
|
||||
// Skip containers that fail to respond
|
||||
}
|
||||
}
|
||||
|
||||
return endpoints;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find best container for a model
|
||||
*/
|
||||
public async findContainerForModel(modelName: string): Promise<BaseContainer | null> {
|
||||
const endpoints = await this.getEndpointsForModel(modelName);
|
||||
|
||||
// Filter to healthy endpoints
|
||||
const healthy = endpoints.filter((e) => e.healthy);
|
||||
if (healthy.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Return first healthy endpoint (could add load balancing)
|
||||
const endpoint = healthy[0];
|
||||
return this.containers.get(endpoint.containerId) || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all available models across all containers
|
||||
*/
|
||||
public async getAllAvailableModels(): Promise<Map<string, IContainerEndpoint[]>> {
|
||||
const modelMap = new Map<string, IContainerEndpoint[]>();
|
||||
|
||||
for (const container of this.containers.values()) {
|
||||
try {
|
||||
const status = await container.getStatus();
|
||||
if (!status.running) continue;
|
||||
|
||||
const models = await container.listModels();
|
||||
|
||||
for (const model of models) {
|
||||
if (!modelMap.has(model)) {
|
||||
modelMap.set(model, []);
|
||||
}
|
||||
|
||||
modelMap.get(model)!.push({
|
||||
containerId: container.getConfig().id,
|
||||
type: container.type,
|
||||
url: container.getEndpoint(),
|
||||
models,
|
||||
healthy: status.health === 'healthy',
|
||||
priority: 0,
|
||||
});
|
||||
}
|
||||
} catch {
|
||||
// Skip failed containers
|
||||
}
|
||||
}
|
||||
|
||||
return modelMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull a model to a specific container type
|
||||
*/
|
||||
public async pullModel(
|
||||
modelName: string,
|
||||
containerType: TContainerType = 'ollama',
|
||||
containerId?: string,
|
||||
): Promise<boolean> {
|
||||
// Find or create appropriate container
|
||||
let container: BaseContainer | undefined;
|
||||
|
||||
if (containerId) {
|
||||
container = this.containers.get(containerId);
|
||||
} else {
|
||||
// Find first container of the specified type
|
||||
for (const c of this.containers.values()) {
|
||||
if (c.type === containerType) {
|
||||
container = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!container) {
|
||||
logger.error(`No ${containerType} container available to pull model`);
|
||||
return false;
|
||||
}
|
||||
|
||||
return container.pullModel(modelName, (progress) => {
|
||||
const percent = progress.percent !== undefined ? ` (${progress.percent}%)` : '';
|
||||
logger.dim(` ${progress.status}${percent}`);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Health check all containers
|
||||
*/
|
||||
public async healthCheck(): Promise<Map<string, boolean>> {
|
||||
const results = new Map<string, boolean>();
|
||||
|
||||
for (const [id, container] of this.containers) {
|
||||
try {
|
||||
const healthy = await container.isHealthy();
|
||||
results.set(id, healthy);
|
||||
} catch {
|
||||
results.set(id, false);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print container status summary
|
||||
*/
|
||||
public async printStatus(): Promise<void> {
|
||||
const statuses = await this.getAllStatus();
|
||||
|
||||
if (statuses.size === 0) {
|
||||
logger.logBox('Containers', ['No containers configured'], 50, 'warning');
|
||||
return;
|
||||
}
|
||||
|
||||
logger.logBoxTitle('Container Status', 70, 'info');
|
||||
|
||||
for (const [id, status] of statuses) {
|
||||
const runningStr = status.running ? 'Running' : 'Stopped';
|
||||
const healthStr = status.health;
|
||||
const modelsStr = status.loadedModels.length > 0
|
||||
? status.loadedModels.join(', ')
|
||||
: 'None';
|
||||
|
||||
logger.logBoxLine(`${status.name} (${id})`);
|
||||
logger.logBoxLine(` Type: ${status.type} | Status: ${runningStr} | Health: ${healthStr}`);
|
||||
logger.logBoxLine(` Models: ${modelsStr}`);
|
||||
logger.logBoxLine(` Endpoint: ${status.endpoint}`);
|
||||
|
||||
if (status.gpuUtilization !== undefined) {
|
||||
logger.logBoxLine(` GPU: ${status.gpuUtilization}% | Memory: ${status.memoryUsage || 0}MB`);
|
||||
}
|
||||
logger.logBoxLine('');
|
||||
}
|
||||
|
||||
logger.logBoxEnd();
|
||||
}
|
||||
}
|
||||
11
ts/containers/index.ts
Normal file
11
ts/containers/index.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
/**
|
||||
* Container Management Module
|
||||
*
|
||||
* Exports all AI container implementations.
|
||||
*/
|
||||
|
||||
export { BaseContainer } from './base-container.ts';
|
||||
export { OllamaContainer } from './ollama.ts';
|
||||
export { VllmContainer } from './vllm.ts';
|
||||
export { TgiContainer } from './tgi.ts';
|
||||
export { ContainerManager } from './container-manager.ts';
|
||||
387
ts/containers/ollama.ts
Normal file
387
ts/containers/ollama.ts
Normal file
@@ -0,0 +1,387 @@
|
||||
/**
|
||||
* Ollama Container
|
||||
*
|
||||
* Manages Ollama containers for running local LLMs.
|
||||
*/
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
ILoadedModel,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import type {
|
||||
IChatCompletionRequest,
|
||||
IChatCompletionResponse,
|
||||
IChatCompletionChoice,
|
||||
IChatMessage,
|
||||
} from '../interfaces/api.ts';
|
||||
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
|
||||
|
||||
/**
|
||||
* Ollama API response types
|
||||
*/
|
||||
interface IOllamaTagsResponse {
|
||||
models: Array<{
|
||||
name: string;
|
||||
size: number;
|
||||
digest: string;
|
||||
modified_at: string;
|
||||
}>;
|
||||
}
|
||||
|
||||
interface IOllamaChatRequest {
|
||||
model: string;
|
||||
messages: Array<{
|
||||
role: string;
|
||||
content: string;
|
||||
}>;
|
||||
stream?: boolean;
|
||||
options?: {
|
||||
temperature?: number;
|
||||
top_p?: number;
|
||||
num_predict?: number;
|
||||
stop?: string[];
|
||||
};
|
||||
}
|
||||
|
||||
interface IOllamaChatResponse {
|
||||
model: string;
|
||||
created_at: string;
|
||||
message: {
|
||||
role: string;
|
||||
content: string;
|
||||
};
|
||||
done: boolean;
|
||||
total_duration?: number;
|
||||
load_duration?: number;
|
||||
prompt_eval_count?: number;
|
||||
eval_count?: number;
|
||||
}
|
||||
|
||||
interface IOllamaPullResponse {
|
||||
status: string;
|
||||
digest?: string;
|
||||
total?: number;
|
||||
completed?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ollama container implementation
|
||||
*/
|
||||
export class OllamaContainer extends BaseContainer {
|
||||
public readonly type: TContainerType = 'ollama';
|
||||
public readonly displayName = 'Ollama';
|
||||
public readonly defaultImage = CONTAINER_IMAGES.OLLAMA;
|
||||
public readonly defaultPort = CONTAINER_PORTS.OLLAMA;
|
||||
|
||||
constructor(config: IContainerConfig) {
|
||||
super(config);
|
||||
|
||||
// Set defaults if not provided
|
||||
if (!config.image) {
|
||||
config.image = this.defaultImage;
|
||||
}
|
||||
if (!config.port) {
|
||||
config.port = this.defaultPort;
|
||||
}
|
||||
|
||||
// Add default volume for model storage
|
||||
if (!config.volumes || config.volumes.length === 0) {
|
||||
config.volumes = [`modelgrid-ollama-${config.id}:/root/.ollama`];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create Ollama container configuration
|
||||
*/
|
||||
public static createConfig(
|
||||
id: string,
|
||||
name: string,
|
||||
gpuIds: string[],
|
||||
options: Partial<IContainerConfig> = {},
|
||||
): IContainerConfig {
|
||||
return {
|
||||
id,
|
||||
name,
|
||||
type: 'ollama',
|
||||
image: options.image || CONTAINER_IMAGES.OLLAMA,
|
||||
gpuIds,
|
||||
port: options.port || CONTAINER_PORTS.OLLAMA,
|
||||
externalPort: options.externalPort,
|
||||
models: options.models || [],
|
||||
env: options.env,
|
||||
volumes: options.volumes || [`modelgrid-ollama-${id}:/root/.ollama`],
|
||||
autoStart: options.autoStart ?? true,
|
||||
restartPolicy: options.restartPolicy || 'unless-stopped',
|
||||
memoryLimit: options.memoryLimit,
|
||||
cpuLimit: options.cpuLimit,
|
||||
command: options.command,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if Ollama is healthy
|
||||
*/
|
||||
public async isHealthy(): Promise<boolean> {
|
||||
try {
|
||||
const response = await this.fetch('/api/tags', { timeout: 5000 });
|
||||
return response.ok;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* List available models
|
||||
*/
|
||||
public async listModels(): Promise<string[]> {
|
||||
try {
|
||||
const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
|
||||
return (data.models || []).map((m) => m.name);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to list Ollama models: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get loaded models with details
|
||||
*/
|
||||
public async getLoadedModels(): Promise<ILoadedModel[]> {
|
||||
try {
|
||||
const data = await this.fetchJson<IOllamaTagsResponse>('/api/tags');
|
||||
return (data.models || []).map((m) => ({
|
||||
name: m.name,
|
||||
size: m.size,
|
||||
format: m.digest.substring(0, 12),
|
||||
loaded: true, // Ollama doesn't distinguish loaded vs available
|
||||
requestCount: 0,
|
||||
}));
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull a model
|
||||
*/
|
||||
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
|
||||
try {
|
||||
logger.info(`Pulling model: ${modelName}`);
|
||||
|
||||
const response = await this.fetch('/api/pull', {
|
||||
method: 'POST',
|
||||
body: { name: modelName },
|
||||
timeout: 3600000, // 1 hour for large models
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
// Read streaming response
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
let lastStatus = '';
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const text = decoder.decode(value);
|
||||
const lines = text.split('\n').filter((l) => l.trim());
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const data = JSON.parse(line) as IOllamaPullResponse;
|
||||
const status = data.status;
|
||||
|
||||
if (status !== lastStatus) {
|
||||
lastStatus = status;
|
||||
let percent: number | undefined;
|
||||
|
||||
if (data.total && data.completed) {
|
||||
percent = Math.round((data.completed / data.total) * 100);
|
||||
}
|
||||
|
||||
if (onProgress) {
|
||||
onProgress({ model: modelName, status, percent });
|
||||
} else {
|
||||
const progressStr = percent !== undefined ? ` (${percent}%)` : '';
|
||||
logger.dim(` ${status}${progressStr}`);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Invalid JSON line, skip
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.success(`Model ${modelName} pulled successfully`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to pull model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a model
|
||||
*/
|
||||
public async removeModel(modelName: string): Promise<boolean> {
|
||||
try {
|
||||
const response = await this.fetch('/api/delete', {
|
||||
method: 'DELETE',
|
||||
body: { name: modelName },
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
logger.success(`Model ${modelName} removed`);
|
||||
return true;
|
||||
}
|
||||
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
} catch (error) {
|
||||
logger.error(`Failed to remove model ${modelName}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a chat completion request
|
||||
*/
|
||||
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
|
||||
const ollamaRequest: IOllamaChatRequest = {
|
||||
model: request.model,
|
||||
messages: request.messages.map((m) => ({
|
||||
role: m.role,
|
||||
content: m.content,
|
||||
})),
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: request.temperature,
|
||||
top_p: request.top_p,
|
||||
num_predict: request.max_tokens,
|
||||
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||||
},
|
||||
};
|
||||
|
||||
const response = await this.fetchJson<IOllamaChatResponse>('/api/chat', {
|
||||
method: 'POST',
|
||||
body: ollamaRequest,
|
||||
timeout: 300000, // 5 minutes
|
||||
});
|
||||
|
||||
// Convert to OpenAI format
|
||||
const created = Math.floor(Date.now() / 1000);
|
||||
|
||||
const choice: IChatCompletionChoice = {
|
||||
index: 0,
|
||||
message: {
|
||||
role: 'assistant',
|
||||
content: response.message.content,
|
||||
},
|
||||
finish_reason: response.done ? 'stop' : null,
|
||||
};
|
||||
|
||||
return {
|
||||
id: this.generateRequestId(),
|
||||
object: 'chat.completion',
|
||||
created,
|
||||
model: request.model,
|
||||
choices: [choice],
|
||||
usage: {
|
||||
prompt_tokens: response.prompt_eval_count || 0,
|
||||
completion_tokens: response.eval_count || 0,
|
||||
total_tokens: (response.prompt_eval_count || 0) + (response.eval_count || 0),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Stream a chat completion request
|
||||
*/
|
||||
public async chatCompletionStream(
|
||||
request: IChatCompletionRequest,
|
||||
onChunk: (chunk: string) => void,
|
||||
): Promise<void> {
|
||||
const ollamaRequest: IOllamaChatRequest = {
|
||||
model: request.model,
|
||||
messages: request.messages.map((m) => ({
|
||||
role: m.role,
|
||||
content: m.content,
|
||||
})),
|
||||
stream: true,
|
||||
options: {
|
||||
temperature: request.temperature,
|
||||
top_p: request.top_p,
|
||||
num_predict: request.max_tokens,
|
||||
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||||
},
|
||||
};
|
||||
|
||||
const response = await this.fetch('/api/chat', {
|
||||
method: 'POST',
|
||||
body: ollamaRequest,
|
||||
timeout: 300000,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
const requestId = this.generateRequestId();
|
||||
const created = Math.floor(Date.now() / 1000);
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const text = decoder.decode(value);
|
||||
const lines = text.split('\n').filter((l) => l.trim());
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const data = JSON.parse(line) as IOllamaChatResponse;
|
||||
|
||||
// Convert to OpenAI streaming format
|
||||
const chunk = {
|
||||
id: requestId,
|
||||
object: 'chat.completion.chunk',
|
||||
created,
|
||||
model: request.model,
|
||||
choices: [
|
||||
{
|
||||
index: 0,
|
||||
delta: {
|
||||
content: data.message.content,
|
||||
} as Partial<IChatMessage>,
|
||||
finish_reason: data.done ? 'stop' : null,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
|
||||
|
||||
if (data.done) {
|
||||
onChunk('data: [DONE]\n\n');
|
||||
}
|
||||
} catch {
|
||||
// Invalid JSON, skip
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
417
ts/containers/tgi.ts
Normal file
417
ts/containers/tgi.ts
Normal file
@@ -0,0 +1,417 @@
|
||||
/**
|
||||
* TGI Container (Text Generation Inference)
|
||||
*
|
||||
* Manages HuggingFace Text Generation Inference containers.
|
||||
*/
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
ILoadedModel,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import type {
|
||||
IChatCompletionRequest,
|
||||
IChatCompletionResponse,
|
||||
IChatCompletionChoice,
|
||||
IChatMessage,
|
||||
} from '../interfaces/api.ts';
|
||||
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
|
||||
|
||||
/**
|
||||
* TGI info response
|
||||
*/
|
||||
interface ITgiInfoResponse {
|
||||
model_id: string;
|
||||
model_sha: string;
|
||||
model_dtype: string;
|
||||
model_device_type: string;
|
||||
max_concurrent_requests: number;
|
||||
max_best_of: number;
|
||||
max_stop_sequences: number;
|
||||
max_input_length: number;
|
||||
max_total_tokens: number;
|
||||
version: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* TGI generate request
|
||||
*/
|
||||
interface ITgiGenerateRequest {
|
||||
inputs: string;
|
||||
parameters?: {
|
||||
temperature?: number;
|
||||
top_p?: number;
|
||||
max_new_tokens?: number;
|
||||
stop?: string[];
|
||||
do_sample?: boolean;
|
||||
return_full_text?: boolean;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* TGI generate response
|
||||
*/
|
||||
interface ITgiGenerateResponse {
|
||||
generated_text: string;
|
||||
details?: {
|
||||
finish_reason: string;
|
||||
generated_tokens: number;
|
||||
seed?: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* TGI container implementation
|
||||
*
|
||||
* TGI is optimized for:
|
||||
* - Production deployments
|
||||
* - Flash Attention support
|
||||
* - Quantization (bitsandbytes, GPTQ, AWQ)
|
||||
* - Multiple GPU support with tensor parallelism
|
||||
*/
|
||||
export class TgiContainer extends BaseContainer {
|
||||
public readonly type: TContainerType = 'tgi';
|
||||
public readonly displayName = 'TGI';
|
||||
public readonly defaultImage = CONTAINER_IMAGES.TGI;
|
||||
public readonly defaultPort = CONTAINER_PORTS.TGI;
|
||||
|
||||
constructor(config: IContainerConfig) {
|
||||
super(config);
|
||||
|
||||
// Set defaults if not provided
|
||||
if (!config.image) {
|
||||
config.image = this.defaultImage;
|
||||
}
|
||||
if (!config.port) {
|
||||
config.port = this.defaultPort;
|
||||
}
|
||||
|
||||
// Add default volume for model cache
|
||||
if (!config.volumes || config.volumes.length === 0) {
|
||||
config.volumes = [`modelgrid-tgi-${config.id}:/data`];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create TGI container configuration
|
||||
*/
|
||||
public static createConfig(
|
||||
id: string,
|
||||
name: string,
|
||||
modelName: string,
|
||||
gpuIds: string[],
|
||||
options: Partial<IContainerConfig> = {},
|
||||
): IContainerConfig {
|
||||
const env: Record<string, string> = {
|
||||
MODEL_ID: modelName,
|
||||
PORT: String(options.port || CONTAINER_PORTS.TGI),
|
||||
HUGGING_FACE_HUB_TOKEN: options.env?.HF_TOKEN || options.env?.HUGGING_FACE_HUB_TOKEN || '',
|
||||
...options.env,
|
||||
};
|
||||
|
||||
// Add GPU configuration
|
||||
if (gpuIds.length > 1) {
|
||||
env.NUM_SHARD = String(gpuIds.length);
|
||||
}
|
||||
|
||||
// Add quantization if specified
|
||||
if (options.env?.QUANTIZE) {
|
||||
env.QUANTIZE = options.env.QUANTIZE;
|
||||
}
|
||||
|
||||
return {
|
||||
id,
|
||||
name,
|
||||
type: 'tgi',
|
||||
image: options.image || CONTAINER_IMAGES.TGI,
|
||||
gpuIds,
|
||||
port: options.port || CONTAINER_PORTS.TGI,
|
||||
externalPort: options.externalPort,
|
||||
models: [modelName],
|
||||
env,
|
||||
volumes: options.volumes || [`modelgrid-tgi-${id}:/data`],
|
||||
autoStart: options.autoStart ?? true,
|
||||
restartPolicy: options.restartPolicy || 'unless-stopped',
|
||||
memoryLimit: options.memoryLimit,
|
||||
cpuLimit: options.cpuLimit,
|
||||
command: options.command,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if TGI is healthy
|
||||
*/
|
||||
public async isHealthy(): Promise<boolean> {
|
||||
try {
|
||||
const response = await this.fetch('/health', { timeout: 5000 });
|
||||
return response.ok;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* List available models
|
||||
* TGI serves a single model per instance
|
||||
*/
|
||||
public async listModels(): Promise<string[]> {
|
||||
try {
|
||||
const info = await this.fetchJson<ITgiInfoResponse>('/info');
|
||||
return [info.model_id];
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to get TGI info: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return this.config.models || [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get loaded models with details
|
||||
*/
|
||||
public async getLoadedModels(): Promise<ILoadedModel[]> {
|
||||
try {
|
||||
const info = await this.fetchJson<ITgiInfoResponse>('/info');
|
||||
return [{
|
||||
name: info.model_id,
|
||||
size: 0, // TGI doesn't expose model size
|
||||
format: info.model_dtype,
|
||||
loaded: true,
|
||||
requestCount: 0,
|
||||
}];
|
||||
} catch {
|
||||
return this.config.models.map((name) => ({
|
||||
name,
|
||||
size: 0,
|
||||
loaded: true,
|
||||
requestCount: 0,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull a model
|
||||
* TGI downloads models automatically at startup
|
||||
*/
|
||||
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
|
||||
logger.info(`TGI downloads models at startup. Model: ${modelName}`);
|
||||
logger.info('To use a different model, create a new TGI container.');
|
||||
|
||||
if (onProgress) {
|
||||
onProgress({
|
||||
model: modelName,
|
||||
status: 'TGI models are loaded at container startup',
|
||||
percent: 100,
|
||||
});
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a model
|
||||
* TGI serves a single model per instance
|
||||
*/
|
||||
public async removeModel(modelName: string): Promise<boolean> {
|
||||
logger.info(`TGI serves a single model per instance.`);
|
||||
logger.info(`To remove model ${modelName}, stop and remove this container.`);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a chat completion request
|
||||
* Convert OpenAI format to TGI format
|
||||
*/
|
||||
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
|
||||
// Convert messages to TGI prompt format
|
||||
const prompt = this.messagesToPrompt(request.messages);
|
||||
|
||||
const tgiRequest: ITgiGenerateRequest = {
|
||||
inputs: prompt,
|
||||
parameters: {
|
||||
temperature: request.temperature,
|
||||
top_p: request.top_p,
|
||||
max_new_tokens: request.max_tokens || 1024,
|
||||
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||||
do_sample: (request.temperature || 0) > 0,
|
||||
return_full_text: false,
|
||||
},
|
||||
};
|
||||
|
||||
const response = await this.fetchJson<ITgiGenerateResponse>('/generate', {
|
||||
method: 'POST',
|
||||
body: tgiRequest,
|
||||
timeout: 300000, // 5 minutes
|
||||
});
|
||||
|
||||
// Convert to OpenAI format
|
||||
const created = Math.floor(Date.now() / 1000);
|
||||
|
||||
const choice: IChatCompletionChoice = {
|
||||
index: 0,
|
||||
message: {
|
||||
role: 'assistant',
|
||||
content: response.generated_text,
|
||||
},
|
||||
finish_reason: response.details?.finish_reason === 'eos_token' ? 'stop' : 'length',
|
||||
};
|
||||
|
||||
return {
|
||||
id: this.generateRequestId(),
|
||||
object: 'chat.completion',
|
||||
created,
|
||||
model: this.config.models[0] || 'unknown',
|
||||
choices: [choice],
|
||||
usage: {
|
||||
prompt_tokens: 0, // TGI doesn't always report this
|
||||
completion_tokens: response.details?.generated_tokens || 0,
|
||||
total_tokens: response.details?.generated_tokens || 0,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Stream a chat completion request
|
||||
*/
|
||||
public async chatCompletionStream(
|
||||
request: IChatCompletionRequest,
|
||||
onChunk: (chunk: string) => void,
|
||||
): Promise<void> {
|
||||
// Convert messages to TGI prompt format
|
||||
const prompt = this.messagesToPrompt(request.messages);
|
||||
|
||||
const response = await this.fetch('/generate_stream', {
|
||||
method: 'POST',
|
||||
body: {
|
||||
inputs: prompt,
|
||||
parameters: {
|
||||
temperature: request.temperature,
|
||||
top_p: request.top_p,
|
||||
max_new_tokens: request.max_tokens || 1024,
|
||||
stop: Array.isArray(request.stop) ? request.stop : request.stop ? [request.stop] : undefined,
|
||||
do_sample: (request.temperature || 0) > 0,
|
||||
},
|
||||
},
|
||||
timeout: 300000,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`HTTP ${response.status}: ${error}`);
|
||||
}
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
const requestId = this.generateRequestId();
|
||||
const created = Math.floor(Date.now() / 1000);
|
||||
const model = this.config.models[0] || 'unknown';
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const text = decoder.decode(value);
|
||||
const lines = text.split('\n').filter((l) => l.startsWith('data:'));
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const jsonStr = line.substring(5).trim();
|
||||
if (jsonStr === '[DONE]') {
|
||||
onChunk('data: [DONE]\n\n');
|
||||
continue;
|
||||
}
|
||||
|
||||
const data = JSON.parse(jsonStr);
|
||||
|
||||
// Convert to OpenAI streaming format
|
||||
const chunk = {
|
||||
id: requestId,
|
||||
object: 'chat.completion.chunk',
|
||||
created,
|
||||
model,
|
||||
choices: [
|
||||
{
|
||||
index: 0,
|
||||
delta: {
|
||||
content: data.token?.text || '',
|
||||
} as Partial<IChatMessage>,
|
||||
finish_reason: data.details?.finish_reason ? 'stop' : null,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
onChunk(`data: ${JSON.stringify(chunk)}\n\n`);
|
||||
} catch {
|
||||
// Invalid JSON, skip
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert chat messages to TGI prompt format
|
||||
*/
|
||||
private messagesToPrompt(messages: IChatMessage[]): string {
|
||||
// Use a simple chat template
|
||||
// TGI can use model-specific templates via the Messages API
|
||||
let prompt = '';
|
||||
|
||||
for (const message of messages) {
|
||||
switch (message.role) {
|
||||
case 'system':
|
||||
prompt += `System: ${message.content}\n\n`;
|
||||
break;
|
||||
case 'user':
|
||||
prompt += `User: ${message.content}\n\n`;
|
||||
break;
|
||||
case 'assistant':
|
||||
prompt += `Assistant: ${message.content}\n\n`;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
prompt += 'Assistant:';
|
||||
return prompt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get TGI server info
|
||||
*/
|
||||
public async getInfo(): Promise<ITgiInfoResponse | null> {
|
||||
try {
|
||||
return await this.fetchJson<ITgiInfoResponse>('/info');
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get TGI metrics
|
||||
*/
|
||||
public async getMetrics(): Promise<Record<string, unknown>> {
|
||||
try {
|
||||
const response = await this.fetch('/metrics', { timeout: 5000 });
|
||||
if (response.ok) {
|
||||
const text = await response.text();
|
||||
// Parse Prometheus metrics
|
||||
const metrics: Record<string, unknown> = {};
|
||||
const lines = text.split('\n');
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('#') || !line.trim()) continue;
|
||||
const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
|
||||
if (match) {
|
||||
metrics[match[1]] = parseFloat(match[2]);
|
||||
}
|
||||
}
|
||||
return metrics;
|
||||
}
|
||||
} catch {
|
||||
// Metrics endpoint may not be available
|
||||
}
|
||||
return {};
|
||||
}
|
||||
}
|
||||
272
ts/containers/vllm.ts
Normal file
272
ts/containers/vllm.ts
Normal file
@@ -0,0 +1,272 @@
|
||||
/**
|
||||
* vLLM Container
|
||||
*
|
||||
* Manages vLLM containers for high-performance LLM inference.
|
||||
*/
|
||||
|
||||
import type {
|
||||
IContainerConfig,
|
||||
ILoadedModel,
|
||||
TContainerType,
|
||||
} from '../interfaces/container.ts';
|
||||
import type {
|
||||
IChatCompletionRequest,
|
||||
IChatCompletionResponse,
|
||||
IChatMessage,
|
||||
} from '../interfaces/api.ts';
|
||||
import { CONTAINER_IMAGES, CONTAINER_PORTS } from '../constants.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { BaseContainer, type TModelPullProgress } from './base-container.ts';
|
||||
|
||||
/**
|
||||
* vLLM model info response
|
||||
*/
|
||||
interface IVllmModelsResponse {
|
||||
object: 'list';
|
||||
data: Array<{
|
||||
id: string;
|
||||
object: 'model';
|
||||
created: number;
|
||||
owned_by: string;
|
||||
}>;
|
||||
}
|
||||
|
||||
/**
|
||||
* vLLM container implementation
|
||||
*
|
||||
* vLLM serves a single model per instance and is optimized for:
|
||||
* - High throughput with PagedAttention
|
||||
* - Continuous batching
|
||||
* - OpenAI-compatible API
|
||||
*/
|
||||
export class VllmContainer extends BaseContainer {
|
||||
public readonly type: TContainerType = 'vllm';
|
||||
public readonly displayName = 'vLLM';
|
||||
public readonly defaultImage = CONTAINER_IMAGES.VLLM;
|
||||
public readonly defaultPort = CONTAINER_PORTS.VLLM;
|
||||
|
||||
constructor(config: IContainerConfig) {
|
||||
super(config);
|
||||
|
||||
// Set defaults if not provided
|
||||
if (!config.image) {
|
||||
config.image = this.defaultImage;
|
||||
}
|
||||
if (!config.port) {
|
||||
config.port = this.defaultPort;
|
||||
}
|
||||
|
||||
// Add default volume for model cache
|
||||
if (!config.volumes || config.volumes.length === 0) {
|
||||
config.volumes = [`modelgrid-vllm-${config.id}:/root/.cache/huggingface`];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create vLLM container configuration
|
||||
*/
|
||||
public static createConfig(
|
||||
id: string,
|
||||
name: string,
|
||||
modelName: string,
|
||||
gpuIds: string[],
|
||||
options: Partial<IContainerConfig> = {},
|
||||
): IContainerConfig {
|
||||
// vLLM requires model to be specified at startup
|
||||
const command = [
|
||||
'--model', modelName,
|
||||
'--host', '0.0.0.0',
|
||||
'--port', String(options.port || CONTAINER_PORTS.VLLM),
|
||||
];
|
||||
|
||||
// Add tensor parallelism if multiple GPUs
|
||||
if (gpuIds.length > 1) {
|
||||
command.push('--tensor-parallel-size', String(gpuIds.length));
|
||||
}
|
||||
|
||||
// Add additional options
|
||||
if (options.env?.VLLM_MAX_MODEL_LEN) {
|
||||
command.push('--max-model-len', options.env.VLLM_MAX_MODEL_LEN);
|
||||
}
|
||||
|
||||
return {
|
||||
id,
|
||||
name,
|
||||
type: 'vllm',
|
||||
image: options.image || CONTAINER_IMAGES.VLLM,
|
||||
gpuIds,
|
||||
port: options.port || CONTAINER_PORTS.VLLM,
|
||||
externalPort: options.externalPort,
|
||||
models: [modelName],
|
||||
env: {
|
||||
HF_TOKEN: options.env?.HF_TOKEN || '',
|
||||
...options.env,
|
||||
},
|
||||
volumes: options.volumes || [`modelgrid-vllm-${id}:/root/.cache/huggingface`],
|
||||
autoStart: options.autoStart ?? true,
|
||||
restartPolicy: options.restartPolicy || 'unless-stopped',
|
||||
memoryLimit: options.memoryLimit,
|
||||
cpuLimit: options.cpuLimit,
|
||||
command,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if vLLM is healthy
|
||||
*/
|
||||
public async isHealthy(): Promise<boolean> {
|
||||
try {
|
||||
const response = await this.fetch('/health', { timeout: 5000 });
|
||||
return response.ok;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* List available models
|
||||
* vLLM serves a single model per instance
|
||||
*/
|
||||
public async listModels(): Promise<string[]> {
|
||||
try {
|
||||
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
|
||||
return (data.data || []).map((m) => m.id);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to list vLLM models: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return this.config.models || [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get loaded models with details
|
||||
*/
|
||||
public async getLoadedModels(): Promise<ILoadedModel[]> {
|
||||
try {
|
||||
const data = await this.fetchJson<IVllmModelsResponse>('/v1/models');
|
||||
return (data.data || []).map((m) => ({
|
||||
name: m.id,
|
||||
size: 0, // vLLM doesn't expose size
|
||||
loaded: true,
|
||||
requestCount: 0,
|
||||
}));
|
||||
} catch {
|
||||
// Return configured model as fallback
|
||||
return this.config.models.map((name) => ({
|
||||
name,
|
||||
size: 0,
|
||||
loaded: true,
|
||||
requestCount: 0,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull a model
|
||||
* vLLM downloads models automatically at startup
|
||||
* This method is a no-op - models are configured at container creation
|
||||
*/
|
||||
public async pullModel(modelName: string, onProgress?: TModelPullProgress): Promise<boolean> {
|
||||
logger.info(`vLLM downloads models at startup. Model: ${modelName}`);
|
||||
logger.info('To use a different model, create a new vLLM container.');
|
||||
|
||||
if (onProgress) {
|
||||
onProgress({
|
||||
model: modelName,
|
||||
status: 'vLLM models are loaded at container startup',
|
||||
percent: 100,
|
||||
});
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a model
|
||||
* vLLM serves a single model per instance
|
||||
*/
|
||||
public async removeModel(modelName: string): Promise<boolean> {
|
||||
logger.info(`vLLM serves a single model per instance.`);
|
||||
logger.info(`To remove model ${modelName}, stop and remove this container.`);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Send a chat completion request
|
||||
* vLLM is OpenAI-compatible
|
||||
*/
|
||||
public async chatCompletion(request: IChatCompletionRequest): Promise<IChatCompletionResponse> {
|
||||
return this.fetchJson<IChatCompletionResponse>('/v1/chat/completions', {
|
||||
method: 'POST',
|
||||
body: {
|
||||
...request,
|
||||
stream: false,
|
||||
},
|
||||
timeout: 300000, // 5 minutes
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Stream a chat completion request
|
||||
* vLLM is OpenAI-compatible
|
||||
*/
|
||||
public async chatCompletionStream(
|
||||
request: IChatCompletionRequest,
|
||||
onChunk: (chunk: string) => void,
|
||||
): Promise<void> {
|
||||
const response = await this.fetch('/v1/chat/completions', {
|
||||
method: 'POST',
|
||||
body: {
|
||||
...request,
|
||||
stream: true,
|
||||
},
|
||||
timeout: 300000,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`HTTP ${response.status}: ${error}`);
|
||||
}
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) {
|
||||
throw new Error('No response body');
|
||||
}
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const text = decoder.decode(value);
|
||||
// vLLM already sends data in SSE format
|
||||
onChunk(text);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get vLLM-specific metrics
|
||||
*/
|
||||
public async getMetrics(): Promise<Record<string, unknown>> {
|
||||
try {
|
||||
const response = await this.fetch('/metrics', { timeout: 5000 });
|
||||
if (response.ok) {
|
||||
const text = await response.text();
|
||||
// Parse Prometheus metrics
|
||||
const metrics: Record<string, unknown> = {};
|
||||
const lines = text.split('\n');
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('#') || !line.trim()) continue;
|
||||
const match = line.match(/^(\w+)(?:\{[^}]*\})?\s+([\d.e+-]+)/);
|
||||
if (match) {
|
||||
metrics[match[1]] = parseFloat(match[2]);
|
||||
}
|
||||
}
|
||||
return metrics;
|
||||
}
|
||||
} catch {
|
||||
// Metrics endpoint may not be enabled
|
||||
}
|
||||
return {};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user