559 lines
15 KiB
TypeScript
559 lines
15 KiB
TypeScript
/**
|
|
* Container Runtime
|
|
*
|
|
* Manages individual Docker containers for AI model serving.
|
|
*/
|
|
|
|
import { exec, spawn } from 'node:child_process';
|
|
import { promisify } from 'node:util';
|
|
import type {
|
|
IContainerConfig,
|
|
IContainerStatus,
|
|
TContainerHealth,
|
|
TContainerRunStatus,
|
|
} from '../interfaces/container.ts';
|
|
import { logger } from '../logger.ts';
|
|
import { DOCKER, TIMING } from '../constants.ts';
|
|
import { DriverManager } from '../drivers/driver-manager.ts';
|
|
|
|
const execAsync = promisify(exec);
|
|
|
|
/**
|
|
* Container runtime execution result
|
|
*/
|
|
export interface IContainerExecResult {
|
|
success: boolean;
|
|
output?: string;
|
|
error?: string;
|
|
exitCode?: number;
|
|
}
|
|
|
|
/**
|
|
* Container logs options
|
|
*/
|
|
export interface ILogsOptions {
|
|
lines?: number;
|
|
follow?: boolean;
|
|
timestamps?: boolean;
|
|
}
|
|
|
|
/**
|
|
* Container Runtime class - manages individual containers
|
|
*/
|
|
export class ContainerRuntime {
|
|
private driverManager: DriverManager;
|
|
|
|
constructor() {
|
|
this.driverManager = new DriverManager();
|
|
}
|
|
|
|
/**
|
|
* Start a container with the given configuration
|
|
*/
|
|
public async startContainer(config: IContainerConfig): Promise<boolean> {
|
|
const containerName = `modelgrid-${config.id}`;
|
|
|
|
// Check if container already exists
|
|
const existingId = await this.getContainerIdByName(containerName);
|
|
if (existingId) {
|
|
// Check if it's running
|
|
const isRunning = await this.isContainerRunning(existingId);
|
|
if (isRunning) {
|
|
logger.dim(`Container ${containerName} is already running`);
|
|
return true;
|
|
}
|
|
|
|
// Start existing container
|
|
try {
|
|
await execAsync(`docker start ${existingId}`, {
|
|
timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
|
|
});
|
|
logger.success(`Started existing container: ${containerName}`);
|
|
return true;
|
|
} catch (error) {
|
|
logger.error(`Failed to start existing container: ${error instanceof Error ? error.message : String(error)}`);
|
|
// Try to remove and recreate
|
|
await this.removeContainer(config.id);
|
|
}
|
|
}
|
|
|
|
// Build docker run command
|
|
const args = await this.buildRunArgs(config);
|
|
const cmd = `docker run ${args.join(' ')}`;
|
|
|
|
logger.info(`Starting container: ${containerName}`);
|
|
logger.dim(`Command: ${cmd}`);
|
|
|
|
try {
|
|
await execAsync(cmd, { timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS });
|
|
logger.success(`Container ${containerName} started`);
|
|
|
|
// Wait for container to be healthy
|
|
await this.waitForHealth(containerName);
|
|
|
|
return true;
|
|
} catch (error) {
|
|
logger.error(`Failed to start container: ${error instanceof Error ? error.message : String(error)}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Stop a container
|
|
*/
|
|
public async stopContainer(containerId: string, timeout: number = 30): Promise<boolean> {
|
|
const containerName = `modelgrid-${containerId}`;
|
|
|
|
try {
|
|
const dockerId = await this.getContainerIdByName(containerName);
|
|
if (!dockerId) {
|
|
logger.dim(`Container ${containerName} not found`);
|
|
return true;
|
|
}
|
|
|
|
logger.info(`Stopping container: ${containerName}`);
|
|
await execAsync(`docker stop -t ${timeout} ${dockerId}`, {
|
|
timeout: (timeout + 10) * 1000,
|
|
});
|
|
logger.success(`Container ${containerName} stopped`);
|
|
return true;
|
|
} catch (error) {
|
|
logger.error(`Failed to stop container: ${error instanceof Error ? error.message : String(error)}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Remove a container
|
|
*/
|
|
public async removeContainer(containerId: string, force: boolean = true): Promise<boolean> {
|
|
const containerName = `modelgrid-${containerId}`;
|
|
|
|
try {
|
|
const dockerId = await this.getContainerIdByName(containerName);
|
|
if (!dockerId) {
|
|
return true;
|
|
}
|
|
|
|
const forceFlag = force ? '-f' : '';
|
|
await execAsync(`docker rm ${forceFlag} ${dockerId}`, { timeout: 30000 });
|
|
logger.success(`Container ${containerName} removed`);
|
|
return true;
|
|
} catch (error) {
|
|
logger.error(`Failed to remove container: ${error instanceof Error ? error.message : String(error)}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Restart a container
|
|
*/
|
|
public async restartContainer(containerId: string): Promise<boolean> {
|
|
const containerName = `modelgrid-${containerId}`;
|
|
|
|
try {
|
|
const dockerId = await this.getContainerIdByName(containerName);
|
|
if (!dockerId) {
|
|
logger.error(`Container ${containerName} not found`);
|
|
return false;
|
|
}
|
|
|
|
await execAsync(`docker restart ${dockerId}`, {
|
|
timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
|
|
});
|
|
logger.success(`Container ${containerName} restarted`);
|
|
return true;
|
|
} catch (error) {
|
|
logger.error(`Failed to restart container: ${error instanceof Error ? error.message : String(error)}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get container status
|
|
*/
|
|
public async getContainerStatus(config: IContainerConfig): Promise<IContainerStatus> {
|
|
const containerName = `modelgrid-${config.id}`;
|
|
|
|
const status: IContainerStatus = {
|
|
id: config.id,
|
|
name: config.name,
|
|
type: config.type,
|
|
running: false,
|
|
runStatus: 'stopped',
|
|
health: 'unknown',
|
|
loadedModels: [],
|
|
assignedGpus: config.gpuIds,
|
|
endpoint: `http://localhost:${config.externalPort || config.port}`,
|
|
};
|
|
|
|
try {
|
|
const dockerId = await this.getContainerIdByName(containerName);
|
|
if (!dockerId) {
|
|
return status;
|
|
}
|
|
|
|
status.dockerId = dockerId;
|
|
|
|
// Get container info
|
|
const { stdout } = await execAsync(
|
|
`docker inspect --format='{{json .}}' ${dockerId}`,
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const info = JSON.parse(stdout);
|
|
|
|
// Get run status
|
|
status.running = info.State.Running === true;
|
|
if (info.State.Running) {
|
|
status.runStatus = 'running';
|
|
} else if (info.State.Restarting) {
|
|
status.runStatus = 'starting';
|
|
} else if (info.State.ExitCode !== 0) {
|
|
status.runStatus = 'error';
|
|
status.lastError = info.State.Error || `Exit code: ${info.State.ExitCode}`;
|
|
} else {
|
|
status.runStatus = 'stopped';
|
|
}
|
|
|
|
// Get health status
|
|
if (info.State.Health) {
|
|
status.health = info.State.Health.Status as TContainerHealth;
|
|
if (info.State.Health.Log && info.State.Health.Log.length > 0) {
|
|
const lastLog = info.State.Health.Log[info.State.Health.Log.length - 1];
|
|
if (lastLog.Output) {
|
|
status.healthMessage = lastLog.Output.substring(0, 200);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get uptime
|
|
if (info.State.StartedAt) {
|
|
const startTime = new Date(info.State.StartedAt).getTime();
|
|
status.startTime = startTime;
|
|
if (status.running) {
|
|
status.uptime = Math.floor((Date.now() - startTime) / 1000);
|
|
}
|
|
}
|
|
|
|
// Try to get loaded models from container
|
|
if (status.running) {
|
|
status.loadedModels = await this.getLoadedModels(config);
|
|
}
|
|
|
|
// Get resource usage
|
|
const stats = await this.getContainerStats(dockerId);
|
|
if (stats) {
|
|
status.memoryUsage = stats.memoryUsage;
|
|
status.cpuUsage = stats.cpuUsage;
|
|
}
|
|
} catch (error) {
|
|
logger.dim(`Error getting container status: ${error instanceof Error ? error.message : String(error)}`);
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
/**
|
|
* Get container resource stats
|
|
*/
|
|
private async getContainerStats(
|
|
dockerId: string,
|
|
): Promise<{ memoryUsage: number; cpuUsage: number } | null> {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`docker stats ${dockerId} --no-stream --format "{{.MemUsage}},{{.CPUPerc}}"`,
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const [memStr, cpuStr] = stdout.trim().split(',');
|
|
|
|
// Parse memory (e.g., "1.5GiB / 16GiB")
|
|
const memMatch = memStr.match(/([\d.]+)(MiB|GiB)/i);
|
|
let memoryUsage = 0;
|
|
if (memMatch) {
|
|
memoryUsage = parseFloat(memMatch[1]);
|
|
if (memMatch[2].toLowerCase() === 'gib') {
|
|
memoryUsage *= 1024;
|
|
}
|
|
}
|
|
|
|
// Parse CPU (e.g., "25.50%")
|
|
const cpuUsage = parseFloat(cpuStr.replace('%', '')) || 0;
|
|
|
|
return { memoryUsage: Math.round(memoryUsage), cpuUsage };
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get loaded models from a container
|
|
*/
|
|
private async getLoadedModels(config: IContainerConfig): Promise<string[]> {
|
|
const containerName = `modelgrid-${config.id}`;
|
|
|
|
try {
|
|
switch (config.type) {
|
|
case 'ollama': {
|
|
// Query Ollama API for loaded models
|
|
const { stdout } = await execAsync(
|
|
`docker exec ${containerName} curl -s http://localhost:11434/api/tags`,
|
|
{ timeout: 5000 },
|
|
);
|
|
const data = JSON.parse(stdout);
|
|
return (data.models || []).map((m: { name: string }) => m.name);
|
|
}
|
|
|
|
case 'vllm':
|
|
case 'tgi': {
|
|
// These typically serve a single model
|
|
return config.models || [];
|
|
}
|
|
|
|
default:
|
|
return [];
|
|
}
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Execute a command inside a container
|
|
*/
|
|
public async exec(
|
|
containerId: string,
|
|
command: string,
|
|
timeout: number = 30000,
|
|
): Promise<IContainerExecResult> {
|
|
const containerName = `modelgrid-${containerId}`;
|
|
|
|
try {
|
|
const dockerId = await this.getContainerIdByName(containerName);
|
|
if (!dockerId) {
|
|
return { success: false, error: 'Container not found' };
|
|
}
|
|
|
|
const { stdout, stderr } = await execAsync(
|
|
`docker exec ${dockerId} ${command}`,
|
|
{ timeout },
|
|
);
|
|
|
|
return {
|
|
success: true,
|
|
output: stdout,
|
|
error: stderr || undefined,
|
|
};
|
|
} catch (error) {
|
|
const err = error as { code?: number; stdout?: string; stderr?: string };
|
|
return {
|
|
success: false,
|
|
output: err.stdout,
|
|
error: err.stderr || (error instanceof Error ? error.message : String(error)),
|
|
exitCode: err.code,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get container logs
|
|
*/
|
|
public async getLogs(
|
|
containerId: string,
|
|
options: ILogsOptions = {},
|
|
): Promise<string> {
|
|
const containerName = `modelgrid-${containerId}`;
|
|
const { lines = 100, timestamps = false } = options;
|
|
|
|
try {
|
|
const dockerId = await this.getContainerIdByName(containerName);
|
|
if (!dockerId) {
|
|
return '';
|
|
}
|
|
|
|
const args = ['logs'];
|
|
if (lines) args.push(`--tail=${lines}`);
|
|
if (timestamps) args.push('--timestamps');
|
|
args.push(dockerId);
|
|
|
|
const { stdout, stderr } = await execAsync(
|
|
`docker ${args.join(' ')}`,
|
|
{ timeout: 10000 },
|
|
);
|
|
|
|
return stdout + stderr;
|
|
} catch (error) {
|
|
return error instanceof Error ? error.message : String(error);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Follow container logs (returns a way to stop following)
|
|
*/
|
|
public followLogs(
|
|
containerId: string,
|
|
onData: (data: string) => void,
|
|
): { stop: () => void } {
|
|
const containerName = `modelgrid-${containerId}`;
|
|
|
|
const child = spawn('docker', ['logs', '-f', containerName], {
|
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
});
|
|
|
|
child.stdout.on('data', (data) => onData(data.toString()));
|
|
child.stderr.on('data', (data) => onData(data.toString()));
|
|
|
|
return {
|
|
stop: () => {
|
|
child.kill();
|
|
},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Build docker run arguments
|
|
*/
|
|
private async buildRunArgs(config: IContainerConfig): Promise<string[]> {
|
|
const containerName = `modelgrid-${config.id}`;
|
|
const args: string[] = [
|
|
'-d', // Detached mode
|
|
`--name=${containerName}`,
|
|
`--network=${DOCKER.DEFAULT_NETWORK}`,
|
|
];
|
|
|
|
// Port mapping
|
|
const externalPort = config.externalPort || config.port;
|
|
args.push(`-p ${externalPort}:${config.port}`);
|
|
|
|
// Restart policy
|
|
args.push(`--restart=${config.restartPolicy}`);
|
|
|
|
// Memory limit
|
|
if (config.memoryLimit) {
|
|
args.push(`--memory=${config.memoryLimit}`);
|
|
}
|
|
|
|
// CPU limit
|
|
if (config.cpuLimit) {
|
|
args.push(`--cpus=${config.cpuLimit}`);
|
|
}
|
|
|
|
// GPU support
|
|
if (config.gpuIds && config.gpuIds.length > 0) {
|
|
const gpuArgs = await this.driverManager.getDockerGpuArgs(config.gpuIds);
|
|
args.push(...gpuArgs);
|
|
}
|
|
|
|
// Environment variables
|
|
if (config.env) {
|
|
for (const [key, value] of Object.entries(config.env)) {
|
|
args.push(`-e ${key}=${value}`);
|
|
}
|
|
}
|
|
|
|
// Volume mounts
|
|
if (config.volumes) {
|
|
for (const volume of config.volumes) {
|
|
args.push(`-v ${volume}`);
|
|
}
|
|
}
|
|
|
|
// Add image
|
|
args.push(config.image);
|
|
|
|
// Add custom command if provided
|
|
if (config.command && config.command.length > 0) {
|
|
args.push(...config.command);
|
|
}
|
|
|
|
return args;
|
|
}
|
|
|
|
/**
|
|
* Get Docker container ID by name
|
|
*/
|
|
private async getContainerIdByName(name: string): Promise<string | null> {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`docker ps -a --filter "name=^${name}$" --format "{{.ID}}"`,
|
|
{ timeout: 5000 },
|
|
);
|
|
return stdout.trim() || null;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if a container is running
|
|
*/
|
|
private async isContainerRunning(dockerId: string): Promise<boolean> {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`docker inspect --format='{{.State.Running}}' ${dockerId}`,
|
|
{ timeout: 5000 },
|
|
);
|
|
return stdout.trim() === 'true';
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Wait for container to be healthy
|
|
*/
|
|
private async waitForHealth(
|
|
containerName: string,
|
|
timeout: number = TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
|
|
): Promise<boolean> {
|
|
const startTime = Date.now();
|
|
const checkInterval = 2000;
|
|
|
|
while (Date.now() - startTime < timeout) {
|
|
try {
|
|
const { stdout } = await execAsync(
|
|
`docker inspect --format='{{.State.Health.Status}}' ${containerName} 2>/dev/null || echo "none"`,
|
|
{ timeout: 5000 },
|
|
);
|
|
|
|
const status = stdout.trim();
|
|
|
|
if (status === 'healthy') {
|
|
return true;
|
|
}
|
|
|
|
if (status === 'none') {
|
|
// Container has no health check, assume healthy if running
|
|
const { stdout: running } = await execAsync(
|
|
`docker inspect --format='{{.State.Running}}' ${containerName}`,
|
|
{ timeout: 5000 },
|
|
);
|
|
if (running.trim() === 'true') {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (status === 'unhealthy') {
|
|
logger.warn(`Container ${containerName} is unhealthy`);
|
|
return false;
|
|
}
|
|
} catch {
|
|
// Container might not be ready yet
|
|
}
|
|
|
|
await this.sleep(checkInterval);
|
|
}
|
|
|
|
logger.warn(`Timeout waiting for container ${containerName} to be healthy`);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Sleep helper
|
|
*/
|
|
private async sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
}
|