/** * Container Runtime * * Manages individual Docker containers for AI model serving. */ import { exec, spawn } from 'node:child_process'; import { promisify } from 'node:util'; import type { IContainerConfig, IContainerStatus, TContainerHealth, TContainerRunStatus, } from '../interfaces/container.ts'; import { logger } from '../logger.ts'; import { DOCKER, TIMING } from '../constants.ts'; import { DriverManager } from '../drivers/driver-manager.ts'; const execAsync = promisify(exec); /** * Container runtime execution result */ export interface IContainerExecResult { success: boolean; output?: string; error?: string; exitCode?: number; } /** * Container logs options */ export interface ILogsOptions { lines?: number; follow?: boolean; timestamps?: boolean; } /** * Container Runtime class - manages individual containers */ export class ContainerRuntime { private driverManager: DriverManager; constructor() { this.driverManager = new DriverManager(); } /** * Start a container with the given configuration */ public async startContainer(config: IContainerConfig): Promise { const containerName = `modelgrid-${config.id}`; // Check if container already exists const existingId = await this.getContainerIdByName(containerName); if (existingId) { // Check if it's running const isRunning = await this.isContainerRunning(existingId); if (isRunning) { logger.dim(`Container ${containerName} is already running`); return true; } // Start existing container try { await execAsync(`docker start ${existingId}`, { timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS, }); logger.success(`Started existing container: ${containerName}`); return true; } catch (error) { logger.error(`Failed to start existing container: ${error instanceof Error ? error.message : String(error)}`); // Try to remove and recreate await this.removeContainer(config.id); } } // Build docker run command const args = await this.buildRunArgs(config); const cmd = `docker run ${args.join(' ')}`; logger.info(`Starting container: ${containerName}`); logger.dim(`Command: ${cmd}`); try { await execAsync(cmd, { timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS }); logger.success(`Container ${containerName} started`); // Wait for container to be healthy await this.waitForHealth(containerName); return true; } catch (error) { logger.error(`Failed to start container: ${error instanceof Error ? error.message : String(error)}`); return false; } } /** * Stop a container */ public async stopContainer(containerId: string, timeout: number = 30): Promise { const containerName = `modelgrid-${containerId}`; try { const dockerId = await this.getContainerIdByName(containerName); if (!dockerId) { logger.dim(`Container ${containerName} not found`); return true; } logger.info(`Stopping container: ${containerName}`); await execAsync(`docker stop -t ${timeout} ${dockerId}`, { timeout: (timeout + 10) * 1000, }); logger.success(`Container ${containerName} stopped`); return true; } catch (error) { logger.error(`Failed to stop container: ${error instanceof Error ? error.message : String(error)}`); return false; } } /** * Remove a container */ public async removeContainer(containerId: string, force: boolean = true): Promise { const containerName = `modelgrid-${containerId}`; try { const dockerId = await this.getContainerIdByName(containerName); if (!dockerId) { return true; } const forceFlag = force ? '-f' : ''; await execAsync(`docker rm ${forceFlag} ${dockerId}`, { timeout: 30000 }); logger.success(`Container ${containerName} removed`); return true; } catch (error) { logger.error(`Failed to remove container: ${error instanceof Error ? error.message : String(error)}`); return false; } } /** * Restart a container */ public async restartContainer(containerId: string): Promise { const containerName = `modelgrid-${containerId}`; try { const dockerId = await this.getContainerIdByName(containerName); if (!dockerId) { logger.error(`Container ${containerName} not found`); return false; } await execAsync(`docker restart ${dockerId}`, { timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS, }); logger.success(`Container ${containerName} restarted`); return true; } catch (error) { logger.error(`Failed to restart container: ${error instanceof Error ? error.message : String(error)}`); return false; } } /** * Get container status */ public async getContainerStatus(config: IContainerConfig): Promise { const containerName = `modelgrid-${config.id}`; const status: IContainerStatus = { id: config.id, name: config.name, type: config.type, running: false, runStatus: 'stopped', health: 'unknown', loadedModels: [], assignedGpus: config.gpuIds, endpoint: `http://localhost:${config.externalPort || config.port}`, }; try { const dockerId = await this.getContainerIdByName(containerName); if (!dockerId) { return status; } status.dockerId = dockerId; // Get container info const { stdout } = await execAsync( `docker inspect --format='{{json .}}' ${dockerId}`, { timeout: 5000 }, ); const info = JSON.parse(stdout); // Get run status status.running = info.State.Running === true; if (info.State.Running) { status.runStatus = 'running'; } else if (info.State.Restarting) { status.runStatus = 'starting'; } else if (info.State.ExitCode !== 0) { status.runStatus = 'error'; status.lastError = info.State.Error || `Exit code: ${info.State.ExitCode}`; } else { status.runStatus = 'stopped'; } // Get health status if (info.State.Health) { status.health = info.State.Health.Status as TContainerHealth; if (info.State.Health.Log && info.State.Health.Log.length > 0) { const lastLog = info.State.Health.Log[info.State.Health.Log.length - 1]; if (lastLog.Output) { status.healthMessage = lastLog.Output.substring(0, 200); } } } // Get uptime if (info.State.StartedAt) { const startTime = new Date(info.State.StartedAt).getTime(); status.startTime = startTime; if (status.running) { status.uptime = Math.floor((Date.now() - startTime) / 1000); } } // Try to get loaded models from container if (status.running) { status.loadedModels = await this.getLoadedModels(config); } // Get resource usage const stats = await this.getContainerStats(dockerId); if (stats) { status.memoryUsage = stats.memoryUsage; status.cpuUsage = stats.cpuUsage; } } catch (error) { logger.dim(`Error getting container status: ${error instanceof Error ? error.message : String(error)}`); } return status; } /** * Get container resource stats */ private async getContainerStats( dockerId: string, ): Promise<{ memoryUsage: number; cpuUsage: number } | null> { try { const { stdout } = await execAsync( `docker stats ${dockerId} --no-stream --format "{{.MemUsage}},{{.CPUPerc}}"`, { timeout: 5000 }, ); const [memStr, cpuStr] = stdout.trim().split(','); // Parse memory (e.g., "1.5GiB / 16GiB") const memMatch = memStr.match(/([\d.]+)(MiB|GiB)/i); let memoryUsage = 0; if (memMatch) { memoryUsage = parseFloat(memMatch[1]); if (memMatch[2].toLowerCase() === 'gib') { memoryUsage *= 1024; } } // Parse CPU (e.g., "25.50%") const cpuUsage = parseFloat(cpuStr.replace('%', '')) || 0; return { memoryUsage: Math.round(memoryUsage), cpuUsage }; } catch { return null; } } /** * Get loaded models from a container */ private async getLoadedModels(config: IContainerConfig): Promise { const containerName = `modelgrid-${config.id}`; try { switch (config.type) { case 'ollama': { // Query Ollama API for loaded models const { stdout } = await execAsync( `docker exec ${containerName} curl -s http://localhost:11434/api/tags`, { timeout: 5000 }, ); const data = JSON.parse(stdout); return (data.models || []).map((m: { name: string }) => m.name); } case 'vllm': case 'tgi': { // These typically serve a single model return config.models || []; } default: return []; } } catch { return []; } } /** * Execute a command inside a container */ public async exec( containerId: string, command: string, timeout: number = 30000, ): Promise { const containerName = `modelgrid-${containerId}`; try { const dockerId = await this.getContainerIdByName(containerName); if (!dockerId) { return { success: false, error: 'Container not found' }; } const { stdout, stderr } = await execAsync( `docker exec ${dockerId} ${command}`, { timeout }, ); return { success: true, output: stdout, error: stderr || undefined, }; } catch (error) { const err = error as { code?: number; stdout?: string; stderr?: string }; return { success: false, output: err.stdout, error: err.stderr || (error instanceof Error ? error.message : String(error)), exitCode: err.code, }; } } /** * Get container logs */ public async getLogs( containerId: string, options: ILogsOptions = {}, ): Promise { const containerName = `modelgrid-${containerId}`; const { lines = 100, timestamps = false } = options; try { const dockerId = await this.getContainerIdByName(containerName); if (!dockerId) { return ''; } const args = ['logs']; if (lines) args.push(`--tail=${lines}`); if (timestamps) args.push('--timestamps'); args.push(dockerId); const { stdout, stderr } = await execAsync( `docker ${args.join(' ')}`, { timeout: 10000 }, ); return stdout + stderr; } catch (error) { return error instanceof Error ? error.message : String(error); } } /** * Follow container logs (returns a way to stop following) */ public followLogs( containerId: string, onData: (data: string) => void, ): { stop: () => void } { const containerName = `modelgrid-${containerId}`; const child = spawn('docker', ['logs', '-f', containerName], { stdio: ['ignore', 'pipe', 'pipe'], }); child.stdout.on('data', (data) => onData(data.toString())); child.stderr.on('data', (data) => onData(data.toString())); return { stop: () => { child.kill(); }, }; } /** * Build docker run arguments */ private async buildRunArgs(config: IContainerConfig): Promise { const containerName = `modelgrid-${config.id}`; const args: string[] = [ '-d', // Detached mode `--name=${containerName}`, `--network=${DOCKER.DEFAULT_NETWORK}`, ]; // Port mapping const externalPort = config.externalPort || config.port; args.push(`-p ${externalPort}:${config.port}`); // Restart policy args.push(`--restart=${config.restartPolicy}`); // Memory limit if (config.memoryLimit) { args.push(`--memory=${config.memoryLimit}`); } // CPU limit if (config.cpuLimit) { args.push(`--cpus=${config.cpuLimit}`); } // GPU support if (config.gpuIds && config.gpuIds.length > 0) { const gpuArgs = await this.driverManager.getDockerGpuArgs(config.gpuIds); args.push(...gpuArgs); } // Environment variables if (config.env) { for (const [key, value] of Object.entries(config.env)) { args.push(`-e ${key}=${value}`); } } // Volume mounts if (config.volumes) { for (const volume of config.volumes) { args.push(`-v ${volume}`); } } // Add image args.push(config.image); // Add custom command if provided if (config.command && config.command.length > 0) { args.push(...config.command); } return args; } /** * Get Docker container ID by name */ private async getContainerIdByName(name: string): Promise { try { const { stdout } = await execAsync( `docker ps -a --filter "name=^${name}$" --format "{{.ID}}"`, { timeout: 5000 }, ); return stdout.trim() || null; } catch { return null; } } /** * Check if a container is running */ private async isContainerRunning(dockerId: string): Promise { try { const { stdout } = await execAsync( `docker inspect --format='{{.State.Running}}' ${dockerId}`, { timeout: 5000 }, ); return stdout.trim() === 'true'; } catch { return false; } } /** * Wait for container to be healthy */ private async waitForHealth( containerName: string, timeout: number = TIMING.CONTAINER_STARTUP_TIMEOUT_MS, ): Promise { const startTime = Date.now(); const checkInterval = 2000; while (Date.now() - startTime < timeout) { try { const { stdout } = await execAsync( `docker inspect --format='{{.State.Health.Status}}' ${containerName} 2>/dev/null || echo "none"`, { timeout: 5000 }, ); const status = stdout.trim(); if (status === 'healthy') { return true; } if (status === 'none') { // Container has no health check, assume healthy if running const { stdout: running } = await execAsync( `docker inspect --format='{{.State.Running}}' ${containerName}`, { timeout: 5000 }, ); if (running.trim() === 'true') { return true; } } if (status === 'unhealthy') { logger.warn(`Container ${containerName} is unhealthy`); return false; } } catch { // Container might not be ready yet } await this.sleep(checkInterval); } logger.warn(`Timeout waiting for container ${containerName} to be healthy`); return false; } /** * Sleep helper */ private async sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } }