Files
modelgrid/ts/docker/container-runtime.ts
Juergen Kunz daaf6559e3
Some checks failed
CI / Type Check & Lint (push) Failing after 5s
CI / Build Test (Current Platform) (push) Failing after 5s
CI / Build All Platforms (push) Successful in 49s
initial
2026-01-30 03:16:57 +00:00

559 lines
15 KiB
TypeScript

/**
* Container Runtime
*
* Manages individual Docker containers for AI model serving.
*/
import { exec, spawn } from 'node:child_process';
import { promisify } from 'node:util';
import type {
IContainerConfig,
IContainerStatus,
TContainerHealth,
TContainerRunStatus,
} from '../interfaces/container.ts';
import { logger } from '../logger.ts';
import { DOCKER, TIMING } from '../constants.ts';
import { DriverManager } from '../drivers/driver-manager.ts';
const execAsync = promisify(exec);
/**
* Container runtime execution result
*/
export interface IContainerExecResult {
success: boolean;
output?: string;
error?: string;
exitCode?: number;
}
/**
* Container logs options
*/
export interface ILogsOptions {
lines?: number;
follow?: boolean;
timestamps?: boolean;
}
/**
* Container Runtime class - manages individual containers
*/
export class ContainerRuntime {
private driverManager: DriverManager;
constructor() {
this.driverManager = new DriverManager();
}
/**
* Start a container with the given configuration
*/
public async startContainer(config: IContainerConfig): Promise<boolean> {
const containerName = `modelgrid-${config.id}`;
// Check if container already exists
const existingId = await this.getContainerIdByName(containerName);
if (existingId) {
// Check if it's running
const isRunning = await this.isContainerRunning(existingId);
if (isRunning) {
logger.dim(`Container ${containerName} is already running`);
return true;
}
// Start existing container
try {
await execAsync(`docker start ${existingId}`, {
timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
});
logger.success(`Started existing container: ${containerName}`);
return true;
} catch (error) {
logger.error(`Failed to start existing container: ${error instanceof Error ? error.message : String(error)}`);
// Try to remove and recreate
await this.removeContainer(config.id);
}
}
// Build docker run command
const args = await this.buildRunArgs(config);
const cmd = `docker run ${args.join(' ')}`;
logger.info(`Starting container: ${containerName}`);
logger.dim(`Command: ${cmd}`);
try {
await execAsync(cmd, { timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS });
logger.success(`Container ${containerName} started`);
// Wait for container to be healthy
await this.waitForHealth(containerName);
return true;
} catch (error) {
logger.error(`Failed to start container: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Stop a container
*/
public async stopContainer(containerId: string, timeout: number = 30): Promise<boolean> {
const containerName = `modelgrid-${containerId}`;
try {
const dockerId = await this.getContainerIdByName(containerName);
if (!dockerId) {
logger.dim(`Container ${containerName} not found`);
return true;
}
logger.info(`Stopping container: ${containerName}`);
await execAsync(`docker stop -t ${timeout} ${dockerId}`, {
timeout: (timeout + 10) * 1000,
});
logger.success(`Container ${containerName} stopped`);
return true;
} catch (error) {
logger.error(`Failed to stop container: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Remove a container
*/
public async removeContainer(containerId: string, force: boolean = true): Promise<boolean> {
const containerName = `modelgrid-${containerId}`;
try {
const dockerId = await this.getContainerIdByName(containerName);
if (!dockerId) {
return true;
}
const forceFlag = force ? '-f' : '';
await execAsync(`docker rm ${forceFlag} ${dockerId}`, { timeout: 30000 });
logger.success(`Container ${containerName} removed`);
return true;
} catch (error) {
logger.error(`Failed to remove container: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Restart a container
*/
public async restartContainer(containerId: string): Promise<boolean> {
const containerName = `modelgrid-${containerId}`;
try {
const dockerId = await this.getContainerIdByName(containerName);
if (!dockerId) {
logger.error(`Container ${containerName} not found`);
return false;
}
await execAsync(`docker restart ${dockerId}`, {
timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
});
logger.success(`Container ${containerName} restarted`);
return true;
} catch (error) {
logger.error(`Failed to restart container: ${error instanceof Error ? error.message : String(error)}`);
return false;
}
}
/**
* Get container status
*/
public async getContainerStatus(config: IContainerConfig): Promise<IContainerStatus> {
const containerName = `modelgrid-${config.id}`;
const status: IContainerStatus = {
id: config.id,
name: config.name,
type: config.type,
running: false,
runStatus: 'stopped',
health: 'unknown',
loadedModels: [],
assignedGpus: config.gpuIds,
endpoint: `http://localhost:${config.externalPort || config.port}`,
};
try {
const dockerId = await this.getContainerIdByName(containerName);
if (!dockerId) {
return status;
}
status.dockerId = dockerId;
// Get container info
const { stdout } = await execAsync(
`docker inspect --format='{{json .}}' ${dockerId}`,
{ timeout: 5000 },
);
const info = JSON.parse(stdout);
// Get run status
status.running = info.State.Running === true;
if (info.State.Running) {
status.runStatus = 'running';
} else if (info.State.Restarting) {
status.runStatus = 'starting';
} else if (info.State.ExitCode !== 0) {
status.runStatus = 'error';
status.lastError = info.State.Error || `Exit code: ${info.State.ExitCode}`;
} else {
status.runStatus = 'stopped';
}
// Get health status
if (info.State.Health) {
status.health = info.State.Health.Status as TContainerHealth;
if (info.State.Health.Log && info.State.Health.Log.length > 0) {
const lastLog = info.State.Health.Log[info.State.Health.Log.length - 1];
if (lastLog.Output) {
status.healthMessage = lastLog.Output.substring(0, 200);
}
}
}
// Get uptime
if (info.State.StartedAt) {
const startTime = new Date(info.State.StartedAt).getTime();
status.startTime = startTime;
if (status.running) {
status.uptime = Math.floor((Date.now() - startTime) / 1000);
}
}
// Try to get loaded models from container
if (status.running) {
status.loadedModels = await this.getLoadedModels(config);
}
// Get resource usage
const stats = await this.getContainerStats(dockerId);
if (stats) {
status.memoryUsage = stats.memoryUsage;
status.cpuUsage = stats.cpuUsage;
}
} catch (error) {
logger.dim(`Error getting container status: ${error instanceof Error ? error.message : String(error)}`);
}
return status;
}
/**
* Get container resource stats
*/
private async getContainerStats(
dockerId: string,
): Promise<{ memoryUsage: number; cpuUsage: number } | null> {
try {
const { stdout } = await execAsync(
`docker stats ${dockerId} --no-stream --format "{{.MemUsage}},{{.CPUPerc}}"`,
{ timeout: 5000 },
);
const [memStr, cpuStr] = stdout.trim().split(',');
// Parse memory (e.g., "1.5GiB / 16GiB")
const memMatch = memStr.match(/([\d.]+)(MiB|GiB)/i);
let memoryUsage = 0;
if (memMatch) {
memoryUsage = parseFloat(memMatch[1]);
if (memMatch[2].toLowerCase() === 'gib') {
memoryUsage *= 1024;
}
}
// Parse CPU (e.g., "25.50%")
const cpuUsage = parseFloat(cpuStr.replace('%', '')) || 0;
return { memoryUsage: Math.round(memoryUsage), cpuUsage };
} catch {
return null;
}
}
/**
* Get loaded models from a container
*/
private async getLoadedModels(config: IContainerConfig): Promise<string[]> {
const containerName = `modelgrid-${config.id}`;
try {
switch (config.type) {
case 'ollama': {
// Query Ollama API for loaded models
const { stdout } = await execAsync(
`docker exec ${containerName} curl -s http://localhost:11434/api/tags`,
{ timeout: 5000 },
);
const data = JSON.parse(stdout);
return (data.models || []).map((m: { name: string }) => m.name);
}
case 'vllm':
case 'tgi': {
// These typically serve a single model
return config.models || [];
}
default:
return [];
}
} catch {
return [];
}
}
/**
* Execute a command inside a container
*/
public async exec(
containerId: string,
command: string,
timeout: number = 30000,
): Promise<IContainerExecResult> {
const containerName = `modelgrid-${containerId}`;
try {
const dockerId = await this.getContainerIdByName(containerName);
if (!dockerId) {
return { success: false, error: 'Container not found' };
}
const { stdout, stderr } = await execAsync(
`docker exec ${dockerId} ${command}`,
{ timeout },
);
return {
success: true,
output: stdout,
error: stderr || undefined,
};
} catch (error) {
const err = error as { code?: number; stdout?: string; stderr?: string };
return {
success: false,
output: err.stdout,
error: err.stderr || (error instanceof Error ? error.message : String(error)),
exitCode: err.code,
};
}
}
/**
* Get container logs
*/
public async getLogs(
containerId: string,
options: ILogsOptions = {},
): Promise<string> {
const containerName = `modelgrid-${containerId}`;
const { lines = 100, timestamps = false } = options;
try {
const dockerId = await this.getContainerIdByName(containerName);
if (!dockerId) {
return '';
}
const args = ['logs'];
if (lines) args.push(`--tail=${lines}`);
if (timestamps) args.push('--timestamps');
args.push(dockerId);
const { stdout, stderr } = await execAsync(
`docker ${args.join(' ')}`,
{ timeout: 10000 },
);
return stdout + stderr;
} catch (error) {
return error instanceof Error ? error.message : String(error);
}
}
/**
* Follow container logs (returns a way to stop following)
*/
public followLogs(
containerId: string,
onData: (data: string) => void,
): { stop: () => void } {
const containerName = `modelgrid-${containerId}`;
const child = spawn('docker', ['logs', '-f', containerName], {
stdio: ['ignore', 'pipe', 'pipe'],
});
child.stdout.on('data', (data) => onData(data.toString()));
child.stderr.on('data', (data) => onData(data.toString()));
return {
stop: () => {
child.kill();
},
};
}
/**
* Build docker run arguments
*/
private async buildRunArgs(config: IContainerConfig): Promise<string[]> {
const containerName = `modelgrid-${config.id}`;
const args: string[] = [
'-d', // Detached mode
`--name=${containerName}`,
`--network=${DOCKER.DEFAULT_NETWORK}`,
];
// Port mapping
const externalPort = config.externalPort || config.port;
args.push(`-p ${externalPort}:${config.port}`);
// Restart policy
args.push(`--restart=${config.restartPolicy}`);
// Memory limit
if (config.memoryLimit) {
args.push(`--memory=${config.memoryLimit}`);
}
// CPU limit
if (config.cpuLimit) {
args.push(`--cpus=${config.cpuLimit}`);
}
// GPU support
if (config.gpuIds && config.gpuIds.length > 0) {
const gpuArgs = await this.driverManager.getDockerGpuArgs(config.gpuIds);
args.push(...gpuArgs);
}
// Environment variables
if (config.env) {
for (const [key, value] of Object.entries(config.env)) {
args.push(`-e ${key}=${value}`);
}
}
// Volume mounts
if (config.volumes) {
for (const volume of config.volumes) {
args.push(`-v ${volume}`);
}
}
// Add image
args.push(config.image);
// Add custom command if provided
if (config.command && config.command.length > 0) {
args.push(...config.command);
}
return args;
}
/**
* Get Docker container ID by name
*/
private async getContainerIdByName(name: string): Promise<string | null> {
try {
const { stdout } = await execAsync(
`docker ps -a --filter "name=^${name}$" --format "{{.ID}}"`,
{ timeout: 5000 },
);
return stdout.trim() || null;
} catch {
return null;
}
}
/**
* Check if a container is running
*/
private async isContainerRunning(dockerId: string): Promise<boolean> {
try {
const { stdout } = await execAsync(
`docker inspect --format='{{.State.Running}}' ${dockerId}`,
{ timeout: 5000 },
);
return stdout.trim() === 'true';
} catch {
return false;
}
}
/**
* Wait for container to be healthy
*/
private async waitForHealth(
containerName: string,
timeout: number = TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
): Promise<boolean> {
const startTime = Date.now();
const checkInterval = 2000;
while (Date.now() - startTime < timeout) {
try {
const { stdout } = await execAsync(
`docker inspect --format='{{.State.Health.Status}}' ${containerName} 2>/dev/null || echo "none"`,
{ timeout: 5000 },
);
const status = stdout.trim();
if (status === 'healthy') {
return true;
}
if (status === 'none') {
// Container has no health check, assume healthy if running
const { stdout: running } = await execAsync(
`docker inspect --format='{{.State.Running}}' ${containerName}`,
{ timeout: 5000 },
);
if (running.trim() === 'true') {
return true;
}
}
if (status === 'unhealthy') {
logger.warn(`Container ${containerName} is unhealthy`);
return false;
}
} catch {
// Container might not be ready yet
}
await this.sleep(checkInterval);
}
logger.warn(`Timeout waiting for container ${containerName} to be healthy`);
return false;
}
/**
* Sleep helper
*/
private async sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
}