initial
This commit is contained in:
558
ts/docker/container-runtime.ts
Normal file
558
ts/docker/container-runtime.ts
Normal file
@@ -0,0 +1,558 @@
|
||||
/**
|
||||
* Container Runtime
|
||||
*
|
||||
* Manages individual Docker containers for AI model serving.
|
||||
*/
|
||||
|
||||
import { exec, spawn } from 'node:child_process';
|
||||
import { promisify } from 'node:util';
|
||||
import type {
|
||||
IContainerConfig,
|
||||
IContainerStatus,
|
||||
TContainerHealth,
|
||||
TContainerRunStatus,
|
||||
} from '../interfaces/container.ts';
|
||||
import { logger } from '../logger.ts';
|
||||
import { DOCKER, TIMING } from '../constants.ts';
|
||||
import { DriverManager } from '../drivers/driver-manager.ts';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
/**
|
||||
* Container runtime execution result
|
||||
*/
|
||||
export interface IContainerExecResult {
|
||||
success: boolean;
|
||||
output?: string;
|
||||
error?: string;
|
||||
exitCode?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Container logs options
|
||||
*/
|
||||
export interface ILogsOptions {
|
||||
lines?: number;
|
||||
follow?: boolean;
|
||||
timestamps?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Container Runtime class - manages individual containers
|
||||
*/
|
||||
export class ContainerRuntime {
|
||||
private driverManager: DriverManager;
|
||||
|
||||
constructor() {
|
||||
this.driverManager = new DriverManager();
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a container with the given configuration
|
||||
*/
|
||||
public async startContainer(config: IContainerConfig): Promise<boolean> {
|
||||
const containerName = `modelgrid-${config.id}`;
|
||||
|
||||
// Check if container already exists
|
||||
const existingId = await this.getContainerIdByName(containerName);
|
||||
if (existingId) {
|
||||
// Check if it's running
|
||||
const isRunning = await this.isContainerRunning(existingId);
|
||||
if (isRunning) {
|
||||
logger.dim(`Container ${containerName} is already running`);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Start existing container
|
||||
try {
|
||||
await execAsync(`docker start ${existingId}`, {
|
||||
timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
|
||||
});
|
||||
logger.success(`Started existing container: ${containerName}`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to start existing container: ${error instanceof Error ? error.message : String(error)}`);
|
||||
// Try to remove and recreate
|
||||
await this.removeContainer(config.id);
|
||||
}
|
||||
}
|
||||
|
||||
// Build docker run command
|
||||
const args = await this.buildRunArgs(config);
|
||||
const cmd = `docker run ${args.join(' ')}`;
|
||||
|
||||
logger.info(`Starting container: ${containerName}`);
|
||||
logger.dim(`Command: ${cmd}`);
|
||||
|
||||
try {
|
||||
await execAsync(cmd, { timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS });
|
||||
logger.success(`Container ${containerName} started`);
|
||||
|
||||
// Wait for container to be healthy
|
||||
await this.waitForHealth(containerName);
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to start container: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop a container
|
||||
*/
|
||||
public async stopContainer(containerId: string, timeout: number = 30): Promise<boolean> {
|
||||
const containerName = `modelgrid-${containerId}`;
|
||||
|
||||
try {
|
||||
const dockerId = await this.getContainerIdByName(containerName);
|
||||
if (!dockerId) {
|
||||
logger.dim(`Container ${containerName} not found`);
|
||||
return true;
|
||||
}
|
||||
|
||||
logger.info(`Stopping container: ${containerName}`);
|
||||
await execAsync(`docker stop -t ${timeout} ${dockerId}`, {
|
||||
timeout: (timeout + 10) * 1000,
|
||||
});
|
||||
logger.success(`Container ${containerName} stopped`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to stop container: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a container
|
||||
*/
|
||||
public async removeContainer(containerId: string, force: boolean = true): Promise<boolean> {
|
||||
const containerName = `modelgrid-${containerId}`;
|
||||
|
||||
try {
|
||||
const dockerId = await this.getContainerIdByName(containerName);
|
||||
if (!dockerId) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const forceFlag = force ? '-f' : '';
|
||||
await execAsync(`docker rm ${forceFlag} ${dockerId}`, { timeout: 30000 });
|
||||
logger.success(`Container ${containerName} removed`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to remove container: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Restart a container
|
||||
*/
|
||||
public async restartContainer(containerId: string): Promise<boolean> {
|
||||
const containerName = `modelgrid-${containerId}`;
|
||||
|
||||
try {
|
||||
const dockerId = await this.getContainerIdByName(containerName);
|
||||
if (!dockerId) {
|
||||
logger.error(`Container ${containerName} not found`);
|
||||
return false;
|
||||
}
|
||||
|
||||
await execAsync(`docker restart ${dockerId}`, {
|
||||
timeout: TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
|
||||
});
|
||||
logger.success(`Container ${containerName} restarted`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to restart container: ${error instanceof Error ? error.message : String(error)}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get container status
|
||||
*/
|
||||
public async getContainerStatus(config: IContainerConfig): Promise<IContainerStatus> {
|
||||
const containerName = `modelgrid-${config.id}`;
|
||||
|
||||
const status: IContainerStatus = {
|
||||
id: config.id,
|
||||
name: config.name,
|
||||
type: config.type,
|
||||
running: false,
|
||||
runStatus: 'stopped',
|
||||
health: 'unknown',
|
||||
loadedModels: [],
|
||||
assignedGpus: config.gpuIds,
|
||||
endpoint: `http://localhost:${config.externalPort || config.port}`,
|
||||
};
|
||||
|
||||
try {
|
||||
const dockerId = await this.getContainerIdByName(containerName);
|
||||
if (!dockerId) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status.dockerId = dockerId;
|
||||
|
||||
// Get container info
|
||||
const { stdout } = await execAsync(
|
||||
`docker inspect --format='{{json .}}' ${dockerId}`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const info = JSON.parse(stdout);
|
||||
|
||||
// Get run status
|
||||
status.running = info.State.Running === true;
|
||||
if (info.State.Running) {
|
||||
status.runStatus = 'running';
|
||||
} else if (info.State.Restarting) {
|
||||
status.runStatus = 'starting';
|
||||
} else if (info.State.ExitCode !== 0) {
|
||||
status.runStatus = 'error';
|
||||
status.lastError = info.State.Error || `Exit code: ${info.State.ExitCode}`;
|
||||
} else {
|
||||
status.runStatus = 'stopped';
|
||||
}
|
||||
|
||||
// Get health status
|
||||
if (info.State.Health) {
|
||||
status.health = info.State.Health.Status as TContainerHealth;
|
||||
if (info.State.Health.Log && info.State.Health.Log.length > 0) {
|
||||
const lastLog = info.State.Health.Log[info.State.Health.Log.length - 1];
|
||||
if (lastLog.Output) {
|
||||
status.healthMessage = lastLog.Output.substring(0, 200);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get uptime
|
||||
if (info.State.StartedAt) {
|
||||
const startTime = new Date(info.State.StartedAt).getTime();
|
||||
status.startTime = startTime;
|
||||
if (status.running) {
|
||||
status.uptime = Math.floor((Date.now() - startTime) / 1000);
|
||||
}
|
||||
}
|
||||
|
||||
// Try to get loaded models from container
|
||||
if (status.running) {
|
||||
status.loadedModels = await this.getLoadedModels(config);
|
||||
}
|
||||
|
||||
// Get resource usage
|
||||
const stats = await this.getContainerStats(dockerId);
|
||||
if (stats) {
|
||||
status.memoryUsage = stats.memoryUsage;
|
||||
status.cpuUsage = stats.cpuUsage;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.dim(`Error getting container status: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get container resource stats
|
||||
*/
|
||||
private async getContainerStats(
|
||||
dockerId: string,
|
||||
): Promise<{ memoryUsage: number; cpuUsage: number } | null> {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`docker stats ${dockerId} --no-stream --format "{{.MemUsage}},{{.CPUPerc}}"`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const [memStr, cpuStr] = stdout.trim().split(',');
|
||||
|
||||
// Parse memory (e.g., "1.5GiB / 16GiB")
|
||||
const memMatch = memStr.match(/([\d.]+)(MiB|GiB)/i);
|
||||
let memoryUsage = 0;
|
||||
if (memMatch) {
|
||||
memoryUsage = parseFloat(memMatch[1]);
|
||||
if (memMatch[2].toLowerCase() === 'gib') {
|
||||
memoryUsage *= 1024;
|
||||
}
|
||||
}
|
||||
|
||||
// Parse CPU (e.g., "25.50%")
|
||||
const cpuUsage = parseFloat(cpuStr.replace('%', '')) || 0;
|
||||
|
||||
return { memoryUsage: Math.round(memoryUsage), cpuUsage };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get loaded models from a container
|
||||
*/
|
||||
private async getLoadedModels(config: IContainerConfig): Promise<string[]> {
|
||||
const containerName = `modelgrid-${config.id}`;
|
||||
|
||||
try {
|
||||
switch (config.type) {
|
||||
case 'ollama': {
|
||||
// Query Ollama API for loaded models
|
||||
const { stdout } = await execAsync(
|
||||
`docker exec ${containerName} curl -s http://localhost:11434/api/tags`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
const data = JSON.parse(stdout);
|
||||
return (data.models || []).map((m: { name: string }) => m.name);
|
||||
}
|
||||
|
||||
case 'vllm':
|
||||
case 'tgi': {
|
||||
// These typically serve a single model
|
||||
return config.models || [];
|
||||
}
|
||||
|
||||
default:
|
||||
return [];
|
||||
}
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a command inside a container
|
||||
*/
|
||||
public async exec(
|
||||
containerId: string,
|
||||
command: string,
|
||||
timeout: number = 30000,
|
||||
): Promise<IContainerExecResult> {
|
||||
const containerName = `modelgrid-${containerId}`;
|
||||
|
||||
try {
|
||||
const dockerId = await this.getContainerIdByName(containerName);
|
||||
if (!dockerId) {
|
||||
return { success: false, error: 'Container not found' };
|
||||
}
|
||||
|
||||
const { stdout, stderr } = await execAsync(
|
||||
`docker exec ${dockerId} ${command}`,
|
||||
{ timeout },
|
||||
);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
output: stdout,
|
||||
error: stderr || undefined,
|
||||
};
|
||||
} catch (error) {
|
||||
const err = error as { code?: number; stdout?: string; stderr?: string };
|
||||
return {
|
||||
success: false,
|
||||
output: err.stdout,
|
||||
error: err.stderr || (error instanceof Error ? error.message : String(error)),
|
||||
exitCode: err.code,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get container logs
|
||||
*/
|
||||
public async getLogs(
|
||||
containerId: string,
|
||||
options: ILogsOptions = {},
|
||||
): Promise<string> {
|
||||
const containerName = `modelgrid-${containerId}`;
|
||||
const { lines = 100, timestamps = false } = options;
|
||||
|
||||
try {
|
||||
const dockerId = await this.getContainerIdByName(containerName);
|
||||
if (!dockerId) {
|
||||
return '';
|
||||
}
|
||||
|
||||
const args = ['logs'];
|
||||
if (lines) args.push(`--tail=${lines}`);
|
||||
if (timestamps) args.push('--timestamps');
|
||||
args.push(dockerId);
|
||||
|
||||
const { stdout, stderr } = await execAsync(
|
||||
`docker ${args.join(' ')}`,
|
||||
{ timeout: 10000 },
|
||||
);
|
||||
|
||||
return stdout + stderr;
|
||||
} catch (error) {
|
||||
return error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Follow container logs (returns a way to stop following)
|
||||
*/
|
||||
public followLogs(
|
||||
containerId: string,
|
||||
onData: (data: string) => void,
|
||||
): { stop: () => void } {
|
||||
const containerName = `modelgrid-${containerId}`;
|
||||
|
||||
const child = spawn('docker', ['logs', '-f', containerName], {
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
});
|
||||
|
||||
child.stdout.on('data', (data) => onData(data.toString()));
|
||||
child.stderr.on('data', (data) => onData(data.toString()));
|
||||
|
||||
return {
|
||||
stop: () => {
|
||||
child.kill();
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Build docker run arguments
|
||||
*/
|
||||
private async buildRunArgs(config: IContainerConfig): Promise<string[]> {
|
||||
const containerName = `modelgrid-${config.id}`;
|
||||
const args: string[] = [
|
||||
'-d', // Detached mode
|
||||
`--name=${containerName}`,
|
||||
`--network=${DOCKER.DEFAULT_NETWORK}`,
|
||||
];
|
||||
|
||||
// Port mapping
|
||||
const externalPort = config.externalPort || config.port;
|
||||
args.push(`-p ${externalPort}:${config.port}`);
|
||||
|
||||
// Restart policy
|
||||
args.push(`--restart=${config.restartPolicy}`);
|
||||
|
||||
// Memory limit
|
||||
if (config.memoryLimit) {
|
||||
args.push(`--memory=${config.memoryLimit}`);
|
||||
}
|
||||
|
||||
// CPU limit
|
||||
if (config.cpuLimit) {
|
||||
args.push(`--cpus=${config.cpuLimit}`);
|
||||
}
|
||||
|
||||
// GPU support
|
||||
if (config.gpuIds && config.gpuIds.length > 0) {
|
||||
const gpuArgs = await this.driverManager.getDockerGpuArgs(config.gpuIds);
|
||||
args.push(...gpuArgs);
|
||||
}
|
||||
|
||||
// Environment variables
|
||||
if (config.env) {
|
||||
for (const [key, value] of Object.entries(config.env)) {
|
||||
args.push(`-e ${key}=${value}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Volume mounts
|
||||
if (config.volumes) {
|
||||
for (const volume of config.volumes) {
|
||||
args.push(`-v ${volume}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Add image
|
||||
args.push(config.image);
|
||||
|
||||
// Add custom command if provided
|
||||
if (config.command && config.command.length > 0) {
|
||||
args.push(...config.command);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Docker container ID by name
|
||||
*/
|
||||
private async getContainerIdByName(name: string): Promise<string | null> {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`docker ps -a --filter "name=^${name}$" --format "{{.ID}}"`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
return stdout.trim() || null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a container is running
|
||||
*/
|
||||
private async isContainerRunning(dockerId: string): Promise<boolean> {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`docker inspect --format='{{.State.Running}}' ${dockerId}`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
return stdout.trim() === 'true';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for container to be healthy
|
||||
*/
|
||||
private async waitForHealth(
|
||||
containerName: string,
|
||||
timeout: number = TIMING.CONTAINER_STARTUP_TIMEOUT_MS,
|
||||
): Promise<boolean> {
|
||||
const startTime = Date.now();
|
||||
const checkInterval = 2000;
|
||||
|
||||
while (Date.now() - startTime < timeout) {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`docker inspect --format='{{.State.Health.Status}}' ${containerName} 2>/dev/null || echo "none"`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
|
||||
const status = stdout.trim();
|
||||
|
||||
if (status === 'healthy') {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (status === 'none') {
|
||||
// Container has no health check, assume healthy if running
|
||||
const { stdout: running } = await execAsync(
|
||||
`docker inspect --format='{{.State.Running}}' ${containerName}`,
|
||||
{ timeout: 5000 },
|
||||
);
|
||||
if (running.trim() === 'true') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (status === 'unhealthy') {
|
||||
logger.warn(`Container ${containerName} is unhealthy`);
|
||||
return false;
|
||||
}
|
||||
} catch {
|
||||
// Container might not be ready yet
|
||||
}
|
||||
|
||||
await this.sleep(checkInterval);
|
||||
}
|
||||
|
||||
logger.warn(`Timeout waiting for container ${containerName} to be healthy`);
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sleep helper
|
||||
*/
|
||||
private async sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user