2026-01-30 03:16:57 +00:00
|
|
|
/**
|
|
|
|
|
* ModelGrid Daemon
|
|
|
|
|
*
|
|
|
|
|
* Background process for managing containers and serving the API.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
import process from 'node:process';
|
|
|
|
|
import { logger } from './logger.ts';
|
|
|
|
|
import { TIMING } from './constants.ts';
|
|
|
|
|
import type { ModelGrid } from './modelgrid.ts';
|
|
|
|
|
import { ApiServer } from './api/server.ts';
|
|
|
|
|
import type { IModelGridConfig } from './interfaces/config.ts';
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* ModelGrid Daemon
|
|
|
|
|
*/
|
|
|
|
|
export class Daemon {
|
|
|
|
|
private modelgrid: ModelGrid;
|
|
|
|
|
private isRunning: boolean = false;
|
|
|
|
|
private apiServer?: ApiServer;
|
|
|
|
|
|
|
|
|
|
constructor(modelgrid: ModelGrid) {
|
|
|
|
|
this.modelgrid = modelgrid;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Start the daemon
|
|
|
|
|
*/
|
|
|
|
|
public async start(): Promise<void> {
|
|
|
|
|
if (this.isRunning) {
|
|
|
|
|
logger.warn('Daemon is already running');
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logger.log('Starting ModelGrid daemon...');
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
// Initialize ModelGrid
|
|
|
|
|
await this.modelgrid.initialize();
|
|
|
|
|
|
|
|
|
|
const config = this.modelgrid.getConfig();
|
|
|
|
|
if (!config) {
|
|
|
|
|
throw new Error('Failed to load configuration');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.logConfigLoaded(config);
|
|
|
|
|
|
|
|
|
|
// Start API server
|
|
|
|
|
await this.startApiServer(config);
|
|
|
|
|
|
|
|
|
|
// Start containers
|
|
|
|
|
await this.startContainers();
|
|
|
|
|
|
|
|
|
|
// Preload models if configured
|
|
|
|
|
await this.preloadModels(config);
|
|
|
|
|
|
2026-04-20 23:00:50 +00:00
|
|
|
await this.syncClusterState(config);
|
|
|
|
|
await this.modelgrid.getClusterCoordinator().reconcileDesiredReplicas();
|
|
|
|
|
await this.syncClusterState(config);
|
|
|
|
|
|
2026-01-30 03:16:57 +00:00
|
|
|
// Setup signal handlers
|
|
|
|
|
this.setupSignalHandlers();
|
|
|
|
|
|
|
|
|
|
this.isRunning = true;
|
|
|
|
|
|
|
|
|
|
// Start monitoring loop
|
|
|
|
|
await this.monitor();
|
|
|
|
|
} catch (error) {
|
|
|
|
|
this.isRunning = false;
|
2026-04-20 23:00:50 +00:00
|
|
|
logger.error(
|
|
|
|
|
`Daemon failed to start: ${error instanceof Error ? error.message : String(error)}`,
|
|
|
|
|
);
|
2026-01-30 03:16:57 +00:00
|
|
|
process.exit(1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Stop the daemon
|
|
|
|
|
*/
|
|
|
|
|
public async stop(): Promise<void> {
|
|
|
|
|
if (!this.isRunning) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logger.log('Stopping ModelGrid daemon...');
|
|
|
|
|
|
|
|
|
|
this.isRunning = false;
|
|
|
|
|
|
|
|
|
|
// Stop API server
|
|
|
|
|
if (this.apiServer) {
|
|
|
|
|
await this.apiServer.stop();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Shutdown ModelGrid (stops containers)
|
|
|
|
|
await this.modelgrid.shutdown();
|
|
|
|
|
|
|
|
|
|
logger.success('ModelGrid daemon stopped');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Start the API server
|
|
|
|
|
*/
|
|
|
|
|
private async startApiServer(config: IModelGridConfig): Promise<void> {
|
|
|
|
|
logger.info('Starting API server...');
|
|
|
|
|
|
|
|
|
|
this.apiServer = new ApiServer(
|
|
|
|
|
config.api,
|
|
|
|
|
this.modelgrid.getContainerManager(),
|
|
|
|
|
this.modelgrid.getModelRegistry(),
|
2026-04-20 23:00:50 +00:00
|
|
|
this.modelgrid.getModelLoader(),
|
|
|
|
|
this.modelgrid.getClusterCoordinator(),
|
2026-01-30 03:16:57 +00:00
|
|
|
);
|
|
|
|
|
|
|
|
|
|
await this.apiServer.start();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Start configured containers
|
|
|
|
|
*/
|
|
|
|
|
private async startContainers(): Promise<void> {
|
|
|
|
|
logger.info('Starting containers...');
|
|
|
|
|
|
|
|
|
|
const containerManager = this.modelgrid.getContainerManager();
|
|
|
|
|
await containerManager.startAll();
|
|
|
|
|
|
|
|
|
|
// Wait for containers to be healthy
|
|
|
|
|
logger.dim('Waiting for containers to become healthy...');
|
|
|
|
|
await this.waitForContainersHealthy();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Wait for all containers to report healthy
|
|
|
|
|
*/
|
|
|
|
|
private async waitForContainersHealthy(timeout: number = 60000): Promise<void> {
|
|
|
|
|
const startTime = Date.now();
|
|
|
|
|
const containerManager = this.modelgrid.getContainerManager();
|
|
|
|
|
|
|
|
|
|
while (Date.now() - startTime < timeout) {
|
|
|
|
|
const allHealthy = await containerManager.checkAllHealth();
|
|
|
|
|
|
|
|
|
|
if (allHealthy) {
|
|
|
|
|
logger.success('All containers are healthy');
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await this.sleep(5000);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logger.warn('Timeout waiting for containers to become healthy');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Preload configured models
|
|
|
|
|
*/
|
|
|
|
|
private async preloadModels(config: IModelGridConfig): Promise<void> {
|
|
|
|
|
if (!config.models.autoLoad || config.models.autoLoad.length === 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logger.info(`Preloading ${config.models.autoLoad.length} model(s)...`);
|
|
|
|
|
|
2026-04-20 23:00:50 +00:00
|
|
|
const clusterCoordinator = this.modelgrid.getClusterCoordinator();
|
|
|
|
|
const results = new Map<string, { success: boolean; error?: string }>();
|
|
|
|
|
|
|
|
|
|
for (const modelName of config.models.autoLoad) {
|
|
|
|
|
const ensured = await clusterCoordinator.ensureModel(modelName);
|
|
|
|
|
results.set(modelName, {
|
|
|
|
|
success: !!ensured,
|
|
|
|
|
error: ensured ? undefined : 'Failed to schedule preload',
|
|
|
|
|
});
|
|
|
|
|
}
|
2026-01-30 03:16:57 +00:00
|
|
|
|
|
|
|
|
let loaded = 0;
|
|
|
|
|
let failed = 0;
|
|
|
|
|
|
|
|
|
|
for (const [name, result] of results) {
|
|
|
|
|
if (result.success) {
|
|
|
|
|
loaded++;
|
|
|
|
|
logger.dim(` ✓ ${name}`);
|
|
|
|
|
} else {
|
|
|
|
|
failed++;
|
|
|
|
|
logger.warn(` ✗ ${name}: ${result.error}`);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (failed > 0) {
|
|
|
|
|
logger.warn(`Preloaded ${loaded}/${config.models.autoLoad.length} models (${failed} failed)`);
|
|
|
|
|
} else {
|
|
|
|
|
logger.success(`Preloaded ${loaded} model(s)`);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Setup signal handlers for graceful shutdown
|
|
|
|
|
*/
|
|
|
|
|
private setupSignalHandlers(): void {
|
|
|
|
|
const shutdown = async () => {
|
|
|
|
|
logger.log('');
|
|
|
|
|
logger.log('Received shutdown signal');
|
|
|
|
|
await this.stop();
|
|
|
|
|
process.exit(0);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
process.on('SIGINT', shutdown);
|
|
|
|
|
process.on('SIGTERM', shutdown);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Main monitoring loop
|
|
|
|
|
*/
|
|
|
|
|
private async monitor(): Promise<void> {
|
|
|
|
|
logger.log('Starting monitoring loop...');
|
|
|
|
|
|
|
|
|
|
const config = this.modelgrid.getConfig();
|
|
|
|
|
const checkInterval = config?.checkInterval || TIMING.CHECK_INTERVAL_MS;
|
|
|
|
|
|
|
|
|
|
while (this.isRunning) {
|
|
|
|
|
try {
|
|
|
|
|
// Check container health
|
|
|
|
|
await this.checkContainerHealth();
|
|
|
|
|
|
2026-04-20 23:00:50 +00:00
|
|
|
await this.syncClusterState();
|
|
|
|
|
await this.modelgrid.getClusterCoordinator().reconcileDesiredReplicas();
|
|
|
|
|
await this.syncClusterState();
|
|
|
|
|
|
2026-01-30 03:16:57 +00:00
|
|
|
// Log periodic status
|
|
|
|
|
this.logPeriodicStatus();
|
|
|
|
|
|
|
|
|
|
await this.sleep(checkInterval);
|
|
|
|
|
} catch (error) {
|
|
|
|
|
logger.error(`Monitor error: ${error instanceof Error ? error.message : String(error)}`);
|
|
|
|
|
await this.sleep(checkInterval);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Check health of all containers
|
|
|
|
|
*/
|
|
|
|
|
private async checkContainerHealth(): Promise<void> {
|
|
|
|
|
const containerManager = this.modelgrid.getContainerManager();
|
|
|
|
|
const statuses = await containerManager.getAllStatus();
|
|
|
|
|
|
|
|
|
|
for (const [id, status] of statuses) {
|
|
|
|
|
if (status.running && status.health === 'unhealthy') {
|
|
|
|
|
logger.warn(`Container ${id} is unhealthy, attempting restart...`);
|
|
|
|
|
|
|
|
|
|
const container = containerManager.getContainer(id);
|
|
|
|
|
if (container) {
|
|
|
|
|
await container.restart();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Log periodic status
|
|
|
|
|
*/
|
|
|
|
|
private logPeriodicStatus(): void {
|
|
|
|
|
if (this.apiServer) {
|
|
|
|
|
const info = this.apiServer.getInfo();
|
|
|
|
|
if (info.running) {
|
|
|
|
|
logger.dim(`API server running on ${info.host}:${info.port} (uptime: ${info.uptime}s)`);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-20 23:00:50 +00:00
|
|
|
private async syncClusterState(config?: IModelGridConfig): Promise<void> {
|
|
|
|
|
const effectiveConfig = config || this.modelgrid.getConfig();
|
|
|
|
|
if (!effectiveConfig) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const advertiseUrl = effectiveConfig.cluster.advertiseUrl ||
|
|
|
|
|
`http://127.0.0.1:${effectiveConfig.api.port}`;
|
|
|
|
|
const coordinator = this.modelgrid.getClusterCoordinator();
|
|
|
|
|
await coordinator.syncLocalState(advertiseUrl);
|
|
|
|
|
await coordinator.sendHeartbeat();
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-30 03:16:57 +00:00
|
|
|
/**
|
|
|
|
|
* Log configuration loaded message
|
|
|
|
|
*/
|
|
|
|
|
private logConfigLoaded(config: IModelGridConfig): void {
|
|
|
|
|
logger.log('');
|
|
|
|
|
logger.logBoxTitle('Configuration Loaded', 60, 'success');
|
|
|
|
|
logger.logBoxLine(`API Port: ${config.api.port}`);
|
2026-04-20 23:00:50 +00:00
|
|
|
logger.logBoxLine(`Deployments: ${config.containers.length}`);
|
|
|
|
|
logger.logBoxLine(`Auto-deploy: ${config.models.autoDeploy ? 'Enabled' : 'Disabled'}`);
|
|
|
|
|
logger.logBoxLine(`Registry: ${config.models.registryUrl}`);
|
|
|
|
|
logger.logBoxLine(`Cluster Mode: ${config.cluster.role}`);
|
2026-01-30 03:16:57 +00:00
|
|
|
logger.logBoxLine(`Check Interval: ${config.checkInterval / 1000}s`);
|
|
|
|
|
logger.logBoxEnd();
|
|
|
|
|
logger.log('');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Sleep for specified milliseconds
|
|
|
|
|
*/
|
|
|
|
|
private sleep(ms: number): Promise<void> {
|
|
|
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
|
|
|
}
|
|
|
|
|
}
|