refactor(health): share health snapshot computation
This commit is contained in:
+8
-39
@@ -16,6 +16,7 @@ import { ModelRegistry } from '../models/registry.ts';
|
|||||||
import { ModelLoader } from '../models/loader.ts';
|
import { ModelLoader } from '../models/loader.ts';
|
||||||
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
||||||
import { ClusterHandler } from './handlers/cluster.ts';
|
import { ClusterHandler } from './handlers/cluster.ts';
|
||||||
|
import { buildHealthSnapshot } from '../helpers/health.ts';
|
||||||
|
|
||||||
interface IApiServerOptions {
|
interface IApiServerOptions {
|
||||||
gpuDetector?: GpuDetector;
|
gpuDetector?: GpuDetector;
|
||||||
@@ -209,47 +210,15 @@ export class ApiServer {
|
|||||||
const gpus = await this.gpuDetector.detectGpus();
|
const gpus = await this.gpuDetector.detectGpus();
|
||||||
const models = await this.containerManager.getAllAvailableModels();
|
const models = await this.containerManager.getAllAvailableModels();
|
||||||
|
|
||||||
let status: 'ok' | 'degraded' | 'error' = 'ok';
|
const response: IHealthResponse = buildHealthSnapshot({
|
||||||
const reasons = new Set<'unhealthy_container' | 'no_models_available' | 'gpu_detection_failed'>();
|
statuses,
|
||||||
const containerHealth: Record<string, 'healthy' | 'unhealthy'> = {};
|
modelCount: models.size,
|
||||||
const gpuStatus: Record<string, 'available' | 'in_use' | 'error'> = {};
|
gpus,
|
||||||
|
startTime: this.startTime,
|
||||||
// Check container health
|
|
||||||
for (const [id, containerStatus] of statuses) {
|
|
||||||
if (containerStatus.running && containerStatus.health === 'healthy') {
|
|
||||||
containerHealth[id] = 'healthy';
|
|
||||||
} else {
|
|
||||||
containerHealth[id] = 'unhealthy';
|
|
||||||
status = 'degraded';
|
|
||||||
reasons.add('unhealthy_container');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check GPU status
|
|
||||||
for (const gpu of gpus) {
|
|
||||||
gpuStatus[gpu.id] = 'available';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (models.size === 0) {
|
|
||||||
status = 'degraded';
|
|
||||||
reasons.add('no_models_available');
|
|
||||||
}
|
|
||||||
|
|
||||||
const response: IHealthResponse = {
|
|
||||||
status,
|
|
||||||
reasons: Array.from(reasons),
|
|
||||||
version: VERSION,
|
version: VERSION,
|
||||||
uptime: Math.floor((Date.now() - this.startTime) / 1000),
|
});
|
||||||
containers: statuses.size,
|
|
||||||
models: models.size,
|
|
||||||
gpus: gpus.length,
|
|
||||||
details: {
|
|
||||||
containers: containerHealth,
|
|
||||||
gpus: gpuStatus,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
res.writeHead(status === 'ok' ? 200 : 503, { 'Content-Type': 'application/json' });
|
res.writeHead(response.status === 'ok' ? 200 : 503, { 'Content-Type': 'application/json' });
|
||||||
res.end(JSON.stringify(response, null, 2));
|
res.end(JSON.stringify(response, null, 2));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
res.writeHead(500, { 'Content-Type': 'application/json' });
|
res.writeHead(500, { 'Content-Type': 'application/json' });
|
||||||
|
|||||||
@@ -0,0 +1,49 @@
|
|||||||
|
import type { IHealthResponse } from '../interfaces/api.ts';
|
||||||
|
import type { IContainerStatus } from '../interfaces/container.ts';
|
||||||
|
import type { IGpuInfo } from '../interfaces/gpu.ts';
|
||||||
|
|
||||||
|
export function buildHealthSnapshot(options: {
|
||||||
|
statuses: Map<string, IContainerStatus>;
|
||||||
|
modelCount: number;
|
||||||
|
gpus: IGpuInfo[];
|
||||||
|
startTime: number;
|
||||||
|
version: string;
|
||||||
|
}): IHealthResponse {
|
||||||
|
let status: 'ok' | 'degraded' | 'error' = 'ok';
|
||||||
|
const reasons = new Set<'unhealthy_container' | 'no_models_available' | 'gpu_detection_failed'>();
|
||||||
|
const containerHealth: Record<string, 'healthy' | 'unhealthy'> = {};
|
||||||
|
const gpuStatus: Record<string, 'available' | 'in_use' | 'error'> = {};
|
||||||
|
|
||||||
|
for (const [id, containerStatus] of options.statuses) {
|
||||||
|
if (containerStatus.running && containerStatus.health === 'healthy') {
|
||||||
|
containerHealth[id] = 'healthy';
|
||||||
|
} else {
|
||||||
|
containerHealth[id] = 'unhealthy';
|
||||||
|
status = 'degraded';
|
||||||
|
reasons.add('unhealthy_container');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const gpu of options.gpus) {
|
||||||
|
gpuStatus[gpu.id] = 'available';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.modelCount === 0) {
|
||||||
|
status = 'degraded';
|
||||||
|
reasons.add('no_models_available');
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
status,
|
||||||
|
reasons: Array.from(reasons),
|
||||||
|
version: options.version,
|
||||||
|
uptime: Math.floor((Date.now() - options.startTime) / 1000),
|
||||||
|
containers: options.statuses.size,
|
||||||
|
models: options.modelCount,
|
||||||
|
gpus: options.gpus.length,
|
||||||
|
details: {
|
||||||
|
containers: containerHealth,
|
||||||
|
gpus: gpuStatus,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
+7
-35
@@ -22,6 +22,7 @@ import { VERSION } from '../constants.ts';
|
|||||||
import type { ContainerManager } from '../containers/container-manager.ts';
|
import type { ContainerManager } from '../containers/container-manager.ts';
|
||||||
import type { ClusterManager } from '../cluster/cluster-manager.ts';
|
import type { ClusterManager } from '../cluster/cluster-manager.ts';
|
||||||
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
import { GpuDetector } from '../hardware/gpu-detector.ts';
|
||||||
|
import { buildHealthSnapshot } from '../helpers/health.ts';
|
||||||
|
|
||||||
interface IBundledFile {
|
interface IBundledFile {
|
||||||
path: string;
|
path: string;
|
||||||
@@ -150,42 +151,13 @@ export class UiServer {
|
|||||||
const models = await this.containerManager.getAllAvailableModels();
|
const models = await this.containerManager.getAllAvailableModels();
|
||||||
const gpus = await this.gpuDetector.detectGpus();
|
const gpus = await this.gpuDetector.detectGpus();
|
||||||
|
|
||||||
let status: 'ok' | 'degraded' | 'error' = 'ok';
|
const health: IHealthResponse = buildHealthSnapshot({
|
||||||
const reasons = new Set<'unhealthy_container' | 'no_models_available' | 'gpu_detection_failed'>();
|
statuses,
|
||||||
const containerHealth: Record<string, 'healthy' | 'unhealthy'> = {};
|
modelCount: models.size,
|
||||||
const gpuStatus: Record<string, 'available' | 'in_use' | 'error'> = {};
|
gpus,
|
||||||
|
startTime: this.startTime,
|
||||||
for (const [id, s] of statuses) {
|
|
||||||
if (s.running && s.health === 'healthy') {
|
|
||||||
containerHealth[id] = 'healthy';
|
|
||||||
} else {
|
|
||||||
containerHealth[id] = 'unhealthy';
|
|
||||||
status = 'degraded';
|
|
||||||
reasons.add('unhealthy_container');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (const gpu of gpus) {
|
|
||||||
gpuStatus[gpu.id] = 'available';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (models.size === 0) {
|
|
||||||
status = 'degraded';
|
|
||||||
reasons.add('no_models_available');
|
|
||||||
}
|
|
||||||
|
|
||||||
const health: IHealthResponse = {
|
|
||||||
status,
|
|
||||||
reasons: Array.from(reasons),
|
|
||||||
version: VERSION,
|
version: VERSION,
|
||||||
uptime: Math.floor((Date.now() - this.startTime) / 1000),
|
});
|
||||||
containers: statuses.size,
|
|
||||||
models: models.size,
|
|
||||||
gpus: gpus.length,
|
|
||||||
details: {
|
|
||||||
containers: containerHealth,
|
|
||||||
gpus: gpuStatus,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
const clusterConfig = this.clusterManager.getConfig();
|
const clusterConfig = this.clusterManager.getConfig();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user