feat(health): include degraded reasons in responses

2026-04-21 13:34:58 +00:00
parent 703cceb512
commit 9022c8dbf3
4 changed files with 84 additions and 1 deletions
@@ -64,6 +64,8 @@ Deno.test('ApiServer serves health metrics and authenticated model listings', as
    assertEquals(healthResponse.status, 200);
    assertEquals(healthBody.status, 'ok');
    assertEquals(healthBody.models, 1);
+    assertEquals(Array.isArray(healthBody.reasons), true);
+    assertEquals(healthBody.reasons.length, 0);
    assertEquals(typeof healthResponse.headers.get('x-request-id'), 'string');

    const metricsResponse = await fetch(`http://127.0.0.1:${port}/metrics`);
@@ -175,6 +177,66 @@ Deno.test('ApiServer metrics expose 5xx counts for failing endpoints', async ()
  }
 });

+Deno.test('ApiServer health reports degraded reasons', async () => {
+  const port = 19300 + Math.floor(Math.random() * 1000);
+  const server = new ApiServer(
+    {
+      host: '127.0.0.1',
+      port,
+      apiKeys: ['valid-key'],
+      cors: false,
+      corsOrigins: [],
+    },
+    {
+      async getAllStatus() {
+        return new Map([
+          ['vllm-1', { running: false, health: 'unhealthy' }],
+        ]);
+      },
+      async getAllAvailableModels() {
+        return new Map();
+      },
+    } as never,
+    {
+      async getAllModels() {
+        return [];
+      },
+    } as never,
+    {} as never,
+    {
+      getStatus() {
+        return {
+          localNode: null,
+          nodes: [],
+          models: {},
+          desiredDeployments: [],
+        };
+      },
+    } as never,
+    {
+      gpuDetector: {
+        async detectGpus() {
+          return [{ id: 'nvidia-0' }];
+        },
+      } as never,
+    },
+  );
+
+  await server.start();
+
+  try {
+    const response = await fetch(`http://127.0.0.1:${port}/health`);
+    const body = await response.json();
+
+    assertEquals(response.status, 503);
+    assertEquals(body.status, 'degraded');
+    assertEquals(body.reasons.includes('unhealthy_container'), true);
+    assertEquals(body.reasons.includes('no_models_available'), true);
+  } finally {
+    await server.stop();
+  }
+});
+
 Deno.test('ApiServer enforces api rate limits while exempting health and metrics', async () => {
  const port = 19200 + Math.floor(Math.random() * 1000);
  const server = new ApiServer(
@@ -191,7 +253,9 @@ Deno.test('ApiServer enforces api rate limits while exempting health and metrics
        return new Map();
      },
      async getAllAvailableModels() {
-        return new Map();
+        return new Map([
+          ['meta-llama/Llama-3.1-8B-Instruct', [{ type: 'vllm' }]],
+        ]);
      },
    } as never,
    {
@@ -210,6 +210,7 @@ export class ApiServer {
      const models = await this.containerManager.getAllAvailableModels();

      let status: 'ok' | 'degraded' | 'error' = 'ok';
+      const reasons = new Set<'unhealthy_container' | 'no_models_available' | 'gpu_detection_failed'>();
      const containerHealth: Record<string, 'healthy' | 'unhealthy'> = {};
      const gpuStatus: Record<string, 'available' | 'in_use' | 'error'> = {};

@@ -220,6 +221,7 @@ export class ApiServer {
        } else {
          containerHealth[id] = 'unhealthy';
          status = 'degraded';
+          reasons.add('unhealthy_container');
        }
      }

@@ -228,8 +230,14 @@ export class ApiServer {
        gpuStatus[gpu.id] = 'available';
      }

+      if (models.size === 0) {
+        status = 'degraded';
+        reasons.add('no_models_available');
+      }
+
      const response: IHealthResponse = {
        status,
+        reasons: Array.from(reasons),
        version: VERSION,
        uptime: Math.floor((Date.now() - this.startTime) / 1000),
        containers: statuses.size,
@@ -247,6 +255,7 @@ export class ApiServer {
      res.writeHead(500, { 'Content-Type': 'application/json' });
      res.end(JSON.stringify({
        status: 'error',
+        reasons: ['gpu_detection_failed'],
        error: error instanceof Error ? error.message : String(error),
      }));
    }
@@ -309,6 +309,8 @@ export interface IApiError {
 export interface IHealthResponse {
  /** Status */
  status: 'ok' | 'degraded' | 'error';
+  /** Machine-readable reasons for degraded or error states */
+  reasons?: Array<'unhealthy_container' | 'no_models_available' | 'gpu_detection_failed'>;
  /** Version */
  version: string;
  /** Uptime in seconds */
@@ -151,6 +151,7 @@ export class UiServer {
    const gpus = await this.gpuDetector.detectGpus();

    let status: 'ok' | 'degraded' | 'error' = 'ok';
+    const reasons = new Set<'unhealthy_container' | 'no_models_available' | 'gpu_detection_failed'>();
    const containerHealth: Record<string, 'healthy' | 'unhealthy'> = {};
    const gpuStatus: Record<string, 'available' | 'in_use' | 'error'> = {};

@@ -160,14 +161,21 @@ export class UiServer {
      } else {
        containerHealth[id] = 'unhealthy';
        status = 'degraded';
+        reasons.add('unhealthy_container');
      }
    }
    for (const gpu of gpus) {
      gpuStatus[gpu.id] = 'available';
    }

+    if (models.size === 0) {
+      status = 'degraded';
+      reasons.add('no_models_available');
+    }
+
    const health: IHealthResponse = {
      status,
+      reasons: Array.from(reasons),
      version: VERSION,
      uptime: Math.floor((Date.now() - this.startTime) / 1000),
      containers: statuses.size,