diff --git a/test/api-server_test.ts b/test/api-server_test.ts index 8808e4b..21812e3 100644 --- a/test/api-server_test.ts +++ b/test/api-server_test.ts @@ -86,6 +86,89 @@ Deno.test('ApiServer serves health metrics and authenticated model listings', as assertEquals(authenticatedModels.status, 200); assertEquals(authenticatedBody.object, 'list'); assertEquals(authenticatedBody.data[0].id, 'meta-llama/Llama-3.1-8B-Instruct'); + + const metricsAfterRequests = await fetch(`http://127.0.0.1:${port}/metrics`); + const metricsAfterRequestsBody = await metricsAfterRequests.text(); + assertEquals( + metricsAfterRequestsBody.includes('modelgrid_api_requests_total{path="/v1/models"} 2'), + true, + ); + assertEquals( + metricsAfterRequestsBody.includes('modelgrid_api_auth_failures_total{path="/v1/models"} 1'), + true, + ); + } finally { + await server.stop(); + } +}); + +Deno.test('ApiServer metrics expose 5xx counts for failing endpoints', async () => { + const port = 19100 + Math.floor(Math.random() * 1000); + let failModelListing = true; + const server = new ApiServer( + { + host: '127.0.0.1', + port, + apiKeys: ['valid-key'], + cors: false, + corsOrigins: [], + }, + { + async getAllStatus() { + return new Map(); + }, + async getAllAvailableModels() { + if (failModelListing) { + failModelListing = false; + throw new Error('models unavailable'); + } + + return new Map(); + }, + } as never, + { + async getAllModels() { + return []; + }, + } as never, + {} as never, + { + getStatus() { + return { + localNode: null, + nodes: [], + models: {}, + desiredDeployments: [], + }; + }, + } as never, + ); + + (server as unknown as { + gpuDetector: { detectGpus: () => Promise }; + }).gpuDetector = { + async detectGpus() { + return []; + }, + }; + + await server.start(); + + try { + const failedModels = await fetch(`http://127.0.0.1:${port}/v1/models`, { + headers: { + Authorization: 'Bearer valid-key', + }, + }); + assertEquals(failedModels.status, 500); + await failedModels.text(); + + const metricsResponse = await fetch(`http://127.0.0.1:${port}/metrics`); + const metricsBody = await metricsResponse.text(); + assertEquals( + metricsBody.includes('modelgrid_api_server_errors_total{path="/v1/models"} 1'), + true, + ); } finally { await server.stop(); } diff --git a/ts/api/server.ts b/ts/api/server.ts index 84ce229..bb4f773 100644 --- a/ts/api/server.ts +++ b/ts/api/server.ts @@ -31,6 +31,9 @@ export class ApiServer { private clusterCoordinator: ClusterCoordinator; private clusterHandler: ClusterHandler; private startTime: number = 0; + private requestCounts = new Map(); + private authFailureCounts = new Map(); + private serverErrorCounts = new Map(); constructor( config: IApiConfig, @@ -131,18 +134,21 @@ export class ApiServer { if (path.startsWith('/_cluster')) { await this.clusterHandler.handle(req, res, path, url); + this.recordRequest(path, res.statusCode); return; } // Health check endpoint (no auth required) if (path === '/health' || path === '/healthz') { await this.handleHealthCheck(res); + this.recordRequest(path, res.statusCode); return; } // Metrics endpoint (no auth required) if (path === '/metrics') { await this.handleMetrics(res); + this.recordRequest(path, res.statusCode); return; } @@ -156,6 +162,7 @@ export class ApiServer { // Log request const duration = Date.now() - startTime; + this.recordRequest(path, res.statusCode); logger.dim(`${req.method} ${path} - ${res.statusCode} (${duration}ms)`); } @@ -268,6 +275,28 @@ export class ApiServer { metrics.push(`# TYPE modelgrid_gpus_total gauge`); metrics.push(`modelgrid_gpus_total ${gpus.length}`); + for (const [path, count] of this.requestCounts.entries()) { + metrics.push(`# HELP modelgrid_api_requests_total Total API requests by path`); + metrics.push(`# TYPE modelgrid_api_requests_total counter`); + metrics.push(`modelgrid_api_requests_total{path="${this.escapeMetricLabel(path)}"} ${count}`); + } + + for (const [path, count] of this.authFailureCounts.entries()) { + metrics.push(`# HELP modelgrid_api_auth_failures_total Total authentication failures by path`); + metrics.push(`# TYPE modelgrid_api_auth_failures_total counter`); + metrics.push( + `modelgrid_api_auth_failures_total{path="${this.escapeMetricLabel(path)}"} ${count}`, + ); + } + + for (const [path, count] of this.serverErrorCounts.entries()) { + metrics.push(`# HELP modelgrid_api_server_errors_total Total 5xx responses by path`); + metrics.push(`# TYPE modelgrid_api_server_errors_total counter`); + metrics.push( + `modelgrid_api_server_errors_total{path="${this.escapeMetricLabel(path)}"} ${count}`, + ); + } + res.writeHead(200, { 'Content-Type': 'text/plain; charset=utf-8' }); res.end(metrics.join('\n') + '\n'); } catch (error) { @@ -310,4 +339,24 @@ export class ApiServer { uptime: this.startTime ? Math.floor((Date.now() - this.startTime) / 1000) : 0, }; } + + private recordRequest(path: string, statusCode: number): void { + this.incrementMetric(this.requestCounts, path); + + if (statusCode === 401) { + this.incrementMetric(this.authFailureCounts, path); + } + + if (statusCode >= 500) { + this.incrementMetric(this.serverErrorCounts, path); + } + } + + private incrementMetric(metric: Map, path: string): void { + metric.set(path, (metric.get(path) || 0) + 1); + } + + private escapeMetricLabel(value: string): string { + return value.replaceAll('\\', '\\\\').replaceAll('"', '\\"'); + } }