feat(metrics): count api requests auth failures and 5xxs
This commit is contained in:
@@ -86,6 +86,89 @@ Deno.test('ApiServer serves health metrics and authenticated model listings', as
|
|||||||
assertEquals(authenticatedModels.status, 200);
|
assertEquals(authenticatedModels.status, 200);
|
||||||
assertEquals(authenticatedBody.object, 'list');
|
assertEquals(authenticatedBody.object, 'list');
|
||||||
assertEquals(authenticatedBody.data[0].id, 'meta-llama/Llama-3.1-8B-Instruct');
|
assertEquals(authenticatedBody.data[0].id, 'meta-llama/Llama-3.1-8B-Instruct');
|
||||||
|
|
||||||
|
const metricsAfterRequests = await fetch(`http://127.0.0.1:${port}/metrics`);
|
||||||
|
const metricsAfterRequestsBody = await metricsAfterRequests.text();
|
||||||
|
assertEquals(
|
||||||
|
metricsAfterRequestsBody.includes('modelgrid_api_requests_total{path="/v1/models"} 2'),
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
assertEquals(
|
||||||
|
metricsAfterRequestsBody.includes('modelgrid_api_auth_failures_total{path="/v1/models"} 1'),
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
} finally {
|
||||||
|
await server.stop();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Deno.test('ApiServer metrics expose 5xx counts for failing endpoints', async () => {
|
||||||
|
const port = 19100 + Math.floor(Math.random() * 1000);
|
||||||
|
let failModelListing = true;
|
||||||
|
const server = new ApiServer(
|
||||||
|
{
|
||||||
|
host: '127.0.0.1',
|
||||||
|
port,
|
||||||
|
apiKeys: ['valid-key'],
|
||||||
|
cors: false,
|
||||||
|
corsOrigins: [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
async getAllStatus() {
|
||||||
|
return new Map();
|
||||||
|
},
|
||||||
|
async getAllAvailableModels() {
|
||||||
|
if (failModelListing) {
|
||||||
|
failModelListing = false;
|
||||||
|
throw new Error('models unavailable');
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Map();
|
||||||
|
},
|
||||||
|
} as never,
|
||||||
|
{
|
||||||
|
async getAllModels() {
|
||||||
|
return [];
|
||||||
|
},
|
||||||
|
} as never,
|
||||||
|
{} as never,
|
||||||
|
{
|
||||||
|
getStatus() {
|
||||||
|
return {
|
||||||
|
localNode: null,
|
||||||
|
nodes: [],
|
||||||
|
models: {},
|
||||||
|
desiredDeployments: [],
|
||||||
|
};
|
||||||
|
},
|
||||||
|
} as never,
|
||||||
|
);
|
||||||
|
|
||||||
|
(server as unknown as {
|
||||||
|
gpuDetector: { detectGpus: () => Promise<unknown[]> };
|
||||||
|
}).gpuDetector = {
|
||||||
|
async detectGpus() {
|
||||||
|
return [];
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
await server.start();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const failedModels = await fetch(`http://127.0.0.1:${port}/v1/models`, {
|
||||||
|
headers: {
|
||||||
|
Authorization: 'Bearer valid-key',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
assertEquals(failedModels.status, 500);
|
||||||
|
await failedModels.text();
|
||||||
|
|
||||||
|
const metricsResponse = await fetch(`http://127.0.0.1:${port}/metrics`);
|
||||||
|
const metricsBody = await metricsResponse.text();
|
||||||
|
assertEquals(
|
||||||
|
metricsBody.includes('modelgrid_api_server_errors_total{path="/v1/models"} 1'),
|
||||||
|
true,
|
||||||
|
);
|
||||||
} finally {
|
} finally {
|
||||||
await server.stop();
|
await server.stop();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,9 @@ export class ApiServer {
|
|||||||
private clusterCoordinator: ClusterCoordinator;
|
private clusterCoordinator: ClusterCoordinator;
|
||||||
private clusterHandler: ClusterHandler;
|
private clusterHandler: ClusterHandler;
|
||||||
private startTime: number = 0;
|
private startTime: number = 0;
|
||||||
|
private requestCounts = new Map<string, number>();
|
||||||
|
private authFailureCounts = new Map<string, number>();
|
||||||
|
private serverErrorCounts = new Map<string, number>();
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
config: IApiConfig,
|
config: IApiConfig,
|
||||||
@@ -131,18 +134,21 @@ export class ApiServer {
|
|||||||
|
|
||||||
if (path.startsWith('/_cluster')) {
|
if (path.startsWith('/_cluster')) {
|
||||||
await this.clusterHandler.handle(req, res, path, url);
|
await this.clusterHandler.handle(req, res, path, url);
|
||||||
|
this.recordRequest(path, res.statusCode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Health check endpoint (no auth required)
|
// Health check endpoint (no auth required)
|
||||||
if (path === '/health' || path === '/healthz') {
|
if (path === '/health' || path === '/healthz') {
|
||||||
await this.handleHealthCheck(res);
|
await this.handleHealthCheck(res);
|
||||||
|
this.recordRequest(path, res.statusCode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Metrics endpoint (no auth required)
|
// Metrics endpoint (no auth required)
|
||||||
if (path === '/metrics') {
|
if (path === '/metrics') {
|
||||||
await this.handleMetrics(res);
|
await this.handleMetrics(res);
|
||||||
|
this.recordRequest(path, res.statusCode);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -156,6 +162,7 @@ export class ApiServer {
|
|||||||
|
|
||||||
// Log request
|
// Log request
|
||||||
const duration = Date.now() - startTime;
|
const duration = Date.now() - startTime;
|
||||||
|
this.recordRequest(path, res.statusCode);
|
||||||
logger.dim(`${req.method} ${path} - ${res.statusCode} (${duration}ms)`);
|
logger.dim(`${req.method} ${path} - ${res.statusCode} (${duration}ms)`);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -268,6 +275,28 @@ export class ApiServer {
|
|||||||
metrics.push(`# TYPE modelgrid_gpus_total gauge`);
|
metrics.push(`# TYPE modelgrid_gpus_total gauge`);
|
||||||
metrics.push(`modelgrid_gpus_total ${gpus.length}`);
|
metrics.push(`modelgrid_gpus_total ${gpus.length}`);
|
||||||
|
|
||||||
|
for (const [path, count] of this.requestCounts.entries()) {
|
||||||
|
metrics.push(`# HELP modelgrid_api_requests_total Total API requests by path`);
|
||||||
|
metrics.push(`# TYPE modelgrid_api_requests_total counter`);
|
||||||
|
metrics.push(`modelgrid_api_requests_total{path="${this.escapeMetricLabel(path)}"} ${count}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [path, count] of this.authFailureCounts.entries()) {
|
||||||
|
metrics.push(`# HELP modelgrid_api_auth_failures_total Total authentication failures by path`);
|
||||||
|
metrics.push(`# TYPE modelgrid_api_auth_failures_total counter`);
|
||||||
|
metrics.push(
|
||||||
|
`modelgrid_api_auth_failures_total{path="${this.escapeMetricLabel(path)}"} ${count}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [path, count] of this.serverErrorCounts.entries()) {
|
||||||
|
metrics.push(`# HELP modelgrid_api_server_errors_total Total 5xx responses by path`);
|
||||||
|
metrics.push(`# TYPE modelgrid_api_server_errors_total counter`);
|
||||||
|
metrics.push(
|
||||||
|
`modelgrid_api_server_errors_total{path="${this.escapeMetricLabel(path)}"} ${count}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
res.writeHead(200, { 'Content-Type': 'text/plain; charset=utf-8' });
|
res.writeHead(200, { 'Content-Type': 'text/plain; charset=utf-8' });
|
||||||
res.end(metrics.join('\n') + '\n');
|
res.end(metrics.join('\n') + '\n');
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -310,4 +339,24 @@ export class ApiServer {
|
|||||||
uptime: this.startTime ? Math.floor((Date.now() - this.startTime) / 1000) : 0,
|
uptime: this.startTime ? Math.floor((Date.now() - this.startTime) / 1000) : 0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private recordRequest(path: string, statusCode: number): void {
|
||||||
|
this.incrementMetric(this.requestCounts, path);
|
||||||
|
|
||||||
|
if (statusCode === 401) {
|
||||||
|
this.incrementMetric(this.authFailureCounts, path);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (statusCode >= 500) {
|
||||||
|
this.incrementMetric(this.serverErrorCounts, path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private incrementMetric(metric: Map<string, number>, path: string): void {
|
||||||
|
metric.set(path, (metric.get(path) || 0) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private escapeMetricLabel(value: string): string {
|
||||||
|
return value.replaceAll('\\', '\\\\').replaceAll('"', '\\"');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user