305 lines
7.1 KiB
TypeScript
305 lines
7.1 KiB
TypeScript
|
|
import type { Client as ElasticClient } from '@elastic/elasticsearch';
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Health status
|
||
|
|
*/
|
||
|
|
export enum HealthStatus {
|
||
|
|
HEALTHY = 'healthy',
|
||
|
|
DEGRADED = 'degraded',
|
||
|
|
UNHEALTHY = 'unhealthy',
|
||
|
|
UNKNOWN = 'unknown',
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Cluster health status from Elasticsearch
|
||
|
|
*/
|
||
|
|
export enum ClusterHealth {
|
||
|
|
GREEN = 'green',
|
||
|
|
YELLOW = 'yellow',
|
||
|
|
RED = 'red',
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Health check result
|
||
|
|
*/
|
||
|
|
export interface HealthCheckResult {
|
||
|
|
/** Overall health status */
|
||
|
|
status: HealthStatus;
|
||
|
|
|
||
|
|
/** Cluster health from Elasticsearch */
|
||
|
|
clusterHealth?: ClusterHealth;
|
||
|
|
|
||
|
|
/** Whether the cluster is available */
|
||
|
|
available: boolean;
|
||
|
|
|
||
|
|
/** Response time in milliseconds */
|
||
|
|
responseTimeMs?: number;
|
||
|
|
|
||
|
|
/** Number of active nodes */
|
||
|
|
activeNodes?: number;
|
||
|
|
|
||
|
|
/** Error if health check failed */
|
||
|
|
error?: Error;
|
||
|
|
|
||
|
|
/** Timestamp of health check */
|
||
|
|
timestamp: Date;
|
||
|
|
|
||
|
|
/** Additional details */
|
||
|
|
details?: Record<string, unknown>;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Health check configuration
|
||
|
|
*/
|
||
|
|
export interface HealthCheckConfig {
|
||
|
|
/** Interval between health checks in milliseconds */
|
||
|
|
interval: number;
|
||
|
|
|
||
|
|
/** Timeout for health check requests */
|
||
|
|
timeout: number;
|
||
|
|
|
||
|
|
/** Number of consecutive failures before marking unhealthy */
|
||
|
|
unhealthyThreshold: number;
|
||
|
|
|
||
|
|
/** Number of consecutive successes before marking healthy */
|
||
|
|
healthyThreshold: number;
|
||
|
|
|
||
|
|
/** Whether to check cluster health */
|
||
|
|
checkClusterHealth: boolean;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Default health check configuration
|
||
|
|
*/
|
||
|
|
export const DEFAULT_HEALTH_CHECK_CONFIG: HealthCheckConfig = {
|
||
|
|
interval: 30000, // 30 seconds
|
||
|
|
timeout: 5000, // 5 seconds
|
||
|
|
unhealthyThreshold: 3,
|
||
|
|
healthyThreshold: 2,
|
||
|
|
checkClusterHealth: true,
|
||
|
|
};
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Health checker for Elasticsearch cluster
|
||
|
|
*/
|
||
|
|
export class HealthChecker {
|
||
|
|
private config: HealthCheckConfig;
|
||
|
|
private consecutiveFailures = 0;
|
||
|
|
private consecutiveSuccesses = 0;
|
||
|
|
private currentStatus: HealthStatus = HealthStatus.UNKNOWN;
|
||
|
|
private lastCheckResult?: HealthCheckResult;
|
||
|
|
private checkInterval?: NodeJS.Timeout;
|
||
|
|
private isChecking = false;
|
||
|
|
|
||
|
|
constructor(
|
||
|
|
private client: ElasticClient,
|
||
|
|
config: Partial<HealthCheckConfig> = {}
|
||
|
|
) {
|
||
|
|
this.config = { ...DEFAULT_HEALTH_CHECK_CONFIG, ...config };
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Perform a single health check
|
||
|
|
*/
|
||
|
|
async check(): Promise<HealthCheckResult> {
|
||
|
|
const startTime = Date.now();
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Ping the cluster
|
||
|
|
const pingResponse = await Promise.race([
|
||
|
|
this.client.ping(),
|
||
|
|
this.timeout(this.config.timeout),
|
||
|
|
]);
|
||
|
|
|
||
|
|
const responseTime = Date.now() - startTime;
|
||
|
|
const available = pingResponse === true || (pingResponse as any).statusCode === 200;
|
||
|
|
|
||
|
|
if (!available) {
|
||
|
|
throw new Error('Cluster ping failed');
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check cluster health if enabled
|
||
|
|
let clusterHealth: ClusterHealth | undefined;
|
||
|
|
let activeNodes: number | undefined;
|
||
|
|
|
||
|
|
if (this.config.checkClusterHealth) {
|
||
|
|
try {
|
||
|
|
const healthResponse = await this.client.cluster.health({
|
||
|
|
timeout: `${this.config.timeout}ms`,
|
||
|
|
});
|
||
|
|
|
||
|
|
clusterHealth = healthResponse.status as ClusterHealth;
|
||
|
|
activeNodes = healthResponse.number_of_nodes;
|
||
|
|
} catch (error) {
|
||
|
|
// Cluster health check failed, but ping succeeded
|
||
|
|
// Mark as degraded
|
||
|
|
this.consecutiveSuccesses = 0;
|
||
|
|
this.consecutiveFailures++;
|
||
|
|
|
||
|
|
const result: HealthCheckResult = {
|
||
|
|
status: HealthStatus.DEGRADED,
|
||
|
|
available: true,
|
||
|
|
responseTimeMs: responseTime,
|
||
|
|
error: error as Error,
|
||
|
|
timestamp: new Date(),
|
||
|
|
};
|
||
|
|
|
||
|
|
this.lastCheckResult = result;
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Success!
|
||
|
|
this.consecutiveFailures = 0;
|
||
|
|
this.consecutiveSuccesses++;
|
||
|
|
|
||
|
|
// Determine status based on cluster health
|
||
|
|
let status: HealthStatus;
|
||
|
|
if (clusterHealth === ClusterHealth.GREEN) {
|
||
|
|
status = HealthStatus.HEALTHY;
|
||
|
|
} else if (clusterHealth === ClusterHealth.YELLOW) {
|
||
|
|
status = HealthStatus.DEGRADED;
|
||
|
|
} else if (clusterHealth === ClusterHealth.RED) {
|
||
|
|
status = HealthStatus.UNHEALTHY;
|
||
|
|
} else {
|
||
|
|
// No cluster health, but ping succeeded
|
||
|
|
status =
|
||
|
|
this.consecutiveSuccesses >= this.config.healthyThreshold
|
||
|
|
? HealthStatus.HEALTHY
|
||
|
|
: HealthStatus.DEGRADED;
|
||
|
|
}
|
||
|
|
|
||
|
|
this.currentStatus = status;
|
||
|
|
|
||
|
|
const result: HealthCheckResult = {
|
||
|
|
status,
|
||
|
|
clusterHealth,
|
||
|
|
available: true,
|
||
|
|
responseTimeMs: responseTime,
|
||
|
|
activeNodes,
|
||
|
|
timestamp: new Date(),
|
||
|
|
};
|
||
|
|
|
||
|
|
this.lastCheckResult = result;
|
||
|
|
return result;
|
||
|
|
} catch (error) {
|
||
|
|
this.consecutiveSuccesses = 0;
|
||
|
|
this.consecutiveFailures++;
|
||
|
|
|
||
|
|
const status =
|
||
|
|
this.consecutiveFailures >= this.config.unhealthyThreshold
|
||
|
|
? HealthStatus.UNHEALTHY
|
||
|
|
: HealthStatus.DEGRADED;
|
||
|
|
|
||
|
|
this.currentStatus = status;
|
||
|
|
|
||
|
|
const result: HealthCheckResult = {
|
||
|
|
status,
|
||
|
|
available: false,
|
||
|
|
error: error as Error,
|
||
|
|
timestamp: new Date(),
|
||
|
|
};
|
||
|
|
|
||
|
|
this.lastCheckResult = result;
|
||
|
|
return result;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Start periodic health checks
|
||
|
|
*/
|
||
|
|
startPeriodicChecks(onHealthChange?: (result: HealthCheckResult) => void): void {
|
||
|
|
if (this.checkInterval) {
|
||
|
|
return; // Already running
|
||
|
|
}
|
||
|
|
|
||
|
|
const performCheck = async () => {
|
||
|
|
if (this.isChecking) return;
|
||
|
|
|
||
|
|
this.isChecking = true;
|
||
|
|
try {
|
||
|
|
const previousStatus = this.currentStatus;
|
||
|
|
const result = await this.check();
|
||
|
|
|
||
|
|
if (onHealthChange && result.status !== previousStatus) {
|
||
|
|
onHealthChange(result);
|
||
|
|
}
|
||
|
|
} catch (error) {
|
||
|
|
// Error already handled in check()
|
||
|
|
} finally {
|
||
|
|
this.isChecking = false;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
// Perform initial check
|
||
|
|
performCheck();
|
||
|
|
|
||
|
|
// Schedule periodic checks
|
||
|
|
this.checkInterval = setInterval(performCheck, this.config.interval);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Stop periodic health checks
|
||
|
|
*/
|
||
|
|
stopPeriodicChecks(): void {
|
||
|
|
if (this.checkInterval) {
|
||
|
|
clearInterval(this.checkInterval);
|
||
|
|
this.checkInterval = undefined;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get current health status
|
||
|
|
*/
|
||
|
|
getStatus(): HealthStatus {
|
||
|
|
return this.currentStatus;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get last health check result
|
||
|
|
*/
|
||
|
|
getLastCheckResult(): HealthCheckResult | undefined {
|
||
|
|
return this.lastCheckResult;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Check if cluster is healthy
|
||
|
|
*/
|
||
|
|
isHealthy(): boolean {
|
||
|
|
return this.currentStatus === HealthStatus.HEALTHY;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Check if cluster is available
|
||
|
|
*/
|
||
|
|
isAvailable(): boolean {
|
||
|
|
return this.lastCheckResult?.available ?? false;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Reset health check state
|
||
|
|
*/
|
||
|
|
reset(): void {
|
||
|
|
this.consecutiveFailures = 0;
|
||
|
|
this.consecutiveSuccesses = 0;
|
||
|
|
this.currentStatus = HealthStatus.UNKNOWN;
|
||
|
|
this.lastCheckResult = undefined;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Helper to create a timeout promise
|
||
|
|
*/
|
||
|
|
private timeout(ms: number): Promise<never> {
|
||
|
|
return new Promise((_, reject) => {
|
||
|
|
setTimeout(() => reject(new Error(`Health check timeout after ${ms}ms`)), ms);
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Cleanup resources
|
||
|
|
*/
|
||
|
|
destroy(): void {
|
||
|
|
this.stopPeriodicChecks();
|
||
|
|
}
|
||
|
|
}
|