feat(metrics): add per-backend connection, error, protocol, and pool metrics with stale backend pruning
This commit is contained in:
@@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::throughput::{ThroughputSample, ThroughputTracker};
|
||||
|
||||
@@ -20,6 +21,7 @@ pub struct Metrics {
|
||||
pub throughput_recent_out_bytes_per_sec: u64,
|
||||
pub routes: std::collections::HashMap<String, RouteMetrics>,
|
||||
pub ips: std::collections::HashMap<String, IpMetrics>,
|
||||
pub backends: std::collections::HashMap<String, BackendMetrics>,
|
||||
pub throughput_history: Vec<ThroughputSample>,
|
||||
pub total_http_requests: u64,
|
||||
pub http_requests_per_sec: u64,
|
||||
@@ -52,6 +54,23 @@ pub struct IpMetrics {
|
||||
pub throughput_out_bytes_per_sec: u64,
|
||||
}
|
||||
|
||||
/// Per-backend metrics (keyed by "host:port").
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct BackendMetrics {
|
||||
pub active_connections: u64,
|
||||
pub total_connections: u64,
|
||||
pub protocol: String,
|
||||
pub connect_errors: u64,
|
||||
pub handshake_errors: u64,
|
||||
pub request_errors: u64,
|
||||
pub total_connect_time_us: u64,
|
||||
pub connect_count: u64,
|
||||
pub pool_hits: u64,
|
||||
pub pool_misses: u64,
|
||||
pub h2_failures: u64,
|
||||
}
|
||||
|
||||
/// Statistics snapshot.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
@@ -69,6 +88,9 @@ const DEFAULT_RETENTION_SECONDS: usize = 3600;
|
||||
/// Maximum number of IPs to include in a snapshot (top by active connections).
|
||||
const MAX_IPS_IN_SNAPSHOT: usize = 100;
|
||||
|
||||
/// Maximum number of backends to include in a snapshot (top by total connections).
|
||||
const MAX_BACKENDS_IN_SNAPSHOT: usize = 100;
|
||||
|
||||
/// Metrics collector tracking connections and throughput.
|
||||
///
|
||||
/// Design: The hot path (`record_bytes`) is entirely lock-free — it only touches
|
||||
@@ -96,6 +118,19 @@ pub struct MetricsCollector {
|
||||
ip_pending_tp: DashMap<String, (AtomicU64, AtomicU64)>,
|
||||
ip_throughput: DashMap<String, Mutex<ThroughputTracker>>,
|
||||
|
||||
// ── Per-backend tracking (keyed by "host:port") ──
|
||||
backend_active: DashMap<String, AtomicU64>,
|
||||
backend_total: DashMap<String, AtomicU64>,
|
||||
backend_protocol: DashMap<String, String>,
|
||||
backend_connect_errors: DashMap<String, AtomicU64>,
|
||||
backend_handshake_errors: DashMap<String, AtomicU64>,
|
||||
backend_request_errors: DashMap<String, AtomicU64>,
|
||||
backend_connect_time_us: DashMap<String, AtomicU64>,
|
||||
backend_connect_count: DashMap<String, AtomicU64>,
|
||||
backend_pool_hits: DashMap<String, AtomicU64>,
|
||||
backend_pool_misses: DashMap<String, AtomicU64>,
|
||||
backend_h2_failures: DashMap<String, AtomicU64>,
|
||||
|
||||
// ── HTTP request tracking ──
|
||||
total_http_requests: AtomicU64,
|
||||
pending_http_requests: AtomicU64,
|
||||
@@ -134,6 +169,17 @@ impl MetricsCollector {
|
||||
ip_bytes_out: DashMap::new(),
|
||||
ip_pending_tp: DashMap::new(),
|
||||
ip_throughput: DashMap::new(),
|
||||
backend_active: DashMap::new(),
|
||||
backend_total: DashMap::new(),
|
||||
backend_protocol: DashMap::new(),
|
||||
backend_connect_errors: DashMap::new(),
|
||||
backend_handshake_errors: DashMap::new(),
|
||||
backend_request_errors: DashMap::new(),
|
||||
backend_connect_time_us: DashMap::new(),
|
||||
backend_connect_count: DashMap::new(),
|
||||
backend_pool_hits: DashMap::new(),
|
||||
backend_pool_misses: DashMap::new(),
|
||||
backend_h2_failures: DashMap::new(),
|
||||
total_http_requests: AtomicU64::new(0),
|
||||
pending_http_requests: AtomicU64::new(0),
|
||||
http_request_throughput: Mutex::new(ThroughputTracker::new(retention_seconds)),
|
||||
@@ -268,6 +314,113 @@ impl MetricsCollector {
|
||||
self.pending_http_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
// ── Per-backend recording methods ──
|
||||
|
||||
/// Record a successful backend connection with its connect duration.
|
||||
pub fn backend_connection_opened(&self, key: &str, connect_time: Duration) {
|
||||
self.backend_active
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
self.backend_total
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
self.backend_connect_time_us
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(connect_time.as_micros() as u64, Ordering::Relaxed);
|
||||
self.backend_connect_count
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Record a backend connection closing.
|
||||
pub fn backend_connection_closed(&self, key: &str) {
|
||||
if let Some(counter) = self.backend_active.get(key) {
|
||||
let val = counter.load(Ordering::Relaxed);
|
||||
if val > 0 {
|
||||
counter.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Record a backend connect error (TCP or TLS connect failure/timeout).
|
||||
pub fn backend_connect_error(&self, key: &str) {
|
||||
self.backend_connect_errors
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Record a backend handshake error (H1 or H2 handshake failure).
|
||||
pub fn backend_handshake_error(&self, key: &str) {
|
||||
self.backend_handshake_errors
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Record a backend request error (send_request failure).
|
||||
pub fn backend_request_error(&self, key: &str) {
|
||||
self.backend_request_errors
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Record a connection pool hit for a backend.
|
||||
pub fn backend_pool_hit(&self, key: &str) {
|
||||
self.backend_pool_hits
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Record a connection pool miss for a backend.
|
||||
pub fn backend_pool_miss(&self, key: &str) {
|
||||
self.backend_pool_misses
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Record an H2 failure (h2 attempted but fell back to h1).
|
||||
pub fn backend_h2_failure(&self, key: &str) {
|
||||
self.backend_h2_failures
|
||||
.entry(key.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Set the protocol in use for a backend ("h1" or "h2").
|
||||
pub fn set_backend_protocol(&self, key: &str, protocol: &str) {
|
||||
self.backend_protocol
|
||||
.entry(key.to_string())
|
||||
.and_modify(|v| {
|
||||
if v != protocol {
|
||||
*v = protocol.to_string();
|
||||
}
|
||||
})
|
||||
.or_insert_with(|| protocol.to_string());
|
||||
}
|
||||
|
||||
/// Remove per-backend metrics for backends no longer in any route target.
|
||||
pub fn retain_backends(&self, active_backends: &HashSet<String>) {
|
||||
self.backend_active.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_total.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_protocol.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_connect_errors.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_handshake_errors.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_request_errors.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_connect_time_us.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_connect_count.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_pool_hits.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_pool_misses.retain(|k, _| active_backends.contains(k));
|
||||
self.backend_h2_failures.retain(|k, _| active_backends.contains(k));
|
||||
}
|
||||
|
||||
/// Take a throughput sample on all trackers (cold path, call at 1Hz or configured interval).
|
||||
///
|
||||
/// Drains the lock-free pending counters and feeds the accumulated bytes
|
||||
@@ -488,6 +641,72 @@ impl MetricsCollector {
|
||||
});
|
||||
}
|
||||
|
||||
// Collect per-backend metrics, capped at top MAX_BACKENDS_IN_SNAPSHOT by total connections
|
||||
let mut backend_entries: Vec<(String, BackendMetrics)> = Vec::new();
|
||||
for entry in self.backend_total.iter() {
|
||||
let key = entry.key().clone();
|
||||
let total = entry.value().load(Ordering::Relaxed);
|
||||
let active = self.backend_active
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
let protocol = self.backend_protocol
|
||||
.get(&key)
|
||||
.map(|v| v.value().clone())
|
||||
.unwrap_or_else(|| "unknown".to_string());
|
||||
let connect_errors = self.backend_connect_errors
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
let handshake_errors = self.backend_handshake_errors
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
let request_errors = self.backend_request_errors
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
let total_connect_time_us = self.backend_connect_time_us
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
let connect_count = self.backend_connect_count
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
let pool_hits = self.backend_pool_hits
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
let pool_misses = self.backend_pool_misses
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
let h2_failures = self.backend_h2_failures
|
||||
.get(&key)
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
|
||||
backend_entries.push((key, BackendMetrics {
|
||||
active_connections: active,
|
||||
total_connections: total,
|
||||
protocol,
|
||||
connect_errors,
|
||||
handshake_errors,
|
||||
request_errors,
|
||||
total_connect_time_us,
|
||||
connect_count,
|
||||
pool_hits,
|
||||
pool_misses,
|
||||
h2_failures,
|
||||
}));
|
||||
}
|
||||
// Sort by total connections descending, then cap
|
||||
backend_entries.sort_by(|a, b| b.1.total_connections.cmp(&a.1.total_connections));
|
||||
backend_entries.truncate(MAX_BACKENDS_IN_SNAPSHOT);
|
||||
|
||||
let backends: std::collections::HashMap<String, BackendMetrics> = backend_entries.into_iter().collect();
|
||||
|
||||
// HTTP request rates
|
||||
let (http_rps, http_rps_recent) = self.http_request_throughput
|
||||
.lock()
|
||||
@@ -509,6 +728,7 @@ impl MetricsCollector {
|
||||
throughput_recent_out_bytes_per_sec: global_recent_out,
|
||||
routes,
|
||||
ips,
|
||||
backends,
|
||||
throughput_history,
|
||||
total_http_requests: self.total_http_requests.load(Ordering::Relaxed),
|
||||
http_requests_per_sec: http_rps,
|
||||
@@ -805,4 +1025,120 @@ mod tests {
|
||||
assert_eq!(snapshot.throughput_history[0].bytes_in, 100);
|
||||
assert_eq!(snapshot.throughput_history[4].bytes_in, 500);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backend_metrics_basic() {
|
||||
let collector = MetricsCollector::new();
|
||||
let key = "backend1:8080";
|
||||
|
||||
// Open connections with timing
|
||||
collector.backend_connection_opened(key, Duration::from_millis(15));
|
||||
collector.backend_connection_opened(key, Duration::from_millis(25));
|
||||
|
||||
assert_eq!(collector.backend_active.get(key).unwrap().load(Ordering::Relaxed), 2);
|
||||
assert_eq!(collector.backend_total.get(key).unwrap().load(Ordering::Relaxed), 2);
|
||||
assert_eq!(collector.backend_connect_count.get(key).unwrap().load(Ordering::Relaxed), 2);
|
||||
// 15ms + 25ms = 40ms = 40_000us
|
||||
assert_eq!(collector.backend_connect_time_us.get(key).unwrap().load(Ordering::Relaxed), 40_000);
|
||||
|
||||
// Close one
|
||||
collector.backend_connection_closed(key);
|
||||
assert_eq!(collector.backend_active.get(key).unwrap().load(Ordering::Relaxed), 1);
|
||||
// total stays
|
||||
assert_eq!(collector.backend_total.get(key).unwrap().load(Ordering::Relaxed), 2);
|
||||
|
||||
// Record errors
|
||||
collector.backend_connect_error(key);
|
||||
collector.backend_handshake_error(key);
|
||||
collector.backend_request_error(key);
|
||||
collector.backend_h2_failure(key);
|
||||
collector.backend_pool_hit(key);
|
||||
collector.backend_pool_hit(key);
|
||||
collector.backend_pool_miss(key);
|
||||
|
||||
assert_eq!(collector.backend_connect_errors.get(key).unwrap().load(Ordering::Relaxed), 1);
|
||||
assert_eq!(collector.backend_handshake_errors.get(key).unwrap().load(Ordering::Relaxed), 1);
|
||||
assert_eq!(collector.backend_request_errors.get(key).unwrap().load(Ordering::Relaxed), 1);
|
||||
assert_eq!(collector.backend_h2_failures.get(key).unwrap().load(Ordering::Relaxed), 1);
|
||||
assert_eq!(collector.backend_pool_hits.get(key).unwrap().load(Ordering::Relaxed), 2);
|
||||
assert_eq!(collector.backend_pool_misses.get(key).unwrap().load(Ordering::Relaxed), 1);
|
||||
|
||||
// Protocol
|
||||
collector.set_backend_protocol(key, "h1");
|
||||
assert_eq!(collector.backend_protocol.get(key).unwrap().value(), "h1");
|
||||
collector.set_backend_protocol(key, "h2");
|
||||
assert_eq!(collector.backend_protocol.get(key).unwrap().value(), "h2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backend_metrics_in_snapshot() {
|
||||
let collector = MetricsCollector::new();
|
||||
|
||||
collector.backend_connection_opened("b1:443", Duration::from_millis(10));
|
||||
collector.backend_connection_opened("b2:8080", Duration::from_millis(20));
|
||||
collector.set_backend_protocol("b1:443", "h2");
|
||||
collector.set_backend_protocol("b2:8080", "h1");
|
||||
collector.backend_connect_error("b1:443");
|
||||
|
||||
let snapshot = collector.snapshot();
|
||||
assert_eq!(snapshot.backends.len(), 2);
|
||||
|
||||
let b1 = snapshot.backends.get("b1:443").unwrap();
|
||||
assert_eq!(b1.active_connections, 1);
|
||||
assert_eq!(b1.total_connections, 1);
|
||||
assert_eq!(b1.protocol, "h2");
|
||||
assert_eq!(b1.connect_errors, 1);
|
||||
assert_eq!(b1.total_connect_time_us, 10_000);
|
||||
assert_eq!(b1.connect_count, 1);
|
||||
|
||||
let b2 = snapshot.backends.get("b2:8080").unwrap();
|
||||
assert_eq!(b2.protocol, "h1");
|
||||
assert_eq!(b2.connect_errors, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retain_backends_prunes_stale() {
|
||||
let collector = MetricsCollector::new();
|
||||
|
||||
collector.backend_connection_opened("active:443", Duration::from_millis(5));
|
||||
collector.backend_connection_opened("stale:8080", Duration::from_millis(10));
|
||||
collector.set_backend_protocol("active:443", "h1");
|
||||
collector.set_backend_protocol("stale:8080", "h2");
|
||||
collector.backend_connect_error("stale:8080");
|
||||
|
||||
let active = HashSet::from(["active:443".to_string()]);
|
||||
collector.retain_backends(&active);
|
||||
|
||||
// active:443 should still exist
|
||||
assert!(collector.backend_total.get("active:443").is_some());
|
||||
assert!(collector.backend_protocol.get("active:443").is_some());
|
||||
|
||||
// stale:8080 should be fully removed
|
||||
assert!(collector.backend_active.get("stale:8080").is_none());
|
||||
assert!(collector.backend_total.get("stale:8080").is_none());
|
||||
assert!(collector.backend_protocol.get("stale:8080").is_none());
|
||||
assert!(collector.backend_connect_errors.get("stale:8080").is_none());
|
||||
assert!(collector.backend_connect_time_us.get("stale:8080").is_none());
|
||||
assert!(collector.backend_connect_count.get("stale:8080").is_none());
|
||||
assert!(collector.backend_pool_hits.get("stale:8080").is_none());
|
||||
assert!(collector.backend_pool_misses.get("stale:8080").is_none());
|
||||
assert!(collector.backend_h2_failures.get("stale:8080").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backend_connection_closed_saturates() {
|
||||
let collector = MetricsCollector::new();
|
||||
let key = "b:80";
|
||||
|
||||
// Close without opening — should not underflow
|
||||
collector.backend_connection_closed(key);
|
||||
// No entry created
|
||||
assert!(collector.backend_active.get(key).is_none());
|
||||
|
||||
// Open one, close two — should saturate at 0
|
||||
collector.backend_connection_opened(key, Duration::from_millis(1));
|
||||
collector.backend_connection_closed(key);
|
||||
collector.backend_connection_closed(key);
|
||||
assert_eq!(collector.backend_active.get(key).unwrap().load(Ordering::Relaxed), 0);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user