|
|
|
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
|
|
|
|
use std::collections::{HashMap, HashSet};
|
|
|
|
|
use std::sync::atomic::{AtomicU64, Ordering};
|
|
|
|
|
use std::sync::Mutex;
|
|
|
|
|
use std::time::Duration;
|
|
|
|
|
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
|
|
|
|
|
|
|
|
|
use crate::throughput::{RequestRateTracker, ThroughputSample, ThroughputTracker};
|
|
|
|
|
|
|
|
|
@@ -147,6 +147,12 @@ const DEFAULT_RETENTION_SECONDS: usize = 3600;
|
|
|
|
|
/// Maximum number of IPs to include in a snapshot.
|
|
|
|
|
const MAX_IPS_IN_SNAPSHOT: usize = 100;
|
|
|
|
|
|
|
|
|
|
/// How long to retain inactive IP metric buckets after the last connection closes.
|
|
|
|
|
const INACTIVE_IP_RETENTION_MS: u64 = 15_000;
|
|
|
|
|
|
|
|
|
|
/// Hard cap for inactive IP metric buckets retained between sampler passes.
|
|
|
|
|
const MAX_INACTIVE_IPS_RETAINED: usize = MAX_IPS_IN_SNAPSHOT * 10;
|
|
|
|
|
|
|
|
|
|
/// Maximum number of backends to include in a snapshot (top by total connections).
|
|
|
|
|
const MAX_BACKENDS_IN_SNAPSHOT: usize = 100;
|
|
|
|
|
|
|
|
|
@@ -165,6 +171,13 @@ fn canonicalize_domain_key(domain: &str) -> Option<String> {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn current_time_ms() -> u64 {
|
|
|
|
|
SystemTime::now()
|
|
|
|
|
.duration_since(UNIX_EPOCH)
|
|
|
|
|
.unwrap_or_default()
|
|
|
|
|
.as_millis() as u64
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Metrics collector tracking connections and throughput.
|
|
|
|
|
///
|
|
|
|
|
/// Design: The hot path (`record_bytes`) is entirely lock-free — it only touches
|
|
|
|
@@ -186,6 +199,7 @@ pub struct MetricsCollector {
|
|
|
|
|
|
|
|
|
|
// ── Per-IP tracking ──
|
|
|
|
|
ip_connections: DashMap<String, AtomicU64>,
|
|
|
|
|
ip_closed_at_ms: DashMap<String, AtomicU64>,
|
|
|
|
|
ip_total_connections: DashMap<String, AtomicU64>,
|
|
|
|
|
ip_bytes_in: DashMap<String, AtomicU64>,
|
|
|
|
|
ip_bytes_out: DashMap<String, AtomicU64>,
|
|
|
|
@@ -273,6 +287,7 @@ impl MetricsCollector {
|
|
|
|
|
route_bytes_in: DashMap::new(),
|
|
|
|
|
route_bytes_out: DashMap::new(),
|
|
|
|
|
ip_connections: DashMap::new(),
|
|
|
|
|
ip_closed_at_ms: DashMap::new(),
|
|
|
|
|
ip_total_connections: DashMap::new(),
|
|
|
|
|
ip_bytes_in: DashMap::new(),
|
|
|
|
|
ip_bytes_out: DashMap::new(),
|
|
|
|
@@ -348,6 +363,7 @@ impl MetricsCollector {
|
|
|
|
|
.entry(ip.to_string())
|
|
|
|
|
.or_insert_with(|| AtomicU64::new(0))
|
|
|
|
|
.fetch_add(1, Ordering::Relaxed);
|
|
|
|
|
self.ip_closed_at_ms.remove(ip);
|
|
|
|
|
self.ip_total_connections
|
|
|
|
|
.entry(ip.to_string())
|
|
|
|
|
.or_insert_with(|| AtomicU64::new(0))
|
|
|
|
@@ -392,11 +408,32 @@ impl MetricsCollector {
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.ok();
|
|
|
|
|
// Clean up zero-count entries to prevent memory growth
|
|
|
|
|
// Keep inactive IP buckets briefly so pending bytes can still
|
|
|
|
|
// be sampled into per-IP throughput after short-lived transfers.
|
|
|
|
|
if matches!(prev, Some(v) if v <= 1) {
|
|
|
|
|
drop(counter);
|
|
|
|
|
self.ip_closed_at_ms
|
|
|
|
|
.entry(ip.to_string())
|
|
|
|
|
.or_insert_with(|| AtomicU64::new(0))
|
|
|
|
|
.store(current_time_ms(), Ordering::Relaxed);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn remove_inactive_ip_tracking(&self, ip: &str) {
|
|
|
|
|
if self
|
|
|
|
|
.ip_connections
|
|
|
|
|
.get(ip)
|
|
|
|
|
.map(|c| c.load(Ordering::Relaxed))
|
|
|
|
|
.unwrap_or(0)
|
|
|
|
|
> 0
|
|
|
|
|
{
|
|
|
|
|
self.ip_closed_at_ms.remove(ip);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
self.ip_connections.remove(ip);
|
|
|
|
|
// Evict all per-IP tracking data for this IP
|
|
|
|
|
self.ip_closed_at_ms.remove(ip);
|
|
|
|
|
self.ip_total_connections.remove(ip);
|
|
|
|
|
self.ip_bytes_in.remove(ip);
|
|
|
|
|
self.ip_bytes_out.remove(ip);
|
|
|
|
@@ -404,8 +441,53 @@ impl MetricsCollector {
|
|
|
|
|
self.ip_throughput.remove(ip);
|
|
|
|
|
self.ip_domain_requests.remove(ip);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn prune_inactive_ip_tracking(&self, now_ms: u64) {
|
|
|
|
|
let cutoff_ms = now_ms.saturating_sub(INACTIVE_IP_RETENTION_MS);
|
|
|
|
|
let mut inactive_ips: Vec<(String, u64)> = Vec::new();
|
|
|
|
|
let mut active_markers: Vec<String> = Vec::new();
|
|
|
|
|
|
|
|
|
|
for entry in self.ip_closed_at_ms.iter() {
|
|
|
|
|
let ip = entry.key().clone();
|
|
|
|
|
let active = self
|
|
|
|
|
.ip_connections
|
|
|
|
|
.get(&ip)
|
|
|
|
|
.map(|c| c.load(Ordering::Relaxed))
|
|
|
|
|
.unwrap_or(0);
|
|
|
|
|
if active > 0 {
|
|
|
|
|
active_markers.push(ip);
|
|
|
|
|
} else {
|
|
|
|
|
inactive_ips.push((ip, entry.value().load(Ordering::Relaxed)));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ip in active_markers {
|
|
|
|
|
self.ip_closed_at_ms.remove(&ip);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut remove_ips: HashSet<String> = inactive_ips
|
|
|
|
|
.iter()
|
|
|
|
|
.filter(|(_, closed_at_ms)| *closed_at_ms <= cutoff_ms)
|
|
|
|
|
.map(|(ip, _)| ip.clone())
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
let retained_after_ttl = inactive_ips.len().saturating_sub(remove_ips.len());
|
|
|
|
|
if retained_after_ttl > MAX_INACTIVE_IPS_RETAINED {
|
|
|
|
|
inactive_ips.sort_by(|a, b| a.1.cmp(&b.1).then_with(|| a.0.cmp(&b.0)));
|
|
|
|
|
let mut overflow = retained_after_ttl - MAX_INACTIVE_IPS_RETAINED;
|
|
|
|
|
for (ip, closed_at_ms) in inactive_ips {
|
|
|
|
|
if overflow == 0 {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if closed_at_ms > cutoff_ms && remove_ips.insert(ip) {
|
|
|
|
|
overflow -= 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ip in remove_ips {
|
|
|
|
|
self.remove_inactive_ip_tracking(&ip);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Record bytes transferred (lock-free hot path).
|
|
|
|
@@ -481,9 +563,8 @@ impl MetricsCollector {
|
|
|
|
|
|
|
|
|
|
// Per-IP tracking: same get()-first pattern to avoid String allocation on hot path.
|
|
|
|
|
if let Some(ip) = source_ip {
|
|
|
|
|
// Only record per-IP stats if the IP still has active connections.
|
|
|
|
|
// This prevents orphaned entries when record_bytes races with
|
|
|
|
|
// connection_closed (which evicts all per-IP data on last close).
|
|
|
|
|
// Only record per-IP stats if the IP is active or still within the
|
|
|
|
|
// bounded inactive retention window after its last connection closed.
|
|
|
|
|
if self.ip_connections.contains_key(ip) {
|
|
|
|
|
if bytes_in > 0 {
|
|
|
|
|
if let Some(counter) = self.ip_bytes_in.get(ip) {
|
|
|
|
@@ -918,9 +999,13 @@ impl MetricsCollector {
|
|
|
|
|
self.http_domain_request_rates.remove(&domain);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Keep closed IP buckets only for a short, bounded window so the sampler
|
|
|
|
|
// can attribute short-lived transfers without leaking per-IP maps.
|
|
|
|
|
self.prune_inactive_ip_tracking(current_time_ms());
|
|
|
|
|
|
|
|
|
|
// Safety-net: prune orphaned per-IP entries that have no corresponding
|
|
|
|
|
// ip_connections entry. This catches any entries created by a race between
|
|
|
|
|
// record_bytes and connection_closed.
|
|
|
|
|
// ip_connections entry. This catches any entries created by older races or
|
|
|
|
|
// by code paths that manually inserted partial per-IP state.
|
|
|
|
|
self.ip_bytes_in
|
|
|
|
|
.retain(|k, _| self.ip_connections.contains_key(k));
|
|
|
|
|
self.ip_bytes_out
|
|
|
|
@@ -933,6 +1018,8 @@ impl MetricsCollector {
|
|
|
|
|
.retain(|k, _| self.ip_connections.contains_key(k));
|
|
|
|
|
self.ip_domain_requests
|
|
|
|
|
.retain(|k, _| self.ip_connections.contains_key(k));
|
|
|
|
|
self.ip_closed_at_ms
|
|
|
|
|
.retain(|k, _| self.ip_connections.contains_key(k));
|
|
|
|
|
|
|
|
|
|
// Safety-net: prune orphaned backend error/stats entries for backends
|
|
|
|
|
// that have no active or total connections (error-only backends).
|
|
|
|
@@ -1492,9 +1579,18 @@ mod tests {
|
|
|
|
|
1
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Close last connection for IP — should be cleaned up
|
|
|
|
|
// Close last connection for IP — active count should drop to zero,
|
|
|
|
|
// while the inactive bucket is retained briefly for final sampling.
|
|
|
|
|
collector.connection_closed(Some("route-a"), Some("1.2.3.4"));
|
|
|
|
|
assert!(collector.ip_connections.get("1.2.3.4").is_none());
|
|
|
|
|
assert_eq!(
|
|
|
|
|
collector
|
|
|
|
|
.ip_connections
|
|
|
|
|
.get("1.2.3.4")
|
|
|
|
|
.map(|c| c.load(Ordering::Relaxed))
|
|
|
|
|
.unwrap_or(0),
|
|
|
|
|
0
|
|
|
|
|
);
|
|
|
|
|
assert!(collector.ip_closed_at_ms.get("1.2.3.4").is_some());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
@@ -1544,13 +1640,36 @@ mod tests {
|
|
|
|
|
collector.connection_closed(Some("route-a"), Some("10.0.0.1"));
|
|
|
|
|
collector.connection_closed(Some("route-a"), Some("10.0.0.1"));
|
|
|
|
|
|
|
|
|
|
// All per-IP data for 10.0.0.1 should be evicted
|
|
|
|
|
// Per-IP data for 10.0.0.1 should be retained until the inactive TTL expires.
|
|
|
|
|
assert_eq!(
|
|
|
|
|
collector
|
|
|
|
|
.ip_connections
|
|
|
|
|
.get("10.0.0.1")
|
|
|
|
|
.map(|c| c.load(Ordering::Relaxed))
|
|
|
|
|
.unwrap_or(0),
|
|
|
|
|
0
|
|
|
|
|
);
|
|
|
|
|
assert!(collector.ip_total_connections.get("10.0.0.1").is_some());
|
|
|
|
|
assert!(collector.ip_bytes_in.get("10.0.0.1").is_some());
|
|
|
|
|
assert!(collector.ip_bytes_out.get("10.0.0.1").is_some());
|
|
|
|
|
assert!(collector.ip_pending_tp.get("10.0.0.1").is_some());
|
|
|
|
|
assert!(collector.ip_throughput.get("10.0.0.1").is_some());
|
|
|
|
|
|
|
|
|
|
collector
|
|
|
|
|
.ip_closed_at_ms
|
|
|
|
|
.get("10.0.0.1")
|
|
|
|
|
.unwrap()
|
|
|
|
|
.store(0, Ordering::Relaxed);
|
|
|
|
|
collector.sample_all();
|
|
|
|
|
|
|
|
|
|
// Expired inactive buckets are fully evicted to prevent leaks.
|
|
|
|
|
assert!(collector.ip_connections.get("10.0.0.1").is_none());
|
|
|
|
|
assert!(collector.ip_total_connections.get("10.0.0.1").is_none());
|
|
|
|
|
assert!(collector.ip_bytes_in.get("10.0.0.1").is_none());
|
|
|
|
|
assert!(collector.ip_bytes_out.get("10.0.0.1").is_none());
|
|
|
|
|
assert!(collector.ip_pending_tp.get("10.0.0.1").is_none());
|
|
|
|
|
assert!(collector.ip_throughput.get("10.0.0.1").is_none());
|
|
|
|
|
assert!(collector.ip_closed_at_ms.get("10.0.0.1").is_none());
|
|
|
|
|
|
|
|
|
|
// 10.0.0.2 should still have data
|
|
|
|
|
assert!(collector.ip_connections.get("10.0.0.2").is_some());
|
|
|
|
@@ -1577,7 +1696,14 @@ mod tests {
|
|
|
|
|
.unwrap_or(0),
|
|
|
|
|
0
|
|
|
|
|
);
|
|
|
|
|
assert!(collector.ip_connections.get("10.0.0.1").is_none());
|
|
|
|
|
assert_eq!(
|
|
|
|
|
collector
|
|
|
|
|
.ip_connections
|
|
|
|
|
.get("10.0.0.1")
|
|
|
|
|
.map(|c| c.load(Ordering::Relaxed))
|
|
|
|
|
.unwrap_or(0),
|
|
|
|
|
0
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
@@ -1718,26 +1844,52 @@ mod tests {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_record_bytes_after_close_no_orphan() {
|
|
|
|
|
fn test_closed_ip_keeps_pending_throughput_until_sample() {
|
|
|
|
|
let collector = MetricsCollector::with_retention(60);
|
|
|
|
|
|
|
|
|
|
// Open a connection, record bytes, then close
|
|
|
|
|
collector.connection_opened(Some("route-a"), Some("10.0.0.1"));
|
|
|
|
|
collector.record_bytes(100, 200, Some("route-a"), Some("10.0.0.1"));
|
|
|
|
|
collector.record_bytes(100, 2000, Some("route-a"), Some("10.0.0.1"));
|
|
|
|
|
collector.connection_closed(Some("route-a"), Some("10.0.0.1"));
|
|
|
|
|
|
|
|
|
|
// IP should be fully evicted
|
|
|
|
|
assert!(collector.ip_connections.get("10.0.0.1").is_none());
|
|
|
|
|
collector.sample_all();
|
|
|
|
|
let snapshot = collector.snapshot();
|
|
|
|
|
let ip_metrics = snapshot.ips.get("10.0.0.1").unwrap();
|
|
|
|
|
|
|
|
|
|
// Now record_bytes arrives late (simulates race) — should NOT re-create entries
|
|
|
|
|
collector.record_bytes(50, 75, Some("route-a"), Some("10.0.0.1"));
|
|
|
|
|
assert!(collector.ip_bytes_in.get("10.0.0.1").is_none());
|
|
|
|
|
assert!(collector.ip_bytes_out.get("10.0.0.1").is_none());
|
|
|
|
|
assert!(collector.ip_pending_tp.get("10.0.0.1").is_none());
|
|
|
|
|
assert_eq!(ip_metrics.active_connections, 0);
|
|
|
|
|
assert_eq!(ip_metrics.bytes_in, 100);
|
|
|
|
|
assert_eq!(ip_metrics.bytes_out, 2000);
|
|
|
|
|
assert_eq!(ip_metrics.throughput_in_bytes_per_sec, 100);
|
|
|
|
|
assert_eq!(ip_metrics.throughput_out_bytes_per_sec, 2000);
|
|
|
|
|
assert_eq!(snapshot.throughput_out_bytes_per_sec, 2000);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
snapshot
|
|
|
|
|
.routes
|
|
|
|
|
.get("route-a")
|
|
|
|
|
.unwrap()
|
|
|
|
|
.throughput_out_bytes_per_sec,
|
|
|
|
|
2000
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Global bytes should still be counted
|
|
|
|
|
assert_eq!(collector.total_bytes_in.load(Ordering::Relaxed), 150);
|
|
|
|
|
assert_eq!(collector.total_bytes_out.load(Ordering::Relaxed), 275);
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_inactive_ip_retention_hard_cap_prunes_oldest() {
|
|
|
|
|
let collector = MetricsCollector::with_retention(60);
|
|
|
|
|
|
|
|
|
|
for i in 0..(MAX_INACTIVE_IPS_RETAINED + 5) {
|
|
|
|
|
let ip = format!("198.51.{}.{}", i / 255, i % 255);
|
|
|
|
|
collector.connection_opened(Some("route-a"), Some(&ip));
|
|
|
|
|
collector.connection_closed(Some("route-a"), Some(&ip));
|
|
|
|
|
collector
|
|
|
|
|
.ip_closed_at_ms
|
|
|
|
|
.get(&ip)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.store(current_time_ms().saturating_sub(1), Ordering::Relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
collector.sample_all();
|
|
|
|
|
|
|
|
|
|
assert!(collector.ip_closed_at_ms.len() <= MAX_INACTIVE_IPS_RETAINED);
|
|
|
|
|
assert!(collector.ip_connections.len() <= MAX_INACTIVE_IPS_RETAINED);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|