feat(metrics): add real-time throughput sampling and byte-counting metrics
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
use dashmap::DashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::throughput::ThroughputTracker;
|
||||
|
||||
/// Aggregated metrics snapshot.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -12,6 +15,8 @@ pub struct Metrics {
|
||||
pub bytes_out: u64,
|
||||
pub throughput_in_bytes_per_sec: u64,
|
||||
pub throughput_out_bytes_per_sec: u64,
|
||||
pub throughput_recent_in_bytes_per_sec: u64,
|
||||
pub throughput_recent_out_bytes_per_sec: u64,
|
||||
pub routes: std::collections::HashMap<String, RouteMetrics>,
|
||||
}
|
||||
|
||||
@@ -25,6 +30,8 @@ pub struct RouteMetrics {
|
||||
pub bytes_out: u64,
|
||||
pub throughput_in_bytes_per_sec: u64,
|
||||
pub throughput_out_bytes_per_sec: u64,
|
||||
pub throughput_recent_in_bytes_per_sec: u64,
|
||||
pub throughput_recent_out_bytes_per_sec: u64,
|
||||
}
|
||||
|
||||
/// Statistics snapshot.
|
||||
@@ -38,7 +45,15 @@ pub struct Statistics {
|
||||
pub uptime_seconds: u64,
|
||||
}
|
||||
|
||||
/// Default retention for throughput samples (1 hour).
|
||||
const DEFAULT_RETENTION_SECONDS: usize = 3600;
|
||||
|
||||
/// Metrics collector tracking connections and throughput.
|
||||
///
|
||||
/// Design: The hot path (`record_bytes`) is entirely lock-free — it only touches
|
||||
/// `AtomicU64` counters. The cold path (`sample_all`, called at 1Hz) drains
|
||||
/// those atomics and feeds the throughput trackers under a Mutex. This avoids
|
||||
/// contention when `record_bytes` is called per-chunk in the TCP copy loop.
|
||||
pub struct MetricsCollector {
|
||||
active_connections: AtomicU64,
|
||||
total_connections: AtomicU64,
|
||||
@@ -51,10 +66,25 @@ pub struct MetricsCollector {
|
||||
/// Per-route byte counters
|
||||
route_bytes_in: DashMap<String, AtomicU64>,
|
||||
route_bytes_out: DashMap<String, AtomicU64>,
|
||||
|
||||
// ── Lock-free pending throughput counters (hot path) ──
|
||||
global_pending_tp_in: AtomicU64,
|
||||
global_pending_tp_out: AtomicU64,
|
||||
route_pending_tp: DashMap<String, (AtomicU64, AtomicU64)>,
|
||||
|
||||
// ── Throughput history — only locked during sampling (cold path) ──
|
||||
global_throughput: Mutex<ThroughputTracker>,
|
||||
route_throughput: DashMap<String, Mutex<ThroughputTracker>>,
|
||||
retention_seconds: usize,
|
||||
}
|
||||
|
||||
impl MetricsCollector {
|
||||
pub fn new() -> Self {
|
||||
Self::with_retention(DEFAULT_RETENTION_SECONDS)
|
||||
}
|
||||
|
||||
/// Create a MetricsCollector with a custom retention period for throughput history.
|
||||
pub fn with_retention(retention_seconds: usize) -> Self {
|
||||
Self {
|
||||
active_connections: AtomicU64::new(0),
|
||||
total_connections: AtomicU64::new(0),
|
||||
@@ -64,6 +94,12 @@ impl MetricsCollector {
|
||||
route_total_connections: DashMap::new(),
|
||||
route_bytes_in: DashMap::new(),
|
||||
route_bytes_out: DashMap::new(),
|
||||
global_pending_tp_in: AtomicU64::new(0),
|
||||
global_pending_tp_out: AtomicU64::new(0),
|
||||
route_pending_tp: DashMap::new(),
|
||||
global_throughput: Mutex::new(ThroughputTracker::new(retention_seconds)),
|
||||
route_throughput: DashMap::new(),
|
||||
retention_seconds,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,11 +134,18 @@ impl MetricsCollector {
|
||||
}
|
||||
}
|
||||
|
||||
/// Record bytes transferred.
|
||||
/// Record bytes transferred (lock-free hot path).
|
||||
///
|
||||
/// Called per-chunk in the TCP copy loop. Only touches AtomicU64 counters —
|
||||
/// no Mutex is taken. The throughput trackers are fed during `sample_all()`.
|
||||
pub fn record_bytes(&self, bytes_in: u64, bytes_out: u64, route_id: Option<&str>) {
|
||||
self.total_bytes_in.fetch_add(bytes_in, Ordering::Relaxed);
|
||||
self.total_bytes_out.fetch_add(bytes_out, Ordering::Relaxed);
|
||||
|
||||
// Accumulate into lock-free pending throughput counters
|
||||
self.global_pending_tp_in.fetch_add(bytes_in, Ordering::Relaxed);
|
||||
self.global_pending_tp_out.fetch_add(bytes_out, Ordering::Relaxed);
|
||||
|
||||
if let Some(route_id) = route_id {
|
||||
self.route_bytes_in
|
||||
.entry(route_id.to_string())
|
||||
@@ -112,6 +155,63 @@ impl MetricsCollector {
|
||||
.entry(route_id.to_string())
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(bytes_out, Ordering::Relaxed);
|
||||
|
||||
// Accumulate into per-route pending throughput counters (lock-free)
|
||||
let entry = self.route_pending_tp
|
||||
.entry(route_id.to_string())
|
||||
.or_insert_with(|| (AtomicU64::new(0), AtomicU64::new(0)));
|
||||
entry.0.fetch_add(bytes_in, Ordering::Relaxed);
|
||||
entry.1.fetch_add(bytes_out, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Take a throughput sample on all trackers (cold path, call at 1Hz or configured interval).
|
||||
///
|
||||
/// Drains the lock-free pending counters and feeds the accumulated bytes
|
||||
/// into the throughput trackers (under Mutex). This is the only place
|
||||
/// the Mutex is locked.
|
||||
pub fn sample_all(&self) {
|
||||
// Drain global pending bytes and feed into the tracker
|
||||
let global_in = self.global_pending_tp_in.swap(0, Ordering::Relaxed);
|
||||
let global_out = self.global_pending_tp_out.swap(0, Ordering::Relaxed);
|
||||
if let Ok(mut tracker) = self.global_throughput.lock() {
|
||||
tracker.record_bytes(global_in, global_out);
|
||||
tracker.sample();
|
||||
}
|
||||
|
||||
// Drain per-route pending bytes; collect into a Vec to avoid holding DashMap shards
|
||||
let mut route_samples: Vec<(String, u64, u64)> = Vec::new();
|
||||
for entry in self.route_pending_tp.iter() {
|
||||
let route_id = entry.key().clone();
|
||||
let pending_in = entry.value().0.swap(0, Ordering::Relaxed);
|
||||
let pending_out = entry.value().1.swap(0, Ordering::Relaxed);
|
||||
route_samples.push((route_id, pending_in, pending_out));
|
||||
}
|
||||
|
||||
// Feed pending bytes into route trackers and sample
|
||||
let retention = self.retention_seconds;
|
||||
for (route_id, pending_in, pending_out) in &route_samples {
|
||||
// Ensure the tracker exists
|
||||
self.route_throughput
|
||||
.entry(route_id.clone())
|
||||
.or_insert_with(|| Mutex::new(ThroughputTracker::new(retention)));
|
||||
// Now get a separate ref and lock it
|
||||
if let Some(tracker_ref) = self.route_throughput.get(route_id) {
|
||||
if let Ok(mut tracker) = tracker_ref.value().lock() {
|
||||
tracker.record_bytes(*pending_in, *pending_out);
|
||||
tracker.sample();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also sample any route trackers that had no new pending bytes
|
||||
// (to keep their sample window advancing)
|
||||
for entry in self.route_throughput.iter() {
|
||||
if !self.route_pending_tp.contains_key(entry.key()) {
|
||||
if let Ok(mut tracker) = entry.value().lock() {
|
||||
tracker.sample();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,6 +239,16 @@ impl MetricsCollector {
|
||||
pub fn snapshot(&self) -> Metrics {
|
||||
let mut routes = std::collections::HashMap::new();
|
||||
|
||||
// Get global throughput (instant = last 1 sample, recent = last 10 samples)
|
||||
let (global_tp_in, global_tp_out, global_recent_in, global_recent_out) = self.global_throughput
|
||||
.lock()
|
||||
.map(|t| {
|
||||
let (i_in, i_out) = t.instant();
|
||||
let (r_in, r_out) = t.recent();
|
||||
(i_in, i_out, r_in, r_out)
|
||||
})
|
||||
.unwrap_or((0, 0, 0, 0));
|
||||
|
||||
// Collect per-route metrics
|
||||
for entry in self.route_total_connections.iter() {
|
||||
let route_id = entry.key().clone();
|
||||
@@ -156,13 +266,24 @@ impl MetricsCollector {
|
||||
.map(|c| c.load(Ordering::Relaxed))
|
||||
.unwrap_or(0);
|
||||
|
||||
let (route_tp_in, route_tp_out, route_recent_in, route_recent_out) = self.route_throughput
|
||||
.get(&route_id)
|
||||
.and_then(|entry| entry.value().lock().ok().map(|t| {
|
||||
let (i_in, i_out) = t.instant();
|
||||
let (r_in, r_out) = t.recent();
|
||||
(i_in, i_out, r_in, r_out)
|
||||
}))
|
||||
.unwrap_or((0, 0, 0, 0));
|
||||
|
||||
routes.insert(route_id, RouteMetrics {
|
||||
active_connections: active,
|
||||
total_connections: total,
|
||||
bytes_in,
|
||||
bytes_out,
|
||||
throughput_in_bytes_per_sec: 0,
|
||||
throughput_out_bytes_per_sec: 0,
|
||||
throughput_in_bytes_per_sec: route_tp_in,
|
||||
throughput_out_bytes_per_sec: route_tp_out,
|
||||
throughput_recent_in_bytes_per_sec: route_recent_in,
|
||||
throughput_recent_out_bytes_per_sec: route_recent_out,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -171,8 +292,10 @@ impl MetricsCollector {
|
||||
total_connections: self.total_connections(),
|
||||
bytes_in: self.total_bytes_in(),
|
||||
bytes_out: self.total_bytes_out(),
|
||||
throughput_in_bytes_per_sec: 0,
|
||||
throughput_out_bytes_per_sec: 0,
|
||||
throughput_in_bytes_per_sec: global_tp_in,
|
||||
throughput_out_bytes_per_sec: global_tp_out,
|
||||
throughput_recent_in_bytes_per_sec: global_recent_in,
|
||||
throughput_recent_out_bytes_per_sec: global_recent_out,
|
||||
routes,
|
||||
}
|
||||
}
|
||||
@@ -248,4 +371,40 @@ mod tests {
|
||||
let route_in = collector.route_bytes_in.get("route-a").unwrap();
|
||||
assert_eq!(route_in.load(Ordering::Relaxed), 150);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_throughput_tracking() {
|
||||
let collector = MetricsCollector::with_retention(60);
|
||||
|
||||
// Open a connection so the route appears in the snapshot
|
||||
collector.connection_opened(Some("route-a"));
|
||||
|
||||
// Record some bytes
|
||||
collector.record_bytes(1000, 2000, Some("route-a"));
|
||||
collector.record_bytes(500, 750, None);
|
||||
|
||||
// Take a sample (simulates the 1Hz tick)
|
||||
collector.sample_all();
|
||||
|
||||
// Check global throughput
|
||||
let snapshot = collector.snapshot();
|
||||
assert_eq!(snapshot.throughput_in_bytes_per_sec, 1500);
|
||||
assert_eq!(snapshot.throughput_out_bytes_per_sec, 2750);
|
||||
|
||||
// Check per-route throughput
|
||||
let route_a = snapshot.routes.get("route-a").unwrap();
|
||||
assert_eq!(route_a.throughput_in_bytes_per_sec, 1000);
|
||||
assert_eq!(route_a.throughput_out_bytes_per_sec, 2000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_throughput_zero_before_sampling() {
|
||||
let collector = MetricsCollector::with_retention(60);
|
||||
collector.record_bytes(1000, 2000, None);
|
||||
|
||||
// Without sampling, throughput should be 0
|
||||
let snapshot = collector.snapshot();
|
||||
assert_eq!(snapshot.throughput_in_bytes_per_sec, 0);
|
||||
assert_eq!(snapshot.throughput_out_bytes_per_sec, 0);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user