//! Bounded, sliding-TTL protocol detection cache with periodic re-probing and failure suppression. //! //! Caches the detected protocol (H1, H2, or H3) per backend endpoint and requested //! domain (host:port + requested_host). This prevents cache oscillation when multiple //! frontend domains share the same backend but differ in protocol support. //! //! ## Sliding TTL //! //! Each cache hit refreshes the entry's expiry timer (`last_accessed_at`). Entries //! remain valid for up to 1 day of continuous use. Every 5 minutes, the next request //! triggers an inline ALPN re-probe to verify the cached protocol is still correct. //! //! ## Upgrade signals //! //! - ALPN (TLS handshake) → detects H2 vs H1 //! - Alt-Svc (response header) → advertises H3 //! //! ## Protocol transitions //! //! All protocol changes are logged at `info!()` level with the reason: //! "Protocol transition: H1 → H2 because periodic ALPN re-probe" //! //! ## Failure suppression //! //! When a protocol fails, `record_failure()` prevents upgrade signals from //! re-introducing it until an escalating cooldown expires (5s → 10s → ... → 300s). //! Within-request escalation is allowed via `can_retry()` after a 5s minimum gap. //! //! ## Total failure eviction //! //! When all protocols (H3, H2, H1) fail for a backend, the cache entry is evicted //! entirely via `evict()`, forcing a fresh probe on the next request. //! //! Cascading: when a lower protocol also fails, higher protocol cooldowns are //! reduced to 5s remaining (not instant clear), preventing tight retry loops. use std::sync::Arc; use std::time::{Duration, Instant}; use dashmap::DashMap; use tracing::{debug, info}; /// Sliding TTL for cached protocol detection results. /// Entries that haven't been accessed for this duration are evicted. /// Each `get()` call refreshes the timer (sliding window). const PROTOCOL_CACHE_TTL: Duration = Duration::from_secs(86400); // 1 day /// Interval between inline ALPN re-probes for H1/H2 entries. /// When a cached entry's `last_probed_at` exceeds this, the next request /// triggers an ALPN re-probe to verify the backend still speaks the same protocol. const PROTOCOL_REPROBE_INTERVAL: Duration = Duration::from_secs(300); // 5 minutes /// Maximum number of entries in the protocol cache. const PROTOCOL_CACHE_MAX_ENTRIES: usize = 4096; /// Background cleanup interval. const PROTOCOL_CACHE_CLEANUP_INTERVAL: Duration = Duration::from_secs(60); /// Minimum cooldown between retry attempts of a failed protocol. const PROTOCOL_FAILURE_COOLDOWN: Duration = Duration::from_secs(5); /// Maximum cooldown (escalation ceiling). const PROTOCOL_FAILURE_MAX_COOLDOWN: Duration = Duration::from_secs(300); /// Consecutive failure count at which cooldown reaches maximum. /// 5s × 2^5 = 160s, 5s × 2^6 = 320s → capped at 300s. const PROTOCOL_FAILURE_ESCALATION_CAP: u32 = 6; /// Detected backend protocol. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum DetectedProtocol { H1, H2, H3, } impl std::fmt::Display for DetectedProtocol { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { DetectedProtocol::H1 => write!(f, "H1"), DetectedProtocol::H2 => write!(f, "H2"), DetectedProtocol::H3 => write!(f, "H3"), } } } /// Result of a protocol cache lookup. #[derive(Debug, Clone, Copy)] pub struct CachedProtocol { pub protocol: DetectedProtocol, /// For H3: the port advertised by Alt-Svc (may differ from TCP port). pub h3_port: Option, /// True if the entry's `last_probed_at` exceeds `PROTOCOL_REPROBE_INTERVAL`. /// Caller should perform an inline ALPN re-probe and call `update_probe_result()`. /// Always `false` for H3 entries (H3 is discovered via Alt-Svc, not ALPN). pub needs_reprobe: bool, } /// Key for the protocol cache: (host, port, requested_host). #[derive(Clone, Debug, Hash, Eq, PartialEq)] pub struct ProtocolCacheKey { pub host: String, pub port: u16, /// The incoming request's domain (Host header / :authority). /// Distinguishes protocol detection when multiple domains share the same backend. pub requested_host: Option, } /// A cached protocol detection result with timestamps. struct CachedEntry { protocol: DetectedProtocol, /// When this protocol was first detected (or last changed). detected_at: Instant, /// Last time any request used this entry (sliding-window TTL). last_accessed_at: Instant, /// Last time an ALPN re-probe was performed for this entry. last_probed_at: Instant, /// For H3: the port advertised by Alt-Svc (may differ from TCP port). h3_port: Option, } /// Failure record for a single protocol level. #[derive(Debug, Clone)] struct FailureRecord { /// When the failure was last recorded. failed_at: Instant, /// Current cooldown duration. Escalates on consecutive failures. cooldown: Duration, /// Number of consecutive failures (for escalation). consecutive_failures: u32, } /// Per-key failure state. Tracks failures at each upgradeable protocol level. /// H1 is never tracked (it's the protocol floor — nothing to fall back to). #[derive(Debug, Clone, Default)] struct FailureState { h2: Option, h3: Option, } impl FailureState { fn is_empty(&self) -> bool { self.h2.is_none() && self.h3.is_none() } fn all_expired(&self) -> bool { let h2_expired = self .h2 .as_ref() .map(|r| r.failed_at.elapsed() >= r.cooldown) .unwrap_or(true); let h3_expired = self .h3 .as_ref() .map(|r| r.failed_at.elapsed() >= r.cooldown) .unwrap_or(true); h2_expired && h3_expired } fn get(&self, protocol: DetectedProtocol) -> Option<&FailureRecord> { match protocol { DetectedProtocol::H2 => self.h2.as_ref(), DetectedProtocol::H3 => self.h3.as_ref(), DetectedProtocol::H1 => None, } } fn get_mut(&mut self, protocol: DetectedProtocol) -> &mut Option { match protocol { DetectedProtocol::H2 => &mut self.h2, DetectedProtocol::H3 => &mut self.h3, DetectedProtocol::H1 => unreachable!("H1 failures are never recorded"), } } } /// Snapshot of a single protocol cache entry, suitable for metrics/UI display. #[derive(Debug, Clone)] pub struct ProtocolCacheEntry { pub host: String, pub port: u16, pub domain: Option, pub protocol: String, pub h3_port: Option, pub age_secs: u64, pub last_accessed_secs: u64, pub last_probed_secs: u64, pub h2_suppressed: bool, pub h3_suppressed: bool, pub h2_cooldown_remaining_secs: Option, pub h3_cooldown_remaining_secs: Option, pub h2_consecutive_failures: Option, pub h3_consecutive_failures: Option, } /// Exponential backoff: PROTOCOL_FAILURE_COOLDOWN × 2^(n-1), capped at MAX. fn escalate_cooldown(consecutive: u32) -> Duration { let base = PROTOCOL_FAILURE_COOLDOWN.as_secs(); let exp = consecutive.saturating_sub(1).min(63) as u64; let secs = base.saturating_mul(1u64.checked_shl(exp as u32).unwrap_or(u64::MAX)); Duration::from_secs(secs.min(PROTOCOL_FAILURE_MAX_COOLDOWN.as_secs())) } /// Bounded, sliding-TTL protocol detection cache with failure suppression. /// /// Memory safety guarantees: /// - Hard cap at `PROTOCOL_CACHE_MAX_ENTRIES` — cannot grow unboundedly. /// - Sliding TTL expiry — entries age out after 1 day without access. /// - Background cleanup task — proactively removes expired entries every 60s. /// - `clear()` — called on route updates to discard stale detections. /// - `Drop` — aborts the background task to prevent dangling tokio tasks. pub struct ProtocolCache { cache: Arc>, /// Generic protocol failure suppression map. Tracks per-protocol failure /// records (H2, H3) for each cache key. Used to prevent upgrade signals /// (ALPN, Alt-Svc) from re-introducing failed protocols. failures: Arc>, cleanup_handle: Option>, } impl ProtocolCache { /// Create a new protocol cache and start the background cleanup task. pub fn new() -> Self { let cache: Arc> = Arc::new(DashMap::new()); let failures: Arc> = Arc::new(DashMap::new()); let cache_clone = Arc::clone(&cache); let failures_clone = Arc::clone(&failures); let cleanup_handle = tokio::spawn(async move { Self::cleanup_loop(cache_clone, failures_clone).await; }); Self { cache, failures, cleanup_handle: Some(cleanup_handle), } } /// Look up the cached protocol for a backend endpoint. /// /// Returns `None` if not cached or expired (caller should probe via ALPN). /// On hit, refreshes `last_accessed_at` (sliding TTL) and sets `needs_reprobe` /// if the entry hasn't been probed in over 5 minutes (H1/H2 only). pub fn get(&self, key: &ProtocolCacheKey) -> Option { let mut entry = self.cache.get_mut(key)?; if entry.last_accessed_at.elapsed() < PROTOCOL_CACHE_TTL { // Refresh sliding TTL entry.last_accessed_at = Instant::now(); // H3 is the ceiling — can't ALPN-probe for H3 (discovered via Alt-Svc). // Only H1/H2 entries trigger periodic re-probing. let needs_reprobe = entry.protocol != DetectedProtocol::H3 && entry.last_probed_at.elapsed() >= PROTOCOL_REPROBE_INTERVAL; Some(CachedProtocol { protocol: entry.protocol, h3_port: entry.h3_port, needs_reprobe, }) } else { // Expired — remove and return None to trigger re-probe drop(entry); // release DashMap ref before remove self.cache.remove(key); None } } /// Insert a detected protocol into the cache. /// Returns `false` if suppressed due to active failure suppression. /// /// **Key semantic**: only suppresses if the protocol being inserted matches /// a suppressed protocol. H1 inserts are NEVER suppressed — downgrades /// always succeed. pub fn insert(&self, key: ProtocolCacheKey, protocol: DetectedProtocol, reason: &str) -> bool { if self.is_suppressed(&key, protocol) { debug!( host = %key.host, port = %key.port, domain = ?key.requested_host, protocol = ?protocol, "Protocol cache insert suppressed — recent failure" ); return false; } self.insert_internal(key, protocol, None, reason); true } /// Insert an H3 detection result with the Alt-Svc advertised port. /// Returns `false` if H3 is suppressed. pub fn insert_h3(&self, key: ProtocolCacheKey, h3_port: u16, reason: &str) -> bool { if self.is_suppressed(&key, DetectedProtocol::H3) { debug!( host = %key.host, port = %key.port, domain = ?key.requested_host, "H3 upgrade suppressed — recent failure" ); return false; } self.insert_internal(key, DetectedProtocol::H3, Some(h3_port), reason); true } /// Update the cache after an inline ALPN re-probe completes. /// /// Always updates `last_probed_at`. If the protocol changed, logs the transition /// and updates the entry. Returns `Some(new_protocol)` if changed, `None` if unchanged. pub fn update_probe_result( &self, key: &ProtocolCacheKey, probed_protocol: DetectedProtocol, reason: &str, ) -> Option { if let Some(mut entry) = self.cache.get_mut(key) { let old_protocol = entry.protocol; entry.last_probed_at = Instant::now(); entry.last_accessed_at = Instant::now(); if old_protocol != probed_protocol { info!( host = %key.host, port = %key.port, domain = ?key.requested_host, old = %old_protocol, new = %probed_protocol, reason = %reason, "Protocol transition" ); entry.protocol = probed_protocol; entry.detected_at = Instant::now(); // Clear h3_port if downgrading from H3 if old_protocol == DetectedProtocol::H3 && probed_protocol != DetectedProtocol::H3 { entry.h3_port = None; } return Some(probed_protocol); } debug!( host = %key.host, port = %key.port, domain = ?key.requested_host, protocol = %old_protocol, reason = %reason, "Re-probe confirmed — no protocol change" ); None } else { // Entry was evicted between the get() and the probe completing. // Insert as a fresh entry. self.insert_internal(key.clone(), probed_protocol, None, reason); Some(probed_protocol) } } /// Record a protocol failure. Future `insert()` calls for this protocol /// will be suppressed until the escalating cooldown expires. /// /// Cooldown escalation: 5s → 10s → 20s → 40s → 80s → 160s → 300s. /// Consecutive counter resets if the previous failure is older than 2× its cooldown. /// /// Cascading: when H2 fails, H3 cooldown is reduced to 5s remaining. /// H1 failures are ignored (H1 is the protocol floor). pub fn record_failure(&self, key: ProtocolCacheKey, protocol: DetectedProtocol) { if protocol == DetectedProtocol::H1 { return; // H1 is the floor — nothing to suppress } let mut entry = self.failures.entry(key.clone()).or_default(); let record = entry.get_mut(protocol); let (consecutive, new_cooldown) = match record { Some(existing) if existing.failed_at.elapsed() < existing.cooldown.saturating_mul(2) => { // Still within the "recent" window — escalate let c = existing .consecutive_failures .saturating_add(1) .min(PROTOCOL_FAILURE_ESCALATION_CAP); (c, escalate_cooldown(c)) } _ => { // First failure or old failure that expired long ago — reset (1, PROTOCOL_FAILURE_COOLDOWN) } }; *record = Some(FailureRecord { failed_at: Instant::now(), cooldown: new_cooldown, consecutive_failures: consecutive, }); // Cascading: when H2 fails, reduce H3 cooldown to 5s remaining if protocol == DetectedProtocol::H2 { Self::reduce_cooldown_to(entry.h3.as_mut(), PROTOCOL_FAILURE_COOLDOWN); } info!( host = %key.host, port = %key.port, domain = ?key.requested_host, protocol = ?protocol, consecutive = consecutive, cooldown_secs = new_cooldown.as_secs(), "Protocol failure recorded — suppressing for {:?}", new_cooldown ); } /// Check whether a protocol is currently suppressed for the given key. /// Returns `true` if the protocol failed within its cooldown period. /// H1 is never suppressed. pub fn is_suppressed(&self, key: &ProtocolCacheKey, protocol: DetectedProtocol) -> bool { if protocol == DetectedProtocol::H1 { return false; } self.failures .get(key) .and_then(|entry| { entry .get(protocol) .map(|r| r.failed_at.elapsed() < r.cooldown) }) .unwrap_or(false) } /// Check whether a protocol can be retried (for within-request escalation). /// Returns `true` if there's no failure record OR if ≥5s have passed since /// the last attempt. More permissive than `is_suppressed`. pub fn can_retry(&self, key: &ProtocolCacheKey, protocol: DetectedProtocol) -> bool { if protocol == DetectedProtocol::H1 { return true; } match self.failures.get(key) { Some(entry) => match entry.get(protocol) { Some(r) => r.failed_at.elapsed() >= PROTOCOL_FAILURE_COOLDOWN, None => true, // no failure record }, None => true, } } /// Record a retry attempt WITHOUT escalating the cooldown. /// Resets the `failed_at` timestamp to prevent rapid retries (5s gate). /// Called before an escalation attempt. If the attempt fails, /// `record_failure` should be called afterward with proper escalation. pub fn record_retry_attempt(&self, key: &ProtocolCacheKey, protocol: DetectedProtocol) { if protocol == DetectedProtocol::H1 { return; } if let Some(mut entry) = self.failures.get_mut(key) { if let Some(ref mut r) = entry.get_mut(protocol) { r.failed_at = Instant::now(); } } } /// Clear the failure record for a protocol (it recovered). /// Called when an escalation retry succeeds. pub fn clear_failure(&self, key: &ProtocolCacheKey, protocol: DetectedProtocol) { if protocol == DetectedProtocol::H1 { return; } if let Some(mut entry) = self.failures.get_mut(key) { *entry.get_mut(protocol) = None; if entry.is_empty() { drop(entry); self.failures.remove(key); } } } /// Evict a cache entry entirely. Called when all protocol probes (H3, H2, H1) /// have failed for a backend. pub fn evict(&self, key: &ProtocolCacheKey) { self.cache.remove(key); self.failures.remove(key); info!( host = %key.host, port = %key.port, domain = ?key.requested_host, "Cache entry evicted — all protocols failed" ); } /// Clear all entries. Called on route updates to discard stale detections. pub fn clear(&self) { self.cache.clear(); self.failures.clear(); } /// Snapshot all non-expired cache entries for metrics/UI display. pub fn snapshot(&self) -> Vec { self.cache .iter() .filter(|entry| entry.value().last_accessed_at.elapsed() < PROTOCOL_CACHE_TTL) .map(|entry| { let key = entry.key(); let val = entry.value(); let failure_info = self.failures.get(key); let (h2_sup, h2_cd, h2_cons) = Self::suppression_info(failure_info.as_deref().and_then(|f| f.h2.as_ref())); let (h3_sup, h3_cd, h3_cons) = Self::suppression_info(failure_info.as_deref().and_then(|f| f.h3.as_ref())); ProtocolCacheEntry { host: key.host.clone(), port: key.port, domain: key.requested_host.clone(), protocol: match val.protocol { DetectedProtocol::H1 => "h1".to_string(), DetectedProtocol::H2 => "h2".to_string(), DetectedProtocol::H3 => "h3".to_string(), }, h3_port: val.h3_port, age_secs: val.detected_at.elapsed().as_secs(), last_accessed_secs: val.last_accessed_at.elapsed().as_secs(), last_probed_secs: val.last_probed_at.elapsed().as_secs(), h2_suppressed: h2_sup, h3_suppressed: h3_sup, h2_cooldown_remaining_secs: h2_cd, h3_cooldown_remaining_secs: h3_cd, h2_consecutive_failures: h2_cons, h3_consecutive_failures: h3_cons, } }) .collect() } // --- Internal helpers --- /// Insert a protocol detection result with an optional H3 port. /// Logs protocol transitions when overwriting an existing entry. /// No suppression check — callers must check before calling. fn insert_internal( &self, key: ProtocolCacheKey, protocol: DetectedProtocol, h3_port: Option, reason: &str, ) { // Check for existing entry to log protocol transitions if let Some(existing) = self.cache.get(&key) { if existing.protocol != protocol { info!( host = %key.host, port = %key.port, domain = ?key.requested_host, old = %existing.protocol, new = %protocol, reason = %reason, "Protocol transition" ); } drop(existing); } // Evict oldest entry if at capacity if self.cache.len() >= PROTOCOL_CACHE_MAX_ENTRIES && !self.cache.contains_key(&key) { let oldest = self .cache .iter() .min_by_key(|entry| entry.value().last_accessed_at) .map(|entry| entry.key().clone()); if let Some(oldest_key) = oldest { self.cache.remove(&oldest_key); } } let now = Instant::now(); self.cache.insert( key, CachedEntry { protocol, detected_at: now, last_accessed_at: now, last_probed_at: now, h3_port, }, ); } /// Reduce a failure record's remaining cooldown to `target`, if it currently /// has MORE than `target` remaining. Never increases cooldown. fn reduce_cooldown_to(record: Option<&mut FailureRecord>, target: Duration) { if let Some(r) = record { let elapsed = r.failed_at.elapsed(); if elapsed < r.cooldown { let remaining = r.cooldown - elapsed; if remaining > target { // Shrink cooldown so it expires in `target` from now r.cooldown = elapsed + target; } } } } /// Extract suppression info from a failure record for metrics. fn suppression_info(record: Option<&FailureRecord>) -> (bool, Option, Option) { match record { Some(r) => { let elapsed = r.failed_at.elapsed(); let suppressed = elapsed < r.cooldown; let remaining = if suppressed { Some((r.cooldown - elapsed).as_secs()) } else { None }; (suppressed, remaining, Some(r.consecutive_failures)) } None => (false, None, None), } } /// Background cleanup loop. async fn cleanup_loop( cache: Arc>, failures: Arc>, ) { let mut interval = tokio::time::interval(PROTOCOL_CACHE_CLEANUP_INTERVAL); loop { interval.tick().await; // Clean expired cache entries (sliding TTL based on last_accessed_at) let expired: Vec = cache .iter() .filter(|entry| entry.value().last_accessed_at.elapsed() >= PROTOCOL_CACHE_TTL) .map(|entry| entry.key().clone()) .collect(); if !expired.is_empty() { debug!( "Protocol cache cleanup: removing {} expired entries", expired.len() ); for key in expired { cache.remove(&key); } } // Clean fully-expired failure entries let expired_failures: Vec = failures .iter() .filter(|entry| entry.value().all_expired()) .map(|entry| entry.key().clone()) .collect(); if !expired_failures.is_empty() { debug!( "Protocol cache cleanup: removing {} expired failure entries", expired_failures.len() ); for key in expired_failures { failures.remove(&key); } } // Safety net: cap failures map at 2× max entries if failures.len() > PROTOCOL_CACHE_MAX_ENTRIES * 2 { let oldest: Vec = failures .iter() .filter(|e| e.value().all_expired()) .map(|e| e.key().clone()) .take(failures.len() - PROTOCOL_CACHE_MAX_ENTRIES) .collect(); for key in oldest { failures.remove(&key); } } } } } impl Drop for ProtocolCache { fn drop(&mut self) { if let Some(handle) = self.cleanup_handle.take() { handle.abort(); } } }