fix(proxy): close connection buildup vectors in HTTP idle, WebSocket, socket relay, and TLS forwarding paths

- Add HTTP keep-alive idle timeout (60s default) with periodic watchdog that
  skips active requests (panic-safe via RAII ActiveRequestGuard)
- Make WebSocket inactivity/max-lifetime timeouts configurable from ConnectionConfig
  instead of hardcoded 1h/24h
- Replace bare copy_bidirectional in socket handler relay with timeout+cancel-aware
  split forwarding (inactivity, max lifetime, graceful shutdown)
- Add CancellationToken to forward_bidirectional_split_with_timeouts so TLS-terminated
  TCP connections respond to graceful shutdown
- Fix graceful_stop to actually abort listener tasks that exceed the shutdown deadline
  (previously they detached and ran forever)
- Add 10s metadata parsing timeout on TS socket-handler-server to prevent stuck sockets
This commit is contained in:
2026-02-26 21:29:19 +00:00
parent ef060d5e79
commit 8db621657f
3 changed files with 215 additions and 47 deletions

View File

@@ -34,12 +34,35 @@ use crate::upstream_selector::UpstreamSelector;
/// Default upstream connect timeout (30 seconds).
const DEFAULT_CONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
/// Default HTTP keep-alive idle timeout (60 seconds).
/// If no new request arrives within this duration, the connection is closed.
const DEFAULT_HTTP_IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
/// Default WebSocket inactivity timeout (1 hour).
const DEFAULT_WS_INACTIVITY_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(3600);
/// Default WebSocket max lifetime (24 hours).
const DEFAULT_WS_MAX_LIFETIME: std::time::Duration = std::time::Duration::from_secs(86400);
/// RAII guard that decrements the active request counter on drop.
/// Ensures the counter is correct even if the request handler panics.
struct ActiveRequestGuard {
counter: Arc<AtomicU64>,
}
impl ActiveRequestGuard {
fn new(counter: Arc<AtomicU64>) -> Self {
counter.fetch_add(1, Ordering::Relaxed);
Self { counter }
}
}
impl Drop for ActiveRequestGuard {
fn drop(&mut self) {
self.counter.fetch_sub(1, Ordering::Relaxed);
}
}
/// Backend stream that can be either plain TCP or TLS-wrapped.
/// Used for `terminate-and-reencrypt` mode where the backend requires TLS.
pub(crate) enum BackendStream {
@@ -125,6 +148,12 @@ pub struct HttpProxyService {
backend_tls_config: Arc<rustls::ClientConfig>,
/// Backend connection pool for reusing keep-alive connections.
connection_pool: Arc<crate::connection_pool::ConnectionPool>,
/// HTTP keep-alive idle timeout: close connection if no new request arrives within this duration.
http_idle_timeout: std::time::Duration,
/// WebSocket inactivity timeout (no data in either direction).
ws_inactivity_timeout: std::time::Duration,
/// WebSocket maximum connection lifetime.
ws_max_lifetime: std::time::Duration,
}
impl HttpProxyService {
@@ -139,6 +168,9 @@ impl HttpProxyService {
regex_cache: DashMap::new(),
backend_tls_config: Self::default_backend_tls_config(),
connection_pool: Arc::new(crate::connection_pool::ConnectionPool::new()),
http_idle_timeout: DEFAULT_HTTP_IDLE_TIMEOUT,
ws_inactivity_timeout: DEFAULT_WS_INACTIVITY_TIMEOUT,
ws_max_lifetime: DEFAULT_WS_MAX_LIFETIME,
}
}
@@ -158,9 +190,25 @@ impl HttpProxyService {
regex_cache: DashMap::new(),
backend_tls_config: Self::default_backend_tls_config(),
connection_pool: Arc::new(crate::connection_pool::ConnectionPool::new()),
http_idle_timeout: DEFAULT_HTTP_IDLE_TIMEOUT,
ws_inactivity_timeout: DEFAULT_WS_INACTIVITY_TIMEOUT,
ws_max_lifetime: DEFAULT_WS_MAX_LIFETIME,
}
}
/// Set the HTTP keep-alive idle timeout, WebSocket inactivity timeout, and
/// WebSocket max lifetime from connection config values.
pub fn set_connection_timeouts(
&mut self,
http_idle_timeout: std::time::Duration,
ws_inactivity_timeout: std::time::Duration,
ws_max_lifetime: std::time::Duration,
) {
self.http_idle_timeout = http_idle_timeout;
self.ws_inactivity_timeout = ws_inactivity_timeout;
self.ws_max_lifetime = ws_max_lifetime;
}
/// Set the shared backend TLS config (enables session resumption).
/// Call this after construction to inject the shared config from tls_handler.
pub fn set_backend_tls_config(&mut self, config: Arc<rustls::ClientConfig>) {
@@ -192,6 +240,10 @@ impl HttpProxyService {
/// based on ALPN negotiation (TLS) or connection preface (h2c).
/// Supports HTTP/1.1 upgrades (WebSocket) and HTTP/2 CONNECT.
/// Responds to graceful shutdown via the cancel token.
///
/// An idle watchdog closes the connection if no new HTTP request arrives
/// within `http_idle_timeout` (default 60s). This prevents keep-alive
/// connections from accumulating indefinitely.
pub async fn handle_io<I>(
self: Arc<Self>,
stream: I,
@@ -204,13 +256,34 @@ impl HttpProxyService {
{
let io = TokioIo::new(stream);
// Capture timeouts before `self` is moved into the service closure.
let idle_timeout = self.http_idle_timeout;
// Activity tracker: updated at the START and END of each request.
// The idle watchdog checks this to determine if the connection is idle
// (no request in progress and none started recently).
let last_activity = Arc::new(AtomicU64::new(0));
let active_requests = Arc::new(AtomicU64::new(0));
let start = std::time::Instant::now();
let la_inner = Arc::clone(&last_activity);
let ar_inner = Arc::clone(&active_requests);
let cancel_inner = cancel.clone();
let service = hyper::service::service_fn(move |req: Request<Incoming>| {
// Mark request start — RAII guard decrements on drop (panic-safe)
la_inner.store(start.elapsed().as_millis() as u64, Ordering::Relaxed);
let req_guard = ActiveRequestGuard::new(Arc::clone(&ar_inner));
let svc = Arc::clone(&self);
let peer = peer_addr;
let cn = cancel_inner.clone();
let la = Arc::clone(&la_inner);
let st = start;
async move {
svc.handle_request(req, peer, port, cn).await
let result = svc.handle_request(req, peer, port, cn).await;
// Mark request end — update activity timestamp before guard drops
la.store(st.elapsed().as_millis() as u64, Ordering::Relaxed);
drop(req_guard); // Explicitly drop to decrement active_requests
result
}
});
@@ -221,7 +294,7 @@ impl HttpProxyService {
// Pin on the heap — auto::UpgradeableConnection is !Unpin
let mut conn = Box::pin(conn);
// Use select to support graceful shutdown via cancellation token
// Use select to support graceful shutdown, cancellation, and idle timeout
tokio::select! {
result = conn.as_mut() => {
if let Err(e) = result {
@@ -235,6 +308,37 @@ impl HttpProxyService {
debug!("HTTP connection error during shutdown from {}: {}", peer_addr, e);
}
}
_ = async {
// Idle watchdog: check every 5s whether the connection has been idle
// (no active requests AND no activity for idle_timeout).
// This avoids killing long-running requests or upgraded connections.
let check_interval = std::time::Duration::from_secs(5);
let mut last_seen = 0u64;
loop {
tokio::time::sleep(check_interval).await;
// Never close while a request is in progress
if active_requests.load(Ordering::Relaxed) > 0 {
last_seen = last_activity.load(Ordering::Relaxed);
continue;
}
let current = last_activity.load(Ordering::Relaxed);
if current == last_seen {
// No new activity since last check
let elapsed_since_activity = start.elapsed().as_millis() as u64 - current;
if elapsed_since_activity >= idle_timeout.as_millis() as u64 {
return;
}
}
last_seen = current;
}
} => {
debug!("HTTP connection idle timeout ({}s) from {}", idle_timeout.as_secs(), peer_addr);
conn.as_mut().graceful_shutdown();
// Give any in-flight work 5s to drain after graceful shutdown
let _ = tokio::time::timeout(std::time::Duration::from_secs(5), conn).await;
}
}
}
@@ -1022,6 +1126,8 @@ impl HttpProxyService {
let source_ip_owned = source_ip.to_string();
let upstream_selector = self.upstream_selector.clone();
let upstream_key_owned = upstream_key.to_string();
let ws_inactivity_timeout = self.ws_inactivity_timeout;
let ws_max_lifetime = self.ws_max_lifetime;
tokio::spawn(async move {
let client_upgraded = match on_client_upgrade.await {
@@ -1084,8 +1190,8 @@ impl HttpProxyService {
let la_watch = Arc::clone(&last_activity);
let c2u_handle = c2u.abort_handle();
let u2c_handle = u2c.abort_handle();
let inactivity_timeout = DEFAULT_WS_INACTIVITY_TIMEOUT;
let max_lifetime = DEFAULT_WS_MAX_LIFETIME;
let inactivity_timeout = ws_inactivity_timeout;
let max_lifetime = ws_max_lifetime;
let watchdog = tokio::spawn(async move {
let check_interval = std::time::Duration::from_secs(5);
@@ -1391,6 +1497,9 @@ impl Default for HttpProxyService {
regex_cache: DashMap::new(),
backend_tls_config: Self::default_backend_tls_config(),
connection_pool: Arc::new(crate::connection_pool::ConnectionPool::new()),
http_idle_timeout: DEFAULT_HTTP_IDLE_TIMEOUT,
ws_inactivity_timeout: DEFAULT_WS_INACTIVITY_TIMEOUT,
ws_max_lifetime: DEFAULT_WS_MAX_LIFETIME,
}
}
}

View File

@@ -174,6 +174,11 @@ impl TcpListenerManager {
std::time::Duration::from_millis(conn_config.connection_timeout_ms),
);
http_proxy_svc.set_backend_tls_config(tls_handler::shared_backend_tls_config());
http_proxy_svc.set_connection_timeouts(
std::time::Duration::from_millis(conn_config.socket_timeout_ms),
std::time::Duration::from_millis(conn_config.socket_timeout_ms),
std::time::Duration::from_millis(conn_config.max_connection_lifetime_ms),
);
let http_proxy = Arc::new(http_proxy_svc);
let conn_tracker = Arc::new(ConnectionTracker::new(
conn_config.max_connections_per_ip,
@@ -204,6 +209,11 @@ impl TcpListenerManager {
std::time::Duration::from_millis(conn_config.connection_timeout_ms),
);
http_proxy_svc.set_backend_tls_config(tls_handler::shared_backend_tls_config());
http_proxy_svc.set_connection_timeouts(
std::time::Duration::from_millis(conn_config.socket_timeout_ms),
std::time::Duration::from_millis(conn_config.socket_timeout_ms),
std::time::Duration::from_millis(conn_config.max_connection_lifetime_ms),
);
let http_proxy = Arc::new(http_proxy_svc);
let conn_tracker = Arc::new(ConnectionTracker::new(
conn_config.max_connections_per_ip,
@@ -232,6 +242,22 @@ impl TcpListenerManager {
config.connection_rate_limit_per_minute,
));
self.conn_semaphore = Arc::new(tokio::sync::Semaphore::new(config.max_connections as usize));
// Rebuild http_proxy with updated timeouts
let rm = self.route_manager.load_full();
let mut http_proxy_svc = HttpProxyService::with_connect_timeout(
rm,
Arc::clone(&self.metrics),
std::time::Duration::from_millis(config.connection_timeout_ms),
);
http_proxy_svc.set_backend_tls_config(tls_handler::shared_backend_tls_config());
http_proxy_svc.set_connection_timeouts(
std::time::Duration::from_millis(config.socket_timeout_ms),
std::time::Duration::from_millis(config.socket_timeout_ms),
std::time::Duration::from_millis(config.max_connection_lifetime_ms),
);
self.http_proxy = Arc::new(http_proxy_svc);
self.conn_config = Arc::new(config);
}
@@ -336,13 +362,15 @@ impl TcpListenerManager {
for (port, handle) in self.listeners.drain() {
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
let abort_handle = handle.abort_handle();
if remaining.is_zero() {
handle.abort();
abort_handle.abort();
warn!("Force-stopped listener on port {} (timeout exceeded)", port);
} else {
match tokio::time::timeout(remaining, handle).await {
Ok(_) => info!("Listener on port {} stopped gracefully", port),
Err(_) => {
abort_handle.abort();
warn!("Listener on port {} did not stop in time, aborting", port);
}
}
@@ -791,7 +819,8 @@ impl TcpListenerManager {
stream, n, port, peer_addr,
&route_match, domain.as_deref(), is_tls,
&relay_socket_path,
&metrics, route_id,
Arc::clone(&metrics), route_id,
&conn_config, cancel.clone(),
).await;
} else {
debug!("Socket-handler route matched but no relay path configured");
@@ -964,7 +993,7 @@ impl TcpListenerManager {
let (_bytes_in, _bytes_out) = Self::forward_bidirectional_split_with_timeouts(
tls_read, tls_write, backend_read, backend_write,
inactivity_timeout, max_lifetime,
inactivity_timeout, max_lifetime, cancel.clone(),
Some(forwarder::ForwardMetricsCtx {
collector: Arc::clone(&metrics),
route_id: route_id.map(|s| s.to_string()),
@@ -1023,7 +1052,7 @@ impl TcpListenerManager {
Self::handle_tls_reencrypt_tunnel(
buf_stream, &target_host, target_port,
peer_addr, Arc::clone(&metrics), route_id,
&conn_config,
&conn_config, cancel.clone(),
).await?;
}
Ok(())
@@ -1100,8 +1129,10 @@ impl TcpListenerManager {
domain: Option<&str>,
is_tls: bool,
relay_path: &str,
metrics: &MetricsCollector,
metrics: Arc<MetricsCollector>,
route_id: Option<&str>,
conn_config: &ConnectionConfig,
cancel: CancellationToken,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::UnixStream;
@@ -1141,27 +1172,34 @@ impl TcpListenerManager {
// Forward initial data to the Unix socket
unix_stream.write_all(&initial_buf).await?;
// Bidirectional relay between TCP client and Unix socket handler
// Bidirectional relay with inactivity timeout, max lifetime, and cancellation.
// Split both streams and use the same watchdog pattern as other forwarding paths.
let initial_len = initial_buf.len() as u64;
match tokio::io::copy_bidirectional(&mut stream, &mut unix_stream).await {
Ok((c2s, s2c)) => {
// Include initial data bytes that were forwarded before copy_bidirectional
let total_in = c2s + initial_len;
debug!("Socket handler relay complete for {}: {} bytes in, {} bytes out",
route_key, total_in, s2c);
let ip = peer_addr.ip().to_string();
metrics.record_bytes(total_in, s2c, route_id, Some(&ip));
}
Err(e) => {
// Still record the initial data even on error
if initial_len > 0 {
let ip = peer_addr.ip().to_string();
metrics.record_bytes(initial_len, 0, route_id, Some(&ip));
}
debug!("Socket handler relay ended for {}: {}", route_key, e);
}
let inactivity_timeout = std::time::Duration::from_millis(conn_config.socket_timeout_ms);
let max_lifetime = std::time::Duration::from_millis(conn_config.max_connection_lifetime_ms);
let (tcp_read, tcp_write) = stream.into_split();
let (unix_read, unix_write) = unix_stream.into_split();
let ip_str = peer_addr.ip().to_string();
let (_bytes_in, _bytes_out) = Self::forward_bidirectional_split_with_timeouts(
tcp_read, tcp_write, unix_read, unix_write,
inactivity_timeout, max_lifetime, cancel,
Some(forwarder::ForwardMetricsCtx {
collector: Arc::clone(&metrics),
route_id: route_id.map(|s| s.to_string()),
source_ip: Some(ip_str.clone()),
}),
).await;
// Include the initial data that was forwarded before the bidirectional relay
if initial_len > 0 {
metrics.record_bytes(initial_len, 0, route_id, Some(&ip_str));
}
debug!("Socket handler relay complete for {}: {} bytes in, {} bytes out",
route_key, _bytes_in + initial_len, _bytes_out);
Ok(())
}
@@ -1176,6 +1214,7 @@ impl TcpListenerManager {
metrics: Arc<MetricsCollector>,
route_id: Option<&str>,
conn_config: &ConnectionConfig,
cancel: CancellationToken,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
// Connect to backend over TLS with timeout
let backend_tls = match tokio::time::timeout(
@@ -1220,7 +1259,7 @@ impl TcpListenerManager {
let (_bytes_in, _bytes_out) = Self::forward_bidirectional_split_with_timeouts(
client_read, client_write, backend_read, backend_write,
inactivity_timeout, max_lifetime,
inactivity_timeout, max_lifetime, cancel,
Some(forwarder::ForwardMetricsCtx {
collector: metrics,
route_id: route_id.map(|s| s.to_string()),
@@ -1295,6 +1334,7 @@ impl TcpListenerManager {
mut backend_write: W2,
inactivity_timeout: std::time::Duration,
max_lifetime: std::time::Duration,
cancel: CancellationToken,
metrics: Option<forwarder::ForwardMetricsCtx>,
) -> (u64, u64)
where
@@ -1362,7 +1402,7 @@ impl TcpListenerManager {
total
});
// Watchdog task: check for inactivity and max lifetime
// Watchdog task: check for inactivity, max lifetime, and cancellation
let la_watch = Arc::clone(&last_activity);
let c2b_handle = c2b.abort_handle();
let b2c_handle = b2c.abort_handle();
@@ -1370,29 +1410,37 @@ impl TcpListenerManager {
let check_interval = std::time::Duration::from_secs(5);
let mut last_seen = 0u64;
loop {
tokio::time::sleep(check_interval).await;
// Check max lifetime
if start.elapsed() >= max_lifetime {
debug!("Connection exceeded max lifetime, closing");
c2b_handle.abort();
b2c_handle.abort();
break;
}
// Check inactivity
let current = la_watch.load(Ordering::Relaxed);
if current == last_seen {
// No activity since last check
let elapsed_since_activity = start.elapsed().as_millis() as u64 - current;
if elapsed_since_activity >= inactivity_timeout.as_millis() as u64 {
debug!("Connection inactive for {}ms, closing", elapsed_since_activity);
tokio::select! {
_ = cancel.cancelled() => {
debug!("Split-stream connection cancelled by shutdown");
c2b_handle.abort();
b2c_handle.abort();
break;
}
_ = tokio::time::sleep(check_interval) => {
// Check max lifetime
if start.elapsed() >= max_lifetime {
debug!("Connection exceeded max lifetime, closing");
c2b_handle.abort();
b2c_handle.abort();
break;
}
// Check inactivity
let current = la_watch.load(Ordering::Relaxed);
if current == last_seen {
// No activity since last check
let elapsed_since_activity = start.elapsed().as_millis() as u64 - current;
if elapsed_since_activity >= inactivity_timeout.as_millis() as u64 {
debug!("Connection inactive for {}ms, closing", elapsed_since_activity);
c2b_handle.abort();
b2c_handle.abort();
break;
}
}
last_seen = current;
}
}
last_seen = current;
}
});