fix(proxy): improve proxy robustness: add connect timeouts, graceful shutdown, WebSocket watchdog, and metrics guard

This commit is contained in:
2026-02-13 16:57:46 +00:00
parent 07e464fdac
commit a8f8946a4d
6 changed files with 199 additions and 39 deletions

View File

@@ -22,3 +22,4 @@ thiserror = { workspace = true }
anyhow = { workspace = true }
arc-swap = { workspace = true }
dashmap = { workspace = true }
tokio-util = { workspace = true }

View File

@@ -6,6 +6,7 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use bytes::Bytes;
use http_body_util::{BodyExt, Full, combinators::BoxBody};
@@ -14,6 +15,7 @@ use hyper::{Request, Response, StatusCode};
use hyper_util::rt::TokioIo;
use regex::Regex;
use tokio::net::TcpStream;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, warn};
use rustproxy_routing::RouteManager;
@@ -23,11 +25,22 @@ use crate::request_filter::RequestFilter;
use crate::response_filter::ResponseFilter;
use crate::upstream_selector::UpstreamSelector;
/// Default upstream connect timeout (30 seconds).
const DEFAULT_CONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
/// Default WebSocket inactivity timeout (1 hour).
const DEFAULT_WS_INACTIVITY_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(3600);
/// Default WebSocket max lifetime (24 hours).
const DEFAULT_WS_MAX_LIFETIME: std::time::Duration = std::time::Duration::from_secs(86400);
/// HTTP proxy service that processes HTTP traffic.
pub struct HttpProxyService {
route_manager: Arc<RouteManager>,
metrics: Arc<MetricsCollector>,
upstream_selector: UpstreamSelector,
/// Timeout for connecting to upstream backends.
connect_timeout: std::time::Duration,
}
impl HttpProxyService {
@@ -36,6 +49,21 @@ impl HttpProxyService {
route_manager,
metrics,
upstream_selector: UpstreamSelector::new(),
connect_timeout: DEFAULT_CONNECT_TIMEOUT,
}
}
/// Create with a custom connect timeout.
pub fn with_connect_timeout(
route_manager: Arc<RouteManager>,
metrics: Arc<MetricsCollector>,
connect_timeout: std::time::Duration,
) -> Self {
Self {
route_manager,
metrics,
upstream_selector: UpstreamSelector::new(),
connect_timeout,
}
}
@@ -45,41 +73,59 @@ impl HttpProxyService {
stream: TcpStream,
peer_addr: std::net::SocketAddr,
port: u16,
cancel: CancellationToken,
) {
self.handle_io(stream, peer_addr, port).await;
self.handle_io(stream, peer_addr, port, cancel).await;
}
/// Handle an incoming HTTP connection on any IO type (plain TCP or TLS-terminated).
///
/// Uses HTTP/1.1 with upgrade support. For clients that negotiate HTTP/2,
/// use `handle_io_auto` instead.
/// Uses HTTP/1.1 with upgrade support. Responds to graceful shutdown via the
/// cancel token — in-flight requests complete, but no new requests are accepted.
pub async fn handle_io<I>(
self: Arc<Self>,
stream: I,
peer_addr: std::net::SocketAddr,
port: u16,
cancel: CancellationToken,
)
where
I: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin + Send + 'static,
{
let io = TokioIo::new(stream);
let cancel_inner = cancel.clone();
let service = hyper::service::service_fn(move |req: Request<Incoming>| {
let svc = Arc::clone(&self);
let peer = peer_addr;
let cn = cancel_inner.clone();
async move {
svc.handle_request(req, peer, port).await
svc.handle_request(req, peer, port, cn).await
}
});
// Use http1::Builder with upgrades for WebSocket support
let conn = hyper::server::conn::http1::Builder::new()
let mut conn = hyper::server::conn::http1::Builder::new()
.keep_alive(true)
.serve_connection(io, service)
.with_upgrades();
if let Err(e) = conn.await {
debug!("HTTP connection error from {}: {}", peer_addr, e);
// Use select to support graceful shutdown via cancellation token
let conn_pin = std::pin::Pin::new(&mut conn);
tokio::select! {
result = conn_pin => {
if let Err(e) = result {
debug!("HTTP connection error from {}: {}", peer_addr, e);
}
}
_ = cancel.cancelled() => {
// Graceful shutdown: let in-flight request finish, stop accepting new ones
let conn_pin = std::pin::Pin::new(&mut conn);
conn_pin.graceful_shutdown();
if let Err(e) = conn.await {
debug!("HTTP connection error during shutdown from {}: {}", peer_addr, e);
}
}
}
}
@@ -89,6 +135,7 @@ impl HttpProxyService {
req: Request<Incoming>,
peer_addr: std::net::SocketAddr,
port: u16,
cancel: CancellationToken,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let host = req.headers()
.get("host")
@@ -184,7 +231,7 @@ impl HttpProxyService {
if is_websocket {
let result = self.handle_websocket_upgrade(
req, peer_addr, &upstream, route_match.route, route_id, &upstream_key,
req, peer_addr, &upstream, route_match.route, route_id, &upstream_key, cancel,
).await;
// Note: for WebSocket, connection_ended is called inside
// the spawned tunnel task when the connection closes.
@@ -223,15 +270,24 @@ impl HttpProxyService {
}
}
// Connect to upstream
let upstream_stream = match TcpStream::connect(format!("{}:{}", upstream.host, upstream.port)).await {
Ok(s) => s,
Err(e) => {
// Connect to upstream with timeout
let upstream_stream = match tokio::time::timeout(
self.connect_timeout,
TcpStream::connect(format!("{}:{}", upstream.host, upstream.port)),
).await {
Ok(Ok(s)) => s,
Ok(Err(e)) => {
error!("Failed to connect to upstream {}:{}: {}", upstream.host, upstream.port, e);
self.upstream_selector.connection_ended(&upstream_key);
self.metrics.connection_closed(route_id);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend unavailable"));
}
Err(_) => {
error!("Upstream connect timeout for {}:{}", upstream.host, upstream.port);
self.upstream_selector.connection_ended(&upstream_key);
self.metrics.connection_closed(route_id);
return Ok(error_response(StatusCode::GATEWAY_TIMEOUT, "Backend connect timeout"));
}
};
upstream_stream.set_nodelay(true).ok();
@@ -394,6 +450,7 @@ impl HttpProxyService {
route: &rustproxy_config::RouteConfig,
route_id: Option<&str>,
upstream_key: &str,
cancel: CancellationToken,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
use tokio::io::{AsyncReadExt, AsyncWriteExt};
@@ -417,16 +474,24 @@ impl HttpProxyService {
info!("WebSocket upgrade from {} -> {}:{}", peer_addr, upstream.host, upstream.port);
let mut upstream_stream = match TcpStream::connect(
format!("{}:{}", upstream.host, upstream.port)
// Connect to upstream with timeout
let mut upstream_stream = match tokio::time::timeout(
self.connect_timeout,
TcpStream::connect(format!("{}:{}", upstream.host, upstream.port)),
).await {
Ok(s) => s,
Err(e) => {
Ok(Ok(s)) => s,
Ok(Err(e)) => {
error!("WebSocket: failed to connect upstream {}:{}: {}", upstream.host, upstream.port, e);
self.upstream_selector.connection_ended(upstream_key);
self.metrics.connection_closed(route_id);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend unavailable"));
}
Err(_) => {
error!("WebSocket: upstream connect timeout for {}:{}", upstream.host, upstream.port);
self.upstream_selector.connection_ended(upstream_key);
self.metrics.connection_closed(route_id);
return Ok(error_response(StatusCode::GATEWAY_TIMEOUT, "Backend connect timeout"));
}
};
upstream_stream.set_nodelay(true).ok();
@@ -591,6 +656,11 @@ impl HttpProxyService {
let (mut cr, mut cw) = tokio::io::split(client_io);
let (mut ur, mut uw) = tokio::io::split(upstream_stream);
// Shared activity tracker for the watchdog
let last_activity = Arc::new(AtomicU64::new(0));
let start = std::time::Instant::now();
let la1 = Arc::clone(&last_activity);
let c2u = tokio::spawn(async move {
let mut buf = vec![0u8; 65536];
let mut total = 0u64;
@@ -603,11 +673,13 @@ impl HttpProxyService {
break;
}
total += n as u64;
la1.store(start.elapsed().as_millis() as u64, Ordering::Relaxed);
}
let _ = uw.shutdown().await;
total
});
let la2 = Arc::clone(&last_activity);
let u2c = tokio::spawn(async move {
let mut buf = vec![0u8; 65536];
let mut total = 0u64;
@@ -620,13 +692,59 @@ impl HttpProxyService {
break;
}
total += n as u64;
la2.store(start.elapsed().as_millis() as u64, Ordering::Relaxed);
}
let _ = cw.shutdown().await;
total
});
// Watchdog: monitors inactivity, max lifetime, and cancellation
let la_watch = Arc::clone(&last_activity);
let c2u_handle = c2u.abort_handle();
let u2c_handle = u2c.abort_handle();
let inactivity_timeout = DEFAULT_WS_INACTIVITY_TIMEOUT;
let max_lifetime = DEFAULT_WS_MAX_LIFETIME;
let watchdog = tokio::spawn(async move {
let check_interval = std::time::Duration::from_secs(5);
let mut last_seen = 0u64;
loop {
tokio::select! {
_ = tokio::time::sleep(check_interval) => {}
_ = cancel.cancelled() => {
debug!("WebSocket tunnel cancelled by shutdown");
c2u_handle.abort();
u2c_handle.abort();
break;
}
}
// Check max lifetime
if start.elapsed() >= max_lifetime {
debug!("WebSocket tunnel exceeded max lifetime, closing");
c2u_handle.abort();
u2c_handle.abort();
break;
}
// Check inactivity
let current = la_watch.load(Ordering::Relaxed);
if current == last_seen {
let elapsed_since_activity = start.elapsed().as_millis() as u64 - current;
if elapsed_since_activity >= inactivity_timeout.as_millis() as u64 {
debug!("WebSocket tunnel inactive for {}ms, closing", elapsed_since_activity);
c2u_handle.abort();
u2c_handle.abort();
break;
}
}
last_seen = current;
}
});
let bytes_in = c2u.await.unwrap_or(0);
let bytes_out = u2c.await.unwrap_or(0);
watchdog.abort();
debug!("WebSocket tunnel closed: {} bytes in, {} bytes out", bytes_in, bytes_out);
@@ -812,6 +930,7 @@ impl Default for HttpProxyService {
route_manager: Arc::new(RouteManager::new(vec![])),
metrics: Arc::new(MetricsCollector::new()),
upstream_selector: UpstreamSelector::new(),
connect_timeout: DEFAULT_CONNECT_TIMEOUT,
}
}
}