feat: add TCP keepalive options and connection pooling for improved performance

- Added `socket2` dependency for socket options. - Introduced `keep_alive`, `keep_alive_initial_delay_ms`, and `max_connections` fields in `ConnectionConfig`. - Implemented TCP keepalive settings in `TcpListenerManager` for both client and backend connections. - Created a new `ConnectionPool` for managing idle HTTP/1.1 and HTTP/2 connections to reduce overhead. - Enhanced TLS configuration to support ALPN for HTTP/2. - Added performance tests for connection pooling, stability, and concurrent connections.
2026-02-20 18:16:09 +00:00
parent 0f6752b9a7
commit 9521f2e044
14 changed files with 1058 additions and 101 deletions
--- a/rust/crates/rustproxy-passthrough/Cargo.toml
+++ b/rust/crates/rustproxy-passthrough/Cargo.toml
@@ -23,3 +23,4 @@ rustls-pemfile = { workspace = true }
 tokio-util = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
+socket2 = { workspace = true }
--- a/rust/crates/rustproxy-passthrough/src/lib.rs
+++ b/rust/crates/rustproxy-passthrough/src/lib.rs
@@ -11,6 +11,7 @@ pub mod tls_handler;
 pub mod connection_record;
 pub mod connection_tracker;
 pub mod socket_relay;
+pub mod socket_opts;

 pub use tcp_listener::*;
 pub use sni_parser::*;
@@ -20,3 +21,4 @@ pub use tls_handler::*;
 pub use connection_record::*;
 pub use connection_tracker::*;
 pub use socket_relay::*;
+pub use socket_opts::*;
--- a/rust/crates/rustproxy-passthrough/src/socket_opts.rs
+++ b/rust/crates/rustproxy-passthrough/src/socket_opts.rs
@@ -0,0 +1,19 @@
+//! Socket-level options for TCP streams (keepalive, etc.).
+//!
+//! Uses `socket2::SockRef::from()` to borrow the raw fd without ownership transfer.
+
+use std::io;
+use std::time::Duration;
+use tokio::net::TcpStream;
+
+/// Apply TCP keepalive to a connected socket.
+///
+/// Enables SO_KEEPALIVE and sets the initial probe delay.
+/// On Linux, also sets the interval between probes to the same value.
+pub fn apply_keepalive(stream: &TcpStream, delay: Duration) -> io::Result<()> {
+    let sock_ref = socket2::SockRef::from(stream);
+    let ka = socket2::TcpKeepalive::new().with_time(delay);
+    #[cfg(target_os = "linux")]
+    let ka = ka.with_interval(delay);
+    sock_ref.set_tcp_keepalive(&ka)
+}
--- a/rust/crates/rustproxy-passthrough/src/tcp_listener.rs
+++ b/rust/crates/rustproxy-passthrough/src/tcp_listener.rs
@@ -15,6 +15,7 @@ use crate::sni_parser;
 use crate::forwarder;
 use crate::tls_handler;
 use crate::connection_tracker::ConnectionTracker;
+use crate::socket_opts;

 /// RAII guard that decrements the active connection metric on drop.
 /// Ensures connection_closed is called on ALL exit paths — normal, error, or panic.
@@ -87,6 +88,12 @@ pub struct ConnectionConfig {
    /// Trusted IPs that may send PROXY protocol headers.
    /// When non-empty, only connections from these IPs will have PROXY headers parsed.
    pub proxy_ips: Vec<std::net::IpAddr>,
+    /// Enable TCP keepalive on sockets (default: true)
+    pub keep_alive: bool,
+    /// Initial delay before first keepalive probe (ms, default: 60000)
+    pub keep_alive_initial_delay_ms: u64,
+    /// Global maximum simultaneous connections (default: 100000)
+    pub max_connections: u64,
 }

 impl Default for ConnectionConfig {
@@ -105,6 +112,9 @@ impl Default for ConnectionConfig {
            accept_proxy_protocol: false,
            send_proxy_protocol: false,
            proxy_ips: Vec::new(),
+            keep_alive: true,
+            keep_alive_initial_delay_ms: 60_000,
+            max_connections: 100_000,
        }
    }
 }
@@ -131,21 +141,26 @@ pub struct TcpListenerManager {
    cancel_token: CancellationToken,
    /// Path to Unix domain socket for relaying socket-handler connections to TypeScript.
    socket_handler_relay: Arc<std::sync::RwLock<Option<String>>>,
+    /// Global connection semaphore — limits total simultaneous connections.
+    conn_semaphore: Arc<tokio::sync::Semaphore>,
 }

 impl TcpListenerManager {
    pub fn new(route_manager: Arc<RouteManager>) -> Self {
        let metrics = Arc::new(MetricsCollector::new());
        let conn_config = ConnectionConfig::default();
-        let http_proxy = Arc::new(HttpProxyService::with_connect_timeout(
+        let mut http_proxy_svc = HttpProxyService::with_connect_timeout(
            Arc::clone(&route_manager),
            Arc::clone(&metrics),
            std::time::Duration::from_millis(conn_config.connection_timeout_ms),
-        ));
+        );
+        http_proxy_svc.set_backend_tls_config(tls_handler::shared_backend_tls_config());
+        let http_proxy = Arc::new(http_proxy_svc);
        let conn_tracker = Arc::new(ConnectionTracker::new(
            conn_config.max_connections_per_ip,
            conn_config.connection_rate_limit_per_minute,
        ));
+        let max_conns = conn_config.max_connections as usize;
        Self {
            listeners: HashMap::new(),
            route_manager: Arc::new(ArcSwap::from(route_manager)),
@@ -157,21 +172,25 @@ impl TcpListenerManager {
            conn_tracker,
            cancel_token: CancellationToken::new(),
            socket_handler_relay: Arc::new(std::sync::RwLock::new(None)),
+            conn_semaphore: Arc::new(tokio::sync::Semaphore::new(max_conns)),
        }
    }

    /// Create with a metrics collector.
    pub fn with_metrics(route_manager: Arc<RouteManager>, metrics: Arc<MetricsCollector>) -> Self {
        let conn_config = ConnectionConfig::default();
-        let http_proxy = Arc::new(HttpProxyService::with_connect_timeout(
+        let mut http_proxy_svc = HttpProxyService::with_connect_timeout(
            Arc::clone(&route_manager),
            Arc::clone(&metrics),
            std::time::Duration::from_millis(conn_config.connection_timeout_ms),
-        ));
+        );
+        http_proxy_svc.set_backend_tls_config(tls_handler::shared_backend_tls_config());
+        let http_proxy = Arc::new(http_proxy_svc);
        let conn_tracker = Arc::new(ConnectionTracker::new(
            conn_config.max_connections_per_ip,
            conn_config.connection_rate_limit_per_minute,
        ));
+        let max_conns = conn_config.max_connections as usize;
        Self {
            listeners: HashMap::new(),
            route_manager: Arc::new(ArcSwap::from(route_manager)),
@@ -183,6 +202,7 @@ impl TcpListenerManager {
            conn_tracker,
            cancel_token: CancellationToken::new(),
            socket_handler_relay: Arc::new(std::sync::RwLock::new(None)),
+            conn_semaphore: Arc::new(tokio::sync::Semaphore::new(max_conns)),
        }
    }

@@ -192,6 +212,7 @@ impl TcpListenerManager {
            config.max_connections_per_ip,
            config.connection_rate_limit_per_minute,
        ));
+        self.conn_semaphore = Arc::new(tokio::sync::Semaphore::new(config.max_connections as usize));
        self.conn_config = Arc::new(config);
    }

@@ -247,11 +268,13 @@ impl TcpListenerManager {
        let conn_tracker = Arc::clone(&self.conn_tracker);
        let cancel = self.cancel_token.clone();
        let relay = Arc::clone(&self.socket_handler_relay);
+        let semaphore = Arc::clone(&self.conn_semaphore);

        let handle = tokio::spawn(async move {
            Self::accept_loop(
                listener, port, route_manager_swap, metrics, tls_configs,
                shared_tls_acceptor, http_proxy, conn_config, conn_tracker, cancel, relay,
+                semaphore,
            ).await;
        });

@@ -346,6 +369,7 @@ impl TcpListenerManager {
        conn_tracker: Arc<ConnectionTracker>,
        cancel: CancellationToken,
        socket_handler_relay: Arc<std::sync::RwLock<Option<String>>>,
+        conn_semaphore: Arc<tokio::sync::Semaphore>,
    ) {
        loop {
            tokio::select! {
@@ -358,10 +382,31 @@ impl TcpListenerManager {
                        Ok((stream, peer_addr)) => {
                            let ip = peer_addr.ip();

+                            // Global connection limit — acquire semaphore permit with timeout
+                            let permit = match tokio::time::timeout(
+                                std::time::Duration::from_secs(5),
+                                conn_semaphore.clone().acquire_owned(),
+                            ).await {
+                                Ok(Ok(permit)) => permit,
+                                Ok(Err(_)) => {
+                                    // Semaphore closed — shouldn't happen, but be safe
+                                    debug!("Connection semaphore closed, dropping connection from {}", peer_addr);
+                                    drop(stream);
+                                    continue;
+                                }
+                                Err(_) => {
+                                    // Timeout — global limit reached
+                                    debug!("Global connection limit reached, dropping connection from {}", peer_addr);
+                                    drop(stream);
+                                    continue;
+                                }
+                            };
+
                            // Check per-IP limits and rate limiting
                            if !conn_tracker.try_accept(&ip) {
                                debug!("Rejected connection from {} (per-IP limit or rate limit)", peer_addr);
                                drop(stream);
+                                drop(permit);
                                continue;
                            }

@@ -382,6 +427,8 @@ impl TcpListenerManager {
                            debug!("Accepted connection from {} on port {}", peer_addr, port);

                            tokio::spawn(async move {
+                                // Move permit into the task — auto-releases on drop
+                                let _permit = permit;
                                let result = Self::handle_connection(
                                    stream, port, peer_addr, rm, m, tc, sa, hp, cc, cn, sr,
                                ).await;
@@ -418,6 +465,12 @@ impl TcpListenerManager {
        use tokio::io::AsyncReadExt;

        stream.set_nodelay(true)?;
+        if conn_config.keep_alive {
+            let delay = std::time::Duration::from_millis(conn_config.keep_alive_initial_delay_ms);
+            if let Err(e) = socket_opts::apply_keepalive(&stream, delay) {
+                debug!("Failed to set keepalive on client socket: {}", e);
+            }
+        }

        // --- PROXY protocol: must happen BEFORE ip_str and fast path ---
        // Only parse PROXY headers from trusted proxy IPs (security).
@@ -543,6 +596,12 @@ impl TcpListenerManager {
                            Err(_) => return Err("Backend connection timeout".into()),
                        };
                        backend.set_nodelay(true)?;
+                        if conn_config.keep_alive {
+                            let delay = std::time::Duration::from_millis(conn_config.keep_alive_initial_delay_ms);
+                            if let Err(e) = socket_opts::apply_keepalive(&backend, delay) {
+                                debug!("Failed to set keepalive on backend socket: {}", e);
+                            }
+                        }

                        // Send PROXY protocol header if configured
                        let should_send_proxy = conn_config.send_proxy_protocol
@@ -778,6 +837,12 @@ impl TcpListenerManager {
                    Err(_) => return Err("Backend connection timeout".into()),
                };
                backend.set_nodelay(true)?;
+                if conn_config.keep_alive {
+                    let delay = std::time::Duration::from_millis(conn_config.keep_alive_initial_delay_ms);
+                    if let Err(e) = socket_opts::apply_keepalive(&backend, delay) {
+                        debug!("Failed to set keepalive on backend socket: {}", e);
+                    }
+                }

                // Send PROXY protocol header if configured
                if let Some(ref header) = proxy_header {
@@ -857,6 +922,12 @@ impl TcpListenerManager {
                        Err(_) => return Err("Backend connection timeout".into()),
                    };
                    backend.set_nodelay(true)?;
+                    if conn_config.keep_alive {
+                        let delay = std::time::Duration::from_millis(conn_config.keep_alive_initial_delay_ms);
+                        if let Err(e) = socket_opts::apply_keepalive(&backend, delay) {
+                            debug!("Failed to set keepalive on backend socket: {}", e);
+                        }
+                    }

                    let (tls_read, tls_write) = tokio::io::split(buf_stream);
                    let (backend_read, backend_write) = tokio::io::split(backend);
@@ -944,6 +1015,12 @@ impl TcpListenerManager {
                        Err(_) => return Err("Backend connection timeout".into()),
                    };
                    backend.set_nodelay(true)?;
+                    if conn_config.keep_alive {
+                        let delay = std::time::Duration::from_millis(conn_config.keep_alive_initial_delay_ms);
+                        if let Err(e) = socket_opts::apply_keepalive(&backend, delay) {
+                            debug!("Failed to set keepalive on backend socket: {}", e);
+                        }
+                    }

                    // Send PROXY protocol header if configured
                    if let Some(ref header) = proxy_header {
--- a/rust/crates/rustproxy-passthrough/src/tls_handler.rs
+++ b/rust/crates/rustproxy-passthrough/src/tls_handler.rs
@@ -1,6 +1,6 @@
 use std::collections::HashMap;
 use std::io::BufReader;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};

 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 use rustls::server::ResolvesServerCert;
@@ -84,13 +84,16 @@ pub fn build_shared_tls_acceptor(resolver: CertResolver) -> Result<TlsAcceptor,
        .with_no_client_auth()
        .with_cert_resolver(Arc::new(resolver));

+    // ALPN: advertise h2 and http/1.1 for client-facing HTTP/2 support
+    config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+
    // Shared session cache — enables session ID resumption across connections
    config.session_storage = rustls::server::ServerSessionMemoryCache::new(4096);
    // Session ticket resumption (12-hour lifetime, Chacha20Poly1305 encrypted)
    config.ticketer = rustls::crypto::ring::Ticketer::new()
        .map_err(|e| format!("Ticketer: {}", e))?;

-    info!("Built shared TLS config with session cache (4096) and ticket support");
+    info!("Built shared TLS config with session cache (4096), ticket support, and ALPN h2+http/1.1");
    Ok(TlsAcceptor::from(Arc::new(config)))
 }

@@ -122,6 +125,9 @@ pub fn build_tls_acceptor_with_config(
            .with_single_cert(certs, key)?
    };

+    // ALPN: advertise h2 and http/1.1 for client-facing HTTP/2 support
+    config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+
    // Apply session timeout if configured
    if let Some(route_tls) = tls_config {
        if let Some(timeout_secs) = route_tls.session_timeout {
@@ -179,21 +185,40 @@ pub async fn accept_tls(
    Ok(tls_stream)
 }

+/// Get or create a shared backend TLS `ClientConfig`.
+///
+/// Uses `OnceLock` to ensure only one config is created across the entire process.
+/// The built-in rustls `Resumption` (session tickets + session IDs) is enabled
+/// by default, so all outbound backend connections share the same session cache.
+static SHARED_CLIENT_CONFIG: OnceLock<Arc<rustls::ClientConfig>> = OnceLock::new();
+
+pub fn shared_backend_tls_config() -> Arc<rustls::ClientConfig> {
+    SHARED_CLIENT_CONFIG.get_or_init(|| {
+        ensure_crypto_provider();
+        let config = rustls::ClientConfig::builder()
+            .dangerous()
+            .with_custom_certificate_verifier(Arc::new(InsecureVerifier))
+            .with_no_client_auth();
+        info!("Built shared backend TLS client config with session resumption");
+        Arc::new(config)
+    }).clone()
+}
+
 /// Connect to a backend with TLS (for terminate-and-reencrypt mode).
+/// Uses the shared backend TLS config for session resumption.
 pub async fn connect_tls(
    host: &str,
    port: u16,
 ) -> Result<tokio_rustls::client::TlsStream<TcpStream>, Box<dyn std::error::Error + Send + Sync>> {
-    ensure_crypto_provider();
-    let config = rustls::ClientConfig::builder()
-        .dangerous()
-        .with_custom_certificate_verifier(Arc::new(InsecureVerifier))
-        .with_no_client_auth();
-
-    let connector = TlsConnector::from(Arc::new(config));
+    let config = shared_backend_tls_config();
+    let connector = TlsConnector::from(config);

    let stream = TcpStream::connect(format!("{}:{}", host, port)).await?;
    stream.set_nodelay(true)?;
+    // Apply keepalive with 60s default (tls_handler doesn't have ConnectionConfig access)
+    if let Err(e) = crate::socket_opts::apply_keepalive(&stream, std::time::Duration::from_secs(60)) {
+        debug!("Failed to set keepalive on backend TLS socket: {}", e);
+    }

    let server_name = rustls::pki_types::ServerName::try_from(host.to_string())?;
    let tls_stream = connector.connect(server_name, stream).await?;