feat(metrics): add per-backend connection, error, protocol, and pool metrics with stale backend pruning

This commit is contained in:
2026-03-12 15:16:11 +00:00
parent 0380a957d0
commit 0d4399d7f1
7 changed files with 561 additions and 36 deletions

View File

@@ -618,6 +618,8 @@ impl HttpProxyService {
// H2 pool checkout (H2 senders are Clone and multiplexed)
if use_h2 {
if let Some(sender) = self.connection_pool.checkout_h2(&pool_key) {
self.metrics.backend_pool_hit(&upstream_key);
self.metrics.set_backend_protocol(&upstream_key, "h2");
let result = self.forward_h2_pooled(
sender, parts, body, upstream_headers, &upstream_path,
route_match.route, route_id, &ip_str, &pool_key,
@@ -629,6 +631,8 @@ impl HttpProxyService {
}
// --- Fresh connection path ---
self.metrics.backend_pool_miss(&upstream_key);
// Choose TLS config: use ALPN config for auto-detect probe, plain config otherwise
let tls_config = if needs_alpn_probe {
&self.backend_tls_config_alpn
@@ -637,6 +641,7 @@ impl HttpProxyService {
};
// Establish backend connection
let connect_start = std::time::Instant::now();
let (backend, detected_h2) = if upstream.use_tls {
match tokio::time::timeout(
self.connect_timeout,
@@ -661,25 +666,39 @@ impl HttpProxyService {
};
self.protocol_cache.insert(cache_key, detected);
debug!(
"Auto-detected {} for backend {}:{}",
if is_h2 { "HTTP/2" } else { "HTTP/1.1" },
upstream.host, upstream.port
info!(
backend = %upstream_key,
protocol = if is_h2 { "h2" } else { "h1" },
connect_time_ms = %connect_start.elapsed().as_millis(),
"Backend protocol detected via ALPN"
);
is_h2
} else {
use_h2
};
self.metrics.backend_connection_opened(&upstream_key, connect_start.elapsed());
self.metrics.set_backend_protocol(&upstream_key, if final_h2 { "h2" } else { "h1" });
(BackendStream::Tls(tls), final_h2)
}
Ok(Err(e)) => {
error!("Failed TLS connect to upstream {}:{}: {}", upstream.host, upstream.port, e);
error!(
backend = %upstream_key,
connect_time_ms = %connect_start.elapsed().as_millis(),
error = %e,
"Backend TLS connect failed"
);
self.metrics.backend_connect_error(&upstream_key);
self.upstream_selector.connection_ended(&upstream_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend TLS unavailable"));
}
Err(_) => {
error!("Upstream TLS connect timeout for {}:{}", upstream.host, upstream.port);
error!(
backend = %upstream_key,
connect_time_ms = %connect_start.elapsed().as_millis(),
"Backend TLS connect timeout"
);
self.metrics.backend_connect_error(&upstream_key);
self.upstream_selector.connection_ended(&upstream_key);
return Ok(error_response(StatusCode::GATEWAY_TIMEOUT, "Backend TLS connect timeout"));
}
@@ -694,15 +713,28 @@ impl HttpProxyService {
let _ = socket2::SockRef::from(&s).set_tcp_keepalive(
&socket2::TcpKeepalive::new().with_time(std::time::Duration::from_secs(60))
);
self.metrics.backend_connection_opened(&upstream_key, connect_start.elapsed());
self.metrics.set_backend_protocol(&upstream_key, if use_h2 { "h2" } else { "h1" });
(BackendStream::Plain(s), use_h2)
}
Ok(Err(e)) => {
error!("Failed to connect to upstream {}:{}: {}", upstream.host, upstream.port, e);
error!(
backend = %upstream_key,
connect_time_ms = %connect_start.elapsed().as_millis(),
error = %e,
"Backend TCP connect failed"
);
self.metrics.backend_connect_error(&upstream_key);
self.upstream_selector.connection_ended(&upstream_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend unavailable"));
}
Err(_) => {
error!("Upstream connect timeout for {}:{}", upstream.host, upstream.port);
error!(
backend = %upstream_key,
connect_time_ms = %connect_start.elapsed().as_millis(),
"Backend TCP connect timeout"
);
self.metrics.backend_connect_error(&upstream_key);
self.upstream_selector.connection_ended(&upstream_key);
return Ok(error_response(StatusCode::GATEWAY_TIMEOUT, "Backend connect timeout"));
}
@@ -740,6 +772,7 @@ impl HttpProxyService {
).await
};
self.upstream_selector.connection_ended(&upstream_key);
self.metrics.backend_connection_closed(&upstream_key);
result
}
@@ -758,8 +791,11 @@ impl HttpProxyService {
source_ip: &str,
pool_key: &crate::connection_pool::PoolKey,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let backend_key = format!("{}:{}", pool_key.host, pool_key.port);
// Try pooled H1 connection first — avoids TCP+TLS handshake
if let Some(pooled_sender) = self.connection_pool.checkout_h1(pool_key) {
self.metrics.backend_pool_hit(&backend_key);
return self.forward_h1_with_sender(
pooled_sender, parts, body, upstream_headers, upstream_path,
route, route_id, source_ip, pool_key,
@@ -773,7 +809,8 @@ impl HttpProxyService {
) = match hyper::client::conn::http1::handshake(io).await {
Ok(h) => h,
Err(e) => {
error!("Upstream handshake failed: {}", e);
error!(backend = %backend_key, error = %e, "Backend H1 handshake failed");
self.metrics.backend_handshake_error(&backend_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend handshake failed"));
}
};
@@ -825,7 +862,9 @@ impl HttpProxyService {
let upstream_response = match sender.send_request(upstream_req).await {
Ok(resp) => resp,
Err(e) => {
error!("Upstream request failed: {}", e);
let bk = format!("{}:{}", pool_key.host, pool_key.port);
error!(backend = %bk, error = %e, "Backend H1 request failed");
self.metrics.backend_request_error(&bk);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend request failed"));
}
};
@@ -851,6 +890,7 @@ impl HttpProxyService {
source_ip: &str,
pool_key: &crate::connection_pool::PoolKey,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let backend_key = format!("{}:{}", pool_key.host, pool_key.port);
let exec = hyper_util::rt::TokioExecutor::new();
// Explicitly type the handshake with BoxBody for uniform pool type
let (sender, conn): (
@@ -859,7 +899,8 @@ impl HttpProxyService {
) = match hyper::client::conn::http2::handshake(exec, io).await {
Ok(h) => h,
Err(e) => {
error!("HTTP/2 upstream handshake failed: {}", e);
error!(backend = %backend_key, error = %e, "Backend H2 handshake failed");
self.metrics.backend_handshake_error(&backend_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend H2 handshake failed"));
}
};
@@ -930,7 +971,10 @@ impl HttpProxyService {
route_id: Option<&str>,
source_ip: &str,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let backend_key = format!("{}:{}", pool_key.host, pool_key.port);
// Establish fresh backend connection
let retry_connect_start = std::time::Instant::now();
let backend = if pool_key.use_tls {
match tokio::time::timeout(
self.connect_timeout,
@@ -938,11 +982,13 @@ impl HttpProxyService {
).await {
Ok(Ok(tls)) => BackendStream::Tls(tls),
Ok(Err(e)) => {
error!("H2 retry: TLS connect failed for {}:{}: {}", pool_key.host, pool_key.port, e);
error!(backend = %backend_key, error = %e, "H2 retry: TLS connect failed");
self.metrics.backend_connect_error(&backend_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend unavailable on H2 retry"));
}
Err(_) => {
error!("H2 retry: TLS connect timeout for {}:{}", pool_key.host, pool_key.port);
error!(backend = %backend_key, "H2 retry: TLS connect timeout");
self.metrics.backend_connect_error(&backend_key);
return Ok(error_response(StatusCode::GATEWAY_TIMEOUT, "Backend timeout on H2 retry"));
}
}
@@ -956,15 +1002,18 @@ impl HttpProxyService {
BackendStream::Plain(s)
}
Ok(Err(e)) => {
error!("H2 retry: connect failed for {}:{}: {}", pool_key.host, pool_key.port, e);
error!(backend = %backend_key, error = %e, "H2 retry: TCP connect failed");
self.metrics.backend_connect_error(&backend_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend unavailable on H2 retry"));
}
Err(_) => {
error!("H2 retry: connect timeout for {}:{}", pool_key.host, pool_key.port);
error!(backend = %backend_key, "H2 retry: TCP connect timeout");
self.metrics.backend_connect_error(&backend_key);
return Ok(error_response(StatusCode::GATEWAY_TIMEOUT, "Backend timeout on H2 retry"));
}
}
};
self.metrics.backend_connection_opened(&backend_key, retry_connect_start.elapsed());
let io = TokioIo::new(backend);
let exec = hyper_util::rt::TokioExecutor::new();
@@ -974,7 +1023,8 @@ impl HttpProxyService {
) = match hyper::client::conn::http2::handshake(exec, io).await {
Ok(h) => h,
Err(e) => {
error!("H2 retry: handshake failed for {}:{}: {}", pool_key.host, pool_key.port, e);
error!(backend = %backend_key, error = %e, "H2 retry: handshake failed");
self.metrics.backend_handshake_error(&backend_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend H2 retry handshake failed"));
}
};
@@ -1004,11 +1054,17 @@ impl HttpProxyService {
match sender.send_request(upstream_req).await {
Ok(resp) => {
self.build_streaming_response(resp, route, route_id, source_ip).await
let result = self.build_streaming_response(resp, route, route_id, source_ip).await;
// Close the fresh backend connection (opened at line 1016 above)
self.metrics.backend_connection_closed(&backend_key);
result
}
Err(e) => {
error!("H2 retry: request failed for {}:{}: {}", pool_key.host, pool_key.port, e);
error!(backend = %backend_key, error = %e, "H2 retry: request failed");
self.metrics.backend_request_error(&backend_key);
self.connection_pool.remove_h2(pool_key);
// Close the fresh backend connection (opened at line 1016 above)
self.metrics.backend_connection_closed(&backend_key);
Ok(error_response(StatusCode::BAD_GATEWAY, "Backend H2 request failed on retry"))
}
}
@@ -1086,10 +1142,13 @@ impl HttpProxyService {
Err(e) => {
// H2 request failed — backend advertises h2 via ALPN but doesn't
// actually speak it. Update cache so future requests use H1.
let bk = format!("{}:{}", upstream.host, upstream.port);
warn!(
"Auto-detect: H2 request failed for {}:{}, falling back to H1: {}",
upstream.host, upstream.port, e
backend = %bk,
error = %e,
"Auto-detect: H2 request failed, falling back to H1"
);
self.metrics.backend_h2_failure(&bk);
let cache_key = crate::protocol_cache::ProtocolCacheKey {
host: upstream.host.clone(),
port: upstream.port,
@@ -1108,10 +1167,13 @@ impl HttpProxyService {
h2: false,
};
let fallback_io = TokioIo::new(fallback_backend);
self.forward_h1_empty_body(
let result = self.forward_h1_empty_body(
fallback_io, method, headers, upstream_path,
route, route_id, source_ip, &h1_pool_key,
).await
).await;
// Close the reconnected backend connection (opened in reconnect_backend)
self.metrics.backend_connection_closed(&bk);
result
}
None => {
Ok(error_response(StatusCode::BAD_GATEWAY, "Backend unavailable after H2 fallback"))
@@ -1126,10 +1188,14 @@ impl HttpProxyService {
Err(e) => {
// H2 handshake truly failed — fall back to H1
// Body is NOT consumed yet, so we can retry the full request.
let bk = format!("{}:{}", upstream.host, upstream.port);
warn!(
"H2 handshake failed for {}:{}, falling back to H1: {}",
upstream.host, upstream.port, e
backend = %bk,
error = %e,
"H2 handshake failed, falling back to H1"
);
self.metrics.backend_h2_failure(&bk);
self.metrics.backend_handshake_error(&bk);
// Update cache to H1 so subsequent requests skip H2
let cache_key = crate::protocol_cache::ProtocolCacheKey {
@@ -1149,10 +1215,13 @@ impl HttpProxyService {
h2: false,
};
let fallback_io = TokioIo::new(fallback_backend);
self.forward_h1(
let result = self.forward_h1(
fallback_io, parts, body, upstream_headers, upstream_path,
upstream, route, route_id, source_ip, &h1_pool_key,
).await
).await;
// Close the reconnected backend connection (opened in reconnect_backend)
self.metrics.backend_connection_closed(&bk);
result
}
None => {
Ok(error_response(StatusCode::BAD_GATEWAY, "Backend unavailable after H2 fallback"))
@@ -1175,13 +1244,15 @@ impl HttpProxyService {
source_ip: &str,
pool_key: &crate::connection_pool::PoolKey,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, hyper::Error> {
let backend_key = format!("{}:{}", pool_key.host, pool_key.port);
let (mut sender, conn): (
hyper::client::conn::http1::SendRequest<BoxBody<Bytes, hyper::Error>>,
hyper::client::conn::http1::Connection<TokioIo<BackendStream>, BoxBody<Bytes, hyper::Error>>,
) = match hyper::client::conn::http1::handshake(io).await {
Ok(h) => h,
Err(e) => {
error!("H1 fallback: handshake failed: {}", e);
error!(backend = %backend_key, error = %e, "H1 fallback: handshake failed");
self.metrics.backend_handshake_error(&backend_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend H1 fallback handshake failed"));
}
};
@@ -1209,7 +1280,8 @@ impl HttpProxyService {
let upstream_response = match sender.send_request(upstream_req).await {
Ok(resp) => resp,
Err(e) => {
error!("H1 fallback: request failed: {}", e);
error!(backend = %backend_key, error = %e, "H1 fallback: request failed");
self.metrics.backend_request_error(&backend_key);
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend H1 fallback request failed"));
}
};
@@ -1225,18 +1297,25 @@ impl HttpProxyService {
&self,
upstream: &crate::upstream_selector::UpstreamSelection,
) -> Option<BackendStream> {
let backend_key = format!("{}:{}", upstream.host, upstream.port);
let reconnect_start = std::time::Instant::now();
if upstream.use_tls {
match tokio::time::timeout(
self.connect_timeout,
connect_tls_backend(&self.backend_tls_config, &upstream.host, upstream.port),
).await {
Ok(Ok(tls)) => Some(BackendStream::Tls(tls)),
Ok(Ok(tls)) => {
self.metrics.backend_connection_opened(&backend_key, reconnect_start.elapsed());
Some(BackendStream::Tls(tls))
}
Ok(Err(e)) => {
error!("H1 fallback: TLS reconnect failed for {}:{}: {}", upstream.host, upstream.port, e);
error!(backend = %backend_key, error = %e, "H1 fallback: TLS reconnect failed");
self.metrics.backend_connect_error(&backend_key);
None
}
Err(_) => {
error!("H1 fallback: TLS reconnect timeout for {}:{}", upstream.host, upstream.port);
error!(backend = %backend_key, "H1 fallback: TLS reconnect timeout");
self.metrics.backend_connect_error(&backend_key);
None
}
}
@@ -1250,14 +1329,17 @@ impl HttpProxyService {
let _ = socket2::SockRef::from(&s).set_tcp_keepalive(
&socket2::TcpKeepalive::new().with_time(std::time::Duration::from_secs(60))
);
self.metrics.backend_connection_opened(&backend_key, reconnect_start.elapsed());
Some(BackendStream::Plain(s))
}
Ok(Err(e)) => {
error!("H1 fallback: reconnect failed for {}:{}: {}", upstream.host, upstream.port, e);
error!(backend = %backend_key, error = %e, "H1 fallback: TCP reconnect failed");
self.metrics.backend_connect_error(&backend_key);
None
}
Err(_) => {
error!("H1 fallback: reconnect timeout for {}:{}", upstream.host, upstream.port);
error!(backend = %backend_key, "H1 fallback: TCP reconnect timeout");
self.metrics.backend_connect_error(&backend_key);
None
}
}
@@ -1300,10 +1382,14 @@ impl HttpProxyService {
let upstream_response = match sender.send_request(upstream_req).await {
Ok(resp) => resp,
Err(e) => {
error!("HTTP/2 upstream request failed: {}", e);
// Evict the dead sender so subsequent requests get fresh connections
if let Some(key) = pool_key {
let bk = format!("{}:{}", key.host, key.port);
error!(backend = %bk, error = %e, "Backend H2 request failed");
self.metrics.backend_request_error(&bk);
self.connection_pool.remove_h2(key);
} else {
error!(error = %e, "Backend H2 request failed");
}
return Ok(error_response(StatusCode::BAD_GATEWAY, "Backend H2 request failed"));
}