fix(remoteingress-core): improve tunnel failure detection and reconnect handling

This commit is contained in:
2026-03-17 00:15:10 +00:00
parent 96e7ab00cf
commit 1afd0e5347
4 changed files with 64 additions and 10 deletions

View File

@@ -300,6 +300,13 @@ async fn handle_edge_connection(
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
// Disable Nagle's algorithm for low-latency control frames (PING/PONG, WINDOW_UPDATE)
stream.set_nodelay(true)?;
// TCP keepalive detects silent network failures (NAT timeout, path change)
// faster than the 45s application-level liveness timeout.
let ka = socket2::TcpKeepalive::new()
.with_time(Duration::from_secs(30));
#[cfg(target_os = "linux")]
let ka = ka.with_interval(Duration::from_secs(10));
let _ = socket2::SockRef::from(&stream).set_tcp_keepalive(&ka);
let tls_stream = acceptor.accept(stream).await?;
let (read_half, mut write_half) = tokio::io::split(tls_stream);
let mut buf_reader = BufReader::new(read_half);
@@ -383,18 +390,20 @@ async fn handle_edge_connection(
// Legacy alias for code that sends both control and data (will be migrated)
let frame_writer_tx = ctrl_tx.clone();
let writer_token = edge_token.clone();
let (writer_dead_tx, mut writer_dead_rx) = tokio::sync::oneshot::channel::<()>();
let writer_handle = tokio::spawn(async move {
// BufWriter coalesces small writes (frame headers, control frames) into fewer
// TLS records and syscalls. Flushed after each frame to avoid holding data.
let mut writer = tokio::io::BufWriter::with_capacity(65536, write_half);
let mut write_error = false;
loop {
tokio::select! {
biased; // control frames always take priority over data
ctrl = ctrl_rx.recv() => {
match ctrl {
Some(frame_data) => {
if writer.write_all(&frame_data).await.is_err() { break; }
if writer.flush().await.is_err() { break; }
if writer.write_all(&frame_data).await.is_err() { write_error = true; break; }
if writer.flush().await.is_err() { write_error = true; break; }
}
None => break,
}
@@ -402,8 +411,8 @@ async fn handle_edge_connection(
data = data_rx.recv() => {
match data {
Some(frame_data) => {
if writer.write_all(&frame_data).await.is_err() { break; }
if writer.flush().await.is_err() { break; }
if writer.write_all(&frame_data).await.is_err() { write_error = true; break; }
if writer.flush().await.is_err() { write_error = true; break; }
}
None => break,
}
@@ -411,6 +420,10 @@ async fn handle_edge_connection(
_ = writer_token.cancelled() => break,
}
}
if write_error {
log::error!("Tunnel writer to edge failed, signalling reader for fast cleanup");
let _ = writer_dead_tx.send(());
}
});
// Spawn task to forward config updates as FRAME_CONFIG frames
@@ -754,6 +767,10 @@ async fn handle_edge_connection(
edge_id, liveness_timeout_dur.as_secs());
break;
}
_ = &mut writer_dead_rx => {
log::error!("Tunnel writer to edge {} died, disconnecting immediately", edge_id);
break;
}
_ = edge_token.cancelled() => {
log::info!("Edge {} cancelled by hub", edge_id);
break;