feat(remoteingress): add heartbeat PING/PONG and liveness timeouts; implement fast-reconnect/backoff reset and JS crash-recovery auto-restart

This commit is contained in:
2026-03-03 11:47:50 +00:00
parent d6a07c28a0
commit 45a2811f3e
7 changed files with 241 additions and 5 deletions

View File

@@ -1,10 +1,12 @@
use std::collections::HashMap;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;
use std::time::Duration;
use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader};
use tokio::net::{TcpListener, TcpStream};
use tokio::sync::{mpsc, Mutex, RwLock};
use tokio::task::JoinHandle;
use tokio::time::{Instant, sleep_until};
use tokio_rustls::TlsConnector;
use tokio_util::sync::CancellationToken;
use serde::{Deserialize, Serialize};
@@ -202,6 +204,13 @@ async fn edge_main_loop(
// Cancel connection token to kill all orphaned tasks from this cycle
connection_token.cancel();
// Reset backoff after a successful connection for fast reconnect
let was_connected = *connected.read().await;
if was_connected {
backoff_ms = 1000;
log::info!("Was connected; resetting backoff to {}ms for fast reconnect", backoff_ms);
}
*connected.write().await = false;
let _ = event_tx.try_send(EdgeEvent::TunnelDisconnected);
active_streams.store(0, Ordering::Relaxed);
@@ -214,7 +223,7 @@ async fn edge_main_loop(
EdgeLoopResult::Reconnect => {
log::info!("Reconnecting in {}ms...", backoff_ms);
tokio::select! {
_ = tokio::time::sleep(std::time::Duration::from_millis(backoff_ms)) => {}
_ = tokio::time::sleep(Duration::from_millis(backoff_ms)) => {}
_ = cancel_token.cancelled() => break,
_ = shutdown_rx.recv() => break,
}
@@ -336,7 +345,7 @@ async fn connect_to_hub_and_run(
_ = stun_token.cancelled() => break,
}
tokio::select! {
_ = tokio::time::sleep(std::time::Duration::from_secs(stun_interval)) => {}
_ = tokio::time::sleep(Duration::from_secs(stun_interval)) => {}
_ = stun_token.cancelled() => break,
}
}
@@ -380,6 +389,11 @@ async fn connect_to_hub_and_run(
connection_token,
);
// Heartbeat: liveness timeout detects silent hub failures
let liveness_timeout_dur = Duration::from_secs(45);
let mut last_activity = Instant::now();
let mut liveness_deadline = Box::pin(sleep_until(last_activity + liveness_timeout_dur));
// Read frames from hub
let mut frame_reader = FrameReader::new(buf_reader);
let result = loop {
@@ -387,6 +401,10 @@ async fn connect_to_hub_and_run(
frame_result = frame_reader.next_frame() => {
match frame_result {
Ok(Some(frame)) => {
// Reset liveness on any received frame
last_activity = Instant::now();
liveness_deadline.as_mut().reset(last_activity + liveness_timeout_dur);
match frame.frame_type {
FRAME_DATA_BACK => {
// A1: Non-blocking send to prevent head-of-line blocking
@@ -420,6 +438,14 @@ async fn connect_to_hub_and_run(
);
}
}
FRAME_PING => {
let pong_frame = encode_frame(0, FRAME_PONG, &[]);
if tunnel_writer_tx.try_send(pong_frame).is_err() {
log::warn!("Failed to send PONG, writer channel full/closed");
break EdgeLoopResult::Reconnect;
}
log::trace!("Received PING from hub, sent PONG");
}
_ => {
log::warn!("Unexpected frame type {} from hub", frame.frame_type);
}
@@ -435,6 +461,11 @@ async fn connect_to_hub_and_run(
}
}
}
_ = &mut liveness_deadline => {
log::warn!("Hub liveness timeout (no frames for {}s), reconnecting",
liveness_timeout_dur.as_secs());
break EdgeLoopResult::Reconnect;
}
_ = connection_token.cancelled() => {
log::info!("Connection cancelled");
break EdgeLoopResult::Shutdown;

View File

@@ -1,8 +1,10 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader};
use tokio::net::{TcpListener, TcpStream};
use tokio::sync::{mpsc, Mutex, RwLock, Semaphore};
use tokio::time::{interval, sleep_until, Instant};
use tokio_rustls::TlsAcceptor;
use tokio_util::sync::CancellationToken;
use serde::{Deserialize, Serialize};
@@ -407,6 +409,14 @@ async fn handle_edge_connection(
// A4: Semaphore to limit concurrent streams per edge
let stream_semaphore = Arc::new(Semaphore::new(MAX_STREAMS_PER_EDGE));
// Heartbeat: periodic PING and liveness timeout
let ping_interval_dur = Duration::from_secs(15);
let liveness_timeout_dur = Duration::from_secs(45);
let mut ping_ticker = interval(ping_interval_dur);
ping_ticker.tick().await; // consume the immediate first tick
let mut last_activity = Instant::now();
let mut liveness_deadline = Box::pin(sleep_until(last_activity + liveness_timeout_dur));
// Frame reading loop
let mut frame_reader = FrameReader::new(buf_reader);
@@ -415,6 +425,10 @@ async fn handle_edge_connection(
frame_result = frame_reader.next_frame() => {
match frame_result {
Ok(Some(frame)) => {
// Reset liveness on any received frame
last_activity = Instant::now();
liveness_deadline.as_mut().reset(last_activity + liveness_timeout_dur);
match frame.frame_type {
FRAME_OPEN => {
// A4: Check stream limit before processing
@@ -462,7 +476,7 @@ async fn handle_edge_connection(
let result = async {
// A2: Connect to SmartProxy with timeout
let mut upstream = tokio::time::timeout(
std::time::Duration::from_secs(10),
Duration::from_secs(10),
TcpStream::connect((target.as_str(), dest_port)),
)
.await
@@ -571,6 +585,9 @@ async fn handle_edge_connection(
});
}
}
FRAME_PONG => {
log::debug!("Received PONG from edge {}", edge_id);
}
_ => {
log::warn!("Unexpected frame type {} from edge", frame.frame_type);
}
@@ -586,6 +603,19 @@ async fn handle_edge_connection(
}
}
}
_ = ping_ticker.tick() => {
let ping_frame = encode_frame(0, FRAME_PING, &[]);
if frame_writer_tx.try_send(ping_frame).is_err() {
log::warn!("Failed to send PING to edge {}, writer channel full/closed", edge_id);
break;
}
log::trace!("Sent PING to edge {}", edge_id);
}
_ = &mut liveness_deadline => {
log::warn!("Edge {} liveness timeout (no frames for {}s), disconnecting",
edge_id, liveness_timeout_dur.as_secs());
break;
}
_ = edge_token.cancelled() => {
log::info!("Edge {} cancelled by hub", edge_id);
break;