fix(rust-edge): refactor tunnel I/O to preserve TLS state and prioritize control frames
This commit is contained in:
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio::sync::{mpsc, Mutex, Notify, RwLock};
|
||||
use tokio::task::JoinHandle;
|
||||
@@ -308,7 +308,7 @@ async fn connect_to_hub_and_run(
|
||||
let server_name = rustls::pki_types::ServerName::try_from(config.hub_host.clone())
|
||||
.unwrap_or_else(|_| rustls::pki_types::ServerName::try_from("remoteingress-hub".to_string()).unwrap());
|
||||
|
||||
let tls_stream = match connector.connect(server_name, tcp).await {
|
||||
let mut tls_stream = match connector.connect(server_name, tcp).await {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
log::error!("TLS handshake failed: {}", e);
|
||||
@@ -316,28 +316,38 @@ async fn connect_to_hub_and_run(
|
||||
}
|
||||
};
|
||||
|
||||
let (read_half, mut write_half) = tokio::io::split(tls_stream);
|
||||
|
||||
// Send auth line
|
||||
// Send auth line (we own the whole stream — no split)
|
||||
let auth_line = format!("EDGE {} {}\n", config.edge_id, config.secret);
|
||||
if write_half.write_all(auth_line.as_bytes()).await.is_err() {
|
||||
if tls_stream.write_all(auth_line.as_bytes()).await.is_err() {
|
||||
return EdgeLoopResult::Reconnect("auth_write_failed".to_string());
|
||||
}
|
||||
if tls_stream.flush().await.is_err() {
|
||||
return EdgeLoopResult::Reconnect("auth_flush_failed".to_string());
|
||||
}
|
||||
|
||||
// Read handshake response line from hub (JSON with initial config)
|
||||
let mut buf_reader = BufReader::new(read_half);
|
||||
let mut handshake_line = String::new();
|
||||
match buf_reader.read_line(&mut handshake_line).await {
|
||||
Ok(0) => {
|
||||
log::error!("Hub rejected connection (EOF before handshake)");
|
||||
return EdgeLoopResult::Reconnect("hub_rejected_eof".to_string());
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
log::error!("Failed to read handshake response: {}", e);
|
||||
return EdgeLoopResult::Reconnect(format!("handshake_read_failed: {}", e));
|
||||
// Read handshake line byte-by-byte (no BufReader — into_inner corrupts TLS state)
|
||||
let mut handshake_bytes = Vec::with_capacity(512);
|
||||
let mut byte = [0u8; 1];
|
||||
loop {
|
||||
match tls_stream.read_exact(&mut byte).await {
|
||||
Ok(_) => {
|
||||
handshake_bytes.push(byte[0]);
|
||||
if byte[0] == b'\n' { break; }
|
||||
if handshake_bytes.len() > 8192 {
|
||||
return EdgeLoopResult::Reconnect("handshake_too_long".to_string());
|
||||
}
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
|
||||
log::error!("Hub rejected connection (EOF before handshake)");
|
||||
return EdgeLoopResult::Reconnect("hub_rejected_eof".to_string());
|
||||
}
|
||||
Err(e) => {
|
||||
log::error!("Failed to read handshake response: {}", e);
|
||||
return EdgeLoopResult::Reconnect(format!("handshake_read_failed: {}", e));
|
||||
}
|
||||
}
|
||||
}
|
||||
let handshake_line = String::from_utf8_lossy(&handshake_bytes);
|
||||
|
||||
let handshake: HandshakeConfig = match serde_json::from_str(handshake_line.trim()) {
|
||||
Ok(h) => h,
|
||||
@@ -394,52 +404,13 @@ async fn connect_to_hub_and_run(
|
||||
let client_writers: Arc<Mutex<HashMap<u32, EdgeStreamState>>> =
|
||||
Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
// QoS dual-channel tunnel writer: control frames (PONG/WINDOW_UPDATE/CLOSE/OPEN)
|
||||
// have priority over data frames (DATA). Prevents PING starvation under load.
|
||||
// QoS dual-channel: ctrl frames have priority over data frames.
|
||||
// Stream handlers send through these channels → TunnelIo drains them.
|
||||
let (tunnel_ctrl_tx, mut tunnel_ctrl_rx) = mpsc::channel::<Vec<u8>>(256);
|
||||
let (tunnel_data_tx, mut tunnel_data_rx) = mpsc::channel::<Vec<u8>>(4096);
|
||||
// Legacy alias — control channel for PONG, CLOSE, WINDOW_UPDATE, OPEN
|
||||
let tunnel_writer_tx = tunnel_ctrl_tx.clone();
|
||||
let tw_token = connection_token.clone();
|
||||
// Oneshot to signal the reader loop when the writer dies from a write error.
|
||||
// This avoids the 45s liveness timeout delay when the tunnel is already dead.
|
||||
let (writer_dead_tx, mut writer_dead_rx) = tokio::sync::oneshot::channel::<()>();
|
||||
let tunnel_writer_handle = tokio::spawn(async move {
|
||||
// BufWriter coalesces small writes (frame headers, control frames) into fewer
|
||||
// TLS records and syscalls. Flushed after each frame to avoid holding data.
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(65536, write_half);
|
||||
let mut write_error = false;
|
||||
loop {
|
||||
tokio::select! {
|
||||
biased; // control frames always take priority over data
|
||||
ctrl = tunnel_ctrl_rx.recv() => {
|
||||
match ctrl {
|
||||
Some(frame_data) => {
|
||||
if writer.write_all(&frame_data).await.is_err() { write_error = true; break; }
|
||||
if writer.flush().await.is_err() { write_error = true; break; }
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
data = tunnel_data_rx.recv() => {
|
||||
match data {
|
||||
Some(frame_data) => {
|
||||
if writer.write_all(&frame_data).await.is_err() { write_error = true; break; }
|
||||
if writer.flush().await.is_err() { write_error = true; break; }
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
_ = tw_token.cancelled() => break,
|
||||
}
|
||||
}
|
||||
if write_error {
|
||||
log::error!("Tunnel writer failed, signalling reader for fast reconnect");
|
||||
let _ = writer_dead_tx.send(());
|
||||
}
|
||||
});
|
||||
|
||||
// Start TCP listeners for initial ports (hot-reloadable)
|
||||
// Start TCP listeners for initial ports
|
||||
let mut port_listeners: HashMap<u16, JoinHandle<()>> = HashMap::new();
|
||||
let bind_address = config.bind_address.as_deref().unwrap_or("0.0.0.0");
|
||||
apply_port_config(
|
||||
@@ -455,122 +426,180 @@ async fn connect_to_hub_and_run(
|
||||
bind_address,
|
||||
);
|
||||
|
||||
// Heartbeat: liveness timeout detects silent hub failures
|
||||
// Single-owner I/O engine — no tokio::io::split, no mutex
|
||||
let mut tunnel_io = remoteingress_protocol::TunnelIo::new(tls_stream, Vec::new());
|
||||
|
||||
let liveness_timeout_dur = Duration::from_secs(45);
|
||||
let mut last_activity = Instant::now();
|
||||
let mut liveness_deadline = Box::pin(sleep_until(last_activity + liveness_timeout_dur));
|
||||
|
||||
// Read frames from hub
|
||||
let mut frame_reader = FrameReader::new(buf_reader);
|
||||
let result = loop {
|
||||
tokio::select! {
|
||||
frame_result = frame_reader.next_frame() => {
|
||||
match frame_result {
|
||||
Ok(Some(frame)) => {
|
||||
// Reset liveness on any received frame
|
||||
last_activity = Instant::now();
|
||||
liveness_deadline.as_mut().reset(last_activity + liveness_timeout_dur);
|
||||
|
||||
match frame.frame_type {
|
||||
FRAME_DATA_BACK => {
|
||||
// Non-blocking dispatch to per-stream channel.
|
||||
// With flow control, the sender should rarely exceed the channel capacity.
|
||||
let mut writers = client_writers.lock().await;
|
||||
if let Some(state) = writers.get(&frame.stream_id) {
|
||||
if state.back_tx.try_send(frame.payload).is_err() {
|
||||
log::warn!("Stream {} back-channel full, closing stream", frame.stream_id);
|
||||
writers.remove(&frame.stream_id);
|
||||
}
|
||||
let result = 'io_loop: loop {
|
||||
// Drain any buffered frames
|
||||
loop {
|
||||
match tunnel_io.try_parse_frame() {
|
||||
Some(Ok(frame)) => {
|
||||
last_activity = Instant::now();
|
||||
liveness_deadline.as_mut().reset(last_activity + liveness_timeout_dur);
|
||||
match frame.frame_type {
|
||||
FRAME_DATA_BACK => {
|
||||
let mut writers = client_writers.lock().await;
|
||||
if let Some(state) = writers.get(&frame.stream_id) {
|
||||
if state.back_tx.try_send(frame.payload).is_err() {
|
||||
log::warn!("Stream {} back-channel full, closing", frame.stream_id);
|
||||
writers.remove(&frame.stream_id);
|
||||
}
|
||||
}
|
||||
FRAME_WINDOW_UPDATE_BACK => {
|
||||
// Hub consumed data — increase our send window for this stream (upload direction)
|
||||
if let Some(increment) = decode_window_update(&frame.payload) {
|
||||
if increment > 0 {
|
||||
let writers = client_writers.lock().await;
|
||||
if let Some(state) = writers.get(&frame.stream_id) {
|
||||
let prev = state.send_window.fetch_add(increment, Ordering::Release);
|
||||
if prev + increment > MAX_WINDOW_SIZE {
|
||||
state.send_window.store(MAX_WINDOW_SIZE, Ordering::Release);
|
||||
}
|
||||
state.window_notify.notify_one();
|
||||
}
|
||||
FRAME_WINDOW_UPDATE_BACK => {
|
||||
if let Some(increment) = decode_window_update(&frame.payload) {
|
||||
if increment > 0 {
|
||||
let writers = client_writers.lock().await;
|
||||
if let Some(state) = writers.get(&frame.stream_id) {
|
||||
let prev = state.send_window.fetch_add(increment, Ordering::Release);
|
||||
if prev + increment > MAX_WINDOW_SIZE {
|
||||
state.send_window.store(MAX_WINDOW_SIZE, Ordering::Release);
|
||||
}
|
||||
state.window_notify.notify_one();
|
||||
}
|
||||
}
|
||||
}
|
||||
FRAME_CLOSE_BACK => {
|
||||
let mut writers = client_writers.lock().await;
|
||||
}
|
||||
FRAME_CLOSE_BACK => {
|
||||
let mut writers = client_writers.lock().await;
|
||||
writers.remove(&frame.stream_id);
|
||||
}
|
||||
FRAME_CONFIG => {
|
||||
if let Ok(update) = serde_json::from_slice::<ConfigUpdate>(&frame.payload) {
|
||||
log::info!("Config update from hub: ports {:?}", update.listen_ports);
|
||||
*listen_ports.write().await = update.listen_ports.clone();
|
||||
let _ = event_tx.try_send(EdgeEvent::PortsUpdated {
|
||||
listen_ports: update.listen_ports.clone(),
|
||||
});
|
||||
apply_port_config(
|
||||
&update.listen_ports,
|
||||
&mut port_listeners,
|
||||
&tunnel_writer_tx,
|
||||
&tunnel_data_tx,
|
||||
&client_writers,
|
||||
active_streams,
|
||||
next_stream_id,
|
||||
&config.edge_id,
|
||||
connection_token,
|
||||
bind_address,
|
||||
);
|
||||
}
|
||||
}
|
||||
FRAME_PING => {
|
||||
// Queue PONG directly — no channel round-trip, guaranteed delivery
|
||||
tunnel_io.queue_ctrl(encode_frame(0, FRAME_PONG, &[]));
|
||||
}
|
||||
_ => {
|
||||
log::warn!("Unexpected frame type {} from hub", frame.frame_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
log::error!("Hub frame error: {}", e);
|
||||
break 'io_loop EdgeLoopResult::Reconnect(format!("hub_frame_error: {}", e));
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
// Poll I/O: write(ctrl→data), flush, read, channels, timers
|
||||
let event = std::future::poll_fn(|cx| {
|
||||
tunnel_io.poll_step(cx, &mut tunnel_ctrl_rx, &mut tunnel_data_rx, &mut liveness_deadline, connection_token)
|
||||
}).await;
|
||||
|
||||
match event {
|
||||
remoteingress_protocol::TunnelEvent::Frame(frame) => {
|
||||
last_activity = Instant::now();
|
||||
liveness_deadline.as_mut().reset(last_activity + liveness_timeout_dur);
|
||||
match frame.frame_type {
|
||||
FRAME_DATA_BACK => {
|
||||
let mut writers = client_writers.lock().await;
|
||||
if let Some(state) = writers.get(&frame.stream_id) {
|
||||
if state.back_tx.try_send(frame.payload).is_err() {
|
||||
log::warn!("Stream {} back-channel full, closing", frame.stream_id);
|
||||
writers.remove(&frame.stream_id);
|
||||
}
|
||||
FRAME_CONFIG => {
|
||||
if let Ok(update) = serde_json::from_slice::<ConfigUpdate>(&frame.payload) {
|
||||
log::info!("Config update from hub: ports {:?}", update.listen_ports);
|
||||
*listen_ports.write().await = update.listen_ports.clone();
|
||||
let _ = event_tx.try_send(EdgeEvent::PortsUpdated {
|
||||
listen_ports: update.listen_ports.clone(),
|
||||
});
|
||||
apply_port_config(
|
||||
&update.listen_ports,
|
||||
&mut port_listeners,
|
||||
&tunnel_writer_tx,
|
||||
&tunnel_data_tx,
|
||||
&client_writers,
|
||||
active_streams,
|
||||
next_stream_id,
|
||||
&config.edge_id,
|
||||
connection_token,
|
||||
bind_address,
|
||||
);
|
||||
}
|
||||
}
|
||||
FRAME_PING => {
|
||||
let pong_frame = encode_frame(0, FRAME_PONG, &[]);
|
||||
if tunnel_writer_tx.try_send(pong_frame).is_err() {
|
||||
// Control channel full (WINDOW_UPDATE burst from many streams).
|
||||
// DON'T disconnect — the 45s liveness timeout gives margin
|
||||
// for the channel to drain and the next PONG to succeed.
|
||||
log::warn!("PONG send failed, control channel full — skipping this cycle");
|
||||
}
|
||||
log::trace!("Received PING from hub, sent PONG");
|
||||
}
|
||||
_ => {
|
||||
log::warn!("Unexpected frame type {} from hub", frame.frame_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
log::info!("Hub disconnected (EOF)");
|
||||
break EdgeLoopResult::Reconnect("hub_eof".to_string());
|
||||
FRAME_WINDOW_UPDATE_BACK => {
|
||||
if let Some(increment) = decode_window_update(&frame.payload) {
|
||||
if increment > 0 {
|
||||
let writers = client_writers.lock().await;
|
||||
if let Some(state) = writers.get(&frame.stream_id) {
|
||||
let prev = state.send_window.fetch_add(increment, Ordering::Release);
|
||||
if prev + increment > MAX_WINDOW_SIZE {
|
||||
state.send_window.store(MAX_WINDOW_SIZE, Ordering::Release);
|
||||
}
|
||||
state.window_notify.notify_one();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
log::error!("Hub frame error: {}", e);
|
||||
break EdgeLoopResult::Reconnect(format!("hub_frame_error: {}", e));
|
||||
FRAME_CLOSE_BACK => {
|
||||
let mut writers = client_writers.lock().await;
|
||||
writers.remove(&frame.stream_id);
|
||||
}
|
||||
FRAME_CONFIG => {
|
||||
if let Ok(update) = serde_json::from_slice::<ConfigUpdate>(&frame.payload) {
|
||||
log::info!("Config update from hub: ports {:?}", update.listen_ports);
|
||||
*listen_ports.write().await = update.listen_ports.clone();
|
||||
let _ = event_tx.try_send(EdgeEvent::PortsUpdated {
|
||||
listen_ports: update.listen_ports.clone(),
|
||||
});
|
||||
apply_port_config(
|
||||
&update.listen_ports,
|
||||
&mut port_listeners,
|
||||
&tunnel_writer_tx,
|
||||
&tunnel_data_tx,
|
||||
&client_writers,
|
||||
active_streams,
|
||||
next_stream_id,
|
||||
&config.edge_id,
|
||||
connection_token,
|
||||
bind_address,
|
||||
);
|
||||
}
|
||||
}
|
||||
FRAME_PING => {
|
||||
tunnel_io.queue_ctrl(encode_frame(0, FRAME_PONG, &[]));
|
||||
}
|
||||
_ => {
|
||||
log::warn!("Unexpected frame type {} from hub", frame.frame_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = &mut liveness_deadline => {
|
||||
log::warn!("Hub liveness timeout (no frames for {}s), reconnecting",
|
||||
liveness_timeout_dur.as_secs());
|
||||
remoteingress_protocol::TunnelEvent::Eof => {
|
||||
log::info!("Hub disconnected (EOF)");
|
||||
break EdgeLoopResult::Reconnect("hub_eof".to_string());
|
||||
}
|
||||
remoteingress_protocol::TunnelEvent::ReadError(e) => {
|
||||
log::error!("Hub frame read error: {}", e);
|
||||
break EdgeLoopResult::Reconnect(format!("hub_frame_error: {}", e));
|
||||
}
|
||||
remoteingress_protocol::TunnelEvent::WriteError(e) => {
|
||||
log::error!("Tunnel write error: {}", e);
|
||||
break EdgeLoopResult::Reconnect(format!("tunnel_write_error: {}", e));
|
||||
}
|
||||
remoteingress_protocol::TunnelEvent::LivenessTimeout => {
|
||||
log::warn!("Hub liveness timeout (no frames for {}s), reconnecting", liveness_timeout_dur.as_secs());
|
||||
break EdgeLoopResult::Reconnect("liveness_timeout".to_string());
|
||||
}
|
||||
_ = &mut writer_dead_rx => {
|
||||
log::error!("Tunnel writer died, reconnecting immediately");
|
||||
break EdgeLoopResult::Reconnect("writer_dead".to_string());
|
||||
}
|
||||
_ = connection_token.cancelled() => {
|
||||
log::info!("Connection cancelled");
|
||||
break EdgeLoopResult::Shutdown;
|
||||
}
|
||||
_ = shutdown_rx.recv() => {
|
||||
remoteingress_protocol::TunnelEvent::Cancelled => {
|
||||
if shutdown_rx.try_recv().is_ok() {
|
||||
break EdgeLoopResult::Shutdown;
|
||||
}
|
||||
break EdgeLoopResult::Shutdown;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Cancel connection token to propagate to all child tasks BEFORE aborting
|
||||
// Cleanup
|
||||
connection_token.cancel();
|
||||
stun_handle.abort();
|
||||
tunnel_writer_handle.abort();
|
||||
for (_, h) in port_listeners.drain() {
|
||||
h.abort();
|
||||
}
|
||||
@@ -717,7 +746,7 @@ async fn handle_client_connection(
|
||||
}
|
||||
|
||||
// Set up channel for data coming back from hub (capacity 16 is sufficient with flow control)
|
||||
let (back_tx, mut back_rx) = mpsc::channel::<Vec<u8>>(256);
|
||||
let (back_tx, mut back_rx) = mpsc::channel::<Vec<u8>>(1024);
|
||||
// Adaptive initial window: scale with current stream count to keep total in-flight
|
||||
// data within the 32MB budget. Prevents burst flooding when many streams open.
|
||||
let initial_window = remoteingress_protocol::compute_window_for_stream_count(
|
||||
|
||||
Reference in New Issue
Block a user