update
This commit is contained in:
@@ -519,6 +519,7 @@ async fn connect_to_hub_and_run(
|
||||
// Single-owner I/O engine — no tokio::io::split, no mutex
|
||||
let mut tunnel_io = remoteingress_protocol::TunnelIo::new(tls_stream, Vec::new());
|
||||
|
||||
|
||||
let liveness_timeout_dur = Duration::from_secs(45);
|
||||
let mut last_activity = Instant::now();
|
||||
let mut liveness_deadline = Box::pin(sleep_until(last_activity + liveness_timeout_dur));
|
||||
|
||||
@@ -755,6 +755,7 @@ async fn handle_edge_connection(
|
||||
// Single-owner I/O engine — no tokio::io::split, no mutex
|
||||
let mut tunnel_io = remoteingress_protocol::TunnelIo::new(tls_stream, Vec::new());
|
||||
|
||||
|
||||
// Assigned in every break path of the hub_loop before use at the end.
|
||||
#[allow(unused_assignments)]
|
||||
let mut disconnect_reason = String::new();
|
||||
|
||||
@@ -183,6 +183,11 @@ pub enum TunnelEvent {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// Maximum bytes written to TLS session buffer before requiring a flush.
|
||||
/// Prevents unbounded buffer growth (OOM) while keeping the pipe saturated.
|
||||
/// 256KB ≈ typical TCP send buffer — enough to fill the socket on each flush.
|
||||
const MAX_UNFLUSHED_BYTES: usize = 256 * 1024;
|
||||
|
||||
/// Write state extracted into a sub-struct so the borrow checker can see
|
||||
/// disjoint field access between `self.write` and `self.stream`.
|
||||
struct WriteState {
|
||||
@@ -190,6 +195,7 @@ struct WriteState {
|
||||
data_queue: VecDeque<Vec<u8>>, // DATA, DATA_BACK — only when ctrl is empty
|
||||
offset: usize, // progress within current frame being written
|
||||
flush_needed: bool,
|
||||
unflushed_bytes: usize, // bytes written to TLS since last successful flush
|
||||
}
|
||||
|
||||
impl WriteState {
|
||||
@@ -231,6 +237,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
data_queue: VecDeque::new(),
|
||||
offset: 0,
|
||||
flush_needed: false,
|
||||
unflushed_bytes: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -312,12 +319,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
cancel_token: &tokio_util::sync::CancellationToken,
|
||||
) -> Poll<TunnelEvent> {
|
||||
// 1. WRITE: drain ctrl queue first, then data queue.
|
||||
// Only write when flush is complete — otherwise the TLS session buffer
|
||||
// grows without bound (poll_write always returns Ready, buffering plaintext
|
||||
// in the TLS session even when TCP can't keep up).
|
||||
// Allow up to MAX_UNFLUSHED_BYTES in the TLS session buffer before
|
||||
// requiring a flush. This keeps the pipe saturated (unlike waiting for
|
||||
// flush to complete) while preventing unbounded buffer growth (OOM).
|
||||
// Safe: `self.write` and `self.stream` are disjoint fields.
|
||||
let mut writes = 0;
|
||||
while self.write.has_work() && writes < 16 && !self.write.flush_needed {
|
||||
while self.write.has_work() && writes < 16
|
||||
&& self.write.unflushed_bytes < MAX_UNFLUSHED_BYTES
|
||||
&& !self.write.flush_needed
|
||||
{
|
||||
let from_ctrl = !self.write.ctrl_queue.is_empty();
|
||||
let frame = if from_ctrl {
|
||||
self.write.ctrl_queue.front().unwrap()
|
||||
@@ -328,6 +338,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
|
||||
match Pin::new(&mut self.stream).poll_write(cx, remaining) {
|
||||
Poll::Ready(Ok(0)) => {
|
||||
log::error!("TunnelIo: poll_write returned 0 (write zero), ctrl_q={} data_q={} unflushed={}",
|
||||
self.write.ctrl_queue.len(), self.write.data_queue.len(), self.write.unflushed_bytes);
|
||||
return Poll::Ready(TunnelEvent::WriteError(
|
||||
std::io::Error::new(std::io::ErrorKind::WriteZero, "write zero"),
|
||||
));
|
||||
@@ -335,6 +347,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
Poll::Ready(Ok(n)) => {
|
||||
self.write.offset += n;
|
||||
self.write.flush_needed = true;
|
||||
self.write.unflushed_bytes += n;
|
||||
if self.write.offset >= frame.len() {
|
||||
if from_ctrl { self.write.ctrl_queue.pop_front(); }
|
||||
else { self.write.data_queue.pop_front(); }
|
||||
@@ -342,7 +355,11 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
writes += 1;
|
||||
}
|
||||
}
|
||||
Poll::Ready(Err(e)) => return Poll::Ready(TunnelEvent::WriteError(e)),
|
||||
Poll::Ready(Err(e)) => {
|
||||
log::error!("TunnelIo: poll_write error: {} (ctrl_q={} data_q={} unflushed={})",
|
||||
e, self.write.ctrl_queue.len(), self.write.data_queue.len(), self.write.unflushed_bytes);
|
||||
return Poll::Ready(TunnelEvent::WriteError(e));
|
||||
}
|
||||
Poll::Pending => break,
|
||||
}
|
||||
}
|
||||
@@ -350,8 +367,14 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
// 2. FLUSH: push encrypted data from TLS session to TCP.
|
||||
if self.write.flush_needed {
|
||||
match Pin::new(&mut self.stream).poll_flush(cx) {
|
||||
Poll::Ready(Ok(())) => self.write.flush_needed = false,
|
||||
Poll::Ready(Err(e)) => return Poll::Ready(TunnelEvent::WriteError(e)),
|
||||
Poll::Ready(Ok(())) => {
|
||||
self.write.flush_needed = false;
|
||||
self.write.unflushed_bytes = 0;
|
||||
}
|
||||
Poll::Ready(Err(e)) => {
|
||||
log::error!("TunnelIo: poll_flush error: {} (unflushed={})", e, self.write.unflushed_bytes);
|
||||
return Poll::Ready(TunnelEvent::WriteError(e));
|
||||
}
|
||||
Poll::Pending => {} // TCP waker will notify us
|
||||
}
|
||||
}
|
||||
@@ -387,12 +410,19 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
// Partial data — loop to call poll_read again so the TCP
|
||||
// waker is re-registered when it finally returns Pending.
|
||||
}
|
||||
Poll::Ready(Err(e)) => return Poll::Ready(TunnelEvent::ReadError(e)),
|
||||
Poll::Ready(Err(e)) => {
|
||||
log::error!("TunnelIo: poll_read error: {}", e);
|
||||
return Poll::Ready(TunnelEvent::ReadError(e));
|
||||
}
|
||||
Poll::Pending => break,
|
||||
}
|
||||
}
|
||||
|
||||
// 4. CHANNELS: drain ctrl into ctrl_queue, data into data_queue.
|
||||
// 4. CHANNELS: drain ctrl (always — priority), data (only if queue is small).
|
||||
// Ctrl frames must never be delayed — always drain fully.
|
||||
// Data frames are gated: keep data in the bounded channel for proper
|
||||
// backpressure when TLS writes are slow. Without this gate, the internal
|
||||
// data_queue (unbounded VecDeque) grows to hundreds of MB under throttle → OOM.
|
||||
let mut got_new = false;
|
||||
loop {
|
||||
match ctrl_rx.poll_recv(cx) {
|
||||
@@ -405,15 +435,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
Poll::Pending => break,
|
||||
}
|
||||
}
|
||||
loop {
|
||||
match data_rx.poll_recv(cx) {
|
||||
Poll::Ready(Some(frame)) => { self.write.data_queue.push_back(frame); got_new = true; }
|
||||
Poll::Ready(None) => {
|
||||
return Poll::Ready(TunnelEvent::WriteError(
|
||||
std::io::Error::new(std::io::ErrorKind::BrokenPipe, "data channel closed"),
|
||||
));
|
||||
if self.write.data_queue.len() < 64 {
|
||||
loop {
|
||||
match data_rx.poll_recv(cx) {
|
||||
Poll::Ready(Some(frame)) => { self.write.data_queue.push_back(frame); got_new = true; }
|
||||
Poll::Ready(None) => {
|
||||
return Poll::Ready(TunnelEvent::WriteError(
|
||||
std::io::Error::new(std::io::ErrorKind::BrokenPipe, "data channel closed"),
|
||||
));
|
||||
}
|
||||
Poll::Pending => break,
|
||||
}
|
||||
Poll::Pending => break,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -426,10 +458,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
|
||||
}
|
||||
|
||||
// 6. SELF-WAKE: only when flush is complete AND we have work.
|
||||
// If flush is pending, the TCP write-readiness waker will notify us.
|
||||
// CRITICAL: do NOT self-wake when flush_needed — this causes unbounded
|
||||
// TLS session buffer growth (poll_write always accepts plaintext, but TCP
|
||||
// can't drain it fast enough → OOM → process killed → ECONNRESET).
|
||||
// When flush is Pending, the TCP write-readiness waker will notify us.
|
||||
// CRITICAL: do NOT self-wake when flush_needed — poll_write always returns
|
||||
// Ready (TLS buffers in-memory), so self-waking causes a tight spin loop
|
||||
// that fills the TLS session buffer unboundedly → OOM → ECONNRESET.
|
||||
// The write loop's MAX_UNFLUSHED_BYTES gate allows up to 64KB per poll_step
|
||||
// even across flush boundaries, keeping the pipe saturated without spinning.
|
||||
if !self.write.flush_needed && (got_new || self.write.has_work()) {
|
||||
cx.waker().wake_by_ref();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user