feat(protocol): add sustained-stream tunnel scheduling to isolate high-throughput traffic

This commit is contained in:
2026-03-18 00:13:14 +00:00
parent a63247af3e
commit 6cbe8bee5e
5 changed files with 181 additions and 32 deletions

View File

@@ -2,8 +2,10 @@ use std::collections::VecDeque;
use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll};
use std::time::Duration;
use bytes::{Bytes, BytesMut, BufMut};
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
use tokio::time::Instant;
// Frame type constants
pub const FRAME_OPEN: u8 = 0x01;
@@ -31,6 +33,16 @@ pub const WINDOW_UPDATE_THRESHOLD: u32 = INITIAL_STREAM_WINDOW / 2;
/// Maximum window size to prevent overflow.
pub const MAX_WINDOW_SIZE: u32 = 4 * 1024 * 1024;
// Sustained stream classification constants
/// Throughput threshold for sustained classification (2.5 MB/s = 20 Mbit/s).
pub const SUSTAINED_THRESHOLD_BPS: u64 = 2_500_000;
/// Minimum duration before a stream can be classified as sustained.
pub const SUSTAINED_MIN_DURATION_SECS: u64 = 10;
/// Fixed window for sustained streams (1 MB — the floor).
pub const SUSTAINED_WINDOW: u32 = 1 * 1024 * 1024;
/// Maximum bytes written from sustained queue per forced drain (1 MB/s guarantee).
pub const SUSTAINED_FORCED_DRAIN_CAP: usize = 1_048_576;
/// Encode a WINDOW_UPDATE frame for a specific stream.
pub fn encode_window_update(stream_id: u32, frame_type: u8, increment: u32) -> Bytes {
encode_frame(stream_id, frame_type, &increment.to_be_bytes())
@@ -185,24 +197,30 @@ pub enum TunnelEvent {
/// Write state extracted into a sub-struct so the borrow checker can see
/// disjoint field access between `self.write` and `self.stream`.
struct WriteState {
ctrl_queue: VecDeque<Bytes>, // PONG, WINDOW_UPDATE, CLOSE, OPEN — always first
data_queue: VecDeque<Bytes>, // DATA, DATA_BACK — only when ctrl is empty
offset: usize, // progress within current frame being written
ctrl_queue: VecDeque<Bytes>, // PONG, WINDOW_UPDATE, CLOSE, OPEN — always first
data_queue: VecDeque<Bytes>, // DATA, DATA_BACK — only when ctrl is empty
sustained_queue: VecDeque<Bytes>, // DATA, DATA_BACK from sustained streams — lowest priority
offset: usize, // progress within current frame being written
flush_needed: bool,
// Sustained starvation prevention: guaranteed 1 MB/s drain
sustained_last_drain: Instant,
sustained_bytes_this_period: usize,
}
impl WriteState {
fn has_work(&self) -> bool {
!self.ctrl_queue.is_empty() || !self.data_queue.is_empty()
!self.ctrl_queue.is_empty() || !self.data_queue.is_empty() || !self.sustained_queue.is_empty()
}
}
/// Single-owner I/O engine for the tunnel TLS connection.
///
/// Owns the TLS stream directly — no `tokio::io::split()`, no mutex.
/// Uses two priority write queues: ctrl frames (PONG, WINDOW_UPDATE, CLOSE, OPEN)
/// are ALWAYS written before data frames (DATA, DATA_BACK). This prevents
/// WINDOW_UPDATE starvation that causes flow control deadlocks.
/// Uses three priority write queues:
/// 1. ctrl (PONG, WINDOW_UPDATE, CLOSE, OPEN) — always first
/// 2. data (DATA, DATA_BACK from normal streams) — when ctrl empty
/// 3. sustained (DATA, DATA_BACK from sustained streams) — lowest priority,
/// drained freely when ctrl+data empty, or forced 1MB/s when they're not
pub struct TunnelIo<S> {
stream: S,
// Read state: accumulate bytes, parse frames incrementally
@@ -228,8 +246,11 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
write: WriteState {
ctrl_queue: VecDeque::new(),
data_queue: VecDeque::new(),
sustained_queue: VecDeque::new(),
offset: 0,
flush_needed: false,
sustained_last_drain: Instant::now(),
sustained_bytes_this_period: 0,
},
}
}
@@ -244,6 +265,11 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
self.write.data_queue.push_back(frame);
}
/// Queue a lowest-priority sustained data frame.
pub fn queue_sustained(&mut self, frame: Bytes) {
self.write.sustained_queue.push_back(frame);
}
/// Try to parse a complete frame from the read buffer.
/// Uses a parse_pos cursor to avoid drain() on every frame.
pub fn try_parse_frame(&mut self) -> Option<Result<Frame, std::io::Error>> {
@@ -303,33 +329,42 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
/// Poll-based I/O step. Returns Ready on events, Pending when idle.
///
/// Order: write(ctrl->data) -> flush -> read -> channels -> timers
/// Order: write(ctrl->data->sustained) -> flush -> read -> channels -> timers
pub fn poll_step(
&mut self,
cx: &mut Context<'_>,
ctrl_rx: &mut tokio::sync::mpsc::Receiver<Bytes>,
data_rx: &mut tokio::sync::mpsc::Receiver<Bytes>,
sustained_rx: &mut tokio::sync::mpsc::Receiver<Bytes>,
liveness_deadline: &mut Pin<Box<tokio::time::Sleep>>,
cancel_token: &tokio_util::sync::CancellationToken,
) -> Poll<TunnelEvent> {
// 1. WRITE: drain ctrl queue first, then data queue.
// 1. WRITE: 3-tier priority — ctrl first, then data, then sustained.
// Sustained drains freely when ctrl+data are empty.
// Write one frame, set flush_needed, then flush must complete before
// writing more. This prevents unbounded TLS session buffer growth.
// Safe: `self.write` and `self.stream` are disjoint fields.
let mut writes = 0;
while self.write.has_work() && writes < 16 && !self.write.flush_needed {
let from_ctrl = !self.write.ctrl_queue.is_empty();
let frame = if from_ctrl {
self.write.ctrl_queue.front().unwrap()
// Pick queue: ctrl > data > sustained
let queue_id = if !self.write.ctrl_queue.is_empty() {
0 // ctrl
} else if !self.write.data_queue.is_empty() {
1 // data
} else {
self.write.data_queue.front().unwrap()
2 // sustained
};
let frame = match queue_id {
0 => self.write.ctrl_queue.front().unwrap(),
1 => self.write.data_queue.front().unwrap(),
_ => self.write.sustained_queue.front().unwrap(),
};
let remaining = &frame[self.write.offset..];
match Pin::new(&mut self.stream).poll_write(cx, remaining) {
Poll::Ready(Ok(0)) => {
log::error!("TunnelIo: poll_write returned 0 (write zero), ctrl_q={} data_q={}",
self.write.ctrl_queue.len(), self.write.data_queue.len());
log::error!("TunnelIo: poll_write returned 0 (write zero), ctrl_q={} data_q={} sustained_q={}",
self.write.ctrl_queue.len(), self.write.data_queue.len(), self.write.sustained_queue.len());
return Poll::Ready(TunnelEvent::WriteError(
std::io::Error::new(std::io::ErrorKind::WriteZero, "write zero"),
));
@@ -338,21 +373,70 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
self.write.offset += n;
self.write.flush_needed = true;
if self.write.offset >= frame.len() {
if from_ctrl { self.write.ctrl_queue.pop_front(); }
else { self.write.data_queue.pop_front(); }
match queue_id {
0 => { self.write.ctrl_queue.pop_front(); }
1 => { self.write.data_queue.pop_front(); }
_ => {
self.write.sustained_queue.pop_front();
self.write.sustained_last_drain = Instant::now();
self.write.sustained_bytes_this_period = 0;
}
}
self.write.offset = 0;
writes += 1;
}
}
Poll::Ready(Err(e)) => {
log::error!("TunnelIo: poll_write error: {} (ctrl_q={} data_q={})",
e, self.write.ctrl_queue.len(), self.write.data_queue.len());
log::error!("TunnelIo: poll_write error: {} (ctrl_q={} data_q={} sustained_q={})",
e, self.write.ctrl_queue.len(), self.write.data_queue.len(), self.write.sustained_queue.len());
return Poll::Ready(TunnelEvent::WriteError(e));
}
Poll::Pending => break,
}
}
// 1b. FORCED SUSTAINED DRAIN: when ctrl/data have work but sustained is waiting,
// guarantee at least 1 MB/s by draining up to SUSTAINED_FORCED_DRAIN_CAP
// once per second.
if !self.write.sustained_queue.is_empty()
&& (!self.write.ctrl_queue.is_empty() || !self.write.data_queue.is_empty())
&& !self.write.flush_needed
{
let now = Instant::now();
if now.duration_since(self.write.sustained_last_drain) >= Duration::from_secs(1) {
self.write.sustained_bytes_this_period = 0;
self.write.sustained_last_drain = now;
while !self.write.sustained_queue.is_empty()
&& self.write.sustained_bytes_this_period < SUSTAINED_FORCED_DRAIN_CAP
&& !self.write.flush_needed
{
let frame = self.write.sustained_queue.front().unwrap();
let remaining = &frame[self.write.offset..];
match Pin::new(&mut self.stream).poll_write(cx, remaining) {
Poll::Ready(Ok(0)) => {
return Poll::Ready(TunnelEvent::WriteError(
std::io::Error::new(std::io::ErrorKind::WriteZero, "write zero"),
));
}
Poll::Ready(Ok(n)) => {
self.write.offset += n;
self.write.flush_needed = true;
self.write.sustained_bytes_this_period += n;
if self.write.offset >= frame.len() {
self.write.sustained_queue.pop_front();
self.write.offset = 0;
}
}
Poll::Ready(Err(e)) => {
return Poll::Ready(TunnelEvent::WriteError(e));
}
Poll::Pending => break,
}
}
}
}
// 2. FLUSH: push encrypted data from TLS session to TCP.
if self.write.flush_needed {
match Pin::new(&mut self.stream).poll_flush(cx) {
@@ -436,6 +520,16 @@ impl<S: AsyncRead + AsyncWrite + Unpin> TunnelIo<S> {
}
}
}
// Sustained channel: drain when sustained_queue is small (same backpressure pattern).
// Channel close is non-fatal — not all connections have sustained streams.
if self.write.sustained_queue.len() < 64 {
loop {
match sustained_rx.poll_recv(cx) {
Poll::Ready(Some(frame)) => { self.write.sustained_queue.push_back(frame); got_new = true; }
Poll::Ready(None) | Poll::Pending => break,
}
}
}
// 5. TIMERS
if liveness_deadline.as_mut().poll(cx).is_ready() {