feat(mixer): enhance mixer functionality with interaction and tool legs

- Updated mixer to handle participant and isolated leg roles, allowing for IVR and consent interactions. - Introduced commands for starting and canceling interactions, managing tool legs for recording and transcription. - Implemented per-source audio handling for tool legs, enabling separate audio processing. - Enhanced DTMF handling to forward events between participant legs only. - Added support for PCM recording directly from tool legs, with WAV file generation. - Updated TypeScript definitions and functions to support new interaction and tool leg features.
2026-04-10 14:54:21 +00:00
parent 6a130db7c7
commit 7d59361352
13 changed files with 1448 additions and 94 deletions
--- a/rust/crates/proxy-engine/src/mixer.rs
+++ b/rust/crates/proxy-engine/src/mixer.rs
@@ -5,14 +5,17 @@
 //!
 //! The mixer runs a 20ms tick loop:
 //! 1. Drain inbound channels, decode to PCM, resample to 16kHz
-//! 2. Compute total mix (sum of all legs' PCM as i32)
-//! 3. For each leg: mix-minus = total - own, resample to leg codec rate, encode, send
+//! 2. Compute total mix (sum of all **participant** legs' PCM as i32)
+//! 3. For each participant leg: mix-minus = total - own, resample to leg codec rate, encode, send
+//! 4. For each isolated leg: play prompt frame or silence, check DTMF
+//! 5. For each tool leg: send per-source unmerged audio batch
+//! 6. Forward DTMF between participant legs only

 use crate::ipc::{emit_event, OutTx};
 use crate::rtp::{build_rtp_header, rtp_clock_increment};
 use codec_lib::{codec_sample_rate, TranscodeState};
-use std::collections::HashMap;
-use tokio::sync::mpsc;
+use std::collections::{HashMap, VecDeque};
+use tokio::sync::{mpsc, oneshot};
 use tokio::task::JoinHandle;
 use tokio::time::{self, Duration, MissedTickBehavior};

@@ -25,11 +28,84 @@ const MIX_FRAME_SIZE: usize = 320; // 16000 * 0.020
 pub struct RtpPacket {
    pub payload: Vec<u8>,
    pub payload_type: u8,
+    /// RTP marker bit (first packet of a DTMF event, etc.).
+    pub marker: bool,
+    /// RTP timestamp from the original packet header.
+    pub timestamp: u32,
 }

+// ---------------------------------------------------------------------------
+// Leg roles
+// ---------------------------------------------------------------------------
+
+/// What role a leg currently plays in the mixer.
+enum LegRole {
+    /// Normal participant: contributes to mix, receives mix-minus.
+    Participant,
+    /// Temporarily isolated for IVR/consent interaction.
+    Isolated(IsolationState),
+}
+
+struct IsolationState {
+    /// PCM frames at MIX_RATE (320 samples each) queued for playback.
+    prompt_frames: VecDeque<Vec<i16>>,
+    /// Digits that complete the interaction (e.g., ['1', '2']).
+    expected_digits: Vec<char>,
+    /// Ticks remaining before timeout (decremented each tick after prompt ends).
+    timeout_ticks_remaining: u32,
+    /// Whether we've finished playing the prompt.
+    prompt_done: bool,
+    /// Channel to send the result back to the command handler.
+    result_tx: Option<oneshot::Sender<InteractionResult>>,
+}
+
+/// Result of a leg interaction (consent prompt, IVR, etc.).
+pub enum InteractionResult {
+    /// The participant pressed one of the expected digits.
+    Digit(char),
+    /// No digit was received within the timeout.
+    Timeout,
+    /// The leg was removed or the call tore down before completion.
+    Cancelled,
+}
+
+// ---------------------------------------------------------------------------
+// Tool legs
+// ---------------------------------------------------------------------------
+
+/// Type of tool leg.
+#[derive(Debug, Clone, Copy)]
+pub enum ToolType {
+    Recording,
+    Transcription,
+}
+
+/// Per-source audio delivered to a tool leg each mixer tick.
+pub struct ToolAudioBatch {
+    pub sources: Vec<ToolAudioSource>,
+}
+
+/// One participant's 20ms audio frame.
+pub struct ToolAudioSource {
+    pub leg_id: String,
+    /// PCM at 16kHz, MIX_FRAME_SIZE (320) samples.
+    pub pcm_16k: Vec<i16>,
+}
+
+/// Internal storage for a tool leg inside the mixer.
+struct ToolLegSlot {
+    #[allow(dead_code)]
+    tool_type: ToolType,
+    audio_tx: mpsc::Sender<ToolAudioBatch>,
+}
+
+// ---------------------------------------------------------------------------
+// Commands
+// ---------------------------------------------------------------------------
+
 /// Commands sent to the mixer task via a control channel.
 pub enum MixerCommand {
-    /// Add a new leg to the mix.
+    /// Add a new participant leg to the mix.
    AddLeg {
        leg_id: String,
        codec_pt: u8,
@@ -40,8 +116,35 @@ pub enum MixerCommand {
    RemoveLeg { leg_id: String },
    /// Shut down the mixer.
    Shutdown,
+
+    /// Isolate a leg and start an interaction (consent prompt, IVR).
+    /// The leg is removed from the mix and hears the prompt instead.
+    /// DTMF from the leg is checked against expected_digits.
+    StartInteraction {
+        leg_id: String,
+        /// PCM frames at MIX_RATE (16kHz), each 320 samples.
+        prompt_pcm_frames: Vec<Vec<i16>>,
+        expected_digits: Vec<char>,
+        timeout_ms: u32,
+        result_tx: oneshot::Sender<InteractionResult>,
+    },
+    /// Cancel an in-progress interaction (e.g., leg being removed).
+    CancelInteraction { leg_id: String },
+
+    /// Add a tool leg that receives per-source unmerged audio.
+    AddToolLeg {
+        leg_id: String,
+        tool_type: ToolType,
+        audio_tx: mpsc::Sender<ToolAudioBatch>,
+    },
+    /// Remove a tool leg (drops the channel, background task finalizes).
+    RemoveToolLeg { leg_id: String },
 }

+// ---------------------------------------------------------------------------
+// Mixer internals
+// ---------------------------------------------------------------------------
+
 /// Internal per-leg state inside the mixer.
 struct MixerLegSlot {
    codec_pt: u8,
@@ -56,6 +159,8 @@ struct MixerLegSlot {
    rtp_seq: u16,
    rtp_ts: u32,
    rtp_ssrc: u32,
+    /// Current role of this leg in the mixer.
+    role: LegRole,
 }

 /// Spawn the mixer task for a call. Returns the command sender and task handle.
@@ -79,13 +184,14 @@ async fn mixer_loop(
    out_tx: OutTx,
 ) {
    let mut legs: HashMap<String, MixerLegSlot> = HashMap::new();
+    let mut tool_legs: HashMap<String, ToolLegSlot> = HashMap::new();
    let mut interval = time::interval(Duration::from_millis(20));
    interval.set_missed_tick_behavior(MissedTickBehavior::Skip);

    loop {
        interval.tick().await;

-        // 1. Process control commands (non-blocking).
+        // ── 1. Process control commands (non-blocking). ─────────────
        loop {
            match cmd_rx.try_recv() {
                Ok(MixerCommand::AddLeg {
@@ -121,38 +227,115 @@ async fn mixer_loop(
                            rtp_seq: 0,
                            rtp_ts: 0,
                            rtp_ssrc: rand::random(),
+                            role: LegRole::Participant,
                        },
                    );
                }
                Ok(MixerCommand::RemoveLeg { leg_id }) => {
+                    // If the leg is isolated, send Cancelled before dropping.
+                    if let Some(slot) = legs.get_mut(&leg_id) {
+                        if let LegRole::Isolated(ref mut state) = slot.role {
+                            if let Some(tx) = state.result_tx.take() {
+                                let _ = tx.send(InteractionResult::Cancelled);
+                            }
+                        }
+                    }
                    legs.remove(&leg_id);
                    // Channels drop → I/O tasks exit cleanly.
                }
-                Ok(MixerCommand::Shutdown) => return,
+                Ok(MixerCommand::Shutdown) => {
+                    // Cancel all outstanding interactions before shutting down.
+                    for slot in legs.values_mut() {
+                        if let LegRole::Isolated(ref mut state) = slot.role {
+                            if let Some(tx) = state.result_tx.take() {
+                                let _ = tx.send(InteractionResult::Cancelled);
+                            }
+                        }
+                    }
+                    return;
+                }
+                Ok(MixerCommand::StartInteraction {
+                    leg_id,
+                    prompt_pcm_frames,
+                    expected_digits,
+                    timeout_ms,
+                    result_tx,
+                }) => {
+                    if let Some(slot) = legs.get_mut(&leg_id) {
+                        // Cancel any existing interaction first.
+                        if let LegRole::Isolated(ref mut old_state) = slot.role {
+                            if let Some(tx) = old_state.result_tx.take() {
+                                let _ = tx.send(InteractionResult::Cancelled);
+                            }
+                        }
+                        let timeout_ticks = timeout_ms / 20;
+                        slot.role = LegRole::Isolated(IsolationState {
+                            prompt_frames: VecDeque::from(prompt_pcm_frames),
+                            expected_digits,
+                            timeout_ticks_remaining: timeout_ticks,
+                            prompt_done: false,
+                            result_tx: Some(result_tx),
+                        });
+                    } else {
+                        // Leg not found — immediately cancel.
+                        let _ = result_tx.send(InteractionResult::Cancelled);
+                    }
+                }
+                Ok(MixerCommand::CancelInteraction { leg_id }) => {
+                    if let Some(slot) = legs.get_mut(&leg_id) {
+                        if let LegRole::Isolated(ref mut state) = slot.role {
+                            if let Some(tx) = state.result_tx.take() {
+                                let _ = tx.send(InteractionResult::Cancelled);
+                            }
+                        }
+                        slot.role = LegRole::Participant;
+                    }
+                }
+                Ok(MixerCommand::AddToolLeg {
+                    leg_id,
+                    tool_type,
+                    audio_tx,
+                }) => {
+                    tool_legs.insert(leg_id, ToolLegSlot { tool_type, audio_tx });
+                }
+                Ok(MixerCommand::RemoveToolLeg { leg_id }) => {
+                    tool_legs.remove(&leg_id);
+                    // Dropping the ToolLegSlot drops audio_tx → background task sees channel close.
+                }
                Err(mpsc::error::TryRecvError::Empty) => break,
                Err(mpsc::error::TryRecvError::Disconnected) => return,
            }
        }

-        if legs.is_empty() {
+        if legs.is_empty() && tool_legs.is_empty() {
            continue;
        }

-        // 2. Drain inbound packets, decode to 16kHz PCM.
+        // ── 2. Drain inbound packets, decode to 16kHz PCM. ─────────
+        //    DTMF (PT 101) packets are collected separately.
        let leg_ids: Vec<String> = legs.keys().cloned().collect();
+        let mut dtmf_forward: Vec<(String, RtpPacket)> = Vec::new();
+
        for lid in &leg_ids {
            let slot = legs.get_mut(lid).unwrap();

-            // Drain channel, keep only the latest packet (simple jitter handling).
-            let mut latest: Option<RtpPacket> = None;
+            // Drain channel — collect DTMF packets separately, keep latest audio.
+            let mut latest_audio: Option<RtpPacket> = None;
            loop {
                match slot.inbound_rx.try_recv() {
-                    Ok(pkt) => latest = Some(pkt),
+                    Ok(pkt) => {
+                        if pkt.payload_type == 101 {
+                            // DTMF telephone-event: collect for processing.
+                            dtmf_forward.push((lid.clone(), pkt));
+                        } else {
+                            latest_audio = Some(pkt);
+                        }
+                    }
                    Err(_) => break,
                }
            }

-            if let Some(pkt) = latest {
+            if let Some(pkt) = latest_audio {
                slot.silent_ticks = 0;
                match slot.transcoder.decode_to_pcm(&pkt.payload, pkt.payload_type) {
                    Ok((pcm, rate)) => {
@@ -174,6 +357,9 @@ async fn mixer_loop(
                        slot.last_pcm_frame = vec![0i16; MIX_FRAME_SIZE];
                    }
                }
+            } else if dtmf_forward.iter().any(|(src, _)| src == lid) {
+                // Got DTMF but no audio — don't bump silent_ticks (DTMF counts as activity).
+                slot.silent_ticks = 0;
            } else {
                slot.silent_ticks += 1;
                // After 150 ticks (3 seconds) of silence, zero out to avoid stale audio.
@@ -183,50 +369,210 @@ async fn mixer_loop(
            }
        }

-        // 3. Compute total mix (sum of all legs as i32 to avoid overflow).
+        // ── 3. Compute total mix from PARTICIPANT legs only. ────────
        let mut total_mix = vec![0i32; MIX_FRAME_SIZE];
        for slot in legs.values() {
-            for (i, &s) in slot.last_pcm_frame.iter().enumerate().take(MIX_FRAME_SIZE) {
-                total_mix[i] += s as i32;
+            if matches!(slot.role, LegRole::Participant) {
+                for (i, &s) in slot.last_pcm_frame.iter().enumerate().take(MIX_FRAME_SIZE) {
+                    total_mix[i] += s as i32;
+                }
            }
        }

-        // 4. For each leg: mix-minus, resample, encode, send.
-        for slot in legs.values_mut() {
-            // Mix-minus: total minus this leg's own contribution.
-            let mut mix_minus = Vec::with_capacity(MIX_FRAME_SIZE);
-            for i in 0..MIX_FRAME_SIZE {
-                let sample =
-                    (total_mix[i] - slot.last_pcm_frame[i] as i32).clamp(-32768, 32767) as i16;
-                mix_minus.push(sample);
+        // ── 4. Per-leg output. ──────────────────────────────────────
+        // Collect interaction completions to apply after the loop
+        // (can't mutate role while iterating mutably for encode).
+        let mut completed_interactions: Vec<(String, InteractionResult)> = Vec::new();
+
+        for (lid, slot) in legs.iter_mut() {
+            match &mut slot.role {
+                LegRole::Participant => {
+                    // Mix-minus: total minus this leg's own contribution.
+                    let mut mix_minus = Vec::with_capacity(MIX_FRAME_SIZE);
+                    for i in 0..MIX_FRAME_SIZE {
+                        let sample = (total_mix[i] - slot.last_pcm_frame[i] as i32)
+                            .clamp(-32768, 32767) as i16;
+                        mix_minus.push(sample);
+                    }
+
+                    // Resample from 16kHz to the leg's codec native rate.
+                    let target_rate = codec_sample_rate(slot.codec_pt);
+                    let resampled = if target_rate == MIX_RATE {
+                        mix_minus
+                    } else {
+                        slot.transcoder
+                            .resample(&mix_minus, MIX_RATE, target_rate)
+                            .unwrap_or_default()
+                    };
+
+                    // Encode to the leg's codec.
+                    let encoded =
+                        match slot.transcoder.encode_from_pcm(&resampled, slot.codec_pt) {
+                            Ok(e) if !e.is_empty() => e,
+                            _ => continue,
+                        };
+
+                    // Build RTP packet with header.
+                    let header =
+                        build_rtp_header(slot.codec_pt, slot.rtp_seq, slot.rtp_ts, slot.rtp_ssrc);
+                    let mut rtp = header.to_vec();
+                    rtp.extend_from_slice(&encoded);
+
+                    slot.rtp_seq = slot.rtp_seq.wrapping_add(1);
+                    slot.rtp_ts = slot.rtp_ts.wrapping_add(rtp_clock_increment(slot.codec_pt));
+
+                    // Non-blocking send — drop frame if channel is full.
+                    let _ = slot.outbound_tx.try_send(rtp);
+                }
+                LegRole::Isolated(state) => {
+                    // Check for DTMF digit from this leg.
+                    let mut matched_digit: Option<char> = None;
+                    for (src_lid, dtmf_pkt) in &dtmf_forward {
+                        if src_lid == lid && dtmf_pkt.payload.len() >= 4 {
+                            let event_id = dtmf_pkt.payload[0];
+                            let end_bit = (dtmf_pkt.payload[1] & 0x80) != 0;
+                            if end_bit {
+                                const EVENT_CHARS: &[char] = &[
+                                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '*', '#',
+                                    'A', 'B', 'C', 'D',
+                                ];
+                                if let Some(&ch) = EVENT_CHARS.get(event_id as usize) {
+                                    if state.expected_digits.contains(&ch) {
+                                        matched_digit = Some(ch);
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    if let Some(digit) = matched_digit {
+                        // Interaction complete — digit matched.
+                        completed_interactions
+                            .push((lid.clone(), InteractionResult::Digit(digit)));
+                    } else {
+                        // Play prompt frame or silence.
+                        let pcm_frame = if let Some(frame) = state.prompt_frames.pop_front() {
+                            frame
+                        } else {
+                            state.prompt_done = true;
+                            vec![0i16; MIX_FRAME_SIZE]
+                        };
+
+                        // Encode prompt frame to the leg's codec (reuses existing encode path).
+                        let target_rate = codec_sample_rate(slot.codec_pt);
+                        let resampled = if target_rate == MIX_RATE {
+                            pcm_frame
+                        } else {
+                            slot.transcoder
+                                .resample(&pcm_frame, MIX_RATE, target_rate)
+                                .unwrap_or_default()
+                        };
+
+                        if let Ok(encoded) =
+                            slot.transcoder.encode_from_pcm(&resampled, slot.codec_pt)
+                        {
+                            if !encoded.is_empty() {
+                                let header = build_rtp_header(
+                                    slot.codec_pt,
+                                    slot.rtp_seq,
+                                    slot.rtp_ts,
+                                    slot.rtp_ssrc,
+                                );
+                                let mut rtp = header.to_vec();
+                                rtp.extend_from_slice(&encoded);
+                                slot.rtp_seq = slot.rtp_seq.wrapping_add(1);
+                                slot.rtp_ts = slot
+                                    .rtp_ts
+                                    .wrapping_add(rtp_clock_increment(slot.codec_pt));
+                                let _ = slot.outbound_tx.try_send(rtp);
+                            }
+                        }
+
+                        // Check timeout (only after prompt finishes).
+                        if state.prompt_done {
+                            if state.timeout_ticks_remaining == 0 {
+                                completed_interactions
+                                    .push((lid.clone(), InteractionResult::Timeout));
+                            } else {
+                                state.timeout_ticks_remaining -= 1;
+                            }
+                        }
+                    }
+                }
            }
+        }

-            // Resample from 16kHz to the leg's codec native rate.
-            let target_rate = codec_sample_rate(slot.codec_pt);
-            let resampled = if target_rate == MIX_RATE {
-                mix_minus
-            } else {
-                slot.transcoder
-                    .resample(&mix_minus, MIX_RATE, target_rate)
-                    .unwrap_or_default()
-            };
+        // Apply completed interactions — revert legs to Participant.
+        for (lid, result) in completed_interactions {
+            if let Some(slot) = legs.get_mut(&lid) {
+                if let LegRole::Isolated(ref mut state) = slot.role {
+                    if let Some(tx) = state.result_tx.take() {
+                        let _ = tx.send(result);
+                    }
+                }
+                slot.role = LegRole::Participant;
+            }
+        }

-            // Encode to the leg's codec.
-            let encoded = match slot.transcoder.encode_from_pcm(&resampled, slot.codec_pt) {
-                Ok(e) if !e.is_empty() => e,
-                _ => continue,
-            };
+        // ── 5. Distribute per-source audio to tool legs. ────────────
+        if !tool_legs.is_empty() {
+            // Collect participant PCM frames (computed in step 2).
+            let sources: Vec<ToolAudioSource> = legs
+                .iter()
+                .filter(|(_, s)| matches!(s.role, LegRole::Participant))
+                .map(|(lid, s)| ToolAudioSource {
+                    leg_id: lid.clone(),
+                    pcm_16k: s.last_pcm_frame.clone(),
+                })
+                .collect();

-            // Build RTP packet with header.
-            let header = build_rtp_header(slot.codec_pt, slot.rtp_seq, slot.rtp_ts, slot.rtp_ssrc);
-            let mut rtp = header.to_vec();
-            rtp.extend_from_slice(&encoded);
+            for tool in tool_legs.values() {
+                let batch = ToolAudioBatch {
+                    sources: sources
+                        .iter()
+                        .map(|s| ToolAudioSource {
+                            leg_id: s.leg_id.clone(),
+                            pcm_16k: s.pcm_16k.clone(),
+                        })
+                        .collect(),
+                };
+                // Non-blocking send — drop batch if tool can't keep up.
+                let _ = tool.audio_tx.try_send(batch);
+            }
+        }

-            slot.rtp_seq = slot.rtp_seq.wrapping_add(1);
-            slot.rtp_ts = slot.rtp_ts.wrapping_add(rtp_clock_increment(slot.codec_pt));
-
-            // Non-blocking send — drop frame if channel is full.
-            let _ = slot.outbound_tx.try_send(rtp);
+        // ── 6. Forward DTMF packets between participant legs only. ──
+        for (source_lid, dtmf_pkt) in &dtmf_forward {
+            // Skip if the source is an isolated leg (its DTMF was handled in step 4).
+            if let Some(src_slot) = legs.get(source_lid) {
+                if matches!(src_slot.role, LegRole::Isolated(_)) {
+                    continue;
+                }
+            }
+            for (target_lid, target_slot) in legs.iter_mut() {
+                if target_lid == source_lid {
+                    continue; // Don't echo DTMF back to sender.
+                }
+                // Don't forward to isolated legs.
+                if matches!(target_slot.role, LegRole::Isolated(_)) {
+                    continue;
+                }
+                let mut header = build_rtp_header(
+                    101,
+                    target_slot.rtp_seq,
+                    target_slot.rtp_ts,
+                    target_slot.rtp_ssrc,
+                );
+                if dtmf_pkt.marker {
+                    header[1] |= 0x80; // Set marker bit.
+                }
+                let mut rtp_out = header.to_vec();
+                rtp_out.extend_from_slice(&dtmf_pkt.payload);
+                target_slot.rtp_seq = target_slot.rtp_seq.wrapping_add(1);
+                // Don't increment rtp_ts for DTMF — it shares timestamp context with audio.
+                let _ = target_slot.outbound_tx.try_send(rtp_out);
+            }
        }
    }
 }