feat(mixer): enhance mixer functionality with interaction and tool legs

- Updated mixer to handle participant and isolated leg roles, allowing for IVR and consent interactions.
- Introduced commands for starting and canceling interactions, managing tool legs for recording and transcription.
- Implemented per-source audio handling for tool legs, enabling separate audio processing.
- Enhanced DTMF handling to forward events between participant legs only.
- Added support for PCM recording directly from tool legs, with WAV file generation.
- Updated TypeScript definitions and functions to support new interaction and tool leg features.
This commit is contained in:
2026-04-10 14:54:21 +00:00
parent 6a130db7c7
commit 7d59361352
13 changed files with 1448 additions and 94 deletions

View File

@@ -5,14 +5,17 @@
//!
//! The mixer runs a 20ms tick loop:
//! 1. Drain inbound channels, decode to PCM, resample to 16kHz
//! 2. Compute total mix (sum of all legs' PCM as i32)
//! 3. For each leg: mix-minus = total - own, resample to leg codec rate, encode, send
//! 2. Compute total mix (sum of all **participant** legs' PCM as i32)
//! 3. For each participant leg: mix-minus = total - own, resample to leg codec rate, encode, send
//! 4. For each isolated leg: play prompt frame or silence, check DTMF
//! 5. For each tool leg: send per-source unmerged audio batch
//! 6. Forward DTMF between participant legs only
use crate::ipc::{emit_event, OutTx};
use crate::rtp::{build_rtp_header, rtp_clock_increment};
use codec_lib::{codec_sample_rate, TranscodeState};
use std::collections::HashMap;
use tokio::sync::mpsc;
use std::collections::{HashMap, VecDeque};
use tokio::sync::{mpsc, oneshot};
use tokio::task::JoinHandle;
use tokio::time::{self, Duration, MissedTickBehavior};
@@ -25,11 +28,84 @@ const MIX_FRAME_SIZE: usize = 320; // 16000 * 0.020
pub struct RtpPacket {
pub payload: Vec<u8>,
pub payload_type: u8,
/// RTP marker bit (first packet of a DTMF event, etc.).
pub marker: bool,
/// RTP timestamp from the original packet header.
pub timestamp: u32,
}
// ---------------------------------------------------------------------------
// Leg roles
// ---------------------------------------------------------------------------
/// What role a leg currently plays in the mixer.
enum LegRole {
/// Normal participant: contributes to mix, receives mix-minus.
Participant,
/// Temporarily isolated for IVR/consent interaction.
Isolated(IsolationState),
}
struct IsolationState {
/// PCM frames at MIX_RATE (320 samples each) queued for playback.
prompt_frames: VecDeque<Vec<i16>>,
/// Digits that complete the interaction (e.g., ['1', '2']).
expected_digits: Vec<char>,
/// Ticks remaining before timeout (decremented each tick after prompt ends).
timeout_ticks_remaining: u32,
/// Whether we've finished playing the prompt.
prompt_done: bool,
/// Channel to send the result back to the command handler.
result_tx: Option<oneshot::Sender<InteractionResult>>,
}
/// Result of a leg interaction (consent prompt, IVR, etc.).
pub enum InteractionResult {
/// The participant pressed one of the expected digits.
Digit(char),
/// No digit was received within the timeout.
Timeout,
/// The leg was removed or the call tore down before completion.
Cancelled,
}
// ---------------------------------------------------------------------------
// Tool legs
// ---------------------------------------------------------------------------
/// Type of tool leg.
#[derive(Debug, Clone, Copy)]
pub enum ToolType {
Recording,
Transcription,
}
/// Per-source audio delivered to a tool leg each mixer tick.
pub struct ToolAudioBatch {
pub sources: Vec<ToolAudioSource>,
}
/// One participant's 20ms audio frame.
pub struct ToolAudioSource {
pub leg_id: String,
/// PCM at 16kHz, MIX_FRAME_SIZE (320) samples.
pub pcm_16k: Vec<i16>,
}
/// Internal storage for a tool leg inside the mixer.
struct ToolLegSlot {
#[allow(dead_code)]
tool_type: ToolType,
audio_tx: mpsc::Sender<ToolAudioBatch>,
}
// ---------------------------------------------------------------------------
// Commands
// ---------------------------------------------------------------------------
/// Commands sent to the mixer task via a control channel.
pub enum MixerCommand {
/// Add a new leg to the mix.
/// Add a new participant leg to the mix.
AddLeg {
leg_id: String,
codec_pt: u8,
@@ -40,8 +116,35 @@ pub enum MixerCommand {
RemoveLeg { leg_id: String },
/// Shut down the mixer.
Shutdown,
/// Isolate a leg and start an interaction (consent prompt, IVR).
/// The leg is removed from the mix and hears the prompt instead.
/// DTMF from the leg is checked against expected_digits.
StartInteraction {
leg_id: String,
/// PCM frames at MIX_RATE (16kHz), each 320 samples.
prompt_pcm_frames: Vec<Vec<i16>>,
expected_digits: Vec<char>,
timeout_ms: u32,
result_tx: oneshot::Sender<InteractionResult>,
},
/// Cancel an in-progress interaction (e.g., leg being removed).
CancelInteraction { leg_id: String },
/// Add a tool leg that receives per-source unmerged audio.
AddToolLeg {
leg_id: String,
tool_type: ToolType,
audio_tx: mpsc::Sender<ToolAudioBatch>,
},
/// Remove a tool leg (drops the channel, background task finalizes).
RemoveToolLeg { leg_id: String },
}
// ---------------------------------------------------------------------------
// Mixer internals
// ---------------------------------------------------------------------------
/// Internal per-leg state inside the mixer.
struct MixerLegSlot {
codec_pt: u8,
@@ -56,6 +159,8 @@ struct MixerLegSlot {
rtp_seq: u16,
rtp_ts: u32,
rtp_ssrc: u32,
/// Current role of this leg in the mixer.
role: LegRole,
}
/// Spawn the mixer task for a call. Returns the command sender and task handle.
@@ -79,13 +184,14 @@ async fn mixer_loop(
out_tx: OutTx,
) {
let mut legs: HashMap<String, MixerLegSlot> = HashMap::new();
let mut tool_legs: HashMap<String, ToolLegSlot> = HashMap::new();
let mut interval = time::interval(Duration::from_millis(20));
interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
loop {
interval.tick().await;
// 1. Process control commands (non-blocking).
// ── 1. Process control commands (non-blocking). ─────────────
loop {
match cmd_rx.try_recv() {
Ok(MixerCommand::AddLeg {
@@ -121,38 +227,115 @@ async fn mixer_loop(
rtp_seq: 0,
rtp_ts: 0,
rtp_ssrc: rand::random(),
role: LegRole::Participant,
},
);
}
Ok(MixerCommand::RemoveLeg { leg_id }) => {
// If the leg is isolated, send Cancelled before dropping.
if let Some(slot) = legs.get_mut(&leg_id) {
if let LegRole::Isolated(ref mut state) = slot.role {
if let Some(tx) = state.result_tx.take() {
let _ = tx.send(InteractionResult::Cancelled);
}
}
}
legs.remove(&leg_id);
// Channels drop → I/O tasks exit cleanly.
}
Ok(MixerCommand::Shutdown) => return,
Ok(MixerCommand::Shutdown) => {
// Cancel all outstanding interactions before shutting down.
for slot in legs.values_mut() {
if let LegRole::Isolated(ref mut state) = slot.role {
if let Some(tx) = state.result_tx.take() {
let _ = tx.send(InteractionResult::Cancelled);
}
}
}
return;
}
Ok(MixerCommand::StartInteraction {
leg_id,
prompt_pcm_frames,
expected_digits,
timeout_ms,
result_tx,
}) => {
if let Some(slot) = legs.get_mut(&leg_id) {
// Cancel any existing interaction first.
if let LegRole::Isolated(ref mut old_state) = slot.role {
if let Some(tx) = old_state.result_tx.take() {
let _ = tx.send(InteractionResult::Cancelled);
}
}
let timeout_ticks = timeout_ms / 20;
slot.role = LegRole::Isolated(IsolationState {
prompt_frames: VecDeque::from(prompt_pcm_frames),
expected_digits,
timeout_ticks_remaining: timeout_ticks,
prompt_done: false,
result_tx: Some(result_tx),
});
} else {
// Leg not found — immediately cancel.
let _ = result_tx.send(InteractionResult::Cancelled);
}
}
Ok(MixerCommand::CancelInteraction { leg_id }) => {
if let Some(slot) = legs.get_mut(&leg_id) {
if let LegRole::Isolated(ref mut state) = slot.role {
if let Some(tx) = state.result_tx.take() {
let _ = tx.send(InteractionResult::Cancelled);
}
}
slot.role = LegRole::Participant;
}
}
Ok(MixerCommand::AddToolLeg {
leg_id,
tool_type,
audio_tx,
}) => {
tool_legs.insert(leg_id, ToolLegSlot { tool_type, audio_tx });
}
Ok(MixerCommand::RemoveToolLeg { leg_id }) => {
tool_legs.remove(&leg_id);
// Dropping the ToolLegSlot drops audio_tx → background task sees channel close.
}
Err(mpsc::error::TryRecvError::Empty) => break,
Err(mpsc::error::TryRecvError::Disconnected) => return,
}
}
if legs.is_empty() {
if legs.is_empty() && tool_legs.is_empty() {
continue;
}
// 2. Drain inbound packets, decode to 16kHz PCM.
// ── 2. Drain inbound packets, decode to 16kHz PCM. ─────────
// DTMF (PT 101) packets are collected separately.
let leg_ids: Vec<String> = legs.keys().cloned().collect();
let mut dtmf_forward: Vec<(String, RtpPacket)> = Vec::new();
for lid in &leg_ids {
let slot = legs.get_mut(lid).unwrap();
// Drain channel, keep only the latest packet (simple jitter handling).
let mut latest: Option<RtpPacket> = None;
// Drain channel — collect DTMF packets separately, keep latest audio.
let mut latest_audio: Option<RtpPacket> = None;
loop {
match slot.inbound_rx.try_recv() {
Ok(pkt) => latest = Some(pkt),
Ok(pkt) => {
if pkt.payload_type == 101 {
// DTMF telephone-event: collect for processing.
dtmf_forward.push((lid.clone(), pkt));
} else {
latest_audio = Some(pkt);
}
}
Err(_) => break,
}
}
if let Some(pkt) = latest {
if let Some(pkt) = latest_audio {
slot.silent_ticks = 0;
match slot.transcoder.decode_to_pcm(&pkt.payload, pkt.payload_type) {
Ok((pcm, rate)) => {
@@ -174,6 +357,9 @@ async fn mixer_loop(
slot.last_pcm_frame = vec![0i16; MIX_FRAME_SIZE];
}
}
} else if dtmf_forward.iter().any(|(src, _)| src == lid) {
// Got DTMF but no audio — don't bump silent_ticks (DTMF counts as activity).
slot.silent_ticks = 0;
} else {
slot.silent_ticks += 1;
// After 150 ticks (3 seconds) of silence, zero out to avoid stale audio.
@@ -183,50 +369,210 @@ async fn mixer_loop(
}
}
// 3. Compute total mix (sum of all legs as i32 to avoid overflow).
// ── 3. Compute total mix from PARTICIPANT legs only. ────────
let mut total_mix = vec![0i32; MIX_FRAME_SIZE];
for slot in legs.values() {
for (i, &s) in slot.last_pcm_frame.iter().enumerate().take(MIX_FRAME_SIZE) {
total_mix[i] += s as i32;
if matches!(slot.role, LegRole::Participant) {
for (i, &s) in slot.last_pcm_frame.iter().enumerate().take(MIX_FRAME_SIZE) {
total_mix[i] += s as i32;
}
}
}
// 4. For each leg: mix-minus, resample, encode, send.
for slot in legs.values_mut() {
// Mix-minus: total minus this leg's own contribution.
let mut mix_minus = Vec::with_capacity(MIX_FRAME_SIZE);
for i in 0..MIX_FRAME_SIZE {
let sample =
(total_mix[i] - slot.last_pcm_frame[i] as i32).clamp(-32768, 32767) as i16;
mix_minus.push(sample);
// ── 4. Per-leg output. ──────────────────────────────────────
// Collect interaction completions to apply after the loop
// (can't mutate role while iterating mutably for encode).
let mut completed_interactions: Vec<(String, InteractionResult)> = Vec::new();
for (lid, slot) in legs.iter_mut() {
match &mut slot.role {
LegRole::Participant => {
// Mix-minus: total minus this leg's own contribution.
let mut mix_minus = Vec::with_capacity(MIX_FRAME_SIZE);
for i in 0..MIX_FRAME_SIZE {
let sample = (total_mix[i] - slot.last_pcm_frame[i] as i32)
.clamp(-32768, 32767) as i16;
mix_minus.push(sample);
}
// Resample from 16kHz to the leg's codec native rate.
let target_rate = codec_sample_rate(slot.codec_pt);
let resampled = if target_rate == MIX_RATE {
mix_minus
} else {
slot.transcoder
.resample(&mix_minus, MIX_RATE, target_rate)
.unwrap_or_default()
};
// Encode to the leg's codec.
let encoded =
match slot.transcoder.encode_from_pcm(&resampled, slot.codec_pt) {
Ok(e) if !e.is_empty() => e,
_ => continue,
};
// Build RTP packet with header.
let header =
build_rtp_header(slot.codec_pt, slot.rtp_seq, slot.rtp_ts, slot.rtp_ssrc);
let mut rtp = header.to_vec();
rtp.extend_from_slice(&encoded);
slot.rtp_seq = slot.rtp_seq.wrapping_add(1);
slot.rtp_ts = slot.rtp_ts.wrapping_add(rtp_clock_increment(slot.codec_pt));
// Non-blocking send — drop frame if channel is full.
let _ = slot.outbound_tx.try_send(rtp);
}
LegRole::Isolated(state) => {
// Check for DTMF digit from this leg.
let mut matched_digit: Option<char> = None;
for (src_lid, dtmf_pkt) in &dtmf_forward {
if src_lid == lid && dtmf_pkt.payload.len() >= 4 {
let event_id = dtmf_pkt.payload[0];
let end_bit = (dtmf_pkt.payload[1] & 0x80) != 0;
if end_bit {
const EVENT_CHARS: &[char] = &[
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '*', '#',
'A', 'B', 'C', 'D',
];
if let Some(&ch) = EVENT_CHARS.get(event_id as usize) {
if state.expected_digits.contains(&ch) {
matched_digit = Some(ch);
break;
}
}
}
}
}
if let Some(digit) = matched_digit {
// Interaction complete — digit matched.
completed_interactions
.push((lid.clone(), InteractionResult::Digit(digit)));
} else {
// Play prompt frame or silence.
let pcm_frame = if let Some(frame) = state.prompt_frames.pop_front() {
frame
} else {
state.prompt_done = true;
vec![0i16; MIX_FRAME_SIZE]
};
// Encode prompt frame to the leg's codec (reuses existing encode path).
let target_rate = codec_sample_rate(slot.codec_pt);
let resampled = if target_rate == MIX_RATE {
pcm_frame
} else {
slot.transcoder
.resample(&pcm_frame, MIX_RATE, target_rate)
.unwrap_or_default()
};
if let Ok(encoded) =
slot.transcoder.encode_from_pcm(&resampled, slot.codec_pt)
{
if !encoded.is_empty() {
let header = build_rtp_header(
slot.codec_pt,
slot.rtp_seq,
slot.rtp_ts,
slot.rtp_ssrc,
);
let mut rtp = header.to_vec();
rtp.extend_from_slice(&encoded);
slot.rtp_seq = slot.rtp_seq.wrapping_add(1);
slot.rtp_ts = slot
.rtp_ts
.wrapping_add(rtp_clock_increment(slot.codec_pt));
let _ = slot.outbound_tx.try_send(rtp);
}
}
// Check timeout (only after prompt finishes).
if state.prompt_done {
if state.timeout_ticks_remaining == 0 {
completed_interactions
.push((lid.clone(), InteractionResult::Timeout));
} else {
state.timeout_ticks_remaining -= 1;
}
}
}
}
}
}
// Resample from 16kHz to the leg's codec native rate.
let target_rate = codec_sample_rate(slot.codec_pt);
let resampled = if target_rate == MIX_RATE {
mix_minus
} else {
slot.transcoder
.resample(&mix_minus, MIX_RATE, target_rate)
.unwrap_or_default()
};
// Apply completed interactions — revert legs to Participant.
for (lid, result) in completed_interactions {
if let Some(slot) = legs.get_mut(&lid) {
if let LegRole::Isolated(ref mut state) = slot.role {
if let Some(tx) = state.result_tx.take() {
let _ = tx.send(result);
}
}
slot.role = LegRole::Participant;
}
}
// Encode to the leg's codec.
let encoded = match slot.transcoder.encode_from_pcm(&resampled, slot.codec_pt) {
Ok(e) if !e.is_empty() => e,
_ => continue,
};
// ── 5. Distribute per-source audio to tool legs. ────────────
if !tool_legs.is_empty() {
// Collect participant PCM frames (computed in step 2).
let sources: Vec<ToolAudioSource> = legs
.iter()
.filter(|(_, s)| matches!(s.role, LegRole::Participant))
.map(|(lid, s)| ToolAudioSource {
leg_id: lid.clone(),
pcm_16k: s.last_pcm_frame.clone(),
})
.collect();
// Build RTP packet with header.
let header = build_rtp_header(slot.codec_pt, slot.rtp_seq, slot.rtp_ts, slot.rtp_ssrc);
let mut rtp = header.to_vec();
rtp.extend_from_slice(&encoded);
for tool in tool_legs.values() {
let batch = ToolAudioBatch {
sources: sources
.iter()
.map(|s| ToolAudioSource {
leg_id: s.leg_id.clone(),
pcm_16k: s.pcm_16k.clone(),
})
.collect(),
};
// Non-blocking send — drop batch if tool can't keep up.
let _ = tool.audio_tx.try_send(batch);
}
}
slot.rtp_seq = slot.rtp_seq.wrapping_add(1);
slot.rtp_ts = slot.rtp_ts.wrapping_add(rtp_clock_increment(slot.codec_pt));
// Non-blocking send — drop frame if channel is full.
let _ = slot.outbound_tx.try_send(rtp);
// ── 6. Forward DTMF packets between participant legs only. ──
for (source_lid, dtmf_pkt) in &dtmf_forward {
// Skip if the source is an isolated leg (its DTMF was handled in step 4).
if let Some(src_slot) = legs.get(source_lid) {
if matches!(src_slot.role, LegRole::Isolated(_)) {
continue;
}
}
for (target_lid, target_slot) in legs.iter_mut() {
if target_lid == source_lid {
continue; // Don't echo DTMF back to sender.
}
// Don't forward to isolated legs.
if matches!(target_slot.role, LegRole::Isolated(_)) {
continue;
}
let mut header = build_rtp_header(
101,
target_slot.rtp_seq,
target_slot.rtp_ts,
target_slot.rtp_ssrc,
);
if dtmf_pkt.marker {
header[1] |= 0x80; // Set marker bit.
}
let mut rtp_out = header.to_vec();
rtp_out.extend_from_slice(&dtmf_pkt.payload);
target_slot.rtp_seq = target_slot.rtp_seq.wrapping_add(1);
// Don't increment rtp_ts for DTMF — it shares timestamp context with audio.
let _ = target_slot.outbound_tx.try_send(rtp_out);
}
}
}
}