feat(proxy-engine): upgrade the internal audio bus to 48kHz f32 with per-leg denoising and improve SIP leg routing

This commit is contained in:
2026-04-10 15:58:41 +00:00
parent 10ad432a4c
commit 73b28f5f57
10 changed files with 194 additions and 65 deletions

View File

@@ -1,5 +1,14 @@
# Changelog # Changelog
## 2026-04-10 - 1.17.0 - feat(proxy-engine)
upgrade the internal audio bus to 48kHz f32 with per-leg denoising and improve SIP leg routing
- switch mixer, prompt playback, and tool leg audio handling from 16kHz i16 to 48kHz f32 for higher-quality internal processing
- add f32 decode/encode and resampling support plus standalone RNNoise denoiser creation in codec-lib
- apply per-leg inbound noise suppression in the mixer before mix-minus generation
- fix passthrough call routing by matching the actual leg from the signaling source address when Call-IDs are shared
- correct dialed number extraction from bare SIP request URIs by parsing the user part directly
## 2026-04-10 - 1.16.0 - feat(proxy-engine) ## 2026-04-10 - 1.16.0 - feat(proxy-engine)
integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files

1
rust/Cargo.lock generated
View File

@@ -2179,6 +2179,7 @@ dependencies = [
"codec-lib", "codec-lib",
"hound", "hound",
"kokoro-tts", "kokoro-tts",
"nnnoiseless",
"ort", "ort",
"rand 0.8.5", "rand 0.8.5",
"regex-lite", "regex-lite",

View File

@@ -104,6 +104,8 @@ pub struct TranscodeState {
g722_dec: libg722::decoder::Decoder, g722_dec: libg722::decoder::Decoder,
/// Cached FFT resamplers keyed by (from_rate, to_rate, chunk_size). /// Cached FFT resamplers keyed by (from_rate, to_rate, chunk_size).
resamplers: HashMap<(u32, u32, usize), FftFixedIn<f64>>, resamplers: HashMap<(u32, u32, usize), FftFixedIn<f64>>,
/// Cached f32 FFT resamplers keyed by (from_rate, to_rate, chunk_size).
resamplers_f32: HashMap<(u32, u32, usize), FftFixedIn<f32>>,
/// ML noise suppression for the SIP-bound direction. /// ML noise suppression for the SIP-bound direction.
denoiser_to_sip: Box<DenoiseState<'static>>, denoiser_to_sip: Box<DenoiseState<'static>>,
/// ML noise suppression for the browser-bound direction. /// ML noise suppression for the browser-bound direction.
@@ -133,6 +135,7 @@ impl TranscodeState {
g722_enc, g722_enc,
g722_dec, g722_dec,
resamplers: HashMap::new(), resamplers: HashMap::new(),
resamplers_f32: HashMap::new(),
denoiser_to_sip: DenoiseState::new(), denoiser_to_sip: DenoiseState::new(),
denoiser_to_browser: DenoiseState::new(), denoiser_to_browser: DenoiseState::new(),
}) })
@@ -293,6 +296,86 @@ impl TranscodeState {
_ => Err(format!("unsupported target PT {pt}")), _ => Err(format!("unsupported target PT {pt}")),
} }
} }
// ---- f32 API for high-quality internal bus ----------------------------
/// Decode an encoded audio payload to f32 PCM samples in [-1.0, 1.0].
/// Returns (samples, sample_rate).
pub fn decode_to_f32(&mut self, data: &[u8], pt: u8) -> Result<(Vec<f32>, u32), String> {
let (pcm_i16, rate) = self.decode_to_pcm(data, pt)?;
let pcm_f32 = pcm_i16.iter().map(|&s| s as f32 / 32768.0).collect();
Ok((pcm_f32, rate))
}
/// Encode f32 PCM samples ([-1.0, 1.0]) to an audio codec.
pub fn encode_from_f32(&mut self, pcm: &[f32], pt: u8) -> Result<Vec<u8>, String> {
let pcm_i16: Vec<i16> = pcm
.iter()
.map(|&s| (s * 32767.0).round().clamp(-32768.0, 32767.0) as i16)
.collect();
self.encode_from_pcm(&pcm_i16, pt)
}
/// High-quality sample rate conversion for f32 PCM using rubato FFT resampler.
/// Uses a separate cache from the i16 resampler.
pub fn resample_f32(
&mut self,
pcm: &[f32],
from_rate: u32,
to_rate: u32,
) -> Result<Vec<f32>, String> {
if from_rate == to_rate || pcm.is_empty() {
return Ok(pcm.to_vec());
}
let chunk = pcm.len();
let key = (from_rate, to_rate, chunk);
if !self.resamplers_f32.contains_key(&key) {
let r =
FftFixedIn::<f32>::new(from_rate as usize, to_rate as usize, chunk, 1, 1)
.map_err(|e| format!("resampler f32 {from_rate}->{to_rate}: {e}"))?;
self.resamplers_f32.insert(key, r);
}
let resampler = self.resamplers_f32.get_mut(&key).unwrap();
let input = vec![pcm.to_vec()];
let result = resampler
.process(&input, None)
.map_err(|e| format!("resample f32 {from_rate}->{to_rate}: {e}"))?;
Ok(result[0].clone())
}
/// Apply RNNoise ML noise suppression to 48kHz f32 PCM audio.
/// Processes in 480-sample (10ms) frames. State persists across calls.
/// Operates natively in f32 — no i16 conversion overhead.
pub fn denoise_f32(denoiser: &mut DenoiseState, pcm: &[f32]) -> Vec<f32> {
let frame_size = DenoiseState::FRAME_SIZE; // 480
let total = pcm.len();
let whole = (total / frame_size) * frame_size;
let mut output = Vec::with_capacity(total);
let mut out_buf = [0.0f32; 480];
// nnnoiseless expects f32 samples scaled as i16 range (-32768..32767).
for offset in (0..whole).step_by(frame_size) {
let input: Vec<f32> = pcm[offset..offset + frame_size]
.iter()
.map(|&s| s * 32768.0)
.collect();
denoiser.process_frame(&mut out_buf, &input);
output.extend(out_buf.iter().map(|&s| s / 32768.0));
}
if whole < total {
output.extend_from_slice(&pcm[whole..]);
}
output
}
}
/// Create a new standalone denoiser for per-leg inbound processing.
pub fn new_denoiser() -> Box<DenoiseState<'static>> {
DenoiseState::new()
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -10,6 +10,7 @@ path = "src/main.rs"
[dependencies] [dependencies]
codec-lib = { path = "../codec-lib" } codec-lib = { path = "../codec-lib" }
sip-proto = { path = "../sip-proto" } sip-proto = { path = "../sip-proto" }
nnnoiseless = { version = "0.5", default-features = false }
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["full"] }
serde = { version = "1", features = ["derive"] } serde = { version = "1", features = ["derive"] }
serde_json = "1" serde_json = "1"

View File

@@ -10,9 +10,9 @@ use tokio::net::UdpSocket;
use tokio::time::{self, Duration}; use tokio::time::{self, Duration};
/// Mixing sample rate used by the mixer (must stay in sync with mixer::MIX_RATE). /// Mixing sample rate used by the mixer (must stay in sync with mixer::MIX_RATE).
const MIX_RATE: u32 = 16000; const MIX_RATE: u32 = 48000;
/// Samples per 20ms frame at the mixing rate. /// Samples per 20ms frame at the mixing rate.
const MIX_FRAME_SIZE: usize = 320; const MIX_FRAME_SIZE: usize = 960;
/// Play a WAV file as RTP to a destination. /// Play a WAV file as RTP to a destination.
/// Returns when playback is complete. /// Returns when playback is complete.
@@ -178,9 +178,9 @@ pub async fn play_beep(
Ok((seq, ts)) Ok((seq, ts))
} }
/// Load a WAV file and split it into 20ms PCM frames at 16kHz. /// Load a WAV file and split it into 20ms f32 PCM frames at 48kHz.
/// Used by the leg interaction system to prepare prompt audio for the mixer. /// Used by the leg interaction system to prepare prompt audio for the mixer.
pub fn load_prompt_pcm_frames(wav_path: &str) -> Result<Vec<Vec<i16>>, String> { pub fn load_prompt_pcm_frames(wav_path: &str) -> Result<Vec<Vec<f32>>, String> {
let path = Path::new(wav_path); let path = Path::new(wav_path);
if !path.exists() { if !path.exists() {
return Err(format!("WAV file not found: {wav_path}")); return Err(format!("WAV file not found: {wav_path}"));
@@ -191,17 +191,17 @@ pub fn load_prompt_pcm_frames(wav_path: &str) -> Result<Vec<Vec<i16>>, String> {
let spec = reader.spec(); let spec = reader.spec();
let wav_rate = spec.sample_rate; let wav_rate = spec.sample_rate;
// Read all samples as i16. // Read all samples as f32 in [-1.0, 1.0].
let samples: Vec<i16> = if spec.bits_per_sample == 16 { let samples: Vec<f32> = if spec.bits_per_sample == 16 {
reader reader
.samples::<i16>() .samples::<i16>()
.filter_map(|s| s.ok()) .filter_map(|s| s.ok())
.map(|s| s as f32 / 32768.0)
.collect() .collect()
} else if spec.bits_per_sample == 32 && spec.sample_format == hound::SampleFormat::Float { } else if spec.bits_per_sample == 32 && spec.sample_format == hound::SampleFormat::Float {
reader reader
.samples::<f32>() .samples::<f32>()
.filter_map(|s| s.ok()) .filter_map(|s| s.ok())
.map(|s| (s * 32767.0).round().clamp(-32768.0, 32767.0) as i16)
.collect() .collect()
} else { } else {
return Err(format!( return Err(format!(
@@ -214,24 +214,24 @@ pub fn load_prompt_pcm_frames(wav_path: &str) -> Result<Vec<Vec<i16>>, String> {
return Ok(vec![]); return Ok(vec![]);
} }
// Resample to MIX_RATE (16kHz) if needed. // Resample to MIX_RATE (48kHz) if needed.
let resampled = if wav_rate != MIX_RATE { let resampled = if wav_rate != MIX_RATE {
let mut transcoder = TranscodeState::new().map_err(|e| format!("codec init: {e}"))?; let mut transcoder = TranscodeState::new().map_err(|e| format!("codec init: {e}"))?;
transcoder transcoder
.resample(&samples, wav_rate, MIX_RATE) .resample_f32(&samples, wav_rate, MIX_RATE)
.map_err(|e| format!("resample: {e}"))? .map_err(|e| format!("resample: {e}"))?
} else { } else {
samples samples
}; };
// Split into MIX_FRAME_SIZE (320) sample frames. // Split into MIX_FRAME_SIZE (960) sample frames.
let mut frames = Vec::new(); let mut frames = Vec::new();
let mut offset = 0; let mut offset = 0;
while offset < resampled.len() { while offset < resampled.len() {
let end = (offset + MIX_FRAME_SIZE).min(resampled.len()); let end = (offset + MIX_FRAME_SIZE).min(resampled.len());
let mut frame = resampled[offset..end].to_vec(); let mut frame = resampled[offset..end].to_vec();
// Pad short final frame with silence. // Pad short final frame with silence.
frame.resize(MIX_FRAME_SIZE, 0); frame.resize(MIX_FRAME_SIZE, 0.0);
frames.push(frame); frames.push(frame);
offset += MIX_FRAME_SIZE; offset += MIX_FRAME_SIZE;
} }

View File

@@ -120,7 +120,19 @@ impl CallManager {
} }
// Passthrough-style routing for inbound/outbound device↔provider calls. // Passthrough-style routing for inbound/outbound device↔provider calls.
self.route_passthrough_message(&call_id, &leg_id, msg, from_addr, socket, config) // The sip_index only stores one leg for shared Call-IDs, so we need to
// determine which leg the message actually belongs to by comparing from_addr.
let actual_leg_id = self
.calls
.get(&call_id)
.and_then(|call| {
call.legs
.values()
.find(|l| l.signaling_addr == Some(from_addr))
.map(|l| l.id.clone())
})
.unwrap_or(leg_id);
self.route_passthrough_message(&call_id, &actual_leg_id, msg, from_addr, socket, config)
.await .await
} }
@@ -866,11 +878,18 @@ impl CallManager {
let lan_port = config.proxy.lan_port; let lan_port = config.proxy.lan_port;
let device_sip_call_id = invite.call_id().to_string(); let device_sip_call_id = invite.call_id().to_string();
// Extract just the user part from the request URI (e.g., "sip:16196000@10.0.0.1" → "16196000").
// extract_uri is for header values with angle brackets, not bare request URIs.
let dialed_number = invite let dialed_number = invite
.request_uri() .request_uri()
.and_then(|uri| SipMessage::extract_uri(uri)) .map(|uri| {
.unwrap_or(invite.request_uri().unwrap_or("")) let stripped = uri
.to_string(); .strip_prefix("sip:")
.or_else(|| uri.strip_prefix("sips:"))
.unwrap_or(uri);
stripped.split('@').next().unwrap_or(stripped).to_string()
})
.unwrap_or_default();
let provider_dest: SocketAddr = match provider_config.outbound_proxy.to_socket_addr() { let provider_dest: SocketAddr = match provider_config.outbound_proxy.to_socket_addr() {
Some(a) => a, Some(a) => a,

View File

@@ -3,9 +3,12 @@
//! Each Call spawns one mixer task. Legs communicate with the mixer via //! Each Call spawns one mixer task. Legs communicate with the mixer via
//! tokio mpsc channels — no shared mutable state, no lock contention. //! tokio mpsc channels — no shared mutable state, no lock contention.
//! //!
//! Internal bus format: 48kHz f32 PCM (960 samples per 20ms frame).
//! All encoding/decoding happens at leg boundaries. Per-leg inbound denoising at 48kHz.
//!
//! The mixer runs a 20ms tick loop: //! The mixer runs a 20ms tick loop:
//! 1. Drain inbound channels, decode to PCM, resample to 16kHz //! 1. Drain inbound channels, decode to f32, resample to 48kHz, denoise per-leg
//! 2. Compute total mix (sum of all **participant** legs' PCM as i32) //! 2. Compute total mix (sum of all **participant** legs' f32 PCM as f64)
//! 3. For each participant leg: mix-minus = total - own, resample to leg codec rate, encode, send //! 3. For each participant leg: mix-minus = total - own, resample to leg codec rate, encode, send
//! 4. For each isolated leg: play prompt frame or silence, check DTMF //! 4. For each isolated leg: play prompt frame or silence, check DTMF
//! 5. For each tool leg: send per-source unmerged audio batch //! 5. For each tool leg: send per-source unmerged audio batch
@@ -13,16 +16,18 @@
use crate::ipc::{emit_event, OutTx}; use crate::ipc::{emit_event, OutTx};
use crate::rtp::{build_rtp_header, rtp_clock_increment}; use crate::rtp::{build_rtp_header, rtp_clock_increment};
use codec_lib::{codec_sample_rate, TranscodeState}; use codec_lib::{codec_sample_rate, new_denoiser, TranscodeState};
use nnnoiseless::DenoiseState;
use std::collections::{HashMap, VecDeque}; use std::collections::{HashMap, VecDeque};
use tokio::sync::{mpsc, oneshot}; use tokio::sync::{mpsc, oneshot};
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use tokio::time::{self, Duration, MissedTickBehavior}; use tokio::time::{self, Duration, MissedTickBehavior};
/// Mixing sample rate — 16kHz. G.722 is native, G.711 needs 2× upsample, Opus needs 3× downsample. /// Mixing sample rate — 48kHz. Opus is native, G.722 needs 3× upsample, G.711 needs 6× upsample.
const MIX_RATE: u32 = 16000; /// All processing (denoising, mixing) happens at this rate in f32 for maximum quality.
const MIX_RATE: u32 = 48000;
/// Samples per 20ms frame at the mixing rate. /// Samples per 20ms frame at the mixing rate.
const MIX_FRAME_SIZE: usize = 320; // 16000 * 0.020 const MIX_FRAME_SIZE: usize = 960; // 48000 * 0.020
/// A raw RTP payload received from a leg (no RTP header). /// A raw RTP payload received from a leg (no RTP header).
pub struct RtpPacket { pub struct RtpPacket {
@@ -47,8 +52,8 @@ enum LegRole {
} }
struct IsolationState { struct IsolationState {
/// PCM frames at MIX_RATE (320 samples each) queued for playback. /// PCM frames at MIX_RATE (960 samples each, 48kHz f32) queued for playback.
prompt_frames: VecDeque<Vec<i16>>, prompt_frames: VecDeque<Vec<f32>>,
/// Digits that complete the interaction (e.g., ['1', '2']). /// Digits that complete the interaction (e.g., ['1', '2']).
expected_digits: Vec<char>, expected_digits: Vec<char>,
/// Ticks remaining before timeout (decremented each tick after prompt ends). /// Ticks remaining before timeout (decremented each tick after prompt ends).
@@ -88,8 +93,8 @@ pub struct ToolAudioBatch {
/// One participant's 20ms audio frame. /// One participant's 20ms audio frame.
pub struct ToolAudioSource { pub struct ToolAudioSource {
pub leg_id: String, pub leg_id: String,
/// PCM at 16kHz, MIX_FRAME_SIZE (320) samples. /// PCM at 48kHz f32, MIX_FRAME_SIZE (960) samples.
pub pcm_16k: Vec<i16>, pub pcm_48k: Vec<f32>,
} }
/// Internal storage for a tool leg inside the mixer. /// Internal storage for a tool leg inside the mixer.
@@ -122,8 +127,8 @@ pub enum MixerCommand {
/// DTMF from the leg is checked against expected_digits. /// DTMF from the leg is checked against expected_digits.
StartInteraction { StartInteraction {
leg_id: String, leg_id: String,
/// PCM frames at MIX_RATE (16kHz), each 320 samples. /// PCM frames at MIX_RATE (48kHz f32), each 960 samples.
prompt_pcm_frames: Vec<Vec<i16>>, prompt_pcm_frames: Vec<Vec<f32>>,
expected_digits: Vec<char>, expected_digits: Vec<char>,
timeout_ms: u32, timeout_ms: u32,
result_tx: oneshot::Sender<InteractionResult>, result_tx: oneshot::Sender<InteractionResult>,
@@ -149,10 +154,12 @@ pub enum MixerCommand {
struct MixerLegSlot { struct MixerLegSlot {
codec_pt: u8, codec_pt: u8,
transcoder: TranscodeState, transcoder: TranscodeState,
/// Per-leg inbound denoiser (48kHz, 480-sample frames).
denoiser: Box<DenoiseState<'static>>,
inbound_rx: mpsc::Receiver<RtpPacket>, inbound_rx: mpsc::Receiver<RtpPacket>,
outbound_tx: mpsc::Sender<Vec<u8>>, outbound_tx: mpsc::Sender<Vec<u8>>,
/// Last decoded PCM frame at MIX_RATE (320 samples). Used for mix-minus. /// Last decoded+denoised PCM frame at MIX_RATE (960 samples, 48kHz f32).
last_pcm_frame: Vec<i16>, last_pcm_frame: Vec<f32>,
/// Number of consecutive ticks with no inbound packet. /// Number of consecutive ticks with no inbound packet.
silent_ticks: u32, silent_ticks: u32,
// RTP output state. // RTP output state.
@@ -220,9 +227,10 @@ async fn mixer_loop(
MixerLegSlot { MixerLegSlot {
codec_pt, codec_pt,
transcoder, transcoder,
denoiser: new_denoiser(),
inbound_rx, inbound_rx,
outbound_tx, outbound_tx,
last_pcm_frame: vec![0i16; MIX_FRAME_SIZE], last_pcm_frame: vec![0.0f32; MIX_FRAME_SIZE],
silent_ticks: 0, silent_ticks: 0,
rtp_seq: 0, rtp_seq: 0,
rtp_ts: 0, rtp_ts: 0,
@@ -337,24 +345,26 @@ async fn mixer_loop(
if let Some(pkt) = latest_audio { if let Some(pkt) = latest_audio {
slot.silent_ticks = 0; slot.silent_ticks = 0;
match slot.transcoder.decode_to_pcm(&pkt.payload, pkt.payload_type) { match slot.transcoder.decode_to_f32(&pkt.payload, pkt.payload_type) {
Ok((pcm, rate)) => { Ok((pcm, rate)) => {
// Resample to mixing rate if needed. // Resample to 48kHz mixing rate if needed.
let pcm_mix = if rate == MIX_RATE { let pcm_48k = if rate == MIX_RATE {
pcm pcm
} else { } else {
slot.transcoder slot.transcoder
.resample(&pcm, rate, MIX_RATE) .resample_f32(&pcm, rate, MIX_RATE)
.unwrap_or_else(|_| vec![0i16; MIX_FRAME_SIZE]) .unwrap_or_else(|_| vec![0.0f32; MIX_FRAME_SIZE])
}; };
// Per-leg inbound denoising at 48kHz.
let denoised = TranscodeState::denoise_f32(&mut slot.denoiser, &pcm_48k);
// Pad or truncate to exactly MIX_FRAME_SIZE. // Pad or truncate to exactly MIX_FRAME_SIZE.
let mut frame = pcm_mix; let mut frame = denoised;
frame.resize(MIX_FRAME_SIZE, 0); frame.resize(MIX_FRAME_SIZE, 0.0);
slot.last_pcm_frame = frame; slot.last_pcm_frame = frame;
} }
Err(_) => { Err(_) => {
// Decode failed — use silence. // Decode failed — use silence.
slot.last_pcm_frame = vec![0i16; MIX_FRAME_SIZE]; slot.last_pcm_frame = vec![0.0f32; MIX_FRAME_SIZE];
} }
} }
} else if dtmf_forward.iter().any(|(src, _)| src == lid) { } else if dtmf_forward.iter().any(|(src, _)| src == lid) {
@@ -364,17 +374,18 @@ async fn mixer_loop(
slot.silent_ticks += 1; slot.silent_ticks += 1;
// After 150 ticks (3 seconds) of silence, zero out to avoid stale audio. // After 150 ticks (3 seconds) of silence, zero out to avoid stale audio.
if slot.silent_ticks > 150 { if slot.silent_ticks > 150 {
slot.last_pcm_frame = vec![0i16; MIX_FRAME_SIZE]; slot.last_pcm_frame = vec![0.0f32; MIX_FRAME_SIZE];
} }
} }
} }
// ── 3. Compute total mix from PARTICIPANT legs only. ──────── // ── 3. Compute total mix from PARTICIPANT legs only. ────────
let mut total_mix = vec![0i32; MIX_FRAME_SIZE]; // Accumulate as f64 to prevent precision loss when summing f32.
let mut total_mix = vec![0.0f64; MIX_FRAME_SIZE];
for slot in legs.values() { for slot in legs.values() {
if matches!(slot.role, LegRole::Participant) { if matches!(slot.role, LegRole::Participant) {
for (i, &s) in slot.last_pcm_frame.iter().enumerate().take(MIX_FRAME_SIZE) { for (i, &s) in slot.last_pcm_frame.iter().enumerate().take(MIX_FRAME_SIZE) {
total_mix[i] += s as i32; total_mix[i] += s as f64;
} }
} }
} }
@@ -387,27 +398,27 @@ async fn mixer_loop(
for (lid, slot) in legs.iter_mut() { for (lid, slot) in legs.iter_mut() {
match &mut slot.role { match &mut slot.role {
LegRole::Participant => { LegRole::Participant => {
// Mix-minus: total minus this leg's own contribution. // Mix-minus: total minus this leg's own contribution, clamped to [-1.0, 1.0].
let mut mix_minus = Vec::with_capacity(MIX_FRAME_SIZE); let mut mix_minus = Vec::with_capacity(MIX_FRAME_SIZE);
for i in 0..MIX_FRAME_SIZE { for i in 0..MIX_FRAME_SIZE {
let sample = (total_mix[i] - slot.last_pcm_frame[i] as i32) let sample =
.clamp(-32768, 32767) as i16; (total_mix[i] - slot.last_pcm_frame[i] as f64) as f32;
mix_minus.push(sample); mix_minus.push(sample.clamp(-1.0, 1.0));
} }
// Resample from 16kHz to the leg's codec native rate. // Resample from 48kHz to the leg's codec native rate.
let target_rate = codec_sample_rate(slot.codec_pt); let target_rate = codec_sample_rate(slot.codec_pt);
let resampled = if target_rate == MIX_RATE { let resampled = if target_rate == MIX_RATE {
mix_minus mix_minus
} else { } else {
slot.transcoder slot.transcoder
.resample(&mix_minus, MIX_RATE, target_rate) .resample_f32(&mix_minus, MIX_RATE, target_rate)
.unwrap_or_default() .unwrap_or_default()
}; };
// Encode to the leg's codec. // Encode to the leg's codec (f32 → i16 → codec inside encode_from_f32).
let encoded = let encoded =
match slot.transcoder.encode_from_pcm(&resampled, slot.codec_pt) { match slot.transcoder.encode_from_f32(&resampled, slot.codec_pt) {
Ok(e) if !e.is_empty() => e, Ok(e) if !e.is_empty() => e,
_ => continue, _ => continue,
}; };
@@ -456,21 +467,21 @@ async fn mixer_loop(
frame frame
} else { } else {
state.prompt_done = true; state.prompt_done = true;
vec![0i16; MIX_FRAME_SIZE] vec![0.0f32; MIX_FRAME_SIZE]
}; };
// Encode prompt frame to the leg's codec (reuses existing encode path). // Encode prompt frame to the leg's codec.
let target_rate = codec_sample_rate(slot.codec_pt); let target_rate = codec_sample_rate(slot.codec_pt);
let resampled = if target_rate == MIX_RATE { let resampled = if target_rate == MIX_RATE {
pcm_frame pcm_frame
} else { } else {
slot.transcoder slot.transcoder
.resample(&pcm_frame, MIX_RATE, target_rate) .resample_f32(&pcm_frame, MIX_RATE, target_rate)
.unwrap_or_default() .unwrap_or_default()
}; };
if let Ok(encoded) = if let Ok(encoded) =
slot.transcoder.encode_from_pcm(&resampled, slot.codec_pt) slot.transcoder.encode_from_f32(&resampled, slot.codec_pt)
{ {
if !encoded.is_empty() { if !encoded.is_empty() {
let header = build_rtp_header( let header = build_rtp_header(
@@ -523,7 +534,7 @@ async fn mixer_loop(
.filter(|(_, s)| matches!(s.role, LegRole::Participant)) .filter(|(_, s)| matches!(s.role, LegRole::Participant))
.map(|(lid, s)| ToolAudioSource { .map(|(lid, s)| ToolAudioSource {
leg_id: lid.clone(), leg_id: lid.clone(),
pcm_16k: s.last_pcm_frame.clone(), pcm_48k: s.last_pcm_frame.clone(),
}) })
.collect(); .collect();
@@ -533,7 +544,7 @@ async fn mixer_loop(
.iter() .iter()
.map(|s| ToolAudioSource { .map(|s| ToolAudioSource {
leg_id: s.leg_id.clone(), leg_id: s.leg_id.clone(),
pcm_16k: s.pcm_16k.clone(), pcm_48k: s.pcm_48k.clone(),
}) })
.collect(), .collect(),
}; };

View File

@@ -2,7 +2,7 @@
//! //!
//! Tool legs are observer legs that receive individual audio streams from each //! Tool legs are observer legs that receive individual audio streams from each
//! participant in a call. The mixer pipes `ToolAudioBatch` every 20ms containing //! participant in a call. The mixer pipes `ToolAudioBatch` every 20ms containing
//! each participant's decoded PCM@16kHz tagged with source leg ID. //! each participant's decoded PCM@48kHz f32 tagged with source leg ID.
//! //!
//! Consumers: //! Consumers:
//! - **Recording**: writes per-source WAV files for speaker-separated recording. //! - **Recording**: writes per-source WAV files for speaker-separated recording.
@@ -37,20 +37,25 @@ pub fn spawn_recording_tool(
while let Some(batch) = rx.recv().await { while let Some(batch) = rx.recv().await {
for source in &batch.sources { for source in &batch.sources {
// Skip silence-only frames (all zeros = no audio activity). // Skip silence-only frames (near-zero = no audio activity).
let has_audio = source.pcm_16k.iter().any(|&s| s != 0); let has_audio = source.pcm_48k.iter().any(|&s| s.abs() > 1e-6);
if !has_audio && !recorders.contains_key(&source.leg_id) { if !has_audio && !recorders.contains_key(&source.leg_id) {
continue; // Don't create a file for silence-only sources. continue; // Don't create a file for silence-only sources.
} }
let recorder = recorders.entry(source.leg_id.clone()).or_insert_with(|| { let recorder = recorders.entry(source.leg_id.clone()).or_insert_with(|| {
let path = format!("{}/{}-{}.wav", base_dir, call_id, source.leg_id); let path = format!("{}/{}-{}.wav", base_dir, call_id, source.leg_id);
Recorder::new_pcm(&path, 16000, None).unwrap_or_else(|e| { Recorder::new_pcm(&path, 48000, None).unwrap_or_else(|e| {
panic!("failed to create recorder for {}: {e}", source.leg_id); panic!("failed to create recorder for {}: {e}", source.leg_id);
}) })
}); });
if !recorder.write_pcm(&source.pcm_16k) { // Convert f32 [-1.0, 1.0] to i16 for WAV writing.
let pcm_i16: Vec<i16> = source.pcm_48k
.iter()
.map(|&s| (s * 32767.0).round().clamp(-32768.0, 32767.0) as i16)
.collect();
if !recorder.write_pcm(&pcm_i16) {
// Max duration reached — stop recording this source. // Max duration reached — stop recording this source.
break; break;
} }
@@ -88,7 +93,7 @@ pub fn spawn_recording_tool(
/// Spawn a transcription tool leg. /// Spawn a transcription tool leg.
/// ///
/// The plumbing is fully real: it receives per-source unmerged PCM@16kHz from /// The plumbing is fully real: it receives per-source unmerged PCM@48kHz f32 from
/// the mixer every 20ms. The consumer is a stub that accumulates audio and /// the mixer every 20ms. The consumer is a stub that accumulates audio and
/// reports metadata on close. Future: will stream to a Whisper HTTP endpoint. /// reports metadata on close. Future: will stream to a Whisper HTTP endpoint.
pub fn spawn_transcription_tool( pub fn spawn_transcription_tool(
@@ -105,7 +110,7 @@ pub fn spawn_transcription_tool(
while let Some(batch) = rx.recv().await { while let Some(batch) = rx.recv().await {
for source in &batch.sources { for source in &batch.sources {
*source_samples.entry(source.leg_id.clone()).or_insert(0) += *source_samples.entry(source.leg_id.clone()).or_insert(0) +=
source.pcm_16k.len() as u64; source.pcm_48k.len() as u64;
// TODO: Future — accumulate chunks and stream to Whisper endpoint. // TODO: Future — accumulate chunks and stream to Whisper endpoint.
// For now, the audio is received and counted but not processed. // For now, the audio is received and counted but not processed.
@@ -118,7 +123,7 @@ pub fn spawn_transcription_tool(
.map(|(leg_id, samples)| { .map(|(leg_id, samples)| {
serde_json::json!({ serde_json::json!({
"source_leg_id": leg_id, "source_leg_id": leg_id,
"duration_ms": (samples * 1000) / 16000, "duration_ms": (samples * 1000) / 48000,
}) })
}) })
.collect(); .collect();

View File

@@ -3,6 +3,6 @@
*/ */
export const commitinfo = { export const commitinfo = {
name: 'siprouter', name: 'siprouter',
version: '1.16.0', version: '1.17.0',
description: 'undefined' description: 'undefined'
} }

View File

@@ -3,6 +3,6 @@
*/ */
export const commitinfo = { export const commitinfo = {
name: 'siprouter', name: 'siprouter',
version: '1.16.0', version: '1.17.0',
description: 'undefined' description: 'undefined'
} }