feat(proxy-engine): upgrade the internal audio bus to 48kHz f32 with per-leg denoising and improve SIP leg routing

2026-04-10 15:58:41 +00:00
parent 10ad432a4c
commit 73b28f5f57
10 changed files with 194 additions and 65 deletions
--- a/rust/crates/proxy-engine/src/tool_leg.rs
+++ b/rust/crates/proxy-engine/src/tool_leg.rs
@@ -2,7 +2,7 @@
 //!
 //! Tool legs are observer legs that receive individual audio streams from each
 //! participant in a call. The mixer pipes `ToolAudioBatch` every 20ms containing
-//! each participant's decoded PCM@16kHz tagged with source leg ID.
+//! each participant's decoded PCM@48kHz f32 tagged with source leg ID.
 //!
 //! Consumers:
 //! - **Recording**: writes per-source WAV files for speaker-separated recording.
@@ -37,20 +37,25 @@ pub fn spawn_recording_tool(

        while let Some(batch) = rx.recv().await {
            for source in &batch.sources {
-                // Skip silence-only frames (all zeros = no audio activity).
-                let has_audio = source.pcm_16k.iter().any(|&s| s != 0);
+                // Skip silence-only frames (near-zero = no audio activity).
+                let has_audio = source.pcm_48k.iter().any(|&s| s.abs() > 1e-6);
                if !has_audio && !recorders.contains_key(&source.leg_id) {
                    continue; // Don't create a file for silence-only sources.
                }

                let recorder = recorders.entry(source.leg_id.clone()).or_insert_with(|| {
                    let path = format!("{}/{}-{}.wav", base_dir, call_id, source.leg_id);
-                    Recorder::new_pcm(&path, 16000, None).unwrap_or_else(|e| {
+                    Recorder::new_pcm(&path, 48000, None).unwrap_or_else(|e| {
                        panic!("failed to create recorder for {}: {e}", source.leg_id);
                    })
                });

-                if !recorder.write_pcm(&source.pcm_16k) {
+                // Convert f32 [-1.0, 1.0] to i16 for WAV writing.
+                let pcm_i16: Vec<i16> = source.pcm_48k
+                    .iter()
+                    .map(|&s| (s * 32767.0).round().clamp(-32768.0, 32767.0) as i16)
+                    .collect();
+                if !recorder.write_pcm(&pcm_i16) {
                    // Max duration reached — stop recording this source.
                    break;
                }
@@ -88,7 +93,7 @@ pub fn spawn_recording_tool(

 /// Spawn a transcription tool leg.
 ///
-/// The plumbing is fully real: it receives per-source unmerged PCM@16kHz from
+/// The plumbing is fully real: it receives per-source unmerged PCM@48kHz f32 from
 /// the mixer every 20ms. The consumer is a stub that accumulates audio and
 /// reports metadata on close. Future: will stream to a Whisper HTTP endpoint.
 pub fn spawn_transcription_tool(
@@ -105,7 +110,7 @@ pub fn spawn_transcription_tool(
        while let Some(batch) = rx.recv().await {
            for source in &batch.sources {
                *source_samples.entry(source.leg_id.clone()).or_insert(0) +=
-                    source.pcm_16k.len() as u64;
+                    source.pcm_48k.len() as u64;

                // TODO: Future — accumulate chunks and stream to Whisper endpoint.
                // For now, the audio is received and counted but not processed.
@@ -118,7 +123,7 @@ pub fn spawn_transcription_tool(
            .map(|(leg_id, samples)| {
                serde_json::json!({
                    "source_leg_id": leg_id,
-                    "duration_ms": (samples * 1000) / 16000,
+                    "duration_ms": (samples * 1000) / 48000,
                })
            })
            .collect();