initial commit — SIP B2BUA + WebRTC bridge with Rust codec engine

Full-featured SIP router with multi-provider trunking, browser softphone via WebRTC, real-time Opus/G.722/PCM transcoding in Rust, RNNoise ML noise suppression, Kokoro neural TTS announcements, and a Lit-based web dashboard with live call monitoring and REST API.
2026-04-09 23:03:55 +00:00
commit f3e1c96872
59 changed files with 18377 additions and 0 deletions
--- a/rust/crates/tts-engine/src/main.rs
+++ b/rust/crates/tts-engine/src/main.rs
@@ -0,0 +1,149 @@
+/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS.
+///
+/// Usage:
+///   echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav
+///   tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world"
+///
+/// Outputs 24kHz 16-bit mono WAV.
+
+use kokoro_tts::{KokoroTts, Voice};
+use std::io::{self, Read};
+
+fn parse_args() -> Result<(String, String, String, String, Option<String>), String> {
+    let args: Vec<String> = std::env::args().collect();
+    let mut model = String::new();
+    let mut voices = String::new();
+    let mut output = String::new();
+    let mut text: Option<String> = None;
+    let mut voice_name: Option<String> = None;
+
+    let mut i = 1;
+    while i < args.len() {
+        match args[i].as_str() {
+            "--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); }
+            "--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); }
+            "--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); }
+            "--text" => { i += 1; text = args.get(i).cloned(); }
+            "--voice" => { i += 1; voice_name = args.get(i).cloned(); }
+            _ => {}
+        }
+        i += 1;
+    }
+
+    if model.is_empty() { return Err("--model required".into()); }
+    if voices.is_empty() { return Err("--voices required".into()); }
+    if output.is_empty() { return Err("--output required".into()); }
+
+    let voice_str = voice_name.unwrap_or_else(|| "af_bella".into());
+
+    Ok((model, voices, output, voice_str, text))
+}
+
+fn select_voice(name: &str) -> Voice {
+    match name {
+        "af_bella" => Voice::AfBella(1.0),
+        "af_heart" => Voice::AfHeart(1.0),
+        "af_jessica" => Voice::AfJessica(1.0),
+        "af_nicole" => Voice::AfNicole(1.0),
+        "af_nova" => Voice::AfNova(1.0),
+        "af_sarah" => Voice::AfSarah(1.0),
+        "af_sky" => Voice::AfSky(1.0),
+        "af_river" => Voice::AfRiver(1.0),
+        "af_alloy" => Voice::AfAlloy(1.0),
+        "af_aoede" => Voice::AfAoede(1.0),
+        "af_kore" => Voice::AfKore(1.0),
+        "am_adam" => Voice::AmAdam(1.0),
+        "am_echo" => Voice::AmEcho(1.0),
+        "am_eric" => Voice::AmEric(1.0),
+        "am_fenrir" => Voice::AmFenrir(1.0),
+        "am_liam" => Voice::AmLiam(1.0),
+        "am_michael" => Voice::AmMichael(1.0),
+        "am_onyx" => Voice::AmOnyx(1.0),
+        "am_puck" => Voice::AmPuck(1.0),
+        "bf_alice" => Voice::BfAlice(1.0),
+        "bf_emma" => Voice::BfEmma(1.0),
+        "bf_isabella" => Voice::BfIsabella(1.0),
+        "bf_lily" => Voice::BfLily(1.0),
+        "bm_daniel" => Voice::BmDaniel(1.0),
+        "bm_fable" => Voice::BmFable(1.0),
+        "bm_george" => Voice::BmGeorge(1.0),
+        "bm_lewis" => Voice::BmLewis(1.0),
+        _ => {
+            eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name);
+            Voice::AfBella(1.0)
+        }
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() {
+        Ok(v) => v,
+        Err(e) => {
+            eprintln!("Error: {}", e);
+            eprintln!("Usage: tts-engine --model <model.onnx> --voices <voices.bin> --output <output.wav> [--text <text>] [--voice <voice_name>]");
+            std::process::exit(1);
+        }
+    };
+
+    // Get text from --text arg or stdin.
+    let text = match text_arg {
+        Some(t) => t,
+        None => {
+            let mut buf = String::new();
+            io::stdin().read_to_string(&mut buf).expect("failed to read stdin");
+            buf.trim().to_string()
+        }
+    };
+
+    if text.is_empty() {
+        eprintln!("[tts-engine] no text provided");
+        std::process::exit(1);
+    }
+
+    eprintln!("[tts-engine] loading model: {}", model_path);
+    let tts = match KokoroTts::new(&model_path, &voices_path).await {
+        Ok(t) => t,
+        Err(e) => {
+            eprintln!("[tts-engine] failed to load model: {:?}", e);
+            std::process::exit(1);
+        }
+    };
+
+    let voice = select_voice(&voice_name);
+    eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text);
+
+    let (samples, duration) = match tts.synth(&text, voice).await {
+        Ok(r) => r,
+        Err(e) => {
+            eprintln!("[tts-engine] synthesis failed: {:?}", e);
+            std::process::exit(1);
+        }
+    };
+
+    eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration);
+
+    // Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects).
+    let spec = hound::WavSpec {
+        channels: 1,
+        sample_rate: 24000,
+        bits_per_sample: 16,
+        sample_format: hound::SampleFormat::Int,
+    };
+
+    let mut writer = match hound::WavWriter::create(&output_path, spec) {
+        Ok(w) => w,
+        Err(e) => {
+            eprintln!("[tts-engine] failed to create WAV: {}", e);
+            std::process::exit(1);
+        }
+    };
+
+    for &sample in &samples {
+        let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
+        writer.write_sample(s16).unwrap();
+    }
+    writer.finalize().unwrap();
+
+    eprintln!("[tts-engine] wrote {}", output_path);
+}