diff --git a/changelog.md b/changelog.md index f40b977..5ba3a6a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,12 @@ # Changelog +## 2026-04-10 - 1.16.0 - feat(proxy-engine) +integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files + +- adds a generate_tts command to proxy-engine with lazy-loaded Kokoro model support and WAV output generation +- removes standalone opus-codec and tts-engine workspace binaries by consolidating TTS generation into proxy-engine +- updates announcement and prompt cache flows to generate and cache WAV files on disk instead of pre-encoding RTP frames in TypeScript + ## 2026-04-10 - 1.15.0 - feat(proxy-engine) add device leg, leg transfer, and leg replacement call controls diff --git a/rust/Cargo.lock b/rust/Cargo.lock index f3e003b..067b8eb 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -1881,16 +1881,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "opus-codec" -version = "0.2.0" -dependencies = [ - "base64 0.22.1", - "codec-lib", - "serde", - "serde_json", -] - [[package]] name = "ort" version = "2.0.0-rc.11" @@ -2188,6 +2178,8 @@ dependencies = [ "base64 0.22.1", "codec-lib", "hound", + "kokoro-tts", + "ort", "rand 0.8.5", "regex-lite", "serde", @@ -3008,16 +3000,6 @@ dependencies = [ "strength_reduce", ] -[[package]] -name = "tts-engine" -version = "0.1.0" -dependencies = [ - "hound", - "kokoro-tts", - "ort", - "tokio", -] - [[package]] name = "turn" version = "0.6.1" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index c32663c..75cd36d 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,8 +1,6 @@ [workspace] members = [ "crates/codec-lib", - "crates/opus-codec", - "crates/tts-engine", "crates/sip-proto", "crates/proxy-engine", ] diff --git a/rust/crates/codec-lib/src/lib.rs b/rust/crates/codec-lib/src/lib.rs index dfa4f0a..3de8e7f 100644 --- a/rust/crates/codec-lib/src/lib.rs +++ b/rust/crates/codec-lib/src/lib.rs @@ -1,7 +1,7 @@ //! Audio codec library for the SIP router. //! //! Handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding with ML noise suppression. -//! Used by both the standalone `opus-codec` CLI and the `proxy-engine` binary. +//! Used by the `proxy-engine` binary for all audio transcoding. use audiopus::coder::{Decoder as OpusDecoder, Encoder as OpusEncoder}; use audiopus::packet::Packet as OpusPacket; diff --git a/rust/crates/opus-codec/Cargo.toml b/rust/crates/opus-codec/Cargo.toml deleted file mode 100644 index bbbb543..0000000 --- a/rust/crates/opus-codec/Cargo.toml +++ /dev/null @@ -1,14 +0,0 @@ -[package] -name = "opus-codec" -version = "0.2.0" -edition = "2021" - -[[bin]] -name = "opus-codec" -path = "src/main.rs" - -[dependencies] -codec-lib = { path = "../codec-lib" } -serde = { version = "1", features = ["derive"] } -serde_json = "1" -base64 = "0.22" diff --git a/rust/crates/opus-codec/src/main.rs b/rust/crates/opus-codec/src/main.rs deleted file mode 100644 index a9b2be9..0000000 --- a/rust/crates/opus-codec/src/main.rs +++ /dev/null @@ -1,286 +0,0 @@ -/// Audio transcoding bridge for smartrust. -/// -/// Thin CLI wrapper around `codec-lib`. Handles Opus ↔ G.722 ↔ PCMU transcoding. -/// -/// Protocol: -/// -> {"id":"1","method":"init","params":{}} -/// <- {"id":"1","success":true,"result":{}} -/// -> {"id":"2","method":"create_session","params":{"session_id":"call-abc"}} -/// <- {"id":"2","success":true,"result":{}} -/// -> {"id":"3","method":"transcode","params":{"session_id":"call-abc","data_b64":"...","from_pt":111,"to_pt":9}} -/// <- {"id":"3","success":true,"result":{"data_b64":"..."}} -/// -> {"id":"4","method":"destroy_session","params":{"session_id":"call-abc"}} -/// <- {"id":"4","success":true,"result":{}} - -use base64::engine::general_purpose::STANDARD as B64; -use base64::Engine as _; -use codec_lib::{codec_sample_rate, TranscodeState}; -use serde::Deserialize; -use std::collections::HashMap; -use std::io::{self, BufRead, Write}; - -#[derive(Deserialize)] -struct Request { - id: String, - method: String, - #[serde(default)] - params: serde_json::Value, -} - -fn respond( - out: &mut impl Write, - id: &str, - success: bool, - result: Option, - error: Option<&str>, -) { - let mut resp = serde_json::json!({ "id": id, "success": success }); - if let Some(r) = result { - resp["result"] = r; - } - if let Some(e) = error { - resp["error"] = serde_json::Value::String(e.to_string()); - } - let _ = writeln!(out, "{}", resp); - let _ = out.flush(); -} - -/// Resolve a session: if session_id is provided, look it up in the sessions map; -/// otherwise fall back to the default state (backward compat with `init`). -fn get_session<'a>( - sessions: &'a mut HashMap, - default: &'a mut Option, - params: &serde_json::Value, -) -> Option<&'a mut TranscodeState> { - if let Some(sid) = params.get("session_id").and_then(|v| v.as_str()) { - sessions.get_mut(sid) - } else { - default.as_mut() - } -} - -fn main() { - let stdin = io::stdin(); - let stdout = io::stdout(); - let mut out = io::BufWriter::new(stdout.lock()); - - let _ = writeln!(out, r#"{{"event":"ready","data":{{}}}}"#); - let _ = out.flush(); - - let mut default_state: Option = None; - let mut sessions: HashMap = HashMap::new(); - - for line in stdin.lock().lines() { - let line = match line { - Ok(l) if !l.trim().is_empty() => l, - Ok(_) => continue, - Err(_) => break, - }; - - let req: Request = match serde_json::from_str(&line) { - Ok(r) => r, - Err(e) => { - respond(&mut out, "", false, None, Some(&format!("parse: {e}"))); - continue; - } - }; - - match req.method.as_str() { - "init" => match TranscodeState::new() { - Ok(s) => { - default_state = Some(s); - respond(&mut out, &req.id, true, Some(serde_json::json!({})), None); - } - Err(e) => respond(&mut out, &req.id, false, None, Some(&e)), - }, - - "create_session" => { - let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) { - Some(s) => s.to_string(), - None => { - respond(&mut out, &req.id, false, None, Some("missing session_id")); - continue; - } - }; - if sessions.contains_key(&session_id) { - respond(&mut out, &req.id, true, Some(serde_json::json!({})), None); - continue; - } - match TranscodeState::new() { - Ok(s) => { - sessions.insert(session_id, s); - respond(&mut out, &req.id, true, Some(serde_json::json!({})), None); - } - Err(e) => respond(&mut out, &req.id, false, None, Some(&e)), - } - } - - "destroy_session" => { - let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) { - Some(s) => s, - None => { - respond(&mut out, &req.id, false, None, Some("missing session_id")); - continue; - } - }; - sessions.remove(session_id); - respond(&mut out, &req.id, true, Some(serde_json::json!({})), None); - } - - "transcode" => { - let st = match get_session(&mut sessions, &mut default_state, &req.params) { - Some(s) => s, - None => { - respond( - &mut out, - &req.id, - false, - None, - Some("not initialized (no session or default state)"), - ); - continue; - } - }; - let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) { - Some(s) => s, - None => { - respond(&mut out, &req.id, false, None, Some("missing data_b64")); - continue; - } - }; - let from_pt = - req.params.get("from_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8; - let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8; - let direction = req.params.get("direction").and_then(|v| v.as_str()); - - let data = match B64.decode(data_b64) { - Ok(b) => b, - Err(e) => { - respond( - &mut out, - &req.id, - false, - None, - Some(&format!("b64: {e}")), - ); - continue; - } - }; - - match st.transcode(&data, from_pt, to_pt, direction) { - Ok(result) => { - respond( - &mut out, - &req.id, - true, - Some(serde_json::json!({ "data_b64": B64.encode(&result) })), - None, - ); - } - Err(e) => respond(&mut out, &req.id, false, None, Some(&e)), - } - } - - "encode_pcm" => { - let st = match get_session(&mut sessions, &mut default_state, &req.params) { - Some(s) => s, - None => { - respond( - &mut out, - &req.id, - false, - None, - Some("not initialized (no session or default state)"), - ); - continue; - } - }; - let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) { - Some(s) => s, - None => { - respond(&mut out, &req.id, false, None, Some("missing data_b64")); - continue; - } - }; - let sample_rate = req - .params - .get("sample_rate") - .and_then(|v| v.as_u64()) - .unwrap_or(22050) as u32; - let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(9) as u8; - - let data = match B64.decode(data_b64) { - Ok(b) => b, - Err(e) => { - respond( - &mut out, - &req.id, - false, - None, - Some(&format!("b64: {e}")), - ); - continue; - } - }; - - if data.len() % 2 != 0 { - respond( - &mut out, - &req.id, - false, - None, - Some("PCM data has odd byte count (expected 16-bit LE samples)"), - ); - continue; - } - - let pcm: Vec = data - .chunks_exact(2) - .map(|c| i16::from_le_bytes([c[0], c[1]])) - .collect(); - - let target_rate = codec_sample_rate(to_pt); - let resampled = match st.resample(&pcm, sample_rate, target_rate) { - Ok(r) => r, - Err(e) => { - respond(&mut out, &req.id, false, None, Some(&e)); - continue; - } - }; - - match st.encode_from_pcm(&resampled, to_pt) { - Ok(encoded) => { - respond( - &mut out, - &req.id, - true, - Some(serde_json::json!({ "data_b64": B64.encode(&encoded) })), - None, - ); - } - Err(e) => { - respond(&mut out, &req.id, false, None, Some(&e)); - } - } - } - - "encode" | "decode" => { - respond( - &mut out, - &req.id, - false, - None, - Some("use 'transcode' command instead"), - ); - } - - _ => respond( - &mut out, - &req.id, - false, - None, - Some(&format!("unknown: {}", req.method)), - ), - } - } -} diff --git a/rust/crates/proxy-engine/Cargo.toml b/rust/crates/proxy-engine/Cargo.toml index c9d9d94..cac37be 100644 --- a/rust/crates/proxy-engine/Cargo.toml +++ b/rust/crates/proxy-engine/Cargo.toml @@ -18,3 +18,8 @@ regex-lite = "0.1" webrtc = "0.8" rand = "0.8" hound = "3.5" +kokoro-tts = { version = "0.3", default-features = false } +ort = { version = "=2.0.0-rc.11", default-features = false, features = [ + "std", "download-binaries", "copy-dylibs", "ndarray", + "tls-native-vendored" +] } diff --git a/rust/crates/proxy-engine/src/main.rs b/rust/crates/proxy-engine/src/main.rs index 5150aad..b08b2be 100644 --- a/rust/crates/proxy-engine/src/main.rs +++ b/rust/crates/proxy-engine/src/main.rs @@ -21,6 +21,7 @@ mod rtp; mod sip_leg; mod sip_transport; mod tool_leg; +mod tts; mod voicemail; mod webrtc_engine; @@ -93,6 +94,9 @@ async fn main() { // WebRTC engine — separate lock to avoid deadlock with SIP handlers. let webrtc = Arc::new(Mutex::new(WebRtcEngine::new(out_tx.clone()))); + // TTS engine — separate lock, lazy-loads model on first use. + let tts_engine = Arc::new(Mutex::new(tts::TtsEngine::new())); + // Read commands from stdin. let stdin = tokio::io::stdin(); let reader = BufReader::new(stdin); @@ -113,11 +117,12 @@ async fn main() { let engine = engine.clone(); let webrtc = webrtc.clone(); + let tts_engine = tts_engine.clone(); let out_tx = out_tx.clone(); // Handle commands — some are async, so we spawn. tokio::spawn(async move { - handle_command(engine, webrtc, &out_tx, cmd).await; + handle_command(engine, webrtc, tts_engine, &out_tx, cmd).await; }); } } @@ -125,6 +130,7 @@ async fn main() { async fn handle_command( engine: Arc>, webrtc: Arc>, + tts_engine: Arc>, out_tx: &OutTx, cmd: Command, ) { @@ -150,6 +156,8 @@ async fn handle_command( "add_tool_leg" => handle_add_tool_leg(engine, out_tx, &cmd).await, "remove_tool_leg" => handle_remove_tool_leg(engine, out_tx, &cmd).await, "set_leg_metadata" => handle_set_leg_metadata(engine, out_tx, &cmd).await, + // TTS command — lock tts_engine only (no SIP/WebRTC contention). + "generate_tts" => handle_generate_tts(tts_engine, out_tx, &cmd).await, _ => respond_err(out_tx, &cmd.id, &format!("unknown command: {}", cmd.method)), } } @@ -1218,3 +1226,16 @@ async fn handle_set_leg_metadata( leg.metadata.insert(key, value); respond_ok(out_tx, &cmd.id, serde_json::json!({})); } + +/// Handle `generate_tts` — synthesize text to a WAV file using Kokoro TTS. +async fn handle_generate_tts( + tts_engine: Arc>, + out_tx: &OutTx, + cmd: &Command, +) { + let mut tts = tts_engine.lock().await; + match tts.generate(&cmd.params).await { + Ok(result) => respond_ok(out_tx, &cmd.id, result), + Err(e) => respond_err(out_tx, &cmd.id, &e), + } +} diff --git a/rust/crates/proxy-engine/src/tts.rs b/rust/crates/proxy-engine/src/tts.rs new file mode 100644 index 0000000..02d8f17 --- /dev/null +++ b/rust/crates/proxy-engine/src/tts.rs @@ -0,0 +1,138 @@ +//! Text-to-speech engine — synthesizes text to WAV files using Kokoro neural TTS. +//! +//! The model is loaded lazily on first use. If the model/voices files are not +//! present, the generate command returns an error and the TS side falls back +//! to espeak-ng. + +use kokoro_tts::{KokoroTts, Voice}; +use std::path::Path; + +/// Wraps the Kokoro TTS engine with lazy model loading. +pub struct TtsEngine { + tts: Option, + /// Path that was used to load the current model (for cache invalidation). + loaded_model_path: String, + loaded_voices_path: String, +} + +impl TtsEngine { + pub fn new() -> Self { + Self { + tts: None, + loaded_model_path: String::new(), + loaded_voices_path: String::new(), + } + } + + /// Generate a WAV file from text. + /// + /// Params (from IPC JSON): + /// - `model`: path to the ONNX model file + /// - `voices`: path to the voices.bin file + /// - `voice`: voice name (e.g. "af_bella") + /// - `text`: text to synthesize + /// - `output`: output WAV file path + pub async fn generate(&mut self, params: &serde_json::Value) -> Result { + let model_path = params.get("model").and_then(|v| v.as_str()) + .ok_or("missing 'model' param")?; + let voices_path = params.get("voices").and_then(|v| v.as_str()) + .ok_or("missing 'voices' param")?; + let voice_name = params.get("voice").and_then(|v| v.as_str()) + .unwrap_or("af_bella"); + let text = params.get("text").and_then(|v| v.as_str()) + .ok_or("missing 'text' param")?; + let output_path = params.get("output").and_then(|v| v.as_str()) + .ok_or("missing 'output' param")?; + + if text.is_empty() { + return Err("empty text".into()); + } + + // Check that model/voices files exist. + if !Path::new(model_path).exists() { + return Err(format!("model not found: {model_path}")); + } + if !Path::new(voices_path).exists() { + return Err(format!("voices not found: {voices_path}")); + } + + // Lazy-load or reload if paths changed. + if self.tts.is_none() + || self.loaded_model_path != model_path + || self.loaded_voices_path != voices_path + { + eprintln!("[tts] loading model: {model_path}"); + let tts = KokoroTts::new(model_path, voices_path) + .await + .map_err(|e| format!("model load failed: {e:?}"))?; + self.tts = Some(tts); + self.loaded_model_path = model_path.to_string(); + self.loaded_voices_path = voices_path.to_string(); + } + + let tts = self.tts.as_ref().unwrap(); + let voice = select_voice(voice_name); + + eprintln!("[tts] synthesizing voice '{voice_name}': \"{text}\""); + let (samples, duration) = tts.synth(text, voice) + .await + .map_err(|e| format!("synthesis failed: {e:?}"))?; + eprintln!("[tts] synthesized {} samples in {duration:?}", samples.len()); + + // Write 24kHz 16-bit mono WAV. + let spec = hound::WavSpec { + channels: 1, + sample_rate: 24000, + bits_per_sample: 16, + sample_format: hound::SampleFormat::Int, + }; + + let mut writer = hound::WavWriter::create(output_path, spec) + .map_err(|e| format!("WAV create failed: {e}"))?; + for &sample in &samples { + let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16; + writer.write_sample(s16).map_err(|e| format!("WAV write: {e}"))?; + } + writer.finalize().map_err(|e| format!("WAV finalize: {e}"))?; + + eprintln!("[tts] wrote {output_path}"); + Ok(serde_json::json!({ "output": output_path })) + } +} + +/// Map voice name string to Kokoro Voice enum variant. +fn select_voice(name: &str) -> Voice { + match name { + "af_bella" => Voice::AfBella(1.0), + "af_heart" => Voice::AfHeart(1.0), + "af_jessica" => Voice::AfJessica(1.0), + "af_nicole" => Voice::AfNicole(1.0), + "af_nova" => Voice::AfNova(1.0), + "af_sarah" => Voice::AfSarah(1.0), + "af_sky" => Voice::AfSky(1.0), + "af_river" => Voice::AfRiver(1.0), + "af_alloy" => Voice::AfAlloy(1.0), + "af_aoede" => Voice::AfAoede(1.0), + "af_kore" => Voice::AfKore(1.0), + "am_adam" => Voice::AmAdam(1.0), + "am_echo" => Voice::AmEcho(1.0), + "am_eric" => Voice::AmEric(1.0), + "am_fenrir" => Voice::AmFenrir(1.0), + "am_liam" => Voice::AmLiam(1.0), + "am_michael" => Voice::AmMichael(1.0), + "am_onyx" => Voice::AmOnyx(1.0), + "am_puck" => Voice::AmPuck(1.0), + "bf_alice" => Voice::BfAlice(1.0), + "bf_emma" => Voice::BfEmma(1.0), + "bf_isabella" => Voice::BfIsabella(1.0), + "bf_lily" => Voice::BfLily(1.0), + "bm_daniel" => Voice::BmDaniel(1.0), + "bm_fable" => Voice::BmFable(1.0), + "bm_george" => Voice::BmGeorge(1.0), + "bm_lewis" => Voice::BmLewis(1.0), + _ => { + eprintln!("[tts] unknown voice '{name}', falling back to af_bella"); + Voice::AfBella(1.0) + } + } +} diff --git a/rust/crates/tts-engine/Cargo.toml b/rust/crates/tts-engine/Cargo.toml deleted file mode 100644 index da3a891..0000000 --- a/rust/crates/tts-engine/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "tts-engine" -version = "0.1.0" -edition = "2021" - -[[bin]] -name = "tts-engine" -path = "src/main.rs" - -[dependencies] -kokoro-tts = { version = "0.3", default-features = false } -# Pin to rc.11 matching kokoro-tts's expectation; enable vendored TLS to avoid system libssl-dev. -ort = { version = "=2.0.0-rc.11", default-features = false, features = [ - "std", "download-binaries", "copy-dylibs", "ndarray", - "tls-native-vendored" -] } -tokio = { version = "1", features = ["rt-multi-thread", "macros"] } -hound = "3.5" diff --git a/rust/crates/tts-engine/src/main.rs b/rust/crates/tts-engine/src/main.rs deleted file mode 100644 index b12c6b4..0000000 --- a/rust/crates/tts-engine/src/main.rs +++ /dev/null @@ -1,149 +0,0 @@ -/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS. -/// -/// Usage: -/// echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav -/// tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world" -/// -/// Outputs 24kHz 16-bit mono WAV. - -use kokoro_tts::{KokoroTts, Voice}; -use std::io::{self, Read}; - -fn parse_args() -> Result<(String, String, String, String, Option), String> { - let args: Vec = std::env::args().collect(); - let mut model = String::new(); - let mut voices = String::new(); - let mut output = String::new(); - let mut text: Option = None; - let mut voice_name: Option = None; - - let mut i = 1; - while i < args.len() { - match args[i].as_str() { - "--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); } - "--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); } - "--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); } - "--text" => { i += 1; text = args.get(i).cloned(); } - "--voice" => { i += 1; voice_name = args.get(i).cloned(); } - _ => {} - } - i += 1; - } - - if model.is_empty() { return Err("--model required".into()); } - if voices.is_empty() { return Err("--voices required".into()); } - if output.is_empty() { return Err("--output required".into()); } - - let voice_str = voice_name.unwrap_or_else(|| "af_bella".into()); - - Ok((model, voices, output, voice_str, text)) -} - -fn select_voice(name: &str) -> Voice { - match name { - "af_bella" => Voice::AfBella(1.0), - "af_heart" => Voice::AfHeart(1.0), - "af_jessica" => Voice::AfJessica(1.0), - "af_nicole" => Voice::AfNicole(1.0), - "af_nova" => Voice::AfNova(1.0), - "af_sarah" => Voice::AfSarah(1.0), - "af_sky" => Voice::AfSky(1.0), - "af_river" => Voice::AfRiver(1.0), - "af_alloy" => Voice::AfAlloy(1.0), - "af_aoede" => Voice::AfAoede(1.0), - "af_kore" => Voice::AfKore(1.0), - "am_adam" => Voice::AmAdam(1.0), - "am_echo" => Voice::AmEcho(1.0), - "am_eric" => Voice::AmEric(1.0), - "am_fenrir" => Voice::AmFenrir(1.0), - "am_liam" => Voice::AmLiam(1.0), - "am_michael" => Voice::AmMichael(1.0), - "am_onyx" => Voice::AmOnyx(1.0), - "am_puck" => Voice::AmPuck(1.0), - "bf_alice" => Voice::BfAlice(1.0), - "bf_emma" => Voice::BfEmma(1.0), - "bf_isabella" => Voice::BfIsabella(1.0), - "bf_lily" => Voice::BfLily(1.0), - "bm_daniel" => Voice::BmDaniel(1.0), - "bm_fable" => Voice::BmFable(1.0), - "bm_george" => Voice::BmGeorge(1.0), - "bm_lewis" => Voice::BmLewis(1.0), - _ => { - eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name); - Voice::AfBella(1.0) - } - } -} - -#[tokio::main] -async fn main() { - let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() { - Ok(v) => v, - Err(e) => { - eprintln!("Error: {}", e); - eprintln!("Usage: tts-engine --model --voices --output [--text ] [--voice ]"); - std::process::exit(1); - } - }; - - // Get text from --text arg or stdin. - let text = match text_arg { - Some(t) => t, - None => { - let mut buf = String::new(); - io::stdin().read_to_string(&mut buf).expect("failed to read stdin"); - buf.trim().to_string() - } - }; - - if text.is_empty() { - eprintln!("[tts-engine] no text provided"); - std::process::exit(1); - } - - eprintln!("[tts-engine] loading model: {}", model_path); - let tts = match KokoroTts::new(&model_path, &voices_path).await { - Ok(t) => t, - Err(e) => { - eprintln!("[tts-engine] failed to load model: {:?}", e); - std::process::exit(1); - } - }; - - let voice = select_voice(&voice_name); - eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text); - - let (samples, duration) = match tts.synth(&text, voice).await { - Ok(r) => r, - Err(e) => { - eprintln!("[tts-engine] synthesis failed: {:?}", e); - std::process::exit(1); - } - }; - - eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration); - - // Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects). - let spec = hound::WavSpec { - channels: 1, - sample_rate: 24000, - bits_per_sample: 16, - sample_format: hound::SampleFormat::Int, - }; - - let mut writer = match hound::WavWriter::create(&output_path, spec) { - Ok(w) => w, - Err(e) => { - eprintln!("[tts-engine] failed to create WAV: {}", e); - std::process::exit(1); - } - }; - - for &sample in &samples { - let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16; - writer.write_sample(s16).unwrap(); - } - writer.finalize().unwrap(); - - eprintln!("[tts-engine] wrote {}", output_path); -} diff --git a/ts/00_commitinfo_data.ts b/ts/00_commitinfo_data.ts index 207d2eb..8528a1f 100644 --- a/ts/00_commitinfo_data.ts +++ b/ts/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: 'siprouter', - version: '1.15.0', + version: '1.16.0', description: 'undefined' } diff --git a/ts/announcement.ts b/ts/announcement.ts index 70eea76..effca62 100644 --- a/ts/announcement.ts +++ b/ts/announcement.ts @@ -1,59 +1,22 @@ /** - * TTS announcement module — pre-generates audio announcements using espeak-ng - * and caches them as encoded RTP packets for playback during call setup. + * TTS announcement module — generates announcement WAV files at startup. * - * On startup, generates the announcement WAV via espeak-ng (formant-based TTS - * with highly accurate pronunciation), encodes each 20ms frame to G.722 (for - * SIP) and Opus (for WebRTC) via the Rust transcoder, and caches the packets. + * Engine priority: espeak-ng (formant TTS, fast) → Kokoro neural TTS via + * proxy-engine → disabled. * - * Falls back to the Rust tts-engine (Kokoro neural TTS) if espeak-ng is not - * installed, and disables announcements if neither is available. + * The generated WAV is left on disk for Rust's audio_player / start_interaction + * to play during calls. No encoding or RTP playback happens in TypeScript. */ import { execSync } from 'node:child_process'; import fs from 'node:fs'; import path from 'node:path'; -import { Buffer } from 'node:buffer'; -import { encodePcm, isCodecReady } from './opusbridge.ts'; - -/** RTP clock increment per 20ms frame for each codec. */ -function rtpClockIncrement(pt: number): number { - if (pt === 111) return 960; - if (pt === 9) return 160; - return 160; -} - -/** Build a fresh RTP header. */ -function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer { - const hdr = Buffer.alloc(12); - hdr[0] = 0x80; - hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f); - hdr.writeUInt16BE(seq & 0xffff, 2); - hdr.writeUInt32BE(ts >>> 0, 4); - hdr.writeUInt32BE(ssrc >>> 0, 8); - return hdr; -} - -// --------------------------------------------------------------------------- -// Types -// --------------------------------------------------------------------------- - -/** A pre-encoded announcement ready for RTP playback. */ -export interface IAnnouncementCache { - /** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */ - g722Frames: Buffer[]; - /** Opus encoded frames for WebRTC playback. */ - opusFrames: Buffer[]; - /** Total duration in milliseconds. */ - durationMs: number; -} +import { sendProxyCommand, isProxyReady } from './proxybridge.ts'; // --------------------------------------------------------------------------- // State // --------------------------------------------------------------------------- -let cachedAnnouncement: IAnnouncementCache | null = null; - const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts'); const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now."; const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav'); @@ -64,12 +27,10 @@ const KOKORO_VOICES = 'voices.bin'; const KOKORO_VOICE = 'af_bella'; // --------------------------------------------------------------------------- -// Initialization +// TTS generators // --------------------------------------------------------------------------- -/** - * Check if espeak-ng is available on the system. - */ +/** Check if espeak-ng is available on the system. */ function isEspeakAvailable(): boolean { try { execSync('which espeak-ng', { stdio: 'pipe' }); @@ -79,10 +40,7 @@ function isEspeakAvailable(): boolean { } } -/** - * Generate announcement WAV via espeak-ng (primary engine). - * Returns true on success. - */ +/** Generate announcement WAV via espeak-ng (primary engine). */ function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean { log('[tts] generating announcement audio via espeak-ng...'); try { @@ -98,11 +56,8 @@ function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => } } -/** - * Generate announcement WAV via Kokoro TTS (fallback engine). - * Returns true on success. - */ -function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): boolean { +/** Generate announcement WAV via Kokoro TTS (fallback, runs inside proxy-engine). */ +async function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): Promise { const modelPath = path.join(TTS_DIR, KOKORO_MODEL); const voicesPath = path.join(TTS_DIR, KOKORO_VOICES); @@ -111,25 +66,21 @@ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => return false; } - const root = process.cwd(); - const ttsBinPaths = [ - path.join(root, 'dist_rust', 'tts-engine'), - path.join(root, 'rust', 'target', 'release', 'tts-engine'), - path.join(root, 'rust', 'target', 'debug', 'tts-engine'), - ]; - const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p)); - if (!ttsBin) { - log('[tts] tts-engine binary not found — Kokoro fallback unavailable'); + if (!isProxyReady()) { + log('[tts] proxy-engine not ready — Kokoro fallback unavailable'); return false; } log('[tts] generating announcement audio via Kokoro TTS (fallback)...'); try { - execSync( - `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${wavPath}" --text "${text}"`, - { timeout: 120000, stdio: 'pipe' }, - ); - log('[tts] Kokoro WAV generated'); + await sendProxyCommand('generate_tts', { + model: modelPath, + voices: voicesPath, + voice: KOKORO_VOICE, + text, + output: wavPath, + }); + log('[tts] Kokoro WAV generated (via proxy-engine)'); return true; } catch (e: any) { log(`[tts] Kokoro failed: ${e.message}`); @@ -137,40 +88,13 @@ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => } } -/** - * Read a WAV file and detect its sample rate from the fmt chunk. - * Returns { pcm, sampleRate } or null on failure. - */ -function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null { - const wav = fs.readFileSync(wavPath); - if (wav.length < 44) return null; - if (wav.toString('ascii', 0, 4) !== 'RIFF') return null; - if (wav.toString('ascii', 8, 12) !== 'WAVE') return null; - - let sampleRate = 22050; // default - let offset = 12; - let pcm: Buffer | null = null; - - while (offset < wav.length - 8) { - const chunkId = wav.toString('ascii', offset, offset + 4); - const chunkSize = wav.readUInt32LE(offset + 4); - if (chunkId === 'fmt ') { - sampleRate = wav.readUInt32LE(offset + 12); - } - if (chunkId === 'data') { - pcm = wav.subarray(offset + 8, offset + 8 + chunkSize); - } - offset += 8 + chunkSize; - if (offset % 2 !== 0) offset++; - } - - if (!pcm) return null; - return { pcm, sampleRate }; -} +// --------------------------------------------------------------------------- +// Initialization +// --------------------------------------------------------------------------- /** - * Pre-generate the announcement audio and encode to G.722 + Opus frames. - * Must be called after the codec bridge is initialized. + * Pre-generate the announcement WAV file. + * Must be called after the proxy engine is initialized. * * Engine priority: espeak-ng → Kokoro → disabled. */ @@ -178,7 +102,6 @@ export async function initAnnouncement(log: (msg: string) => void): Promise void): Promise void): Promise void): Promise void, - ssrc: number, - onDone?: () => void, -): (() => void) | null { - if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) { - onDone?.(); - return null; - } - - const frames = cachedAnnouncement.g722Frames; - const PT = 9; // G.722 - let frameIdx = 0; - let seq = Math.floor(Math.random() * 0xffff); - let rtpTs = Math.floor(Math.random() * 0xffffffff); - - const timer = setInterval(() => { - if (frameIdx >= frames.length) { - clearInterval(timer); - onDone?.(); - return; - } - - const payload = frames[frameIdx]; - const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0); - const pkt = Buffer.concat([hdr, payload]); - sendPacket(pkt); - - seq++; - rtpTs += rtpClockIncrement(PT); - frameIdx++; - }, 20); - - // Return cancel function. - return () => clearInterval(timer); +/** Get the path to the cached announcement WAV, or null if not generated. */ +export function getAnnouncementWavPath(): string | null { + return fs.existsSync(CACHE_WAV) ? CACHE_WAV : null; } - -/** - * Play pre-cached Opus announcement to a WebRTC PeerConnection sender. - * - * @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp() - * @param ssrc - SSRC to use in RTP headers - * @param onDone - called when announcement finishes - * @returns cancel function, or null if no announcement cached - */ -export function playAnnouncementToWebRtc( - sendRtpPacket: (pkt: Buffer) => void, - ssrc: number, - counters: { seq: number; ts: number }, - onDone?: () => void, -): (() => void) | null { - if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) { - onDone?.(); - return null; - } - - const frames = cachedAnnouncement.opusFrames; - const PT = 111; // Opus - let frameIdx = 0; - - const timer = setInterval(() => { - if (frameIdx >= frames.length) { - clearInterval(timer); - onDone?.(); - return; - } - - const payload = frames[frameIdx]; - const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0); - const pkt = Buffer.concat([hdr, payload]); - sendRtpPacket(pkt); - - counters.seq++; - counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms - frameIdx++; - }, 20); - - return () => clearInterval(timer); -} - -/** Check if an announcement is cached and ready. */ -export function isAnnouncementReady(): boolean { - return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0; -} - diff --git a/ts/call/prompt-cache.ts b/ts/call/prompt-cache.ts index 9d0d499..4d9dfa2 100644 --- a/ts/call/prompt-cache.ts +++ b/ts/call/prompt-cache.ts @@ -1,55 +1,31 @@ /** - * PromptCache — manages multiple named audio prompts for IVR and voicemail. + * PromptCache — manages named audio prompt WAV files for IVR and voicemail. * - * Each prompt is pre-encoded as both G.722 frames (for SIP legs) and Opus - * frames (for WebRTC legs), ready for 20ms RTP playback. + * Generates WAV files via espeak-ng (primary) or Kokoro TTS through the + * proxy-engine (fallback). Also supports loading pre-existing WAV files + * and programmatic tone generation. * - * Supports three sources: - * 1. TTS generation via espeak-ng (primary) or Kokoro (fallback) - * 2. Loading from a pre-existing WAV file - * 3. Programmatic tone generation (beep, etc.) - * - * The existing announcement.ts system continues to work independently; - * this module provides generalized prompt management for IVR/voicemail. + * All audio playback happens in Rust (audio_player / start_interaction). + * This module only manages WAV files on disk. */ import { execSync } from 'node:child_process'; import fs from 'node:fs'; import path from 'node:path'; import { Buffer } from 'node:buffer'; -import { encodePcm, isCodecReady } from '../opusbridge.ts'; - -/** RTP clock increment per 20ms frame for each codec. */ -function rtpClockIncrement(pt: number): number { - if (pt === 111) return 960; - if (pt === 9) return 160; - return 160; -} - -/** Build a fresh RTP header. */ -function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer { - const hdr = Buffer.alloc(12); - hdr[0] = 0x80; - hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f); - hdr.writeUInt16BE(seq & 0xffff, 2); - hdr.writeUInt32BE(ts >>> 0, 4); - hdr.writeUInt32BE(ssrc >>> 0, 8); - return hdr; -} +import { sendProxyCommand, isProxyReady } from '../proxybridge.ts'; // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- -/** A pre-encoded prompt ready for RTP playback. */ +/** A cached prompt — just a WAV file path and metadata. */ export interface ICachedPrompt { /** Unique prompt identifier. */ id: string; - /** G.722 encoded frames (20ms each, no RTP header). */ - g722Frames: Buffer[]; - /** Opus encoded frames (20ms each, no RTP header). */ - opusFrames: Buffer[]; - /** Total duration in milliseconds. */ + /** Path to the WAV file on disk. */ + wavPath: string; + /** Total duration in milliseconds (approximate, from WAV header). */ durationMs: number; } @@ -82,84 +58,61 @@ function generateViaEspeak(wavPath: string, text: string): boolean { } } -/** Generate WAV via Kokoro TTS. */ -function generateViaKokoro(wavPath: string, text: string, voice: string): boolean { +/** Generate WAV via Kokoro TTS (runs inside proxy-engine). */ +async function generateViaKokoro(wavPath: string, text: string, voice: string): Promise { const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx'); const voicesPath = path.join(TTS_DIR, 'voices.bin'); if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false; - - const root = process.cwd(); - const ttsBin = [ - path.join(root, 'dist_rust', 'tts-engine'), - path.join(root, 'rust', 'target', 'release', 'tts-engine'), - path.join(root, 'rust', 'target', 'debug', 'tts-engine'), - ].find((p) => fs.existsSync(p)); - if (!ttsBin) return false; + if (!isProxyReady()) return false; try { - execSync( - `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${voice}" --output "${wavPath}" --text "${text}"`, - { timeout: 120000, stdio: 'pipe' }, - ); + await sendProxyCommand('generate_tts', { + model: modelPath, + voices: voicesPath, + voice, + text, + output: wavPath, + }); return true; } catch { return false; } } -/** Read a WAV file and return raw PCM + sample rate. */ -function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null { - const wav = fs.readFileSync(wavPath); - if (wav.length < 44) return null; - if (wav.toString('ascii', 0, 4) !== 'RIFF') return null; - if (wav.toString('ascii', 8, 12) !== 'WAVE') return null; +/** Read a WAV file's duration from its header. */ +function getWavDurationMs(wavPath: string): number { + try { + const wav = fs.readFileSync(wavPath); + if (wav.length < 44) return 0; + if (wav.toString('ascii', 0, 4) !== 'RIFF') return 0; - let sampleRate = 22050; - let pcm: Buffer | null = null; - let offset = 12; + let sampleRate = 16000; + let dataSize = 0; + let bitsPerSample = 16; + let channels = 1; + let offset = 12; - while (offset < wav.length - 8) { - const chunkId = wav.toString('ascii', offset, offset + 4); - const chunkSize = wav.readUInt32LE(offset + 4); - if (chunkId === 'fmt ') { - sampleRate = wav.readUInt32LE(offset + 12); + while (offset < wav.length - 8) { + const chunkId = wav.toString('ascii', offset, offset + 4); + const chunkSize = wav.readUInt32LE(offset + 4); + if (chunkId === 'fmt ') { + channels = wav.readUInt16LE(offset + 10); + sampleRate = wav.readUInt32LE(offset + 12); + bitsPerSample = wav.readUInt16LE(offset + 22); + } + if (chunkId === 'data') { + dataSize = chunkSize; + } + offset += 8 + chunkSize; + if (offset % 2 !== 0) offset++; } - if (chunkId === 'data') { - pcm = wav.subarray(offset + 8, offset + 8 + chunkSize); - } - offset += 8 + chunkSize; - if (offset % 2 !== 0) offset++; + + const bytesPerSample = (bitsPerSample / 8) * channels; + const totalSamples = bytesPerSample > 0 ? dataSize / bytesPerSample : 0; + return sampleRate > 0 ? Math.round((totalSamples / sampleRate) * 1000) : 0; + } catch { + return 0; } - - return pcm ? { pcm, sampleRate } : null; -} - -/** Encode raw PCM frames to G.722 + Opus. */ -async function encodePcmFrames( - pcm: Buffer, - sampleRate: number, - log: (msg: string) => void, -): Promise<{ g722Frames: Buffer[]; opusFrames: Buffer[] } | null> { - if (!isCodecReady()) return null; - - const frameSamples = Math.floor(sampleRate * 0.02); // 20ms - const frameBytes = frameSamples * 2; // 16-bit - const totalFrames = Math.floor(pcm.length / frameBytes); - - const g722Frames: Buffer[] = []; - const opusFrames: Buffer[] = []; - - for (let i = 0; i < totalFrames; i++) { - const framePcm = Buffer.from(pcm.subarray(i * frameBytes, (i + 1) * frameBytes)); - const [g722, opus] = await Promise.all([ - encodePcm(framePcm, sampleRate, 9), // G.722 - encodePcm(framePcm, sampleRate, 111), // Opus - ]); - if (g722) g722Frames.push(g722); - if (opus) opusFrames.push(opus); - } - - return { g722Frames, opusFrames }; } // --------------------------------------------------------------------------- @@ -195,7 +148,7 @@ export class PromptCache { } /** - * Generate a TTS prompt and cache it. + * Generate a TTS prompt WAV and cache its path. * Uses espeak-ng (primary) or Kokoro (fallback). */ async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise { @@ -207,14 +160,14 @@ export class PromptCache { this.espeakAvailable = isEspeakAvailable(); } - // Generate WAV. - let generated = false; + // Generate WAV if not already on disk. if (!fs.existsSync(wavPath)) { + let generated = false; if (this.espeakAvailable) { generated = generateViaEspeak(wavPath, text); } if (!generated) { - generated = generateViaKokoro(wavPath, text, voice); + generated = await generateViaKokoro(wavPath, text, voice); } if (!generated) { this.log(`[prompt-cache] failed to generate TTS for "${id}"`); @@ -223,49 +176,22 @@ export class PromptCache { this.log(`[prompt-cache] generated WAV for "${id}"`); } - return this.loadWavPrompt(id, wavPath); + return this.registerWav(id, wavPath); } /** - * Load a WAV file as a prompt and cache it. + * Load a pre-existing WAV file as a prompt. */ async loadWavPrompt(id: string, wavPath: string): Promise { if (!fs.existsSync(wavPath)) { this.log(`[prompt-cache] WAV not found: ${wavPath}`); return null; } - - const result = readWavWithRate(wavPath); - if (!result) { - this.log(`[prompt-cache] failed to parse WAV: ${wavPath}`); - return null; - } - - const encoded = await encodePcmFrames(result.pcm, result.sampleRate, this.log); - if (!encoded) { - this.log(`[prompt-cache] encoding failed for "${id}" (codec bridge not ready?)`); - return null; - } - - const durationMs = encoded.g722Frames.length * 20; - const prompt: ICachedPrompt = { - id, - g722Frames: encoded.g722Frames, - opusFrames: encoded.opusFrames, - durationMs, - }; - - this.prompts.set(id, prompt); - this.log(`[prompt-cache] cached "${id}": ${encoded.g722Frames.length} frames (${(durationMs / 1000).toFixed(1)}s)`); - return prompt; + return this.registerWav(id, wavPath); } /** - * Generate a beep tone prompt (sine wave). - * @param id - prompt ID - * @param freqHz - tone frequency (default 1000 Hz) - * @param durationMs - tone duration (default 500ms) - * @param amplitude - 16-bit amplitude (default 8000) + * Generate a beep tone WAV and cache it. */ async generateBeep( id: string, @@ -273,149 +199,77 @@ export class PromptCache { durationMs = 500, amplitude = 8000, ): Promise { - // Generate at 16kHz for decent quality. - const sampleRate = 16000; - const totalSamples = Math.floor((sampleRate * durationMs) / 1000); - const pcm = Buffer.alloc(totalSamples * 2); + fs.mkdirSync(TTS_DIR, { recursive: true }); + const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`); - for (let i = 0; i < totalSamples; i++) { - const t = i / sampleRate; - // Apply a short fade-in/fade-out to avoid click artifacts. - const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade - let envelope = 1.0; - if (i < fadeLen) envelope = i / fadeLen; - else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen; + if (!fs.existsSync(wavPath)) { + // Generate 16kHz 16-bit mono sine wave WAV. + const sampleRate = 16000; + const totalSamples = Math.floor((sampleRate * durationMs) / 1000); + const pcm = Buffer.alloc(totalSamples * 2); - const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope); - pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2); + for (let i = 0; i < totalSamples; i++) { + const t = i / sampleRate; + const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade + let envelope = 1.0; + if (i < fadeLen) envelope = i / fadeLen; + else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen; + + const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope); + pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2); + } + + // Write WAV file. + const headerSize = 44; + const dataSize = pcm.length; + const wav = Buffer.alloc(headerSize + dataSize); + + // RIFF header + wav.write('RIFF', 0); + wav.writeUInt32LE(36 + dataSize, 4); + wav.write('WAVE', 8); + + // fmt chunk + wav.write('fmt ', 12); + wav.writeUInt32LE(16, 16); // chunk size + wav.writeUInt16LE(1, 20); // PCM format + wav.writeUInt16LE(1, 22); // mono + wav.writeUInt32LE(sampleRate, 24); + wav.writeUInt32LE(sampleRate * 2, 28); // byte rate + wav.writeUInt16LE(2, 32); // block align + wav.writeUInt16LE(16, 34); // bits per sample + + // data chunk + wav.write('data', 36); + wav.writeUInt32LE(dataSize, 40); + pcm.copy(wav, 44); + + fs.writeFileSync(wavPath, wav); + this.log(`[prompt-cache] beep WAV generated for "${id}"`); } - const encoded = await encodePcmFrames(pcm, sampleRate, this.log); - if (!encoded) { - this.log(`[prompt-cache] beep encoding failed for "${id}"`); - return null; - } - - const actualDuration = encoded.g722Frames.length * 20; - const prompt: ICachedPrompt = { - id, - g722Frames: encoded.g722Frames, - opusFrames: encoded.opusFrames, - durationMs: actualDuration, - }; - - this.prompts.set(id, prompt); - this.log(`[prompt-cache] beep "${id}" cached: ${actualDuration}ms @ ${freqHz}Hz`); - return prompt; + return this.registerWav(id, wavPath); } - /** - * Remove a prompt from the cache. - */ + /** Remove a prompt from the cache. */ remove(id: string): void { this.prompts.delete(id); } - /** - * Clear all cached prompts. - */ + /** Clear all cached prompts. */ clear(): void { this.prompts.clear(); } -} -// --------------------------------------------------------------------------- -// Standalone playback helpers (for use by SystemLeg) -// --------------------------------------------------------------------------- + // ------------------------------------------------------------------------- + // Internal + // ------------------------------------------------------------------------- -/** - * Play a cached prompt's G.722 frames as RTP packets at 20ms intervals. - * - * @param prompt - the cached prompt to play - * @param sendPacket - function to send a raw RTP packet (12-byte header + payload) - * @param ssrc - SSRC for RTP headers - * @param onDone - called when playback finishes - * @returns cancel function, or null if prompt has no G.722 frames - */ -export function playPromptG722( - prompt: ICachedPrompt, - sendPacket: (pkt: Buffer) => void, - ssrc: number, - onDone?: () => void, -): (() => void) | null { - if (prompt.g722Frames.length === 0) { - onDone?.(); - return null; + private registerWav(id: string, wavPath: string): ICachedPrompt { + const durationMs = getWavDurationMs(wavPath); + const prompt: ICachedPrompt = { id, wavPath, durationMs }; + this.prompts.set(id, prompt); + this.log(`[prompt-cache] cached "${id}": ${wavPath} (${(durationMs / 1000).toFixed(1)}s)`); + return prompt; } - - const frames = prompt.g722Frames; - const PT = 9; - let frameIdx = 0; - let seq = Math.floor(Math.random() * 0xffff); - let rtpTs = Math.floor(Math.random() * 0xffffffff); - - const timer = setInterval(() => { - if (frameIdx >= frames.length) { - clearInterval(timer); - onDone?.(); - return; - } - - const payload = frames[frameIdx]; - const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0); - const pkt = Buffer.concat([hdr, payload]); - sendPacket(pkt); - - seq++; - rtpTs += rtpClockIncrement(PT); - frameIdx++; - }, 20); - - return () => clearInterval(timer); -} - -/** - * Play a cached prompt's Opus frames as RTP packets at 20ms intervals. - * - * @param prompt - the cached prompt to play - * @param sendPacket - function to send a raw RTP packet - * @param ssrc - SSRC for RTP headers - * @param counters - shared seq/ts counters (mutated in place for seamless transitions) - * @param onDone - called when playback finishes - * @returns cancel function, or null if prompt has no Opus frames - */ -export function playPromptOpus( - prompt: ICachedPrompt, - sendPacket: (pkt: Buffer) => void, - ssrc: number, - counters: { seq: number; ts: number }, - onDone?: () => void, -): (() => void) | null { - if (prompt.opusFrames.length === 0) { - onDone?.(); - return null; - } - - const frames = prompt.opusFrames; - const PT = 111; - let frameIdx = 0; - - const timer = setInterval(() => { - if (frameIdx >= frames.length) { - clearInterval(timer); - onDone?.(); - return; - } - - const payload = frames[frameIdx]; - const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0); - const pkt = Buffer.concat([hdr, payload]); - sendPacket(pkt); - - counters.seq++; - counters.ts += 960; // Opus 48kHz: 960 samples per 20ms - frameIdx++; - }, 20); - - return () => clearInterval(timer); } diff --git a/ts/opusbridge.ts b/ts/opusbridge.ts deleted file mode 100644 index 3a834a4..0000000 --- a/ts/opusbridge.ts +++ /dev/null @@ -1,199 +0,0 @@ -/** - * Audio transcoding bridge — uses smartrust to communicate with the Rust - * opus-codec binary, which handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding. - * - * All codec conversion happens in Rust (libopus + SpanDSP G.722 port). - * The TypeScript side just passes raw payloads back and forth. - */ - -import path from 'node:path'; -import { RustBridge } from '@push.rocks/smartrust'; - -// --------------------------------------------------------------------------- -// Command type map for smartrust -// --------------------------------------------------------------------------- - -type TCodecCommands = { - init: { - params: Record; - result: Record; - }; - create_session: { - params: { session_id: string }; - result: Record; - }; - destroy_session: { - params: { session_id: string }; - result: Record; - }; - transcode: { - params: { data_b64: string; from_pt: number; to_pt: number; session_id?: string; direction?: string }; - result: { data_b64: string }; - }; - encode_pcm: { - params: { data_b64: string; sample_rate: number; to_pt: number; session_id?: string }; - result: { data_b64: string }; - }; -}; - -// --------------------------------------------------------------------------- -// Bridge singleton -// --------------------------------------------------------------------------- - -let bridge: RustBridge | null = null; -let initialized = false; - -function buildLocalPaths(): string[] { - const root = process.cwd(); - return [ - path.join(root, 'dist_rust', 'opus-codec'), - path.join(root, 'rust', 'target', 'release', 'opus-codec'), - path.join(root, 'rust', 'target', 'debug', 'opus-codec'), - ]; -} - -let logFn: ((msg: string) => void) | undefined; - -/** - * Initialize the audio transcoding bridge. Spawns the Rust binary. - */ -export async function initCodecBridge(log?: (msg: string) => void): Promise { - if (initialized && bridge) return true; - logFn = log; - - try { - bridge = new RustBridge({ - binaryName: 'opus-codec', - localPaths: buildLocalPaths(), - }); - - const spawned = await bridge.spawn(); - if (!spawned) { - log?.('[codec] failed to spawn opus-codec binary'); - bridge = null; - return false; - } - - // Auto-restart: reset state when the Rust process exits so the next - // transcode attempt triggers re-initialization instead of silent failure. - bridge.on('exit', () => { - logFn?.('[codec] Rust audio transcoder process exited — will re-init on next use'); - bridge = null; - initialized = false; - }); - - await bridge.sendCommand('init', {} as any); - initialized = true; - log?.('[codec] Rust audio transcoder initialized (Opus + G.722 + PCMU/PCMA)'); - return true; - } catch (e: any) { - log?.(`[codec] init error: ${e.message}`); - bridge = null; - return false; - } -} - -// --------------------------------------------------------------------------- -// Session management — per-call codec isolation -// --------------------------------------------------------------------------- - -/** - * Create an isolated codec session. Each session gets its own Opus/G.722 - * encoder/decoder state, preventing concurrent calls from corrupting each - * other's stateful codec predictions. - */ -export async function createSession(sessionId: string): Promise { - if (!bridge || !initialized) { - // Attempt auto-reinit if bridge died. - const ok = await initCodecBridge(logFn); - if (!ok) return false; - } - try { - await bridge!.sendCommand('create_session', { session_id: sessionId }); - return true; - } catch (e: any) { - logFn?.(`[codec] create_session error: ${e?.message || e}`); - return false; - } -} - -/** - * Destroy a codec session, freeing its encoder/decoder state. - */ -export async function destroySession(sessionId: string): Promise { - if (!bridge || !initialized) return; - try { - await bridge.sendCommand('destroy_session', { session_id: sessionId }); - } catch { - // Best-effort cleanup. - } -} - -// --------------------------------------------------------------------------- -// Transcoding -// --------------------------------------------------------------------------- - -/** - * Transcode an RTP payload between two codecs. - * All codec work (Opus, G.722, PCMU, PCMA) + resampling happens in Rust. - * - * @param data - raw RTP payload (no header) - * @param fromPT - source payload type (0=PCMU, 8=PCMA, 9=G.722, 111=Opus) - * @param toPT - target payload type - * @param sessionId - optional session for isolated codec state - * @returns transcoded payload, or null on failure - */ -export async function transcode(data: Buffer, fromPT: number, toPT: number, sessionId?: string, direction?: string): Promise { - if (!bridge || !initialized) return null; - try { - const params: any = { - data_b64: data.toString('base64'), - from_pt: fromPT, - to_pt: toPT, - }; - if (sessionId) params.session_id = sessionId; - if (direction) params.direction = direction; - const result = await bridge.sendCommand('transcode', params); - return Buffer.from(result.data_b64, 'base64'); - } catch { - return null; - } -} - -/** - * Encode raw 16-bit PCM to a target codec. - * @param pcmData - raw 16-bit LE PCM bytes - * @param sampleRate - input sample rate (e.g. 22050 for Piper TTS) - * @param toPT - target payload type (9=G.722, 111=Opus, 0=PCMU, 8=PCMA) - * @param sessionId - optional session for isolated codec state - */ -export async function encodePcm(pcmData: Buffer, sampleRate: number, toPT: number, sessionId?: string): Promise { - if (!bridge || !initialized) return null; - try { - const params: any = { - data_b64: pcmData.toString('base64'), - sample_rate: sampleRate, - to_pt: toPT, - }; - if (sessionId) params.session_id = sessionId; - const result = await bridge.sendCommand('encode_pcm', params); - return Buffer.from(result.data_b64, 'base64'); - } catch (e: any) { - console.error('[encodePcm] error:', e?.message || e); - return null; - } -} - -/** Check if the codec bridge is ready. */ -export function isCodecReady(): boolean { - return initialized && bridge !== null; -} - -/** Shut down the codec bridge. */ -export function shutdownCodecBridge(): void { - if (bridge) { - try { bridge.kill(); } catch { /* ignore */ } - bridge = null; - initialized = false; - } -} diff --git a/ts/proxybridge.ts b/ts/proxybridge.ts index eb4fcfc..54d3105 100644 --- a/ts/proxybridge.ts +++ b/ts/proxybridge.ts @@ -79,6 +79,10 @@ type TProxyCommands = { params: { call_id: string; leg_id: string; key: string; value: unknown }; result: Record; }; + generate_tts: { + params: { model: string; voices: string; voice: string; text: string; output: string }; + result: { output: string }; + }; }; // --------------------------------------------------------------------------- @@ -493,6 +497,15 @@ export function isProxyReady(): boolean { return initialized && bridge !== null; } +/** Send an arbitrary command to the proxy engine bridge. */ +export async function sendProxyCommand( + method: K, + params: TProxyCommands[K]['params'], +): Promise { + if (!bridge || !initialized) throw new Error('proxy engine not initialized'); + return bridge.sendCommand(method as string, params as any) as any; +} + /** Shut down the proxy engine. */ export function shutdownProxyEngine(): void { if (bridge) { diff --git a/ts/sipproxy.ts b/ts/sipproxy.ts index 01387dd..02612ec 100644 --- a/ts/sipproxy.ts +++ b/ts/sipproxy.ts @@ -24,7 +24,6 @@ import { getAllBrowserDeviceIds, getBrowserDeviceWs, } from './webrtcbridge.ts'; -import { initCodecBridge } from './opusbridge.ts'; import { initAnnouncement } from './announcement.ts'; import { PromptCache } from './call/prompt-cache.ts'; import { VoiceboxManager } from './voicebox.ts'; @@ -523,9 +522,8 @@ async function startProxyEngine(): Promise { const deviceList = appConfig.devices.map((d) => d.displayName).join(', '); log(`proxy engine started | LAN ${appConfig.proxy.lanIp}:${appConfig.proxy.lanPort} | providers: ${providerList} | devices: ${deviceList}`); - // Initialize audio codec bridge (still needed for WebRTC transcoding). + // Generate TTS audio (WAV files on disk, played by Rust audio_player). try { - await initCodecBridge(log); await initAnnouncement(log); // Pre-generate prompts. @@ -547,7 +545,7 @@ async function startProxyEngine(): Promise { } log(`[startup] prompts cached: ${promptCache.listIds().join(', ') || 'none'}`); } catch (e) { - log(`[codec] init failed: ${e}`); + log(`[tts] init failed: ${e}`); } } diff --git a/ts_web/00_commitinfo_data.ts b/ts_web/00_commitinfo_data.ts index 207d2eb..8528a1f 100644 --- a/ts_web/00_commitinfo_data.ts +++ b/ts_web/00_commitinfo_data.ts @@ -3,6 +3,6 @@ */ export const commitinfo = { name: 'siprouter', - version: '1.15.0', + version: '1.16.0', description: 'undefined' }