feat(proxy-engine): integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files

2026-04-10 15:21:44 +00:00
parent c9ae747c95
commit 66112091a2
18 changed files with 340 additions and 1202 deletions
@@ -1,5 +1,12 @@
 # Changelog

+## 2026-04-10 - 1.16.0 - feat(proxy-engine)
+integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files
+
+- adds a generate_tts command to proxy-engine with lazy-loaded Kokoro model support and WAV output generation
+- removes standalone opus-codec and tts-engine workspace binaries by consolidating TTS generation into proxy-engine
+- updates announcement and prompt cache flows to generate and cache WAV files on disk instead of pre-encoding RTP frames in TypeScript
+
 ## 2026-04-10 - 1.15.0 - feat(proxy-engine)
 add device leg, leg transfer, and leg replacement call controls

@@ -1881,16 +1881,6 @@ dependencies = [
 "vcpkg",
 ]

-[[package]]
-name = "opus-codec"
-version = "0.2.0"
-dependencies = [
- "base64 0.22.1",
- "codec-lib",
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "ort"
 version = "2.0.0-rc.11"
@@ -2188,6 +2178,8 @@ dependencies = [
 "base64 0.22.1",
 "codec-lib",
 "hound",
+ "kokoro-tts",
+ "ort",
 "rand 0.8.5",
 "regex-lite",
 "serde",
@@ -3008,16 +3000,6 @@ dependencies = [
 "strength_reduce",
 ]

-[[package]]
-name = "tts-engine"
-version = "0.1.0"
-dependencies = [
- "hound",
- "kokoro-tts",
- "ort",
- "tokio",
-]
-
 [[package]]
 name = "turn"
 version = "0.6.1"
@@ -1,8 +1,6 @@
 [workspace]
 members = [
    "crates/codec-lib",
-    "crates/opus-codec",
-    "crates/tts-engine",
    "crates/sip-proto",
    "crates/proxy-engine",
 ]
@@ -1,7 +1,7 @@
 //! Audio codec library for the SIP router.
 //!
 //! Handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding with ML noise suppression.
-//! Used by both the standalone `opus-codec` CLI and the `proxy-engine` binary.
+//! Used by the `proxy-engine` binary for all audio transcoding.

 use audiopus::coder::{Decoder as OpusDecoder, Encoder as OpusEncoder};
 use audiopus::packet::Packet as OpusPacket;
@@ -1,14 +0,0 @@
-[package]
-name = "opus-codec"
-version = "0.2.0"
-edition = "2021"
-
-[[bin]]
-name = "opus-codec"
-path = "src/main.rs"
-
-[dependencies]
-codec-lib = { path = "../codec-lib" }
-serde = { version = "1", features = ["derive"] }
-serde_json = "1"
-base64 = "0.22"
@@ -1,286 +0,0 @@
-/// Audio transcoding bridge for smartrust.
-///
-/// Thin CLI wrapper around `codec-lib`. Handles Opus ↔ G.722 ↔ PCMU transcoding.
-///
-/// Protocol:
-///   -> {"id":"1","method":"init","params":{}}
-///   <- {"id":"1","success":true,"result":{}}
-///   -> {"id":"2","method":"create_session","params":{"session_id":"call-abc"}}
-///   <- {"id":"2","success":true,"result":{}}
-///   -> {"id":"3","method":"transcode","params":{"session_id":"call-abc","data_b64":"...","from_pt":111,"to_pt":9}}
-///   <- {"id":"3","success":true,"result":{"data_b64":"..."}}
-///   -> {"id":"4","method":"destroy_session","params":{"session_id":"call-abc"}}
-///   <- {"id":"4","success":true,"result":{}}
-
-use base64::engine::general_purpose::STANDARD as B64;
-use base64::Engine as _;
-use codec_lib::{codec_sample_rate, TranscodeState};
-use serde::Deserialize;
-use std::collections::HashMap;
-use std::io::{self, BufRead, Write};
-
-#[derive(Deserialize)]
-struct Request {
-    id: String,
-    method: String,
-    #[serde(default)]
-    params: serde_json::Value,
-}
-
-fn respond(
-    out: &mut impl Write,
-    id: &str,
-    success: bool,
-    result: Option<serde_json::Value>,
-    error: Option<&str>,
-) {
-    let mut resp = serde_json::json!({ "id": id, "success": success });
-    if let Some(r) = result {
-        resp["result"] = r;
-    }
-    if let Some(e) = error {
-        resp["error"] = serde_json::Value::String(e.to_string());
-    }
-    let _ = writeln!(out, "{}", resp);
-    let _ = out.flush();
-}
-
-/// Resolve a session: if session_id is provided, look it up in the sessions map;
-/// otherwise fall back to the default state (backward compat with `init`).
-fn get_session<'a>(
-    sessions: &'a mut HashMap<String, TranscodeState>,
-    default: &'a mut Option<TranscodeState>,
-    params: &serde_json::Value,
-) -> Option<&'a mut TranscodeState> {
-    if let Some(sid) = params.get("session_id").and_then(|v| v.as_str()) {
-        sessions.get_mut(sid)
-    } else {
-        default.as_mut()
-    }
-}
-
-fn main() {
-    let stdin = io::stdin();
-    let stdout = io::stdout();
-    let mut out = io::BufWriter::new(stdout.lock());
-
-    let _ = writeln!(out, r#"{{"event":"ready","data":{{}}}}"#);
-    let _ = out.flush();
-
-    let mut default_state: Option<TranscodeState> = None;
-    let mut sessions: HashMap<String, TranscodeState> = HashMap::new();
-
-    for line in stdin.lock().lines() {
-        let line = match line {
-            Ok(l) if !l.trim().is_empty() => l,
-            Ok(_) => continue,
-            Err(_) => break,
-        };
-
-        let req: Request = match serde_json::from_str(&line) {
-            Ok(r) => r,
-            Err(e) => {
-                respond(&mut out, "", false, None, Some(&format!("parse: {e}")));
-                continue;
-            }
-        };
-
-        match req.method.as_str() {
-            "init" => match TranscodeState::new() {
-                Ok(s) => {
-                    default_state = Some(s);
-                    respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
-                }
-                Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
-            },
-
-            "create_session" => {
-                let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) {
-                    Some(s) => s.to_string(),
-                    None => {
-                        respond(&mut out, &req.id, false, None, Some("missing session_id"));
-                        continue;
-                    }
-                };
-                if sessions.contains_key(&session_id) {
-                    respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
-                    continue;
-                }
-                match TranscodeState::new() {
-                    Ok(s) => {
-                        sessions.insert(session_id, s);
-                        respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
-                    }
-                    Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
-                }
-            }
-
-            "destroy_session" => {
-                let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) {
-                    Some(s) => s,
-                    None => {
-                        respond(&mut out, &req.id, false, None, Some("missing session_id"));
-                        continue;
-                    }
-                };
-                sessions.remove(session_id);
-                respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
-            }
-
-            "transcode" => {
-                let st = match get_session(&mut sessions, &mut default_state, &req.params) {
-                    Some(s) => s,
-                    None => {
-                        respond(
-                            &mut out,
-                            &req.id,
-                            false,
-                            None,
-                            Some("not initialized (no session or default state)"),
-                        );
-                        continue;
-                    }
-                };
-                let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) {
-                    Some(s) => s,
-                    None => {
-                        respond(&mut out, &req.id, false, None, Some("missing data_b64"));
-                        continue;
-                    }
-                };
-                let from_pt =
-                    req.params.get("from_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8;
-                let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8;
-                let direction = req.params.get("direction").and_then(|v| v.as_str());
-
-                let data = match B64.decode(data_b64) {
-                    Ok(b) => b,
-                    Err(e) => {
-                        respond(
-                            &mut out,
-                            &req.id,
-                            false,
-                            None,
-                            Some(&format!("b64: {e}")),
-                        );
-                        continue;
-                    }
-                };
-
-                match st.transcode(&data, from_pt, to_pt, direction) {
-                    Ok(result) => {
-                        respond(
-                            &mut out,
-                            &req.id,
-                            true,
-                            Some(serde_json::json!({ "data_b64": B64.encode(&result) })),
-                            None,
-                        );
-                    }
-                    Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
-                }
-            }
-
-            "encode_pcm" => {
-                let st = match get_session(&mut sessions, &mut default_state, &req.params) {
-                    Some(s) => s,
-                    None => {
-                        respond(
-                            &mut out,
-                            &req.id,
-                            false,
-                            None,
-                            Some("not initialized (no session or default state)"),
-                        );
-                        continue;
-                    }
-                };
-                let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) {
-                    Some(s) => s,
-                    None => {
-                        respond(&mut out, &req.id, false, None, Some("missing data_b64"));
-                        continue;
-                    }
-                };
-                let sample_rate = req
-                    .params
-                    .get("sample_rate")
-                    .and_then(|v| v.as_u64())
-                    .unwrap_or(22050) as u32;
-                let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(9) as u8;
-
-                let data = match B64.decode(data_b64) {
-                    Ok(b) => b,
-                    Err(e) => {
-                        respond(
-                            &mut out,
-                            &req.id,
-                            false,
-                            None,
-                            Some(&format!("b64: {e}")),
-                        );
-                        continue;
-                    }
-                };
-
-                if data.len() % 2 != 0 {
-                    respond(
-                        &mut out,
-                        &req.id,
-                        false,
-                        None,
-                        Some("PCM data has odd byte count (expected 16-bit LE samples)"),
-                    );
-                    continue;
-                }
-
-                let pcm: Vec<i16> = data
-                    .chunks_exact(2)
-                    .map(|c| i16::from_le_bytes([c[0], c[1]]))
-                    .collect();
-
-                let target_rate = codec_sample_rate(to_pt);
-                let resampled = match st.resample(&pcm, sample_rate, target_rate) {
-                    Ok(r) => r,
-                    Err(e) => {
-                        respond(&mut out, &req.id, false, None, Some(&e));
-                        continue;
-                    }
-                };
-
-                match st.encode_from_pcm(&resampled, to_pt) {
-                    Ok(encoded) => {
-                        respond(
-                            &mut out,
-                            &req.id,
-                            true,
-                            Some(serde_json::json!({ "data_b64": B64.encode(&encoded) })),
-                            None,
-                        );
-                    }
-                    Err(e) => {
-                        respond(&mut out, &req.id, false, None, Some(&e));
-                    }
-                }
-            }
-
-            "encode" | "decode" => {
-                respond(
-                    &mut out,
-                    &req.id,
-                    false,
-                    None,
-                    Some("use 'transcode' command instead"),
-                );
-            }
-
-            _ => respond(
-                &mut out,
-                &req.id,
-                false,
-                None,
-                Some(&format!("unknown: {}", req.method)),
-            ),
-        }
-    }
-}
@@ -18,3 +18,8 @@ regex-lite = "0.1"
 webrtc = "0.8"
 rand = "0.8"
 hound = "3.5"
+kokoro-tts = { version = "0.3", default-features = false }
+ort = { version = "=2.0.0-rc.11", default-features = false, features = [
+    "std", "download-binaries", "copy-dylibs", "ndarray",
+    "tls-native-vendored"
+] }
@@ -21,6 +21,7 @@ mod rtp;
 mod sip_leg;
 mod sip_transport;
 mod tool_leg;
+mod tts;
 mod voicemail;
 mod webrtc_engine;

@@ -93,6 +94,9 @@ async fn main() {
    // WebRTC engine — separate lock to avoid deadlock with SIP handlers.
    let webrtc = Arc::new(Mutex::new(WebRtcEngine::new(out_tx.clone())));

+    // TTS engine — separate lock, lazy-loads model on first use.
+    let tts_engine = Arc::new(Mutex::new(tts::TtsEngine::new()));
+
    // Read commands from stdin.
    let stdin = tokio::io::stdin();
    let reader = BufReader::new(stdin);
@@ -113,11 +117,12 @@ async fn main() {

        let engine = engine.clone();
        let webrtc = webrtc.clone();
+        let tts_engine = tts_engine.clone();
        let out_tx = out_tx.clone();

        // Handle commands — some are async, so we spawn.
        tokio::spawn(async move {
-            handle_command(engine, webrtc, &out_tx, cmd).await;
+            handle_command(engine, webrtc, tts_engine, &out_tx, cmd).await;
        });
    }
 }
@@ -125,6 +130,7 @@ async fn main() {
 async fn handle_command(
    engine: Arc<Mutex<ProxyEngine>>,
    webrtc: Arc<Mutex<WebRtcEngine>>,
+    tts_engine: Arc<Mutex<tts::TtsEngine>>,
    out_tx: &OutTx,
    cmd: Command,
 ) {
@@ -150,6 +156,8 @@ async fn handle_command(
        "add_tool_leg" => handle_add_tool_leg(engine, out_tx, &cmd).await,
        "remove_tool_leg" => handle_remove_tool_leg(engine, out_tx, &cmd).await,
        "set_leg_metadata" => handle_set_leg_metadata(engine, out_tx, &cmd).await,
+        // TTS command — lock tts_engine only (no SIP/WebRTC contention).
+        "generate_tts" => handle_generate_tts(tts_engine, out_tx, &cmd).await,
        _ => respond_err(out_tx, &cmd.id, &format!("unknown command: {}", cmd.method)),
    }
 }
@@ -1218,3 +1226,16 @@ async fn handle_set_leg_metadata(
    leg.metadata.insert(key, value);
    respond_ok(out_tx, &cmd.id, serde_json::json!({}));
 }
+
+/// Handle `generate_tts` — synthesize text to a WAV file using Kokoro TTS.
+async fn handle_generate_tts(
+    tts_engine: Arc<Mutex<tts::TtsEngine>>,
+    out_tx: &OutTx,
+    cmd: &Command,
+) {
+    let mut tts = tts_engine.lock().await;
+    match tts.generate(&cmd.params).await {
+        Ok(result) => respond_ok(out_tx, &cmd.id, result),
+        Err(e) => respond_err(out_tx, &cmd.id, &e),
+    }
+}
@@ -0,0 +1,138 @@
+//! Text-to-speech engine — synthesizes text to WAV files using Kokoro neural TTS.
+//!
+//! The model is loaded lazily on first use. If the model/voices files are not
+//! present, the generate command returns an error and the TS side falls back
+//! to espeak-ng.
+
+use kokoro_tts::{KokoroTts, Voice};
+use std::path::Path;
+
+/// Wraps the Kokoro TTS engine with lazy model loading.
+pub struct TtsEngine {
+    tts: Option<KokoroTts>,
+    /// Path that was used to load the current model (for cache invalidation).
+    loaded_model_path: String,
+    loaded_voices_path: String,
+}
+
+impl TtsEngine {
+    pub fn new() -> Self {
+        Self {
+            tts: None,
+            loaded_model_path: String::new(),
+            loaded_voices_path: String::new(),
+        }
+    }
+
+    /// Generate a WAV file from text.
+    ///
+    /// Params (from IPC JSON):
+    ///   - `model`: path to the ONNX model file
+    ///   - `voices`: path to the voices.bin file
+    ///   - `voice`: voice name (e.g. "af_bella")
+    ///   - `text`: text to synthesize
+    ///   - `output`: output WAV file path
+    pub async fn generate(&mut self, params: &serde_json::Value) -> Result<serde_json::Value, String> {
+        let model_path = params.get("model").and_then(|v| v.as_str())
+            .ok_or("missing 'model' param")?;
+        let voices_path = params.get("voices").and_then(|v| v.as_str())
+            .ok_or("missing 'voices' param")?;
+        let voice_name = params.get("voice").and_then(|v| v.as_str())
+            .unwrap_or("af_bella");
+        let text = params.get("text").and_then(|v| v.as_str())
+            .ok_or("missing 'text' param")?;
+        let output_path = params.get("output").and_then(|v| v.as_str())
+            .ok_or("missing 'output' param")?;
+
+        if text.is_empty() {
+            return Err("empty text".into());
+        }
+
+        // Check that model/voices files exist.
+        if !Path::new(model_path).exists() {
+            return Err(format!("model not found: {model_path}"));
+        }
+        if !Path::new(voices_path).exists() {
+            return Err(format!("voices not found: {voices_path}"));
+        }
+
+        // Lazy-load or reload if paths changed.
+        if self.tts.is_none()
+            || self.loaded_model_path != model_path
+            || self.loaded_voices_path != voices_path
+        {
+            eprintln!("[tts] loading model: {model_path}");
+            let tts = KokoroTts::new(model_path, voices_path)
+                .await
+                .map_err(|e| format!("model load failed: {e:?}"))?;
+            self.tts = Some(tts);
+            self.loaded_model_path = model_path.to_string();
+            self.loaded_voices_path = voices_path.to_string();
+        }
+
+        let tts = self.tts.as_ref().unwrap();
+        let voice = select_voice(voice_name);
+
+        eprintln!("[tts] synthesizing voice '{voice_name}': \"{text}\"");
+        let (samples, duration) = tts.synth(text, voice)
+            .await
+            .map_err(|e| format!("synthesis failed: {e:?}"))?;
+        eprintln!("[tts] synthesized {} samples in {duration:?}", samples.len());
+
+        // Write 24kHz 16-bit mono WAV.
+        let spec = hound::WavSpec {
+            channels: 1,
+            sample_rate: 24000,
+            bits_per_sample: 16,
+            sample_format: hound::SampleFormat::Int,
+        };
+
+        let mut writer = hound::WavWriter::create(output_path, spec)
+            .map_err(|e| format!("WAV create failed: {e}"))?;
+        for &sample in &samples {
+            let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
+            writer.write_sample(s16).map_err(|e| format!("WAV write: {e}"))?;
+        }
+        writer.finalize().map_err(|e| format!("WAV finalize: {e}"))?;
+
+        eprintln!("[tts] wrote {output_path}");
+        Ok(serde_json::json!({ "output": output_path }))
+    }
+}
+
+/// Map voice name string to Kokoro Voice enum variant.
+fn select_voice(name: &str) -> Voice {
+    match name {
+        "af_bella" => Voice::AfBella(1.0),
+        "af_heart" => Voice::AfHeart(1.0),
+        "af_jessica" => Voice::AfJessica(1.0),
+        "af_nicole" => Voice::AfNicole(1.0),
+        "af_nova" => Voice::AfNova(1.0),
+        "af_sarah" => Voice::AfSarah(1.0),
+        "af_sky" => Voice::AfSky(1.0),
+        "af_river" => Voice::AfRiver(1.0),
+        "af_alloy" => Voice::AfAlloy(1.0),
+        "af_aoede" => Voice::AfAoede(1.0),
+        "af_kore" => Voice::AfKore(1.0),
+        "am_adam" => Voice::AmAdam(1.0),
+        "am_echo" => Voice::AmEcho(1.0),
+        "am_eric" => Voice::AmEric(1.0),
+        "am_fenrir" => Voice::AmFenrir(1.0),
+        "am_liam" => Voice::AmLiam(1.0),
+        "am_michael" => Voice::AmMichael(1.0),
+        "am_onyx" => Voice::AmOnyx(1.0),
+        "am_puck" => Voice::AmPuck(1.0),
+        "bf_alice" => Voice::BfAlice(1.0),
+        "bf_emma" => Voice::BfEmma(1.0),
+        "bf_isabella" => Voice::BfIsabella(1.0),
+        "bf_lily" => Voice::BfLily(1.0),
+        "bm_daniel" => Voice::BmDaniel(1.0),
+        "bm_fable" => Voice::BmFable(1.0),
+        "bm_george" => Voice::BmGeorge(1.0),
+        "bm_lewis" => Voice::BmLewis(1.0),
+        _ => {
+            eprintln!("[tts] unknown voice '{name}', falling back to af_bella");
+            Voice::AfBella(1.0)
+        }
+    }
+}
@@ -1,18 +0,0 @@
-[package]
-name = "tts-engine"
-version = "0.1.0"
-edition = "2021"
-
-[[bin]]
-name = "tts-engine"
-path = "src/main.rs"
-
-[dependencies]
-kokoro-tts = { version = "0.3", default-features = false }
-# Pin to rc.11 matching kokoro-tts's expectation; enable vendored TLS to avoid system libssl-dev.
-ort = { version = "=2.0.0-rc.11", default-features = false, features = [
-    "std", "download-binaries", "copy-dylibs", "ndarray",
-    "tls-native-vendored"
-] }
-tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
-hound = "3.5"
@@ -1,149 +0,0 @@
-/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS.
-///
-/// Usage:
-///   echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav
-///   tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world"
-///
-/// Outputs 24kHz 16-bit mono WAV.
-
-use kokoro_tts::{KokoroTts, Voice};
-use std::io::{self, Read};
-
-fn parse_args() -> Result<(String, String, String, String, Option<String>), String> {
-    let args: Vec<String> = std::env::args().collect();
-    let mut model = String::new();
-    let mut voices = String::new();
-    let mut output = String::new();
-    let mut text: Option<String> = None;
-    let mut voice_name: Option<String> = None;
-
-    let mut i = 1;
-    while i < args.len() {
-        match args[i].as_str() {
-            "--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); }
-            "--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); }
-            "--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); }
-            "--text" => { i += 1; text = args.get(i).cloned(); }
-            "--voice" => { i += 1; voice_name = args.get(i).cloned(); }
-            _ => {}
-        }
-        i += 1;
-    }
-
-    if model.is_empty() { return Err("--model required".into()); }
-    if voices.is_empty() { return Err("--voices required".into()); }
-    if output.is_empty() { return Err("--output required".into()); }
-
-    let voice_str = voice_name.unwrap_or_else(|| "af_bella".into());
-
-    Ok((model, voices, output, voice_str, text))
-}
-
-fn select_voice(name: &str) -> Voice {
-    match name {
-        "af_bella" => Voice::AfBella(1.0),
-        "af_heart" => Voice::AfHeart(1.0),
-        "af_jessica" => Voice::AfJessica(1.0),
-        "af_nicole" => Voice::AfNicole(1.0),
-        "af_nova" => Voice::AfNova(1.0),
-        "af_sarah" => Voice::AfSarah(1.0),
-        "af_sky" => Voice::AfSky(1.0),
-        "af_river" => Voice::AfRiver(1.0),
-        "af_alloy" => Voice::AfAlloy(1.0),
-        "af_aoede" => Voice::AfAoede(1.0),
-        "af_kore" => Voice::AfKore(1.0),
-        "am_adam" => Voice::AmAdam(1.0),
-        "am_echo" => Voice::AmEcho(1.0),
-        "am_eric" => Voice::AmEric(1.0),
-        "am_fenrir" => Voice::AmFenrir(1.0),
-        "am_liam" => Voice::AmLiam(1.0),
-        "am_michael" => Voice::AmMichael(1.0),
-        "am_onyx" => Voice::AmOnyx(1.0),
-        "am_puck" => Voice::AmPuck(1.0),
-        "bf_alice" => Voice::BfAlice(1.0),
-        "bf_emma" => Voice::BfEmma(1.0),
-        "bf_isabella" => Voice::BfIsabella(1.0),
-        "bf_lily" => Voice::BfLily(1.0),
-        "bm_daniel" => Voice::BmDaniel(1.0),
-        "bm_fable" => Voice::BmFable(1.0),
-        "bm_george" => Voice::BmGeorge(1.0),
-        "bm_lewis" => Voice::BmLewis(1.0),
-        _ => {
-            eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name);
-            Voice::AfBella(1.0)
-        }
-    }
-}
-
-#[tokio::main]
-async fn main() {
-    let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() {
-        Ok(v) => v,
-        Err(e) => {
-            eprintln!("Error: {}", e);
-            eprintln!("Usage: tts-engine --model <model.onnx> --voices <voices.bin> --output <output.wav> [--text <text>] [--voice <voice_name>]");
-            std::process::exit(1);
-        }
-    };
-
-    // Get text from --text arg or stdin.
-    let text = match text_arg {
-        Some(t) => t,
-        None => {
-            let mut buf = String::new();
-            io::stdin().read_to_string(&mut buf).expect("failed to read stdin");
-            buf.trim().to_string()
-        }
-    };
-
-    if text.is_empty() {
-        eprintln!("[tts-engine] no text provided");
-        std::process::exit(1);
-    }
-
-    eprintln!("[tts-engine] loading model: {}", model_path);
-    let tts = match KokoroTts::new(&model_path, &voices_path).await {
-        Ok(t) => t,
-        Err(e) => {
-            eprintln!("[tts-engine] failed to load model: {:?}", e);
-            std::process::exit(1);
-        }
-    };
-
-    let voice = select_voice(&voice_name);
-    eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text);
-
-    let (samples, duration) = match tts.synth(&text, voice).await {
-        Ok(r) => r,
-        Err(e) => {
-            eprintln!("[tts-engine] synthesis failed: {:?}", e);
-            std::process::exit(1);
-        }
-    };
-
-    eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration);
-
-    // Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects).
-    let spec = hound::WavSpec {
-        channels: 1,
-        sample_rate: 24000,
-        bits_per_sample: 16,
-        sample_format: hound::SampleFormat::Int,
-    };
-
-    let mut writer = match hound::WavWriter::create(&output_path, spec) {
-        Ok(w) => w,
-        Err(e) => {
-            eprintln!("[tts-engine] failed to create WAV: {}", e);
-            std::process::exit(1);
-        }
-    };
-
-    for &sample in &samples {
-        let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
-        writer.write_sample(s16).unwrap();
-    }
-    writer.finalize().unwrap();
-
-    eprintln!("[tts-engine] wrote {}", output_path);
-}
@@ -3,6 +3,6 @@
 */
 export const commitinfo = {
  name: 'siprouter',
-  version: '1.15.0',
+  version: '1.16.0',
  description: 'undefined'
 }
@@ -1,59 +1,22 @@
 /**
- * TTS announcement module — pre-generates audio announcements using espeak-ng
- * and caches them as encoded RTP packets for playback during call setup.
+ * TTS announcement module — generates announcement WAV files at startup.
 *
- * On startup, generates the announcement WAV via espeak-ng (formant-based TTS
- * with highly accurate pronunciation), encodes each 20ms frame to G.722 (for
- * SIP) and Opus (for WebRTC) via the Rust transcoder, and caches the packets.
+ * Engine priority: espeak-ng (formant TTS, fast) → Kokoro neural TTS via
+ * proxy-engine → disabled.
 *
- * Falls back to the Rust tts-engine (Kokoro neural TTS) if espeak-ng is not
- * installed, and disables announcements if neither is available.
+ * The generated WAV is left on disk for Rust's audio_player / start_interaction
+ * to play during calls. No encoding or RTP playback happens in TypeScript.
 */

 import { execSync } from 'node:child_process';
 import fs from 'node:fs';
 import path from 'node:path';
-import { Buffer } from 'node:buffer';
-import { encodePcm, isCodecReady } from './opusbridge.ts';
-
-/** RTP clock increment per 20ms frame for each codec. */
-function rtpClockIncrement(pt: number): number {
-  if (pt === 111) return 960;
-  if (pt === 9) return 160;
-  return 160;
-}
-
-/** Build a fresh RTP header. */
-function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer {
-  const hdr = Buffer.alloc(12);
-  hdr[0] = 0x80;
-  hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f);
-  hdr.writeUInt16BE(seq & 0xffff, 2);
-  hdr.writeUInt32BE(ts >>> 0, 4);
-  hdr.writeUInt32BE(ssrc >>> 0, 8);
-  return hdr;
-}
-
-// ---------------------------------------------------------------------------
-// Types
-// ---------------------------------------------------------------------------
-
-/** A pre-encoded announcement ready for RTP playback. */
-export interface IAnnouncementCache {
-  /** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */
-  g722Frames: Buffer[];
-  /** Opus encoded frames for WebRTC playback. */
-  opusFrames: Buffer[];
-  /** Total duration in milliseconds. */
-  durationMs: number;
-}
+import { sendProxyCommand, isProxyReady } from './proxybridge.ts';

 // ---------------------------------------------------------------------------
 // State
 // ---------------------------------------------------------------------------

-let cachedAnnouncement: IAnnouncementCache | null = null;
-
 const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
 const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
 const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');
@@ -64,12 +27,10 @@ const KOKORO_VOICES = 'voices.bin';
 const KOKORO_VOICE = 'af_bella';

 // ---------------------------------------------------------------------------
-// Initialization
+// TTS generators
 // ---------------------------------------------------------------------------

-/**
- * Check if espeak-ng is available on the system.
- */
+/** Check if espeak-ng is available on the system. */
 function isEspeakAvailable(): boolean {
  try {
    execSync('which espeak-ng', { stdio: 'pipe' });
@@ -79,10 +40,7 @@ function isEspeakAvailable(): boolean {
  }
 }

-/**
- * Generate announcement WAV via espeak-ng (primary engine).
- * Returns true on success.
- */
+/** Generate announcement WAV via espeak-ng (primary engine). */
 function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
  log('[tts] generating announcement audio via espeak-ng...');
  try {
@@ -98,11 +56,8 @@ function generateViaEspeak(wavPath: string, text: string, log: (msg: string) =>
  }
 }

-/**
- * Generate announcement WAV via Kokoro TTS (fallback engine).
- * Returns true on success.
- */
-function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): boolean {
+/** Generate announcement WAV via Kokoro TTS (fallback, runs inside proxy-engine). */
+async function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): Promise<boolean> {
  const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
  const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);

@@ -111,25 +66,21 @@ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) =>
    return false;
  }

-  const root = process.cwd();
-  const ttsBinPaths = [
-    path.join(root, 'dist_rust', 'tts-engine'),
-    path.join(root, 'rust', 'target', 'release', 'tts-engine'),
-    path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
-  ];
-  const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p));
-  if (!ttsBin) {
-    log('[tts] tts-engine binary not found — Kokoro fallback unavailable');
+  if (!isProxyReady()) {
+    log('[tts] proxy-engine not ready — Kokoro fallback unavailable');
    return false;
  }

  log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
  try {
-    execSync(
-      `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${wavPath}" --text "${text}"`,
-      { timeout: 120000, stdio: 'pipe' },
-    );
-    log('[tts] Kokoro WAV generated');
+    await sendProxyCommand('generate_tts', {
+      model: modelPath,
+      voices: voicesPath,
+      voice: KOKORO_VOICE,
+      text,
+      output: wavPath,
+    });
+    log('[tts] Kokoro WAV generated (via proxy-engine)');
    return true;
  } catch (e: any) {
    log(`[tts] Kokoro failed: ${e.message}`);
@@ -137,40 +88,13 @@ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) =>
  }
 }

-/**
- * Read a WAV file and detect its sample rate from the fmt chunk.
- * Returns { pcm, sampleRate } or null on failure.
- */
-function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
-  const wav = fs.readFileSync(wavPath);
-  if (wav.length < 44) return null;
-  if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
-  if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
-
-  let sampleRate = 22050; // default
-  let offset = 12;
-  let pcm: Buffer | null = null;
-
-  while (offset < wav.length - 8) {
-    const chunkId = wav.toString('ascii', offset, offset + 4);
-    const chunkSize = wav.readUInt32LE(offset + 4);
-    if (chunkId === 'fmt ') {
-      sampleRate = wav.readUInt32LE(offset + 12);
-    }
-    if (chunkId === 'data') {
-      pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
-    }
-    offset += 8 + chunkSize;
-    if (offset % 2 !== 0) offset++;
-  }
-
-  if (!pcm) return null;
-  return { pcm, sampleRate };
-}
+// ---------------------------------------------------------------------------
+// Initialization
+// ---------------------------------------------------------------------------

 /**
- * Pre-generate the announcement audio and encode to G.722 + Opus frames.
- * Must be called after the codec bridge is initialized.
+ * Pre-generate the announcement WAV file.
+ * Must be called after the proxy engine is initialized.
 *
 * Engine priority: espeak-ng → Kokoro → disabled.
 */
@@ -178,7 +102,6 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
  fs.mkdirSync(TTS_DIR, { recursive: true });

  try {
-    // Generate WAV if not cached.
    if (!fs.existsSync(CACHE_WAV)) {
      let generated = false;

@@ -189,9 +112,9 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
        log('[tts] espeak-ng not installed — trying Kokoro fallback');
      }

-      // Fall back to Kokoro.
+      // Fall back to Kokoro (via proxy-engine).
      if (!generated) {
-        generated = generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
+        generated = await generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
      }

      if (!generated) {
@@ -200,49 +123,7 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
      }
    }

-    // Read WAV and extract raw PCM + sample rate.
-    const result = readWavWithRate(CACHE_WAV);
-    if (!result) {
-      log('[tts] failed to parse WAV file');
-      return false;
-    }
-
-    const { pcm, sampleRate } = result;
-
-    // Wait for codec bridge to be ready.
-    if (!isCodecReady()) {
-      log('[tts] codec bridge not ready — will retry');
-      return false;
-    }
-
-    // Encode in 20ms chunks. The Rust encoder resamples to each codec's native rate.
-    const FRAME_SAMPLES = Math.floor(sampleRate * 0.02);
-    const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample
-    const totalFrames = Math.floor(pcm.length / FRAME_BYTES);
-
-    const g722Frames: Buffer[] = [];
-    const opusFrames: Buffer[] = [];
-
-    log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${sampleRate}Hz)...`);
-    for (let i = 0; i < totalFrames; i++) {
-      const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES);
-      const pcmBuf = Buffer.from(framePcm);
-      const [g722, opus] = await Promise.all([
-        encodePcm(pcmBuf, sampleRate, 9),   // G.722 for SIP devices
-        encodePcm(pcmBuf, sampleRate, 111),  // Opus for WebRTC browsers
-      ]);
-      if (g722) g722Frames.push(g722);
-      if (opus) opusFrames.push(opus);
-      if (!g722 && !opus && i < 3) log(`[tts] frame ${i} encode failed`);
-    }
-
-    cachedAnnouncement = {
-      g722Frames,
-      opusFrames,
-      durationMs: totalFrames * 20,
-    };
-
-    log(`[tts] announcement cached: ${g722Frames.length} frames (${(totalFrames * 20 / 1000).toFixed(1)}s)`);
+    log('[tts] announcement WAV ready');
    return true;
  } catch (e: any) {
    log(`[tts] init error: ${e.message}`);
@@ -250,100 +131,7 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
  }
 }

-// ---------------------------------------------------------------------------
-// Playback
-// ---------------------------------------------------------------------------
-
-/**
- * Play the pre-cached announcement to an RTP endpoint.
- *
- * @param sendPacket - function to send a raw RTP packet
- * @param ssrc - SSRC to use in RTP headers
- * @param onDone - called when the announcement finishes
- * @returns a cancel function, or null if no announcement is cached
- */
-export function playAnnouncement(
-  sendPacket: (pkt: Buffer) => void,
-  ssrc: number,
-  onDone?: () => void,
-): (() => void) | null {
-  if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) {
-    onDone?.();
-    return null;
-  }
-
-  const frames = cachedAnnouncement.g722Frames;
-  const PT = 9; // G.722
-  let frameIdx = 0;
-  let seq = Math.floor(Math.random() * 0xffff);
-  let rtpTs = Math.floor(Math.random() * 0xffffffff);
-
-  const timer = setInterval(() => {
-    if (frameIdx >= frames.length) {
-      clearInterval(timer);
-      onDone?.();
-      return;
-    }
-
-    const payload = frames[frameIdx];
-    const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
-    const pkt = Buffer.concat([hdr, payload]);
-    sendPacket(pkt);
-
-    seq++;
-    rtpTs += rtpClockIncrement(PT);
-    frameIdx++;
-  }, 20);
-
-  // Return cancel function.
-  return () => clearInterval(timer);
+/** Get the path to the cached announcement WAV, or null if not generated. */
+export function getAnnouncementWavPath(): string | null {
+  return fs.existsSync(CACHE_WAV) ? CACHE_WAV : null;
 }
-
-/**
- * Play pre-cached Opus announcement to a WebRTC PeerConnection sender.
- *
- * @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp()
- * @param ssrc - SSRC to use in RTP headers
- * @param onDone - called when announcement finishes
- * @returns cancel function, or null if no announcement cached
- */
-export function playAnnouncementToWebRtc(
-  sendRtpPacket: (pkt: Buffer) => void,
-  ssrc: number,
-  counters: { seq: number; ts: number },
-  onDone?: () => void,
-): (() => void) | null {
-  if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) {
-    onDone?.();
-    return null;
-  }
-
-  const frames = cachedAnnouncement.opusFrames;
-  const PT = 111; // Opus
-  let frameIdx = 0;
-
-  const timer = setInterval(() => {
-    if (frameIdx >= frames.length) {
-      clearInterval(timer);
-      onDone?.();
-      return;
-    }
-
-    const payload = frames[frameIdx];
-    const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
-    const pkt = Buffer.concat([hdr, payload]);
-    sendRtpPacket(pkt);
-
-    counters.seq++;
-    counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms
-    frameIdx++;
-  }, 20);
-
-  return () => clearInterval(timer);
-}
-
-/** Check if an announcement is cached and ready. */
-export function isAnnouncementReady(): boolean {
-  return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0;
-}
-
@@ -1,55 +1,31 @@
 /**
- * PromptCache — manages multiple named audio prompts for IVR and voicemail.
+ * PromptCache — manages named audio prompt WAV files for IVR and voicemail.
 *
- * Each prompt is pre-encoded as both G.722 frames (for SIP legs) and Opus
- * frames (for WebRTC legs), ready for 20ms RTP playback.
+ * Generates WAV files via espeak-ng (primary) or Kokoro TTS through the
+ * proxy-engine (fallback). Also supports loading pre-existing WAV files
+ * and programmatic tone generation.
 *
- * Supports three sources:
- * 1. TTS generation via espeak-ng (primary) or Kokoro (fallback)
- * 2. Loading from a pre-existing WAV file
- * 3. Programmatic tone generation (beep, etc.)
- *
- * The existing announcement.ts system continues to work independently;
- * this module provides generalized prompt management for IVR/voicemail.
+ * All audio playback happens in Rust (audio_player / start_interaction).
+ * This module only manages WAV files on disk.
 */

 import { execSync } from 'node:child_process';
 import fs from 'node:fs';
 import path from 'node:path';
 import { Buffer } from 'node:buffer';
-import { encodePcm, isCodecReady } from '../opusbridge.ts';
-
-/** RTP clock increment per 20ms frame for each codec. */
-function rtpClockIncrement(pt: number): number {
-  if (pt === 111) return 960;
-  if (pt === 9) return 160;
-  return 160;
-}
-
-/** Build a fresh RTP header. */
-function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer {
-  const hdr = Buffer.alloc(12);
-  hdr[0] = 0x80;
-  hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f);
-  hdr.writeUInt16BE(seq & 0xffff, 2);
-  hdr.writeUInt32BE(ts >>> 0, 4);
-  hdr.writeUInt32BE(ssrc >>> 0, 8);
-  return hdr;
-}
+import { sendProxyCommand, isProxyReady } from '../proxybridge.ts';

 // ---------------------------------------------------------------------------
 // Types
 // ---------------------------------------------------------------------------

-/** A pre-encoded prompt ready for RTP playback. */
+/** A cached prompt — just a WAV file path and metadata. */
 export interface ICachedPrompt {
  /** Unique prompt identifier. */
  id: string;
-  /** G.722 encoded frames (20ms each, no RTP header). */
-  g722Frames: Buffer[];
-  /** Opus encoded frames (20ms each, no RTP header). */
-  opusFrames: Buffer[];
-  /** Total duration in milliseconds. */
+  /** Path to the WAV file on disk. */
+  wavPath: string;
+  /** Total duration in milliseconds (approximate, from WAV header). */
  durationMs: number;
 }

@@ -82,84 +58,61 @@ function generateViaEspeak(wavPath: string, text: string): boolean {
  }
 }

-/** Generate WAV via Kokoro TTS. */
-function generateViaKokoro(wavPath: string, text: string, voice: string): boolean {
+/** Generate WAV via Kokoro TTS (runs inside proxy-engine). */
+async function generateViaKokoro(wavPath: string, text: string, voice: string): Promise<boolean> {
  const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx');
  const voicesPath = path.join(TTS_DIR, 'voices.bin');
  if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false;
-
-  const root = process.cwd();
-  const ttsBin = [
-    path.join(root, 'dist_rust', 'tts-engine'),
-    path.join(root, 'rust', 'target', 'release', 'tts-engine'),
-    path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
-  ].find((p) => fs.existsSync(p));
-  if (!ttsBin) return false;
+  if (!isProxyReady()) return false;

  try {
-    execSync(
-      `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${voice}" --output "${wavPath}" --text "${text}"`,
-      { timeout: 120000, stdio: 'pipe' },
-    );
+    await sendProxyCommand('generate_tts', {
+      model: modelPath,
+      voices: voicesPath,
+      voice,
+      text,
+      output: wavPath,
+    });
    return true;
  } catch {
    return false;
  }
 }

-/** Read a WAV file and return raw PCM + sample rate. */
-function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
-  const wav = fs.readFileSync(wavPath);
-  if (wav.length < 44) return null;
-  if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
-  if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
+/** Read a WAV file's duration from its header. */
+function getWavDurationMs(wavPath: string): number {
+  try {
+    const wav = fs.readFileSync(wavPath);
+    if (wav.length < 44) return 0;
+    if (wav.toString('ascii', 0, 4) !== 'RIFF') return 0;

-  let sampleRate = 22050;
-  let pcm: Buffer | null = null;
-  let offset = 12;
+    let sampleRate = 16000;
+    let dataSize = 0;
+    let bitsPerSample = 16;
+    let channels = 1;
+    let offset = 12;

-  while (offset < wav.length - 8) {
-    const chunkId = wav.toString('ascii', offset, offset + 4);
-    const chunkSize = wav.readUInt32LE(offset + 4);
-    if (chunkId === 'fmt ') {
-      sampleRate = wav.readUInt32LE(offset + 12);
+    while (offset < wav.length - 8) {
+      const chunkId = wav.toString('ascii', offset, offset + 4);
+      const chunkSize = wav.readUInt32LE(offset + 4);
+      if (chunkId === 'fmt ') {
+        channels = wav.readUInt16LE(offset + 10);
+        sampleRate = wav.readUInt32LE(offset + 12);
+        bitsPerSample = wav.readUInt16LE(offset + 22);
+      }
+      if (chunkId === 'data') {
+        dataSize = chunkSize;
+      }
+      offset += 8 + chunkSize;
+      if (offset % 2 !== 0) offset++;
    }
-    if (chunkId === 'data') {
-      pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
-    }
-    offset += 8 + chunkSize;
-    if (offset % 2 !== 0) offset++;
+
+    const bytesPerSample = (bitsPerSample / 8) * channels;
+    const totalSamples = bytesPerSample > 0 ? dataSize / bytesPerSample : 0;
+    return sampleRate > 0 ? Math.round((totalSamples / sampleRate) * 1000) : 0;
+  } catch {
+    return 0;
  }
-
-  return pcm ? { pcm, sampleRate } : null;
-}
-
-/** Encode raw PCM frames to G.722 + Opus. */
-async function encodePcmFrames(
-  pcm: Buffer,
-  sampleRate: number,
-  log: (msg: string) => void,
-): Promise<{ g722Frames: Buffer[]; opusFrames: Buffer[] } | null> {
-  if (!isCodecReady()) return null;
-
-  const frameSamples = Math.floor(sampleRate * 0.02); // 20ms
-  const frameBytes = frameSamples * 2; // 16-bit
-  const totalFrames = Math.floor(pcm.length / frameBytes);
-
-  const g722Frames: Buffer[] = [];
-  const opusFrames: Buffer[] = [];
-
-  for (let i = 0; i < totalFrames; i++) {
-    const framePcm = Buffer.from(pcm.subarray(i * frameBytes, (i + 1) * frameBytes));
-    const [g722, opus] = await Promise.all([
-      encodePcm(framePcm, sampleRate, 9),   // G.722
-      encodePcm(framePcm, sampleRate, 111),  // Opus
-    ]);
-    if (g722) g722Frames.push(g722);
-    if (opus) opusFrames.push(opus);
-  }
-
-  return { g722Frames, opusFrames };
 }

 // ---------------------------------------------------------------------------
@@ -195,7 +148,7 @@ export class PromptCache {
  }

  /**
-   * Generate a TTS prompt and cache it.
+   * Generate a TTS prompt WAV and cache its path.
   * Uses espeak-ng (primary) or Kokoro (fallback).
   */
  async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise<ICachedPrompt | null> {
@@ -207,14 +160,14 @@ export class PromptCache {
      this.espeakAvailable = isEspeakAvailable();
    }

-    // Generate WAV.
-    let generated = false;
+    // Generate WAV if not already on disk.
    if (!fs.existsSync(wavPath)) {
+      let generated = false;
      if (this.espeakAvailable) {
        generated = generateViaEspeak(wavPath, text);
      }
      if (!generated) {
-        generated = generateViaKokoro(wavPath, text, voice);
+        generated = await generateViaKokoro(wavPath, text, voice);
      }
      if (!generated) {
        this.log(`[prompt-cache] failed to generate TTS for "${id}"`);
@@ -223,49 +176,22 @@ export class PromptCache {
      this.log(`[prompt-cache] generated WAV for "${id}"`);
    }

-    return this.loadWavPrompt(id, wavPath);
+    return this.registerWav(id, wavPath);
  }

  /**
-   * Load a WAV file as a prompt and cache it.
+   * Load a pre-existing WAV file as a prompt.
   */
  async loadWavPrompt(id: string, wavPath: string): Promise<ICachedPrompt | null> {
    if (!fs.existsSync(wavPath)) {
      this.log(`[prompt-cache] WAV not found: ${wavPath}`);
      return null;
    }
-
-    const result = readWavWithRate(wavPath);
-    if (!result) {
-      this.log(`[prompt-cache] failed to parse WAV: ${wavPath}`);
-      return null;
-    }
-
-    const encoded = await encodePcmFrames(result.pcm, result.sampleRate, this.log);
-    if (!encoded) {
-      this.log(`[prompt-cache] encoding failed for "${id}" (codec bridge not ready?)`);
-      return null;
-    }
-
-    const durationMs = encoded.g722Frames.length * 20;
-    const prompt: ICachedPrompt = {
-      id,
-      g722Frames: encoded.g722Frames,
-      opusFrames: encoded.opusFrames,
-      durationMs,
-    };
-
-    this.prompts.set(id, prompt);
-    this.log(`[prompt-cache] cached "${id}": ${encoded.g722Frames.length} frames (${(durationMs / 1000).toFixed(1)}s)`);
-    return prompt;
+    return this.registerWav(id, wavPath);
  }

  /**
-   * Generate a beep tone prompt (sine wave).
-   * @param id - prompt ID
-   * @param freqHz - tone frequency (default 1000 Hz)
-   * @param durationMs - tone duration (default 500ms)
-   * @param amplitude - 16-bit amplitude (default 8000)
+   * Generate a beep tone WAV and cache it.
   */
  async generateBeep(
    id: string,
@@ -273,149 +199,77 @@ export class PromptCache {
    durationMs = 500,
    amplitude = 8000,
  ): Promise<ICachedPrompt | null> {
-    // Generate at 16kHz for decent quality.
-    const sampleRate = 16000;
-    const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
-    const pcm = Buffer.alloc(totalSamples * 2);
+    fs.mkdirSync(TTS_DIR, { recursive: true });
+    const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);

-    for (let i = 0; i < totalSamples; i++) {
-      const t = i / sampleRate;
-      // Apply a short fade-in/fade-out to avoid click artifacts.
-      const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
-      let envelope = 1.0;
-      if (i < fadeLen) envelope = i / fadeLen;
-      else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
+    if (!fs.existsSync(wavPath)) {
+      // Generate 16kHz 16-bit mono sine wave WAV.
+      const sampleRate = 16000;
+      const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
+      const pcm = Buffer.alloc(totalSamples * 2);

-      const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
-      pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
+      for (let i = 0; i < totalSamples; i++) {
+        const t = i / sampleRate;
+        const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
+        let envelope = 1.0;
+        if (i < fadeLen) envelope = i / fadeLen;
+        else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
+
+        const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
+        pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
+      }
+
+      // Write WAV file.
+      const headerSize = 44;
+      const dataSize = pcm.length;
+      const wav = Buffer.alloc(headerSize + dataSize);
+
+      // RIFF header
+      wav.write('RIFF', 0);
+      wav.writeUInt32LE(36 + dataSize, 4);
+      wav.write('WAVE', 8);
+
+      // fmt chunk
+      wav.write('fmt ', 12);
+      wav.writeUInt32LE(16, 16);        // chunk size
+      wav.writeUInt16LE(1, 20);         // PCM format
+      wav.writeUInt16LE(1, 22);         // mono
+      wav.writeUInt32LE(sampleRate, 24);
+      wav.writeUInt32LE(sampleRate * 2, 28); // byte rate
+      wav.writeUInt16LE(2, 32);         // block align
+      wav.writeUInt16LE(16, 34);        // bits per sample
+
+      // data chunk
+      wav.write('data', 36);
+      wav.writeUInt32LE(dataSize, 40);
+      pcm.copy(wav, 44);
+
+      fs.writeFileSync(wavPath, wav);
+      this.log(`[prompt-cache] beep WAV generated for "${id}"`);
    }

-    const encoded = await encodePcmFrames(pcm, sampleRate, this.log);
-    if (!encoded) {
-      this.log(`[prompt-cache] beep encoding failed for "${id}"`);
-      return null;
-    }
-
-    const actualDuration = encoded.g722Frames.length * 20;
-    const prompt: ICachedPrompt = {
-      id,
-      g722Frames: encoded.g722Frames,
-      opusFrames: encoded.opusFrames,
-      durationMs: actualDuration,
-    };
-
-    this.prompts.set(id, prompt);
-    this.log(`[prompt-cache] beep "${id}" cached: ${actualDuration}ms @ ${freqHz}Hz`);
-    return prompt;
+    return this.registerWav(id, wavPath);
  }

-  /**
-   * Remove a prompt from the cache.
-   */
+  /** Remove a prompt from the cache. */
  remove(id: string): void {
    this.prompts.delete(id);
  }

-  /**
-   * Clear all cached prompts.
-   */
+  /** Clear all cached prompts. */
  clear(): void {
    this.prompts.clear();
  }
-}

-// ---------------------------------------------------------------------------
-// Standalone playback helpers (for use by SystemLeg)
-// ---------------------------------------------------------------------------
+  // -------------------------------------------------------------------------
+  // Internal
+  // -------------------------------------------------------------------------

-/**
- * Play a cached prompt's G.722 frames as RTP packets at 20ms intervals.
- *
- * @param prompt - the cached prompt to play
- * @param sendPacket - function to send a raw RTP packet (12-byte header + payload)
- * @param ssrc - SSRC for RTP headers
- * @param onDone - called when playback finishes
- * @returns cancel function, or null if prompt has no G.722 frames
- */
-export function playPromptG722(
-  prompt: ICachedPrompt,
-  sendPacket: (pkt: Buffer) => void,
-  ssrc: number,
-  onDone?: () => void,
-): (() => void) | null {
-  if (prompt.g722Frames.length === 0) {
-    onDone?.();
-    return null;
+  private registerWav(id: string, wavPath: string): ICachedPrompt {
+    const durationMs = getWavDurationMs(wavPath);
+    const prompt: ICachedPrompt = { id, wavPath, durationMs };
+    this.prompts.set(id, prompt);
+    this.log(`[prompt-cache] cached "${id}": ${wavPath} (${(durationMs / 1000).toFixed(1)}s)`);
+    return prompt;
  }
-
-  const frames = prompt.g722Frames;
-  const PT = 9;
-  let frameIdx = 0;
-  let seq = Math.floor(Math.random() * 0xffff);
-  let rtpTs = Math.floor(Math.random() * 0xffffffff);
-
-  const timer = setInterval(() => {
-    if (frameIdx >= frames.length) {
-      clearInterval(timer);
-      onDone?.();
-      return;
-    }
-
-    const payload = frames[frameIdx];
-    const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
-    const pkt = Buffer.concat([hdr, payload]);
-    sendPacket(pkt);
-
-    seq++;
-    rtpTs += rtpClockIncrement(PT);
-    frameIdx++;
-  }, 20);
-
-  return () => clearInterval(timer);
-}
-
-/**
- * Play a cached prompt's Opus frames as RTP packets at 20ms intervals.
- *
- * @param prompt - the cached prompt to play
- * @param sendPacket - function to send a raw RTP packet
- * @param ssrc - SSRC for RTP headers
- * @param counters - shared seq/ts counters (mutated in place for seamless transitions)
- * @param onDone - called when playback finishes
- * @returns cancel function, or null if prompt has no Opus frames
- */
-export function playPromptOpus(
-  prompt: ICachedPrompt,
-  sendPacket: (pkt: Buffer) => void,
-  ssrc: number,
-  counters: { seq: number; ts: number },
-  onDone?: () => void,
-): (() => void) | null {
-  if (prompt.opusFrames.length === 0) {
-    onDone?.();
-    return null;
-  }
-
-  const frames = prompt.opusFrames;
-  const PT = 111;
-  let frameIdx = 0;
-
-  const timer = setInterval(() => {
-    if (frameIdx >= frames.length) {
-      clearInterval(timer);
-      onDone?.();
-      return;
-    }
-
-    const payload = frames[frameIdx];
-    const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
-    const pkt = Buffer.concat([hdr, payload]);
-    sendPacket(pkt);
-
-    counters.seq++;
-    counters.ts += 960; // Opus 48kHz: 960 samples per 20ms
-    frameIdx++;
-  }, 20);
-
-  return () => clearInterval(timer);
 }
@@ -1,199 +0,0 @@
-/**
- * Audio transcoding bridge — uses smartrust to communicate with the Rust
- * opus-codec binary, which handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding.
- *
- * All codec conversion happens in Rust (libopus + SpanDSP G.722 port).
- * The TypeScript side just passes raw payloads back and forth.
- */
-
-import path from 'node:path';
-import { RustBridge } from '@push.rocks/smartrust';
-
-// ---------------------------------------------------------------------------
-// Command type map for smartrust
-// ---------------------------------------------------------------------------
-
-type TCodecCommands = {
-  init: {
-    params: Record<string, never>;
-    result: Record<string, never>;
-  };
-  create_session: {
-    params: { session_id: string };
-    result: Record<string, never>;
-  };
-  destroy_session: {
-    params: { session_id: string };
-    result: Record<string, never>;
-  };
-  transcode: {
-    params: { data_b64: string; from_pt: number; to_pt: number; session_id?: string; direction?: string };
-    result: { data_b64: string };
-  };
-  encode_pcm: {
-    params: { data_b64: string; sample_rate: number; to_pt: number; session_id?: string };
-    result: { data_b64: string };
-  };
-};
-
-// ---------------------------------------------------------------------------
-// Bridge singleton
-// ---------------------------------------------------------------------------
-
-let bridge: RustBridge<TCodecCommands> | null = null;
-let initialized = false;
-
-function buildLocalPaths(): string[] {
-  const root = process.cwd();
-  return [
-    path.join(root, 'dist_rust', 'opus-codec'),
-    path.join(root, 'rust', 'target', 'release', 'opus-codec'),
-    path.join(root, 'rust', 'target', 'debug', 'opus-codec'),
-  ];
-}
-
-let logFn: ((msg: string) => void) | undefined;
-
-/**
- * Initialize the audio transcoding bridge. Spawns the Rust binary.
- */
-export async function initCodecBridge(log?: (msg: string) => void): Promise<boolean> {
-  if (initialized && bridge) return true;
-  logFn = log;
-
-  try {
-    bridge = new RustBridge<TCodecCommands>({
-      binaryName: 'opus-codec',
-      localPaths: buildLocalPaths(),
-    });
-
-    const spawned = await bridge.spawn();
-    if (!spawned) {
-      log?.('[codec] failed to spawn opus-codec binary');
-      bridge = null;
-      return false;
-    }
-
-    // Auto-restart: reset state when the Rust process exits so the next
-    // transcode attempt triggers re-initialization instead of silent failure.
-    bridge.on('exit', () => {
-      logFn?.('[codec] Rust audio transcoder process exited — will re-init on next use');
-      bridge = null;
-      initialized = false;
-    });
-
-    await bridge.sendCommand('init', {} as any);
-    initialized = true;
-    log?.('[codec] Rust audio transcoder initialized (Opus + G.722 + PCMU/PCMA)');
-    return true;
-  } catch (e: any) {
-    log?.(`[codec] init error: ${e.message}`);
-    bridge = null;
-    return false;
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Session management — per-call codec isolation
-// ---------------------------------------------------------------------------
-
-/**
- * Create an isolated codec session. Each session gets its own Opus/G.722
- * encoder/decoder state, preventing concurrent calls from corrupting each
- * other's stateful codec predictions.
- */
-export async function createSession(sessionId: string): Promise<boolean> {
-  if (!bridge || !initialized) {
-    // Attempt auto-reinit if bridge died.
-    const ok = await initCodecBridge(logFn);
-    if (!ok) return false;
-  }
-  try {
-    await bridge!.sendCommand('create_session', { session_id: sessionId });
-    return true;
-  } catch (e: any) {
-    logFn?.(`[codec] create_session error: ${e?.message || e}`);
-    return false;
-  }
-}
-
-/**
- * Destroy a codec session, freeing its encoder/decoder state.
- */
-export async function destroySession(sessionId: string): Promise<void> {
-  if (!bridge || !initialized) return;
-  try {
-    await bridge.sendCommand('destroy_session', { session_id: sessionId });
-  } catch {
-    // Best-effort cleanup.
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Transcoding
-// ---------------------------------------------------------------------------
-
-/**
- * Transcode an RTP payload between two codecs.
- * All codec work (Opus, G.722, PCMU, PCMA) + resampling happens in Rust.
- *
- * @param data - raw RTP payload (no header)
- * @param fromPT - source payload type (0=PCMU, 8=PCMA, 9=G.722, 111=Opus)
- * @param toPT - target payload type
- * @param sessionId - optional session for isolated codec state
- * @returns transcoded payload, or null on failure
- */
-export async function transcode(data: Buffer, fromPT: number, toPT: number, sessionId?: string, direction?: string): Promise<Buffer | null> {
-  if (!bridge || !initialized) return null;
-  try {
-    const params: any = {
-      data_b64: data.toString('base64'),
-      from_pt: fromPT,
-      to_pt: toPT,
-    };
-    if (sessionId) params.session_id = sessionId;
-    if (direction) params.direction = direction;
-    const result = await bridge.sendCommand('transcode', params);
-    return Buffer.from(result.data_b64, 'base64');
-  } catch {
-    return null;
-  }
-}
-
-/**
- * Encode raw 16-bit PCM to a target codec.
- * @param pcmData - raw 16-bit LE PCM bytes
- * @param sampleRate - input sample rate (e.g. 22050 for Piper TTS)
- * @param toPT - target payload type (9=G.722, 111=Opus, 0=PCMU, 8=PCMA)
- * @param sessionId - optional session for isolated codec state
- */
-export async function encodePcm(pcmData: Buffer, sampleRate: number, toPT: number, sessionId?: string): Promise<Buffer | null> {
-  if (!bridge || !initialized) return null;
-  try {
-    const params: any = {
-      data_b64: pcmData.toString('base64'),
-      sample_rate: sampleRate,
-      to_pt: toPT,
-    };
-    if (sessionId) params.session_id = sessionId;
-    const result = await bridge.sendCommand('encode_pcm', params);
-    return Buffer.from(result.data_b64, 'base64');
-  } catch (e: any) {
-    console.error('[encodePcm] error:', e?.message || e);
-    return null;
-  }
-}
-
-/** Check if the codec bridge is ready. */
-export function isCodecReady(): boolean {
-  return initialized && bridge !== null;
-}
-
-/** Shut down the codec bridge. */
-export function shutdownCodecBridge(): void {
-  if (bridge) {
-    try { bridge.kill(); } catch { /* ignore */ }
-    bridge = null;
-    initialized = false;
-  }
-}
@@ -79,6 +79,10 @@ type TProxyCommands = {
    params: { call_id: string; leg_id: string; key: string; value: unknown };
    result: Record<string, never>;
  };
+  generate_tts: {
+    params: { model: string; voices: string; voice: string; text: string; output: string };
+    result: { output: string };
+  };
 };

 // ---------------------------------------------------------------------------
@@ -493,6 +497,15 @@ export function isProxyReady(): boolean {
  return initialized && bridge !== null;
 }

+/** Send an arbitrary command to the proxy engine bridge. */
+export async function sendProxyCommand<K extends keyof TProxyCommands>(
+  method: K,
+  params: TProxyCommands[K]['params'],
+): Promise<TProxyCommands[K]['result']> {
+  if (!bridge || !initialized) throw new Error('proxy engine not initialized');
+  return bridge.sendCommand(method as string, params as any) as any;
+}
+
 /** Shut down the proxy engine. */
 export function shutdownProxyEngine(): void {
  if (bridge) {
@@ -24,7 +24,6 @@ import {
  getAllBrowserDeviceIds,
  getBrowserDeviceWs,
 } from './webrtcbridge.ts';
-import { initCodecBridge } from './opusbridge.ts';
 import { initAnnouncement } from './announcement.ts';
 import { PromptCache } from './call/prompt-cache.ts';
 import { VoiceboxManager } from './voicebox.ts';
@@ -523,9 +522,8 @@ async function startProxyEngine(): Promise<void> {
  const deviceList = appConfig.devices.map((d) => d.displayName).join(', ');
  log(`proxy engine started | LAN ${appConfig.proxy.lanIp}:${appConfig.proxy.lanPort} | providers: ${providerList} | devices: ${deviceList}`);

-  // Initialize audio codec bridge (still needed for WebRTC transcoding).
+  // Generate TTS audio (WAV files on disk, played by Rust audio_player).
  try {
-    await initCodecBridge(log);
    await initAnnouncement(log);

    // Pre-generate prompts.
@@ -547,7 +545,7 @@ async function startProxyEngine(): Promise<void> {
    }
    log(`[startup] prompts cached: ${promptCache.listIds().join(', ') || 'none'}`);
  } catch (e) {
-    log(`[codec] init failed: ${e}`);
+    log(`[tts] init failed: ${e}`);
  }
 }

@@ -3,6 +3,6 @@
 */
 export const commitinfo = {
  name: 'siprouter',
-  version: '1.15.0',
+  version: '1.16.0',
  description: 'undefined'
 }