feat(proxy-engine): integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files

2026-04-10 15:21:44 +00:00
parent c9ae747c95
commit 66112091a2
18 changed files with 340 additions and 1202 deletions
--- a/rust/crates/proxy-engine/Cargo.toml
+++ b/rust/crates/proxy-engine/Cargo.toml
@@ -18,3 +18,8 @@ regex-lite = "0.1"
 webrtc = "0.8"
 rand = "0.8"
 hound = "3.5"
+kokoro-tts = { version = "0.3", default-features = false }
+ort = { version = "=2.0.0-rc.11", default-features = false, features = [
+    "std", "download-binaries", "copy-dylibs", "ndarray",
+    "tls-native-vendored"
+] }
--- a/rust/crates/proxy-engine/src/main.rs
+++ b/rust/crates/proxy-engine/src/main.rs
@@ -21,6 +21,7 @@ mod rtp;
 mod sip_leg;
 mod sip_transport;
 mod tool_leg;
+mod tts;
 mod voicemail;
 mod webrtc_engine;

@@ -93,6 +94,9 @@ async fn main() {
    // WebRTC engine — separate lock to avoid deadlock with SIP handlers.
    let webrtc = Arc::new(Mutex::new(WebRtcEngine::new(out_tx.clone())));

+    // TTS engine — separate lock, lazy-loads model on first use.
+    let tts_engine = Arc::new(Mutex::new(tts::TtsEngine::new()));
+
    // Read commands from stdin.
    let stdin = tokio::io::stdin();
    let reader = BufReader::new(stdin);
@@ -113,11 +117,12 @@ async fn main() {

        let engine = engine.clone();
        let webrtc = webrtc.clone();
+        let tts_engine = tts_engine.clone();
        let out_tx = out_tx.clone();

        // Handle commands — some are async, so we spawn.
        tokio::spawn(async move {
-            handle_command(engine, webrtc, &out_tx, cmd).await;
+            handle_command(engine, webrtc, tts_engine, &out_tx, cmd).await;
        });
    }
 }
@@ -125,6 +130,7 @@ async fn main() {
 async fn handle_command(
    engine: Arc<Mutex<ProxyEngine>>,
    webrtc: Arc<Mutex<WebRtcEngine>>,
+    tts_engine: Arc<Mutex<tts::TtsEngine>>,
    out_tx: &OutTx,
    cmd: Command,
 ) {
@@ -150,6 +156,8 @@ async fn handle_command(
        "add_tool_leg" => handle_add_tool_leg(engine, out_tx, &cmd).await,
        "remove_tool_leg" => handle_remove_tool_leg(engine, out_tx, &cmd).await,
        "set_leg_metadata" => handle_set_leg_metadata(engine, out_tx, &cmd).await,
+        // TTS command — lock tts_engine only (no SIP/WebRTC contention).
+        "generate_tts" => handle_generate_tts(tts_engine, out_tx, &cmd).await,
        _ => respond_err(out_tx, &cmd.id, &format!("unknown command: {}", cmd.method)),
    }
 }
@@ -1218,3 +1226,16 @@ async fn handle_set_leg_metadata(
    leg.metadata.insert(key, value);
    respond_ok(out_tx, &cmd.id, serde_json::json!({}));
 }
+
+/// Handle `generate_tts` — synthesize text to a WAV file using Kokoro TTS.
+async fn handle_generate_tts(
+    tts_engine: Arc<Mutex<tts::TtsEngine>>,
+    out_tx: &OutTx,
+    cmd: &Command,
+) {
+    let mut tts = tts_engine.lock().await;
+    match tts.generate(&cmd.params).await {
+        Ok(result) => respond_ok(out_tx, &cmd.id, result),
+        Err(e) => respond_err(out_tx, &cmd.id, &e),
+    }
+}
--- a/rust/crates/proxy-engine/src/tts.rs
+++ b/rust/crates/proxy-engine/src/tts.rs
@@ -0,0 +1,138 @@
+//! Text-to-speech engine — synthesizes text to WAV files using Kokoro neural TTS.
+//!
+//! The model is loaded lazily on first use. If the model/voices files are not
+//! present, the generate command returns an error and the TS side falls back
+//! to espeak-ng.
+
+use kokoro_tts::{KokoroTts, Voice};
+use std::path::Path;
+
+/// Wraps the Kokoro TTS engine with lazy model loading.
+pub struct TtsEngine {
+    tts: Option<KokoroTts>,
+    /// Path that was used to load the current model (for cache invalidation).
+    loaded_model_path: String,
+    loaded_voices_path: String,
+}
+
+impl TtsEngine {
+    pub fn new() -> Self {
+        Self {
+            tts: None,
+            loaded_model_path: String::new(),
+            loaded_voices_path: String::new(),
+        }
+    }
+
+    /// Generate a WAV file from text.
+    ///
+    /// Params (from IPC JSON):
+    ///   - `model`: path to the ONNX model file
+    ///   - `voices`: path to the voices.bin file
+    ///   - `voice`: voice name (e.g. "af_bella")
+    ///   - `text`: text to synthesize
+    ///   - `output`: output WAV file path
+    pub async fn generate(&mut self, params: &serde_json::Value) -> Result<serde_json::Value, String> {
+        let model_path = params.get("model").and_then(|v| v.as_str())
+            .ok_or("missing 'model' param")?;
+        let voices_path = params.get("voices").and_then(|v| v.as_str())
+            .ok_or("missing 'voices' param")?;
+        let voice_name = params.get("voice").and_then(|v| v.as_str())
+            .unwrap_or("af_bella");
+        let text = params.get("text").and_then(|v| v.as_str())
+            .ok_or("missing 'text' param")?;
+        let output_path = params.get("output").and_then(|v| v.as_str())
+            .ok_or("missing 'output' param")?;
+
+        if text.is_empty() {
+            return Err("empty text".into());
+        }
+
+        // Check that model/voices files exist.
+        if !Path::new(model_path).exists() {
+            return Err(format!("model not found: {model_path}"));
+        }
+        if !Path::new(voices_path).exists() {
+            return Err(format!("voices not found: {voices_path}"));
+        }
+
+        // Lazy-load or reload if paths changed.
+        if self.tts.is_none()
+            || self.loaded_model_path != model_path
+            || self.loaded_voices_path != voices_path
+        {
+            eprintln!("[tts] loading model: {model_path}");
+            let tts = KokoroTts::new(model_path, voices_path)
+                .await
+                .map_err(|e| format!("model load failed: {e:?}"))?;
+            self.tts = Some(tts);
+            self.loaded_model_path = model_path.to_string();
+            self.loaded_voices_path = voices_path.to_string();
+        }
+
+        let tts = self.tts.as_ref().unwrap();
+        let voice = select_voice(voice_name);
+
+        eprintln!("[tts] synthesizing voice '{voice_name}': \"{text}\"");
+        let (samples, duration) = tts.synth(text, voice)
+            .await
+            .map_err(|e| format!("synthesis failed: {e:?}"))?;
+        eprintln!("[tts] synthesized {} samples in {duration:?}", samples.len());
+
+        // Write 24kHz 16-bit mono WAV.
+        let spec = hound::WavSpec {
+            channels: 1,
+            sample_rate: 24000,
+            bits_per_sample: 16,
+            sample_format: hound::SampleFormat::Int,
+        };
+
+        let mut writer = hound::WavWriter::create(output_path, spec)
+            .map_err(|e| format!("WAV create failed: {e}"))?;
+        for &sample in &samples {
+            let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
+            writer.write_sample(s16).map_err(|e| format!("WAV write: {e}"))?;
+        }
+        writer.finalize().map_err(|e| format!("WAV finalize: {e}"))?;
+
+        eprintln!("[tts] wrote {output_path}");
+        Ok(serde_json::json!({ "output": output_path }))
+    }
+}
+
+/// Map voice name string to Kokoro Voice enum variant.
+fn select_voice(name: &str) -> Voice {
+    match name {
+        "af_bella" => Voice::AfBella(1.0),
+        "af_heart" => Voice::AfHeart(1.0),
+        "af_jessica" => Voice::AfJessica(1.0),
+        "af_nicole" => Voice::AfNicole(1.0),
+        "af_nova" => Voice::AfNova(1.0),
+        "af_sarah" => Voice::AfSarah(1.0),
+        "af_sky" => Voice::AfSky(1.0),
+        "af_river" => Voice::AfRiver(1.0),
+        "af_alloy" => Voice::AfAlloy(1.0),
+        "af_aoede" => Voice::AfAoede(1.0),
+        "af_kore" => Voice::AfKore(1.0),
+        "am_adam" => Voice::AmAdam(1.0),
+        "am_echo" => Voice::AmEcho(1.0),
+        "am_eric" => Voice::AmEric(1.0),
+        "am_fenrir" => Voice::AmFenrir(1.0),
+        "am_liam" => Voice::AmLiam(1.0),
+        "am_michael" => Voice::AmMichael(1.0),
+        "am_onyx" => Voice::AmOnyx(1.0),
+        "am_puck" => Voice::AmPuck(1.0),
+        "bf_alice" => Voice::BfAlice(1.0),
+        "bf_emma" => Voice::BfEmma(1.0),
+        "bf_isabella" => Voice::BfIsabella(1.0),
+        "bf_lily" => Voice::BfLily(1.0),
+        "bm_daniel" => Voice::BmDaniel(1.0),
+        "bm_fable" => Voice::BmFable(1.0),
+        "bm_george" => Voice::BmGeorge(1.0),
+        "bm_lewis" => Voice::BmLewis(1.0),
+        _ => {
+            eprintln!("[tts] unknown voice '{name}', falling back to af_bella");
+            Voice::AfBella(1.0)
+        }
+    }
+}