feat(proxy-engine): integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files

This commit is contained in:
2026-04-10 15:21:44 +00:00
parent c9ae747c95
commit 66112091a2
18 changed files with 340 additions and 1202 deletions

View File

@@ -18,3 +18,8 @@ regex-lite = "0.1"
webrtc = "0.8"
rand = "0.8"
hound = "3.5"
kokoro-tts = { version = "0.3", default-features = false }
ort = { version = "=2.0.0-rc.11", default-features = false, features = [
"std", "download-binaries", "copy-dylibs", "ndarray",
"tls-native-vendored"
] }

View File

@@ -21,6 +21,7 @@ mod rtp;
mod sip_leg;
mod sip_transport;
mod tool_leg;
mod tts;
mod voicemail;
mod webrtc_engine;
@@ -93,6 +94,9 @@ async fn main() {
// WebRTC engine — separate lock to avoid deadlock with SIP handlers.
let webrtc = Arc::new(Mutex::new(WebRtcEngine::new(out_tx.clone())));
// TTS engine — separate lock, lazy-loads model on first use.
let tts_engine = Arc::new(Mutex::new(tts::TtsEngine::new()));
// Read commands from stdin.
let stdin = tokio::io::stdin();
let reader = BufReader::new(stdin);
@@ -113,11 +117,12 @@ async fn main() {
let engine = engine.clone();
let webrtc = webrtc.clone();
let tts_engine = tts_engine.clone();
let out_tx = out_tx.clone();
// Handle commands — some are async, so we spawn.
tokio::spawn(async move {
handle_command(engine, webrtc, &out_tx, cmd).await;
handle_command(engine, webrtc, tts_engine, &out_tx, cmd).await;
});
}
}
@@ -125,6 +130,7 @@ async fn main() {
async fn handle_command(
engine: Arc<Mutex<ProxyEngine>>,
webrtc: Arc<Mutex<WebRtcEngine>>,
tts_engine: Arc<Mutex<tts::TtsEngine>>,
out_tx: &OutTx,
cmd: Command,
) {
@@ -150,6 +156,8 @@ async fn handle_command(
"add_tool_leg" => handle_add_tool_leg(engine, out_tx, &cmd).await,
"remove_tool_leg" => handle_remove_tool_leg(engine, out_tx, &cmd).await,
"set_leg_metadata" => handle_set_leg_metadata(engine, out_tx, &cmd).await,
// TTS command — lock tts_engine only (no SIP/WebRTC contention).
"generate_tts" => handle_generate_tts(tts_engine, out_tx, &cmd).await,
_ => respond_err(out_tx, &cmd.id, &format!("unknown command: {}", cmd.method)),
}
}
@@ -1218,3 +1226,16 @@ async fn handle_set_leg_metadata(
leg.metadata.insert(key, value);
respond_ok(out_tx, &cmd.id, serde_json::json!({}));
}
/// Handle `generate_tts` — synthesize text to a WAV file using Kokoro TTS.
async fn handle_generate_tts(
tts_engine: Arc<Mutex<tts::TtsEngine>>,
out_tx: &OutTx,
cmd: &Command,
) {
let mut tts = tts_engine.lock().await;
match tts.generate(&cmd.params).await {
Ok(result) => respond_ok(out_tx, &cmd.id, result),
Err(e) => respond_err(out_tx, &cmd.id, &e),
}
}

View File

@@ -0,0 +1,138 @@
//! Text-to-speech engine — synthesizes text to WAV files using Kokoro neural TTS.
//!
//! The model is loaded lazily on first use. If the model/voices files are not
//! present, the generate command returns an error and the TS side falls back
//! to espeak-ng.
use kokoro_tts::{KokoroTts, Voice};
use std::path::Path;
/// Wraps the Kokoro TTS engine with lazy model loading.
pub struct TtsEngine {
tts: Option<KokoroTts>,
/// Path that was used to load the current model (for cache invalidation).
loaded_model_path: String,
loaded_voices_path: String,
}
impl TtsEngine {
pub fn new() -> Self {
Self {
tts: None,
loaded_model_path: String::new(),
loaded_voices_path: String::new(),
}
}
/// Generate a WAV file from text.
///
/// Params (from IPC JSON):
/// - `model`: path to the ONNX model file
/// - `voices`: path to the voices.bin file
/// - `voice`: voice name (e.g. "af_bella")
/// - `text`: text to synthesize
/// - `output`: output WAV file path
pub async fn generate(&mut self, params: &serde_json::Value) -> Result<serde_json::Value, String> {
let model_path = params.get("model").and_then(|v| v.as_str())
.ok_or("missing 'model' param")?;
let voices_path = params.get("voices").and_then(|v| v.as_str())
.ok_or("missing 'voices' param")?;
let voice_name = params.get("voice").and_then(|v| v.as_str())
.unwrap_or("af_bella");
let text = params.get("text").and_then(|v| v.as_str())
.ok_or("missing 'text' param")?;
let output_path = params.get("output").and_then(|v| v.as_str())
.ok_or("missing 'output' param")?;
if text.is_empty() {
return Err("empty text".into());
}
// Check that model/voices files exist.
if !Path::new(model_path).exists() {
return Err(format!("model not found: {model_path}"));
}
if !Path::new(voices_path).exists() {
return Err(format!("voices not found: {voices_path}"));
}
// Lazy-load or reload if paths changed.
if self.tts.is_none()
|| self.loaded_model_path != model_path
|| self.loaded_voices_path != voices_path
{
eprintln!("[tts] loading model: {model_path}");
let tts = KokoroTts::new(model_path, voices_path)
.await
.map_err(|e| format!("model load failed: {e:?}"))?;
self.tts = Some(tts);
self.loaded_model_path = model_path.to_string();
self.loaded_voices_path = voices_path.to_string();
}
let tts = self.tts.as_ref().unwrap();
let voice = select_voice(voice_name);
eprintln!("[tts] synthesizing voice '{voice_name}': \"{text}\"");
let (samples, duration) = tts.synth(text, voice)
.await
.map_err(|e| format!("synthesis failed: {e:?}"))?;
eprintln!("[tts] synthesized {} samples in {duration:?}", samples.len());
// Write 24kHz 16-bit mono WAV.
let spec = hound::WavSpec {
channels: 1,
sample_rate: 24000,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = hound::WavWriter::create(output_path, spec)
.map_err(|e| format!("WAV create failed: {e}"))?;
for &sample in &samples {
let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
writer.write_sample(s16).map_err(|e| format!("WAV write: {e}"))?;
}
writer.finalize().map_err(|e| format!("WAV finalize: {e}"))?;
eprintln!("[tts] wrote {output_path}");
Ok(serde_json::json!({ "output": output_path }))
}
}
/// Map voice name string to Kokoro Voice enum variant.
fn select_voice(name: &str) -> Voice {
match name {
"af_bella" => Voice::AfBella(1.0),
"af_heart" => Voice::AfHeart(1.0),
"af_jessica" => Voice::AfJessica(1.0),
"af_nicole" => Voice::AfNicole(1.0),
"af_nova" => Voice::AfNova(1.0),
"af_sarah" => Voice::AfSarah(1.0),
"af_sky" => Voice::AfSky(1.0),
"af_river" => Voice::AfRiver(1.0),
"af_alloy" => Voice::AfAlloy(1.0),
"af_aoede" => Voice::AfAoede(1.0),
"af_kore" => Voice::AfKore(1.0),
"am_adam" => Voice::AmAdam(1.0),
"am_echo" => Voice::AmEcho(1.0),
"am_eric" => Voice::AmEric(1.0),
"am_fenrir" => Voice::AmFenrir(1.0),
"am_liam" => Voice::AmLiam(1.0),
"am_michael" => Voice::AmMichael(1.0),
"am_onyx" => Voice::AmOnyx(1.0),
"am_puck" => Voice::AmPuck(1.0),
"bf_alice" => Voice::BfAlice(1.0),
"bf_emma" => Voice::BfEmma(1.0),
"bf_isabella" => Voice::BfIsabella(1.0),
"bf_lily" => Voice::BfLily(1.0),
"bm_daniel" => Voice::BmDaniel(1.0),
"bm_fable" => Voice::BmFable(1.0),
"bm_george" => Voice::BmGeorge(1.0),
"bm_lewis" => Voice::BmLewis(1.0),
_ => {
eprintln!("[tts] unknown voice '{name}', falling back to af_bella");
Voice::AfBella(1.0)
}
}
}