feat(proxy-engine): add on-demand TTS caching for voicemail and IVR prompts

This commit is contained in:
2026-04-12 20:45:08 +00:00
parent cfadd7a2b6
commit 59d8c2557c
17 changed files with 460 additions and 488 deletions

View File

@@ -50,11 +50,12 @@ struct ProxyEngine {
registrar: Registrar,
call_mgr: CallManager,
rtp_pool: Option<RtpPortPool>,
tts_engine: Arc<Mutex<tts::TtsEngine>>,
out_tx: OutTx,
}
impl ProxyEngine {
fn new(out_tx: OutTx) -> Self {
fn new(out_tx: OutTx, tts_engine: Arc<Mutex<tts::TtsEngine>>) -> Self {
Self {
config: None,
transport: None,
@@ -62,6 +63,7 @@ impl ProxyEngine {
registrar: Registrar::new(out_tx.clone()),
call_mgr: CallManager::new(out_tx.clone()),
rtp_pool: None,
tts_engine,
out_tx,
}
}
@@ -88,15 +90,16 @@ async fn main() {
// Emit ready event.
emit_event(&out_tx, "ready", serde_json::json!({}));
// Shared engine state (SIP side).
let engine = Arc::new(Mutex::new(ProxyEngine::new(out_tx.clone())));
// TTS engine — separate internal lock, lazy-loads model on first use.
let tts_engine = Arc::new(Mutex::new(tts::TtsEngine::new()));
// Shared engine state (SIP side). TTS engine is stored inside so the
// SIP packet handler path can reach it for on-demand voicemail/IVR generation.
let engine = Arc::new(Mutex::new(ProxyEngine::new(out_tx.clone(), tts_engine)));
// WebRTC engine — separate lock to avoid deadlock with SIP handlers.
let webrtc = Arc::new(Mutex::new(WebRtcEngine::new(out_tx.clone())));
// TTS engine — separate lock, lazy-loads model on first use.
let tts_engine = Arc::new(Mutex::new(tts::TtsEngine::new()));
// Read commands from stdin.
let stdin = tokio::io::stdin();
let reader = BufReader::new(stdin);
@@ -117,12 +120,11 @@ async fn main() {
let engine = engine.clone();
let webrtc = webrtc.clone();
let tts_engine = tts_engine.clone();
let out_tx = out_tx.clone();
// Handle commands — some are async, so we spawn.
tokio::spawn(async move {
handle_command(engine, webrtc, tts_engine, &out_tx, cmd).await;
handle_command(engine, webrtc, &out_tx, cmd).await;
});
}
}
@@ -130,7 +132,6 @@ async fn main() {
async fn handle_command(
engine: Arc<Mutex<ProxyEngine>>,
webrtc: Arc<Mutex<WebRtcEngine>>,
tts_engine: Arc<Mutex<tts::TtsEngine>>,
out_tx: &OutTx,
cmd: Command,
) {
@@ -155,8 +156,8 @@ async fn handle_command(
"add_tool_leg" => handle_add_tool_leg(engine, out_tx, &cmd).await,
"remove_tool_leg" => handle_remove_tool_leg(engine, out_tx, &cmd).await,
"set_leg_metadata" => handle_set_leg_metadata(engine, out_tx, &cmd).await,
// TTS command — lock tts_engine only (no SIP/WebRTC contention).
"generate_tts" => handle_generate_tts(tts_engine, out_tx, &cmd).await,
// TTS command — gets tts_engine from inside ProxyEngine.
"generate_tts" => handle_generate_tts(engine, out_tx, &cmd).await,
_ => respond_err(out_tx, &cmd.id, &format!("unknown command: {}", cmd.method)),
}
}
@@ -325,8 +326,10 @@ async fn handle_sip_packet(
ref registrar,
ref mut call_mgr,
ref mut rtp_pool,
ref tts_engine,
..
} = *eng;
let tts_clone = tts_engine.clone();
let rtp_pool = rtp_pool.as_mut().unwrap();
let inbound = call_mgr
.create_inbound_call(
@@ -339,6 +342,7 @@ async fn handle_sip_packet(
rtp_pool,
socket,
public_ip.as_deref(),
tts_clone,
)
.await;
@@ -1231,10 +1235,11 @@ async fn handle_set_leg_metadata(
/// Handle `generate_tts` — synthesize text to a WAV file using Kokoro TTS.
async fn handle_generate_tts(
tts_engine: Arc<Mutex<tts::TtsEngine>>,
engine: Arc<Mutex<ProxyEngine>>,
out_tx: &OutTx,
cmd: &Command,
) {
let tts_engine = engine.lock().await.tts_engine.clone();
let mut tts = tts_engine.lock().await;
match tts.generate(&cmd.params).await {
Ok(result) => respond_ok(out_tx, &cmd.id, result),