feat(proxy-engine): integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files

This commit is contained in:
2026-04-10 15:21:44 +00:00
parent c9ae747c95
commit 66112091a2
18 changed files with 340 additions and 1202 deletions

22
rust/Cargo.lock generated
View File

@@ -1881,16 +1881,6 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "opus-codec"
version = "0.2.0"
dependencies = [
"base64 0.22.1",
"codec-lib",
"serde",
"serde_json",
]
[[package]]
name = "ort"
version = "2.0.0-rc.11"
@@ -2188,6 +2178,8 @@ dependencies = [
"base64 0.22.1",
"codec-lib",
"hound",
"kokoro-tts",
"ort",
"rand 0.8.5",
"regex-lite",
"serde",
@@ -3008,16 +3000,6 @@ dependencies = [
"strength_reduce",
]
[[package]]
name = "tts-engine"
version = "0.1.0"
dependencies = [
"hound",
"kokoro-tts",
"ort",
"tokio",
]
[[package]]
name = "turn"
version = "0.6.1"

View File

@@ -1,8 +1,6 @@
[workspace]
members = [
"crates/codec-lib",
"crates/opus-codec",
"crates/tts-engine",
"crates/sip-proto",
"crates/proxy-engine",
]

View File

@@ -1,7 +1,7 @@
//! Audio codec library for the SIP router.
//!
//! Handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding with ML noise suppression.
//! Used by both the standalone `opus-codec` CLI and the `proxy-engine` binary.
//! Used by the `proxy-engine` binary for all audio transcoding.
use audiopus::coder::{Decoder as OpusDecoder, Encoder as OpusEncoder};
use audiopus::packet::Packet as OpusPacket;

View File

@@ -1,14 +0,0 @@
[package]
name = "opus-codec"
version = "0.2.0"
edition = "2021"
[[bin]]
name = "opus-codec"
path = "src/main.rs"
[dependencies]
codec-lib = { path = "../codec-lib" }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
base64 = "0.22"

View File

@@ -1,286 +0,0 @@
/// Audio transcoding bridge for smartrust.
///
/// Thin CLI wrapper around `codec-lib`. Handles Opus ↔ G.722 ↔ PCMU transcoding.
///
/// Protocol:
/// -> {"id":"1","method":"init","params":{}}
/// <- {"id":"1","success":true,"result":{}}
/// -> {"id":"2","method":"create_session","params":{"session_id":"call-abc"}}
/// <- {"id":"2","success":true,"result":{}}
/// -> {"id":"3","method":"transcode","params":{"session_id":"call-abc","data_b64":"...","from_pt":111,"to_pt":9}}
/// <- {"id":"3","success":true,"result":{"data_b64":"..."}}
/// -> {"id":"4","method":"destroy_session","params":{"session_id":"call-abc"}}
/// <- {"id":"4","success":true,"result":{}}
use base64::engine::general_purpose::STANDARD as B64;
use base64::Engine as _;
use codec_lib::{codec_sample_rate, TranscodeState};
use serde::Deserialize;
use std::collections::HashMap;
use std::io::{self, BufRead, Write};
#[derive(Deserialize)]
struct Request {
id: String,
method: String,
#[serde(default)]
params: serde_json::Value,
}
fn respond(
out: &mut impl Write,
id: &str,
success: bool,
result: Option<serde_json::Value>,
error: Option<&str>,
) {
let mut resp = serde_json::json!({ "id": id, "success": success });
if let Some(r) = result {
resp["result"] = r;
}
if let Some(e) = error {
resp["error"] = serde_json::Value::String(e.to_string());
}
let _ = writeln!(out, "{}", resp);
let _ = out.flush();
}
/// Resolve a session: if session_id is provided, look it up in the sessions map;
/// otherwise fall back to the default state (backward compat with `init`).
fn get_session<'a>(
sessions: &'a mut HashMap<String, TranscodeState>,
default: &'a mut Option<TranscodeState>,
params: &serde_json::Value,
) -> Option<&'a mut TranscodeState> {
if let Some(sid) = params.get("session_id").and_then(|v| v.as_str()) {
sessions.get_mut(sid)
} else {
default.as_mut()
}
}
fn main() {
let stdin = io::stdin();
let stdout = io::stdout();
let mut out = io::BufWriter::new(stdout.lock());
let _ = writeln!(out, r#"{{"event":"ready","data":{{}}}}"#);
let _ = out.flush();
let mut default_state: Option<TranscodeState> = None;
let mut sessions: HashMap<String, TranscodeState> = HashMap::new();
for line in stdin.lock().lines() {
let line = match line {
Ok(l) if !l.trim().is_empty() => l,
Ok(_) => continue,
Err(_) => break,
};
let req: Request = match serde_json::from_str(&line) {
Ok(r) => r,
Err(e) => {
respond(&mut out, "", false, None, Some(&format!("parse: {e}")));
continue;
}
};
match req.method.as_str() {
"init" => match TranscodeState::new() {
Ok(s) => {
default_state = Some(s);
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
}
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
},
"create_session" => {
let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) {
Some(s) => s.to_string(),
None => {
respond(&mut out, &req.id, false, None, Some("missing session_id"));
continue;
}
};
if sessions.contains_key(&session_id) {
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
continue;
}
match TranscodeState::new() {
Ok(s) => {
sessions.insert(session_id, s);
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
}
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
}
}
"destroy_session" => {
let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) {
Some(s) => s,
None => {
respond(&mut out, &req.id, false, None, Some("missing session_id"));
continue;
}
};
sessions.remove(session_id);
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
}
"transcode" => {
let st = match get_session(&mut sessions, &mut default_state, &req.params) {
Some(s) => s,
None => {
respond(
&mut out,
&req.id,
false,
None,
Some("not initialized (no session or default state)"),
);
continue;
}
};
let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) {
Some(s) => s,
None => {
respond(&mut out, &req.id, false, None, Some("missing data_b64"));
continue;
}
};
let from_pt =
req.params.get("from_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8;
let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8;
let direction = req.params.get("direction").and_then(|v| v.as_str());
let data = match B64.decode(data_b64) {
Ok(b) => b,
Err(e) => {
respond(
&mut out,
&req.id,
false,
None,
Some(&format!("b64: {e}")),
);
continue;
}
};
match st.transcode(&data, from_pt, to_pt, direction) {
Ok(result) => {
respond(
&mut out,
&req.id,
true,
Some(serde_json::json!({ "data_b64": B64.encode(&result) })),
None,
);
}
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
}
}
"encode_pcm" => {
let st = match get_session(&mut sessions, &mut default_state, &req.params) {
Some(s) => s,
None => {
respond(
&mut out,
&req.id,
false,
None,
Some("not initialized (no session or default state)"),
);
continue;
}
};
let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) {
Some(s) => s,
None => {
respond(&mut out, &req.id, false, None, Some("missing data_b64"));
continue;
}
};
let sample_rate = req
.params
.get("sample_rate")
.and_then(|v| v.as_u64())
.unwrap_or(22050) as u32;
let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(9) as u8;
let data = match B64.decode(data_b64) {
Ok(b) => b,
Err(e) => {
respond(
&mut out,
&req.id,
false,
None,
Some(&format!("b64: {e}")),
);
continue;
}
};
if data.len() % 2 != 0 {
respond(
&mut out,
&req.id,
false,
None,
Some("PCM data has odd byte count (expected 16-bit LE samples)"),
);
continue;
}
let pcm: Vec<i16> = data
.chunks_exact(2)
.map(|c| i16::from_le_bytes([c[0], c[1]]))
.collect();
let target_rate = codec_sample_rate(to_pt);
let resampled = match st.resample(&pcm, sample_rate, target_rate) {
Ok(r) => r,
Err(e) => {
respond(&mut out, &req.id, false, None, Some(&e));
continue;
}
};
match st.encode_from_pcm(&resampled, to_pt) {
Ok(encoded) => {
respond(
&mut out,
&req.id,
true,
Some(serde_json::json!({ "data_b64": B64.encode(&encoded) })),
None,
);
}
Err(e) => {
respond(&mut out, &req.id, false, None, Some(&e));
}
}
}
"encode" | "decode" => {
respond(
&mut out,
&req.id,
false,
None,
Some("use 'transcode' command instead"),
);
}
_ => respond(
&mut out,
&req.id,
false,
None,
Some(&format!("unknown: {}", req.method)),
),
}
}
}

View File

@@ -18,3 +18,8 @@ regex-lite = "0.1"
webrtc = "0.8"
rand = "0.8"
hound = "3.5"
kokoro-tts = { version = "0.3", default-features = false }
ort = { version = "=2.0.0-rc.11", default-features = false, features = [
"std", "download-binaries", "copy-dylibs", "ndarray",
"tls-native-vendored"
] }

View File

@@ -21,6 +21,7 @@ mod rtp;
mod sip_leg;
mod sip_transport;
mod tool_leg;
mod tts;
mod voicemail;
mod webrtc_engine;
@@ -93,6 +94,9 @@ async fn main() {
// WebRTC engine — separate lock to avoid deadlock with SIP handlers.
let webrtc = Arc::new(Mutex::new(WebRtcEngine::new(out_tx.clone())));
// TTS engine — separate lock, lazy-loads model on first use.
let tts_engine = Arc::new(Mutex::new(tts::TtsEngine::new()));
// Read commands from stdin.
let stdin = tokio::io::stdin();
let reader = BufReader::new(stdin);
@@ -113,11 +117,12 @@ async fn main() {
let engine = engine.clone();
let webrtc = webrtc.clone();
let tts_engine = tts_engine.clone();
let out_tx = out_tx.clone();
// Handle commands — some are async, so we spawn.
tokio::spawn(async move {
handle_command(engine, webrtc, &out_tx, cmd).await;
handle_command(engine, webrtc, tts_engine, &out_tx, cmd).await;
});
}
}
@@ -125,6 +130,7 @@ async fn main() {
async fn handle_command(
engine: Arc<Mutex<ProxyEngine>>,
webrtc: Arc<Mutex<WebRtcEngine>>,
tts_engine: Arc<Mutex<tts::TtsEngine>>,
out_tx: &OutTx,
cmd: Command,
) {
@@ -150,6 +156,8 @@ async fn handle_command(
"add_tool_leg" => handle_add_tool_leg(engine, out_tx, &cmd).await,
"remove_tool_leg" => handle_remove_tool_leg(engine, out_tx, &cmd).await,
"set_leg_metadata" => handle_set_leg_metadata(engine, out_tx, &cmd).await,
// TTS command — lock tts_engine only (no SIP/WebRTC contention).
"generate_tts" => handle_generate_tts(tts_engine, out_tx, &cmd).await,
_ => respond_err(out_tx, &cmd.id, &format!("unknown command: {}", cmd.method)),
}
}
@@ -1218,3 +1226,16 @@ async fn handle_set_leg_metadata(
leg.metadata.insert(key, value);
respond_ok(out_tx, &cmd.id, serde_json::json!({}));
}
/// Handle `generate_tts` — synthesize text to a WAV file using Kokoro TTS.
async fn handle_generate_tts(
tts_engine: Arc<Mutex<tts::TtsEngine>>,
out_tx: &OutTx,
cmd: &Command,
) {
let mut tts = tts_engine.lock().await;
match tts.generate(&cmd.params).await {
Ok(result) => respond_ok(out_tx, &cmd.id, result),
Err(e) => respond_err(out_tx, &cmd.id, &e),
}
}

View File

@@ -0,0 +1,138 @@
//! Text-to-speech engine — synthesizes text to WAV files using Kokoro neural TTS.
//!
//! The model is loaded lazily on first use. If the model/voices files are not
//! present, the generate command returns an error and the TS side falls back
//! to espeak-ng.
use kokoro_tts::{KokoroTts, Voice};
use std::path::Path;
/// Wraps the Kokoro TTS engine with lazy model loading.
pub struct TtsEngine {
tts: Option<KokoroTts>,
/// Path that was used to load the current model (for cache invalidation).
loaded_model_path: String,
loaded_voices_path: String,
}
impl TtsEngine {
pub fn new() -> Self {
Self {
tts: None,
loaded_model_path: String::new(),
loaded_voices_path: String::new(),
}
}
/// Generate a WAV file from text.
///
/// Params (from IPC JSON):
/// - `model`: path to the ONNX model file
/// - `voices`: path to the voices.bin file
/// - `voice`: voice name (e.g. "af_bella")
/// - `text`: text to synthesize
/// - `output`: output WAV file path
pub async fn generate(&mut self, params: &serde_json::Value) -> Result<serde_json::Value, String> {
let model_path = params.get("model").and_then(|v| v.as_str())
.ok_or("missing 'model' param")?;
let voices_path = params.get("voices").and_then(|v| v.as_str())
.ok_or("missing 'voices' param")?;
let voice_name = params.get("voice").and_then(|v| v.as_str())
.unwrap_or("af_bella");
let text = params.get("text").and_then(|v| v.as_str())
.ok_or("missing 'text' param")?;
let output_path = params.get("output").and_then(|v| v.as_str())
.ok_or("missing 'output' param")?;
if text.is_empty() {
return Err("empty text".into());
}
// Check that model/voices files exist.
if !Path::new(model_path).exists() {
return Err(format!("model not found: {model_path}"));
}
if !Path::new(voices_path).exists() {
return Err(format!("voices not found: {voices_path}"));
}
// Lazy-load or reload if paths changed.
if self.tts.is_none()
|| self.loaded_model_path != model_path
|| self.loaded_voices_path != voices_path
{
eprintln!("[tts] loading model: {model_path}");
let tts = KokoroTts::new(model_path, voices_path)
.await
.map_err(|e| format!("model load failed: {e:?}"))?;
self.tts = Some(tts);
self.loaded_model_path = model_path.to_string();
self.loaded_voices_path = voices_path.to_string();
}
let tts = self.tts.as_ref().unwrap();
let voice = select_voice(voice_name);
eprintln!("[tts] synthesizing voice '{voice_name}': \"{text}\"");
let (samples, duration) = tts.synth(text, voice)
.await
.map_err(|e| format!("synthesis failed: {e:?}"))?;
eprintln!("[tts] synthesized {} samples in {duration:?}", samples.len());
// Write 24kHz 16-bit mono WAV.
let spec = hound::WavSpec {
channels: 1,
sample_rate: 24000,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = hound::WavWriter::create(output_path, spec)
.map_err(|e| format!("WAV create failed: {e}"))?;
for &sample in &samples {
let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
writer.write_sample(s16).map_err(|e| format!("WAV write: {e}"))?;
}
writer.finalize().map_err(|e| format!("WAV finalize: {e}"))?;
eprintln!("[tts] wrote {output_path}");
Ok(serde_json::json!({ "output": output_path }))
}
}
/// Map voice name string to Kokoro Voice enum variant.
fn select_voice(name: &str) -> Voice {
match name {
"af_bella" => Voice::AfBella(1.0),
"af_heart" => Voice::AfHeart(1.0),
"af_jessica" => Voice::AfJessica(1.0),
"af_nicole" => Voice::AfNicole(1.0),
"af_nova" => Voice::AfNova(1.0),
"af_sarah" => Voice::AfSarah(1.0),
"af_sky" => Voice::AfSky(1.0),
"af_river" => Voice::AfRiver(1.0),
"af_alloy" => Voice::AfAlloy(1.0),
"af_aoede" => Voice::AfAoede(1.0),
"af_kore" => Voice::AfKore(1.0),
"am_adam" => Voice::AmAdam(1.0),
"am_echo" => Voice::AmEcho(1.0),
"am_eric" => Voice::AmEric(1.0),
"am_fenrir" => Voice::AmFenrir(1.0),
"am_liam" => Voice::AmLiam(1.0),
"am_michael" => Voice::AmMichael(1.0),
"am_onyx" => Voice::AmOnyx(1.0),
"am_puck" => Voice::AmPuck(1.0),
"bf_alice" => Voice::BfAlice(1.0),
"bf_emma" => Voice::BfEmma(1.0),
"bf_isabella" => Voice::BfIsabella(1.0),
"bf_lily" => Voice::BfLily(1.0),
"bm_daniel" => Voice::BmDaniel(1.0),
"bm_fable" => Voice::BmFable(1.0),
"bm_george" => Voice::BmGeorge(1.0),
"bm_lewis" => Voice::BmLewis(1.0),
_ => {
eprintln!("[tts] unknown voice '{name}', falling back to af_bella");
Voice::AfBella(1.0)
}
}
}

View File

@@ -1,18 +0,0 @@
[package]
name = "tts-engine"
version = "0.1.0"
edition = "2021"
[[bin]]
name = "tts-engine"
path = "src/main.rs"
[dependencies]
kokoro-tts = { version = "0.3", default-features = false }
# Pin to rc.11 matching kokoro-tts's expectation; enable vendored TLS to avoid system libssl-dev.
ort = { version = "=2.0.0-rc.11", default-features = false, features = [
"std", "download-binaries", "copy-dylibs", "ndarray",
"tls-native-vendored"
] }
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
hound = "3.5"

View File

@@ -1,149 +0,0 @@
/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS.
///
/// Usage:
/// echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav
/// tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world"
///
/// Outputs 24kHz 16-bit mono WAV.
use kokoro_tts::{KokoroTts, Voice};
use std::io::{self, Read};
fn parse_args() -> Result<(String, String, String, String, Option<String>), String> {
let args: Vec<String> = std::env::args().collect();
let mut model = String::new();
let mut voices = String::new();
let mut output = String::new();
let mut text: Option<String> = None;
let mut voice_name: Option<String> = None;
let mut i = 1;
while i < args.len() {
match args[i].as_str() {
"--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); }
"--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); }
"--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); }
"--text" => { i += 1; text = args.get(i).cloned(); }
"--voice" => { i += 1; voice_name = args.get(i).cloned(); }
_ => {}
}
i += 1;
}
if model.is_empty() { return Err("--model required".into()); }
if voices.is_empty() { return Err("--voices required".into()); }
if output.is_empty() { return Err("--output required".into()); }
let voice_str = voice_name.unwrap_or_else(|| "af_bella".into());
Ok((model, voices, output, voice_str, text))
}
fn select_voice(name: &str) -> Voice {
match name {
"af_bella" => Voice::AfBella(1.0),
"af_heart" => Voice::AfHeart(1.0),
"af_jessica" => Voice::AfJessica(1.0),
"af_nicole" => Voice::AfNicole(1.0),
"af_nova" => Voice::AfNova(1.0),
"af_sarah" => Voice::AfSarah(1.0),
"af_sky" => Voice::AfSky(1.0),
"af_river" => Voice::AfRiver(1.0),
"af_alloy" => Voice::AfAlloy(1.0),
"af_aoede" => Voice::AfAoede(1.0),
"af_kore" => Voice::AfKore(1.0),
"am_adam" => Voice::AmAdam(1.0),
"am_echo" => Voice::AmEcho(1.0),
"am_eric" => Voice::AmEric(1.0),
"am_fenrir" => Voice::AmFenrir(1.0),
"am_liam" => Voice::AmLiam(1.0),
"am_michael" => Voice::AmMichael(1.0),
"am_onyx" => Voice::AmOnyx(1.0),
"am_puck" => Voice::AmPuck(1.0),
"bf_alice" => Voice::BfAlice(1.0),
"bf_emma" => Voice::BfEmma(1.0),
"bf_isabella" => Voice::BfIsabella(1.0),
"bf_lily" => Voice::BfLily(1.0),
"bm_daniel" => Voice::BmDaniel(1.0),
"bm_fable" => Voice::BmFable(1.0),
"bm_george" => Voice::BmGeorge(1.0),
"bm_lewis" => Voice::BmLewis(1.0),
_ => {
eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name);
Voice::AfBella(1.0)
}
}
}
#[tokio::main]
async fn main() {
let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() {
Ok(v) => v,
Err(e) => {
eprintln!("Error: {}", e);
eprintln!("Usage: tts-engine --model <model.onnx> --voices <voices.bin> --output <output.wav> [--text <text>] [--voice <voice_name>]");
std::process::exit(1);
}
};
// Get text from --text arg or stdin.
let text = match text_arg {
Some(t) => t,
None => {
let mut buf = String::new();
io::stdin().read_to_string(&mut buf).expect("failed to read stdin");
buf.trim().to_string()
}
};
if text.is_empty() {
eprintln!("[tts-engine] no text provided");
std::process::exit(1);
}
eprintln!("[tts-engine] loading model: {}", model_path);
let tts = match KokoroTts::new(&model_path, &voices_path).await {
Ok(t) => t,
Err(e) => {
eprintln!("[tts-engine] failed to load model: {:?}", e);
std::process::exit(1);
}
};
let voice = select_voice(&voice_name);
eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text);
let (samples, duration) = match tts.synth(&text, voice).await {
Ok(r) => r,
Err(e) => {
eprintln!("[tts-engine] synthesis failed: {:?}", e);
std::process::exit(1);
}
};
eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration);
// Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects).
let spec = hound::WavSpec {
channels: 1,
sample_rate: 24000,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = match hound::WavWriter::create(&output_path, spec) {
Ok(w) => w,
Err(e) => {
eprintln!("[tts-engine] failed to create WAV: {}", e);
std::process::exit(1);
}
};
for &sample in &samples {
let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
writer.write_sample(s16).unwrap();
}
writer.finalize().unwrap();
eprintln!("[tts-engine] wrote {}", output_path);
}