initial commit — SIP B2BUA + WebRTC bridge with Rust codec engine
Full-featured SIP router with multi-provider trunking, browser softphone via WebRTC, real-time Opus/G.722/PCM transcoding in Rust, RNNoise ML noise suppression, Kokoro neural TTS announcements, and a Lit-based web dashboard with live call monitoring and REST API.
This commit is contained in:
149
rust/crates/tts-engine/src/main.rs
Normal file
149
rust/crates/tts-engine/src/main.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS.
|
||||
///
|
||||
/// Usage:
|
||||
/// echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav
|
||||
/// tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world"
|
||||
///
|
||||
/// Outputs 24kHz 16-bit mono WAV.
|
||||
|
||||
use kokoro_tts::{KokoroTts, Voice};
|
||||
use std::io::{self, Read};
|
||||
|
||||
fn parse_args() -> Result<(String, String, String, String, Option<String>), String> {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
let mut model = String::new();
|
||||
let mut voices = String::new();
|
||||
let mut output = String::new();
|
||||
let mut text: Option<String> = None;
|
||||
let mut voice_name: Option<String> = None;
|
||||
|
||||
let mut i = 1;
|
||||
while i < args.len() {
|
||||
match args[i].as_str() {
|
||||
"--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); }
|
||||
"--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); }
|
||||
"--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); }
|
||||
"--text" => { i += 1; text = args.get(i).cloned(); }
|
||||
"--voice" => { i += 1; voice_name = args.get(i).cloned(); }
|
||||
_ => {}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
if model.is_empty() { return Err("--model required".into()); }
|
||||
if voices.is_empty() { return Err("--voices required".into()); }
|
||||
if output.is_empty() { return Err("--output required".into()); }
|
||||
|
||||
let voice_str = voice_name.unwrap_or_else(|| "af_bella".into());
|
||||
|
||||
Ok((model, voices, output, voice_str, text))
|
||||
}
|
||||
|
||||
fn select_voice(name: &str) -> Voice {
|
||||
match name {
|
||||
"af_bella" => Voice::AfBella(1.0),
|
||||
"af_heart" => Voice::AfHeart(1.0),
|
||||
"af_jessica" => Voice::AfJessica(1.0),
|
||||
"af_nicole" => Voice::AfNicole(1.0),
|
||||
"af_nova" => Voice::AfNova(1.0),
|
||||
"af_sarah" => Voice::AfSarah(1.0),
|
||||
"af_sky" => Voice::AfSky(1.0),
|
||||
"af_river" => Voice::AfRiver(1.0),
|
||||
"af_alloy" => Voice::AfAlloy(1.0),
|
||||
"af_aoede" => Voice::AfAoede(1.0),
|
||||
"af_kore" => Voice::AfKore(1.0),
|
||||
"am_adam" => Voice::AmAdam(1.0),
|
||||
"am_echo" => Voice::AmEcho(1.0),
|
||||
"am_eric" => Voice::AmEric(1.0),
|
||||
"am_fenrir" => Voice::AmFenrir(1.0),
|
||||
"am_liam" => Voice::AmLiam(1.0),
|
||||
"am_michael" => Voice::AmMichael(1.0),
|
||||
"am_onyx" => Voice::AmOnyx(1.0),
|
||||
"am_puck" => Voice::AmPuck(1.0),
|
||||
"bf_alice" => Voice::BfAlice(1.0),
|
||||
"bf_emma" => Voice::BfEmma(1.0),
|
||||
"bf_isabella" => Voice::BfIsabella(1.0),
|
||||
"bf_lily" => Voice::BfLily(1.0),
|
||||
"bm_daniel" => Voice::BmDaniel(1.0),
|
||||
"bm_fable" => Voice::BmFable(1.0),
|
||||
"bm_george" => Voice::BmGeorge(1.0),
|
||||
"bm_lewis" => Voice::BmLewis(1.0),
|
||||
_ => {
|
||||
eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name);
|
||||
Voice::AfBella(1.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
eprintln!("Error: {}", e);
|
||||
eprintln!("Usage: tts-engine --model <model.onnx> --voices <voices.bin> --output <output.wav> [--text <text>] [--voice <voice_name>]");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
// Get text from --text arg or stdin.
|
||||
let text = match text_arg {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
let mut buf = String::new();
|
||||
io::stdin().read_to_string(&mut buf).expect("failed to read stdin");
|
||||
buf.trim().to_string()
|
||||
}
|
||||
};
|
||||
|
||||
if text.is_empty() {
|
||||
eprintln!("[tts-engine] no text provided");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
eprintln!("[tts-engine] loading model: {}", model_path);
|
||||
let tts = match KokoroTts::new(&model_path, &voices_path).await {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
eprintln!("[tts-engine] failed to load model: {:?}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let voice = select_voice(&voice_name);
|
||||
eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text);
|
||||
|
||||
let (samples, duration) = match tts.synth(&text, voice).await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!("[tts-engine] synthesis failed: {:?}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration);
|
||||
|
||||
// Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects).
|
||||
let spec = hound::WavSpec {
|
||||
channels: 1,
|
||||
sample_rate: 24000,
|
||||
bits_per_sample: 16,
|
||||
sample_format: hound::SampleFormat::Int,
|
||||
};
|
||||
|
||||
let mut writer = match hound::WavWriter::create(&output_path, spec) {
|
||||
Ok(w) => w,
|
||||
Err(e) => {
|
||||
eprintln!("[tts-engine] failed to create WAV: {}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
for &sample in &samples {
|
||||
let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
|
||||
writer.write_sample(s16).unwrap();
|
||||
}
|
||||
writer.finalize().unwrap();
|
||||
|
||||
eprintln!("[tts-engine] wrote {}", output_path);
|
||||
}
|
||||
Reference in New Issue
Block a user