Full-featured SIP router with multi-provider trunking, browser softphone via WebRTC, real-time Opus/G.722/PCM transcoding in Rust, RNNoise ML noise suppression, Kokoro neural TTS announcements, and a Lit-based web dashboard with live call monitoring and REST API.
150 lines
5.1 KiB
Rust
150 lines
5.1 KiB
Rust
/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS.
|
|
///
|
|
/// Usage:
|
|
/// echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav
|
|
/// tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world"
|
|
///
|
|
/// Outputs 24kHz 16-bit mono WAV.
|
|
|
|
use kokoro_tts::{KokoroTts, Voice};
|
|
use std::io::{self, Read};
|
|
|
|
fn parse_args() -> Result<(String, String, String, String, Option<String>), String> {
|
|
let args: Vec<String> = std::env::args().collect();
|
|
let mut model = String::new();
|
|
let mut voices = String::new();
|
|
let mut output = String::new();
|
|
let mut text: Option<String> = None;
|
|
let mut voice_name: Option<String> = None;
|
|
|
|
let mut i = 1;
|
|
while i < args.len() {
|
|
match args[i].as_str() {
|
|
"--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); }
|
|
"--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); }
|
|
"--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); }
|
|
"--text" => { i += 1; text = args.get(i).cloned(); }
|
|
"--voice" => { i += 1; voice_name = args.get(i).cloned(); }
|
|
_ => {}
|
|
}
|
|
i += 1;
|
|
}
|
|
|
|
if model.is_empty() { return Err("--model required".into()); }
|
|
if voices.is_empty() { return Err("--voices required".into()); }
|
|
if output.is_empty() { return Err("--output required".into()); }
|
|
|
|
let voice_str = voice_name.unwrap_or_else(|| "af_bella".into());
|
|
|
|
Ok((model, voices, output, voice_str, text))
|
|
}
|
|
|
|
fn select_voice(name: &str) -> Voice {
|
|
match name {
|
|
"af_bella" => Voice::AfBella(1.0),
|
|
"af_heart" => Voice::AfHeart(1.0),
|
|
"af_jessica" => Voice::AfJessica(1.0),
|
|
"af_nicole" => Voice::AfNicole(1.0),
|
|
"af_nova" => Voice::AfNova(1.0),
|
|
"af_sarah" => Voice::AfSarah(1.0),
|
|
"af_sky" => Voice::AfSky(1.0),
|
|
"af_river" => Voice::AfRiver(1.0),
|
|
"af_alloy" => Voice::AfAlloy(1.0),
|
|
"af_aoede" => Voice::AfAoede(1.0),
|
|
"af_kore" => Voice::AfKore(1.0),
|
|
"am_adam" => Voice::AmAdam(1.0),
|
|
"am_echo" => Voice::AmEcho(1.0),
|
|
"am_eric" => Voice::AmEric(1.0),
|
|
"am_fenrir" => Voice::AmFenrir(1.0),
|
|
"am_liam" => Voice::AmLiam(1.0),
|
|
"am_michael" => Voice::AmMichael(1.0),
|
|
"am_onyx" => Voice::AmOnyx(1.0),
|
|
"am_puck" => Voice::AmPuck(1.0),
|
|
"bf_alice" => Voice::BfAlice(1.0),
|
|
"bf_emma" => Voice::BfEmma(1.0),
|
|
"bf_isabella" => Voice::BfIsabella(1.0),
|
|
"bf_lily" => Voice::BfLily(1.0),
|
|
"bm_daniel" => Voice::BmDaniel(1.0),
|
|
"bm_fable" => Voice::BmFable(1.0),
|
|
"bm_george" => Voice::BmGeorge(1.0),
|
|
"bm_lewis" => Voice::BmLewis(1.0),
|
|
_ => {
|
|
eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name);
|
|
Voice::AfBella(1.0)
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() {
|
|
let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() {
|
|
Ok(v) => v,
|
|
Err(e) => {
|
|
eprintln!("Error: {}", e);
|
|
eprintln!("Usage: tts-engine --model <model.onnx> --voices <voices.bin> --output <output.wav> [--text <text>] [--voice <voice_name>]");
|
|
std::process::exit(1);
|
|
}
|
|
};
|
|
|
|
// Get text from --text arg or stdin.
|
|
let text = match text_arg {
|
|
Some(t) => t,
|
|
None => {
|
|
let mut buf = String::new();
|
|
io::stdin().read_to_string(&mut buf).expect("failed to read stdin");
|
|
buf.trim().to_string()
|
|
}
|
|
};
|
|
|
|
if text.is_empty() {
|
|
eprintln!("[tts-engine] no text provided");
|
|
std::process::exit(1);
|
|
}
|
|
|
|
eprintln!("[tts-engine] loading model: {}", model_path);
|
|
let tts = match KokoroTts::new(&model_path, &voices_path).await {
|
|
Ok(t) => t,
|
|
Err(e) => {
|
|
eprintln!("[tts-engine] failed to load model: {:?}", e);
|
|
std::process::exit(1);
|
|
}
|
|
};
|
|
|
|
let voice = select_voice(&voice_name);
|
|
eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text);
|
|
|
|
let (samples, duration) = match tts.synth(&text, voice).await {
|
|
Ok(r) => r,
|
|
Err(e) => {
|
|
eprintln!("[tts-engine] synthesis failed: {:?}", e);
|
|
std::process::exit(1);
|
|
}
|
|
};
|
|
|
|
eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration);
|
|
|
|
// Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects).
|
|
let spec = hound::WavSpec {
|
|
channels: 1,
|
|
sample_rate: 24000,
|
|
bits_per_sample: 16,
|
|
sample_format: hound::SampleFormat::Int,
|
|
};
|
|
|
|
let mut writer = match hound::WavWriter::create(&output_path, spec) {
|
|
Ok(w) => w,
|
|
Err(e) => {
|
|
eprintln!("[tts-engine] failed to create WAV: {}", e);
|
|
std::process::exit(1);
|
|
}
|
|
};
|
|
|
|
for &sample in &samples {
|
|
let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
|
|
writer.write_sample(s16).unwrap();
|
|
}
|
|
writer.finalize().unwrap();
|
|
|
|
eprintln!("[tts-engine] wrote {}", output_path);
|
|
}
|