/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS. /// /// Usage: /// echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav /// tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world" /// /// Outputs 24kHz 16-bit mono WAV. use kokoro_tts::{KokoroTts, Voice}; use std::io::{self, Read}; fn parse_args() -> Result<(String, String, String, String, Option), String> { let args: Vec = std::env::args().collect(); let mut model = String::new(); let mut voices = String::new(); let mut output = String::new(); let mut text: Option = None; let mut voice_name: Option = None; let mut i = 1; while i < args.len() { match args[i].as_str() { "--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); } "--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); } "--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); } "--text" => { i += 1; text = args.get(i).cloned(); } "--voice" => { i += 1; voice_name = args.get(i).cloned(); } _ => {} } i += 1; } if model.is_empty() { return Err("--model required".into()); } if voices.is_empty() { return Err("--voices required".into()); } if output.is_empty() { return Err("--output required".into()); } let voice_str = voice_name.unwrap_or_else(|| "af_bella".into()); Ok((model, voices, output, voice_str, text)) } fn select_voice(name: &str) -> Voice { match name { "af_bella" => Voice::AfBella(1.0), "af_heart" => Voice::AfHeart(1.0), "af_jessica" => Voice::AfJessica(1.0), "af_nicole" => Voice::AfNicole(1.0), "af_nova" => Voice::AfNova(1.0), "af_sarah" => Voice::AfSarah(1.0), "af_sky" => Voice::AfSky(1.0), "af_river" => Voice::AfRiver(1.0), "af_alloy" => Voice::AfAlloy(1.0), "af_aoede" => Voice::AfAoede(1.0), "af_kore" => Voice::AfKore(1.0), "am_adam" => Voice::AmAdam(1.0), "am_echo" => Voice::AmEcho(1.0), "am_eric" => Voice::AmEric(1.0), "am_fenrir" => Voice::AmFenrir(1.0), "am_liam" => Voice::AmLiam(1.0), "am_michael" => Voice::AmMichael(1.0), "am_onyx" => Voice::AmOnyx(1.0), "am_puck" => Voice::AmPuck(1.0), "bf_alice" => Voice::BfAlice(1.0), "bf_emma" => Voice::BfEmma(1.0), "bf_isabella" => Voice::BfIsabella(1.0), "bf_lily" => Voice::BfLily(1.0), "bm_daniel" => Voice::BmDaniel(1.0), "bm_fable" => Voice::BmFable(1.0), "bm_george" => Voice::BmGeorge(1.0), "bm_lewis" => Voice::BmLewis(1.0), _ => { eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name); Voice::AfBella(1.0) } } } #[tokio::main] async fn main() { let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() { Ok(v) => v, Err(e) => { eprintln!("Error: {}", e); eprintln!("Usage: tts-engine --model --voices --output [--text ] [--voice ]"); std::process::exit(1); } }; // Get text from --text arg or stdin. let text = match text_arg { Some(t) => t, None => { let mut buf = String::new(); io::stdin().read_to_string(&mut buf).expect("failed to read stdin"); buf.trim().to_string() } }; if text.is_empty() { eprintln!("[tts-engine] no text provided"); std::process::exit(1); } eprintln!("[tts-engine] loading model: {}", model_path); let tts = match KokoroTts::new(&model_path, &voices_path).await { Ok(t) => t, Err(e) => { eprintln!("[tts-engine] failed to load model: {:?}", e); std::process::exit(1); } }; let voice = select_voice(&voice_name); eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text); let (samples, duration) = match tts.synth(&text, voice).await { Ok(r) => r, Err(e) => { eprintln!("[tts-engine] synthesis failed: {:?}", e); std::process::exit(1); } }; eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration); // Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects). let spec = hound::WavSpec { channels: 1, sample_rate: 24000, bits_per_sample: 16, sample_format: hound::SampleFormat::Int, }; let mut writer = match hound::WavWriter::create(&output_path, spec) { Ok(w) => w, Err(e) => { eprintln!("[tts-engine] failed to create WAV: {}", e); std::process::exit(1); } }; for &sample in &samples { let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16; writer.write_sample(s16).unwrap(); } writer.finalize().unwrap(); eprintln!("[tts-engine] wrote {}", output_path); }