siprouter/rust/crates/tts-engine/src/main.rs

/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS.
///
/// Usage:
///   echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav
///   tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world"
///
/// Outputs 24kHz 16-bit mono WAV.

use kokoro_tts::{KokoroTts, Voice};
use std::io::{self, Read};

fn parse_args() -> Result<(String, String, String, String, Option<String>), String> {
    let args: Vec<String> = std::env::args().collect();
    let mut model = String::new();
    let mut voices = String::new();
    let mut output = String::new();
    let mut text: Option<String> = None;
    let mut voice_name: Option<String> = None;

    let mut i = 1;
    while i < args.len() {
        match args[i].as_str() {
            "--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); }
            "--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); }
            "--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); }
            "--text" => { i += 1; text = args.get(i).cloned(); }
            "--voice" => { i += 1; voice_name = args.get(i).cloned(); }
            _ => {}
        }
        i += 1;
    }

    if model.is_empty() { return Err("--model required".into()); }
    if voices.is_empty() { return Err("--voices required".into()); }
    if output.is_empty() { return Err("--output required".into()); }

    let voice_str = voice_name.unwrap_or_else(|| "af_bella".into());

    Ok((model, voices, output, voice_str, text))
}

fn select_voice(name: &str) -> Voice {
    match name {
        "af_bella" => Voice::AfBella(1.0),
        "af_heart" => Voice::AfHeart(1.0),
        "af_jessica" => Voice::AfJessica(1.0),
        "af_nicole" => Voice::AfNicole(1.0),
        "af_nova" => Voice::AfNova(1.0),
        "af_sarah" => Voice::AfSarah(1.0),
        "af_sky" => Voice::AfSky(1.0),
        "af_river" => Voice::AfRiver(1.0),
        "af_alloy" => Voice::AfAlloy(1.0),
        "af_aoede" => Voice::AfAoede(1.0),
        "af_kore" => Voice::AfKore(1.0),
        "am_adam" => Voice::AmAdam(1.0),
        "am_echo" => Voice::AmEcho(1.0),
        "am_eric" => Voice::AmEric(1.0),
        "am_fenrir" => Voice::AmFenrir(1.0),
        "am_liam" => Voice::AmLiam(1.0),
        "am_michael" => Voice::AmMichael(1.0),
        "am_onyx" => Voice::AmOnyx(1.0),
        "am_puck" => Voice::AmPuck(1.0),
        "bf_alice" => Voice::BfAlice(1.0),
        "bf_emma" => Voice::BfEmma(1.0),
        "bf_isabella" => Voice::BfIsabella(1.0),
        "bf_lily" => Voice::BfLily(1.0),
        "bm_daniel" => Voice::BmDaniel(1.0),
        "bm_fable" => Voice::BmFable(1.0),
        "bm_george" => Voice::BmGeorge(1.0),
        "bm_lewis" => Voice::BmLewis(1.0),
        _ => {
            eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name);
            Voice::AfBella(1.0)
        }
    }
}

#[tokio::main]
async fn main() {
    let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() {
        Ok(v) => v,
        Err(e) => {
            eprintln!("Error: {}", e);
            eprintln!("Usage: tts-engine --model <model.onnx> --voices <voices.bin> --output <output.wav> [--text <text>] [--voice <voice_name>]");
            std::process::exit(1);
        }
    };

    // Get text from --text arg or stdin.
    let text = match text_arg {
        Some(t) => t,
        None => {
            let mut buf = String::new();
            io::stdin().read_to_string(&mut buf).expect("failed to read stdin");
            buf.trim().to_string()
        }
    };

    if text.is_empty() {
        eprintln!("[tts-engine] no text provided");
        std::process::exit(1);
    }

    eprintln!("[tts-engine] loading model: {}", model_path);
    let tts = match KokoroTts::new(&model_path, &voices_path).await {
        Ok(t) => t,
        Err(e) => {
            eprintln!("[tts-engine] failed to load model: {:?}", e);
            std::process::exit(1);
        }
    };

    let voice = select_voice(&voice_name);
    eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text);

    let (samples, duration) = match tts.synth(&text, voice).await {
        Ok(r) => r,
        Err(e) => {
            eprintln!("[tts-engine] synthesis failed: {:?}", e);
            std::process::exit(1);
        }
    };

    eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration);

    // Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects).
    let spec = hound::WavSpec {
        channels: 1,
        sample_rate: 24000,
        bits_per_sample: 16,
        sample_format: hound::SampleFormat::Int,
    };

    let mut writer = match hound::WavWriter::create(&output_path, spec) {
        Ok(w) => w,
        Err(e) => {
            eprintln!("[tts-engine] failed to create WAV: {}", e);
            std::process::exit(1);
        }
    };

    for &sample in &samples {
        let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
        writer.write_sample(s16).unwrap();
    }
    writer.finalize().unwrap();

    eprintln!("[tts-engine] wrote {}", output_path);
}