initial commit — SIP B2BUA + WebRTC bridge with Rust codec engine
Full-featured SIP router with multi-provider trunking, browser softphone via WebRTC, real-time Opus/G.722/PCM transcoding in Rust, RNNoise ML noise suppression, Kokoro neural TTS announcements, and a Lit-based web dashboard with live call monitoring and REST API.
This commit is contained in:
1971
rust/Cargo.lock
generated
Normal file
1971
rust/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
7
rust/Cargo.toml
Normal file
7
rust/Cargo.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[workspace]
|
||||
members = ["crates/opus-codec", "crates/tts-engine"]
|
||||
resolver = "2"
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
lto = true
|
||||
17
rust/crates/opus-codec/Cargo.toml
Normal file
17
rust/crates/opus-codec/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "opus-codec"
|
||||
version = "0.2.0"
|
||||
edition = "2021"
|
||||
|
||||
[[bin]]
|
||||
name = "opus-codec"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
audiopus = "0.3.0-rc.0"
|
||||
ezk-g722 = "0.1"
|
||||
rubato = "0.14"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
base64 = "0.22"
|
||||
nnnoiseless = { version = "0.5", default-features = false }
|
||||
464
rust/crates/opus-codec/src/main.rs
Normal file
464
rust/crates/opus-codec/src/main.rs
Normal file
@@ -0,0 +1,464 @@
|
||||
/// Audio transcoding bridge for smartrust.
|
||||
///
|
||||
/// Handles Opus ↔ G.722 ↔ PCMU transcoding for the SIP router.
|
||||
/// Uses audiopus (libopus) for Opus and ezk-g722 (SpanDSP port) for G.722.
|
||||
///
|
||||
/// Supports per-session codec state so concurrent calls don't corrupt each
|
||||
/// other's stateful codecs (Opus, G.722 ADPCM).
|
||||
///
|
||||
/// Protocol:
|
||||
/// -> {"id":"1","method":"init","params":{}}
|
||||
/// <- {"id":"1","success":true,"result":{}}
|
||||
/// -> {"id":"2","method":"create_session","params":{"session_id":"call-abc"}}
|
||||
/// <- {"id":"2","success":true,"result":{}}
|
||||
/// -> {"id":"3","method":"transcode","params":{"session_id":"call-abc","data_b64":"...","from_pt":111,"to_pt":9}}
|
||||
/// <- {"id":"3","success":true,"result":{"data_b64":"..."}}
|
||||
/// -> {"id":"4","method":"destroy_session","params":{"session_id":"call-abc"}}
|
||||
/// <- {"id":"4","success":true,"result":{}}
|
||||
|
||||
use audiopus::coder::{Decoder as OpusDecoder, Encoder as OpusEncoder};
|
||||
use audiopus::packet::Packet as OpusPacket;
|
||||
use audiopus::{Application, Bitrate as OpusBitrate, Channels, MutSignals, SampleRate};
|
||||
use base64::engine::general_purpose::STANDARD as B64;
|
||||
use base64::Engine as _;
|
||||
use ezk_g722::libg722::{self, Bitrate};
|
||||
use nnnoiseless::DenoiseState;
|
||||
use rubato::{FftFixedIn, Resampler};
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, BufRead, Write};
|
||||
|
||||
// Payload type constants.
|
||||
const PT_PCMU: u8 = 0;
|
||||
const PT_PCMA: u8 = 8;
|
||||
const PT_G722: u8 = 9;
|
||||
const PT_OPUS: u8 = 111;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Request {
|
||||
id: String,
|
||||
method: String,
|
||||
#[serde(default)]
|
||||
params: serde_json::Value,
|
||||
}
|
||||
|
||||
fn respond(out: &mut impl Write, id: &str, success: bool, result: Option<serde_json::Value>, error: Option<&str>) {
|
||||
let mut resp = serde_json::json!({ "id": id, "success": success });
|
||||
if let Some(r) = result { resp["result"] = r; }
|
||||
if let Some(e) = error { resp["error"] = serde_json::Value::String(e.to_string()); }
|
||||
let _ = writeln!(out, "{}", resp);
|
||||
let _ = out.flush();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Codec state
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
struct TranscodeState {
|
||||
opus_enc: OpusEncoder,
|
||||
opus_dec: OpusDecoder,
|
||||
g722_enc: libg722::encoder::Encoder,
|
||||
g722_dec: libg722::decoder::Decoder,
|
||||
// Cached FFT resamplers keyed by (from_rate, to_rate, chunk_size).
|
||||
resamplers: HashMap<(u32, u32, usize), FftFixedIn<f64>>,
|
||||
// Per-direction ML noise suppression (RNNoise). Separate state per direction
|
||||
// prevents the RNN hidden state from being corrupted by interleaved audio streams.
|
||||
denoiser_to_sip: Box<DenoiseState<'static>>,
|
||||
denoiser_to_browser: Box<DenoiseState<'static>>,
|
||||
}
|
||||
|
||||
impl TranscodeState {
|
||||
fn new() -> Result<Self, String> {
|
||||
let mut opus_enc = OpusEncoder::new(SampleRate::Hz48000, Channels::Mono, Application::Voip)
|
||||
.map_err(|e| format!("opus encoder: {e}"))?;
|
||||
// Telephony-grade tuning: complexity 5 is sufficient for voice bridged to G.722.
|
||||
opus_enc.set_complexity(5).map_err(|e| format!("opus set_complexity: {e}"))?;
|
||||
opus_enc.set_bitrate(OpusBitrate::BitsPerSecond(24000)).map_err(|e| format!("opus set_bitrate: {e}"))?;
|
||||
let opus_dec = OpusDecoder::new(SampleRate::Hz48000, Channels::Mono)
|
||||
.map_err(|e| format!("opus decoder: {e}"))?;
|
||||
let g722_enc = libg722::encoder::Encoder::new(Bitrate::Mode1_64000, false, false);
|
||||
let g722_dec = libg722::decoder::Decoder::new(Bitrate::Mode1_64000, false, false);
|
||||
|
||||
Ok(Self {
|
||||
opus_enc, opus_dec, g722_enc, g722_dec,
|
||||
resamplers: HashMap::new(),
|
||||
denoiser_to_sip: DenoiseState::new(),
|
||||
denoiser_to_browser: DenoiseState::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// High-quality sample rate conversion using rubato FFT resampler.
|
||||
/// Resamplers are cached by (from_rate, to_rate, chunk_size) and reused,
|
||||
/// maintaining proper inter-frame state for continuous audio streams.
|
||||
fn resample(&mut self, pcm: &[i16], from_rate: u32, to_rate: u32) -> Result<Vec<i16>, String> {
|
||||
if from_rate == to_rate || pcm.is_empty() {
|
||||
return Ok(pcm.to_vec());
|
||||
}
|
||||
|
||||
let chunk = pcm.len();
|
||||
let key = (from_rate, to_rate, chunk);
|
||||
|
||||
// Get or create cached resampler for this rate pair + chunk size.
|
||||
if !self.resamplers.contains_key(&key) {
|
||||
let r = FftFixedIn::<f64>::new(from_rate as usize, to_rate as usize, chunk, 1, 1)
|
||||
.map_err(|e| format!("resampler {from_rate}->{to_rate}: {e}"))?;
|
||||
self.resamplers.insert(key, r);
|
||||
}
|
||||
let resampler = self.resamplers.get_mut(&key).unwrap();
|
||||
|
||||
// i16 → f64 normalized to [-1.0, 1.0]
|
||||
let float_in: Vec<f64> = pcm.iter().map(|&s| s as f64 / 32768.0).collect();
|
||||
let input = vec![float_in];
|
||||
|
||||
let result = resampler.process(&input, None)
|
||||
.map_err(|e| format!("resample {from_rate}->{to_rate}: {e}"))?;
|
||||
|
||||
// f64 → i16
|
||||
Ok(result[0].iter()
|
||||
.map(|&s| (s * 32767.0).round().clamp(-32768.0, 32767.0) as i16)
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Apply RNNoise ML noise suppression to 48kHz PCM audio.
|
||||
/// Processes in 480-sample (10ms) frames. State persists across calls.
|
||||
fn denoise(denoiser: &mut DenoiseState, pcm: &[i16]) -> Vec<i16> {
|
||||
let frame_size = DenoiseState::FRAME_SIZE; // 480
|
||||
let total = pcm.len();
|
||||
// Round down to whole frames — don't process partial frames to avoid
|
||||
// injecting artificial silence into the RNN state.
|
||||
let whole = (total / frame_size) * frame_size;
|
||||
let mut output = Vec::with_capacity(total);
|
||||
let mut out_buf = [0.0f32; 480];
|
||||
|
||||
for offset in (0..whole).step_by(frame_size) {
|
||||
let input: Vec<f32> = pcm[offset..offset + frame_size]
|
||||
.iter().map(|&s| s as f32).collect();
|
||||
denoiser.process_frame(&mut out_buf, &input);
|
||||
output.extend(out_buf.iter()
|
||||
.map(|&s| s.round().clamp(-32768.0, 32767.0) as i16));
|
||||
}
|
||||
// Pass through any trailing partial-frame samples unmodified.
|
||||
if whole < total {
|
||||
output.extend_from_slice(&pcm[whole..]);
|
||||
}
|
||||
output
|
||||
}
|
||||
|
||||
/// Transcode audio payload from one codec to another.
|
||||
/// `direction`: "to_sip" or "to_browser" — selects the per-direction denoiser.
|
||||
/// If None, denoising is skipped (backward compat).
|
||||
fn transcode(&mut self, data: &[u8], from_pt: u8, to_pt: u8, direction: Option<&str>) -> Result<Vec<u8>, String> {
|
||||
if from_pt == to_pt {
|
||||
return Ok(data.to_vec());
|
||||
}
|
||||
|
||||
// Decode to PCM (at source sample rate).
|
||||
let (pcm, rate) = self.decode_to_pcm(data, from_pt)?;
|
||||
|
||||
// Apply noise suppression if direction is specified.
|
||||
let processed = if let Some(dir) = direction {
|
||||
// Resample to 48kHz for denoising (no-op when already 48kHz).
|
||||
let pcm_48k = self.resample(&pcm, rate, 48000)?;
|
||||
let denoiser = match dir {
|
||||
"to_sip" => &mut self.denoiser_to_sip,
|
||||
_ => &mut self.denoiser_to_browser,
|
||||
};
|
||||
let denoised = Self::denoise(denoiser, &pcm_48k);
|
||||
// Resample to target rate (no-op when target is 48kHz).
|
||||
let target_rate = codec_sample_rate(to_pt);
|
||||
self.resample(&denoised, 48000, target_rate)?
|
||||
} else {
|
||||
// No denoising — direct resample.
|
||||
let target_rate = codec_sample_rate(to_pt);
|
||||
if rate == target_rate { pcm } else { self.resample(&pcm, rate, target_rate)? }
|
||||
};
|
||||
|
||||
// Encode from PCM.
|
||||
self.encode_from_pcm(&processed, to_pt)
|
||||
}
|
||||
|
||||
fn decode_to_pcm(&mut self, data: &[u8], pt: u8) -> Result<(Vec<i16>, u32), String> {
|
||||
match pt {
|
||||
PT_OPUS => {
|
||||
let mut pcm = vec![0i16; 5760]; // up to 120ms at 48kHz (RFC 6716 max)
|
||||
let packet = OpusPacket::try_from(data)
|
||||
.map_err(|e| format!("opus packet: {e}"))?;
|
||||
let out = MutSignals::try_from(&mut pcm[..])
|
||||
.map_err(|e| format!("opus signals: {e}"))?;
|
||||
let n: usize = self.opus_dec.decode(Some(packet), out, false)
|
||||
.map_err(|e| format!("opus decode: {e}"))?.into();
|
||||
pcm.truncate(n);
|
||||
Ok((pcm, 48000))
|
||||
}
|
||||
PT_G722 => {
|
||||
let pcm = self.g722_dec.decode(data);
|
||||
Ok((pcm, 16000))
|
||||
}
|
||||
PT_PCMU => {
|
||||
let pcm: Vec<i16> = data.iter().map(|&b| mulaw_decode(b)).collect();
|
||||
Ok((pcm, 8000))
|
||||
}
|
||||
PT_PCMA => {
|
||||
let pcm: Vec<i16> = data.iter().map(|&b| alaw_decode(b)).collect();
|
||||
Ok((pcm, 8000))
|
||||
}
|
||||
_ => Err(format!("unsupported source PT {pt}")),
|
||||
}
|
||||
}
|
||||
|
||||
fn encode_from_pcm(&mut self, pcm: &[i16], pt: u8) -> Result<Vec<u8>, String> {
|
||||
match pt {
|
||||
PT_OPUS => {
|
||||
let mut buf = vec![0u8; 4000];
|
||||
let n: usize = self.opus_enc.encode(pcm, &mut buf)
|
||||
.map_err(|e| format!("opus encode: {e}"))?.into();
|
||||
buf.truncate(n);
|
||||
Ok(buf)
|
||||
}
|
||||
PT_G722 => {
|
||||
Ok(self.g722_enc.encode(pcm))
|
||||
}
|
||||
PT_PCMU => {
|
||||
Ok(pcm.iter().map(|&s| mulaw_encode(s)).collect())
|
||||
}
|
||||
PT_PCMA => {
|
||||
Ok(pcm.iter().map(|&s| alaw_encode(s)).collect())
|
||||
}
|
||||
_ => Err(format!("unsupported target PT {pt}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn codec_sample_rate(pt: u8) -> u32 {
|
||||
match pt {
|
||||
PT_OPUS => 48000,
|
||||
PT_G722 => 16000,
|
||||
_ => 8000, // PCMU, PCMA
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// G.711 µ-law (PCMU)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn mulaw_encode(sample: i16) -> u8 {
|
||||
const BIAS: i16 = 0x84;
|
||||
const CLIP: i16 = 32635;
|
||||
let sign = if sample < 0 { 0x80u8 } else { 0 };
|
||||
// Use i32 to avoid overflow when sample == i16::MIN (-32768).
|
||||
let mut s = (sample as i32).unsigned_abs().min(CLIP as u32) as i16;
|
||||
s += BIAS;
|
||||
let mut exp = 7u8;
|
||||
let mut mask = 0x4000i16;
|
||||
while exp > 0 && (s & mask) == 0 { exp -= 1; mask >>= 1; }
|
||||
let mantissa = ((s >> (exp + 3)) & 0x0f) as u8;
|
||||
!(sign | (exp << 4) | mantissa)
|
||||
}
|
||||
|
||||
fn mulaw_decode(mulaw: u8) -> i16 {
|
||||
let v = !mulaw;
|
||||
let sign = v & 0x80;
|
||||
let exp = (v >> 4) & 0x07;
|
||||
let mantissa = v & 0x0f;
|
||||
let mut sample = (((mantissa as i16) << 4) + 0x84) << exp;
|
||||
sample -= 0x84;
|
||||
if sign != 0 { -sample } else { sample }
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// G.711 A-law (PCMA)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn alaw_encode(sample: i16) -> u8 {
|
||||
let sign = if sample >= 0 { 0x80u8 } else { 0 };
|
||||
// Use i32 to avoid overflow when sample == i16::MIN (-32768).
|
||||
let s = (sample as i32).unsigned_abs().min(32767) as i16;
|
||||
let mut exp = 7u8;
|
||||
let mut mask = 0x4000i16;
|
||||
while exp > 0 && (s & mask) == 0 { exp -= 1; mask >>= 1; }
|
||||
let mantissa = if exp > 0 { ((s >> (exp + 3)) & 0x0f) as u8 } else { ((s >> 4) & 0x0f) as u8 };
|
||||
(sign | (exp << 4) | mantissa) ^ 0x55
|
||||
}
|
||||
|
||||
fn alaw_decode(alaw: u8) -> i16 {
|
||||
let v = alaw ^ 0x55;
|
||||
let sign = v & 0x80;
|
||||
let exp = (v >> 4) & 0x07;
|
||||
let mantissa = v & 0x0f;
|
||||
let sample = if exp == 0 {
|
||||
((mantissa as i16) << 4) + 8
|
||||
} else {
|
||||
(((mantissa as i16) << 4) + 0x108) << (exp - 1)
|
||||
};
|
||||
if sign != 0 { sample } else { -sample }
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main loop
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Resolve a session: if session_id is provided, look it up in the sessions map;
|
||||
/// otherwise fall back to the default state (backward compat with `init`).
|
||||
fn get_session<'a>(
|
||||
sessions: &'a mut HashMap<String, TranscodeState>,
|
||||
default: &'a mut Option<TranscodeState>,
|
||||
params: &serde_json::Value,
|
||||
) -> Option<&'a mut TranscodeState> {
|
||||
if let Some(sid) = params.get("session_id").and_then(|v| v.as_str()) {
|
||||
sessions.get_mut(sid)
|
||||
} else {
|
||||
default.as_mut()
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let stdin = io::stdin();
|
||||
let stdout = io::stdout();
|
||||
let mut out = io::BufWriter::new(stdout.lock());
|
||||
|
||||
let _ = writeln!(out, r#"{{"event":"ready","data":{{}}}}"#);
|
||||
let _ = out.flush();
|
||||
|
||||
// Default state for backward-compat `init` (no session_id).
|
||||
let mut default_state: Option<TranscodeState> = None;
|
||||
// Per-session codec state for concurrent call isolation.
|
||||
let mut sessions: HashMap<String, TranscodeState> = HashMap::new();
|
||||
|
||||
for line in stdin.lock().lines() {
|
||||
let line = match line {
|
||||
Ok(l) if !l.trim().is_empty() => l,
|
||||
Ok(_) => continue,
|
||||
Err(_) => break,
|
||||
};
|
||||
|
||||
let req: Request = match serde_json::from_str(&line) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
respond(&mut out, "", false, None, Some(&format!("parse: {e}")));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match req.method.as_str() {
|
||||
// Backward-compat: init the default (shared) session.
|
||||
"init" => {
|
||||
match TranscodeState::new() {
|
||||
Ok(s) => {
|
||||
default_state = Some(s);
|
||||
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
|
||||
}
|
||||
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
|
||||
}
|
||||
}
|
||||
|
||||
// Create an isolated session with its own codec state.
|
||||
"create_session" => {
|
||||
let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) {
|
||||
Some(s) => s.to_string(),
|
||||
None => { respond(&mut out, &req.id, false, None, Some("missing session_id")); continue; }
|
||||
};
|
||||
if sessions.contains_key(&session_id) {
|
||||
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
|
||||
continue;
|
||||
}
|
||||
match TranscodeState::new() {
|
||||
Ok(s) => {
|
||||
sessions.insert(session_id, s);
|
||||
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
|
||||
}
|
||||
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
|
||||
}
|
||||
}
|
||||
|
||||
// Destroy a session, freeing its codec state.
|
||||
"destroy_session" => {
|
||||
let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) {
|
||||
Some(s) => s,
|
||||
None => { respond(&mut out, &req.id, false, None, Some("missing session_id")); continue; }
|
||||
};
|
||||
sessions.remove(session_id);
|
||||
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
|
||||
}
|
||||
|
||||
// Transcode: uses session_id if provided, else default state.
|
||||
"transcode" => {
|
||||
let st = match get_session(&mut sessions, &mut default_state, &req.params) {
|
||||
Some(s) => s,
|
||||
None => { respond(&mut out, &req.id, false, None, Some("not initialized (no session or default state)")); continue; }
|
||||
};
|
||||
let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) {
|
||||
Some(s) => s,
|
||||
None => { respond(&mut out, &req.id, false, None, Some("missing data_b64")); continue; }
|
||||
};
|
||||
let from_pt = req.params.get("from_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8;
|
||||
let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8;
|
||||
let direction = req.params.get("direction").and_then(|v| v.as_str());
|
||||
|
||||
let data = match B64.decode(data_b64) {
|
||||
Ok(b) => b,
|
||||
Err(e) => { respond(&mut out, &req.id, false, None, Some(&format!("b64: {e}"))); continue; }
|
||||
};
|
||||
|
||||
match st.transcode(&data, from_pt, to_pt, direction) {
|
||||
Ok(result) => {
|
||||
respond(&mut out, &req.id, true, Some(serde_json::json!({ "data_b64": B64.encode(&result) })), None);
|
||||
}
|
||||
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
|
||||
}
|
||||
}
|
||||
|
||||
// Encode raw 16-bit PCM to a target codec.
|
||||
// Params: data_b64 (raw PCM bytes, 16-bit LE), sample_rate (input Hz), to_pt
|
||||
// Optional: session_id for isolated codec state.
|
||||
"encode_pcm" => {
|
||||
let st = match get_session(&mut sessions, &mut default_state, &req.params) {
|
||||
Some(s) => s,
|
||||
None => { respond(&mut out, &req.id, false, None, Some("not initialized (no session or default state)")); continue; }
|
||||
};
|
||||
let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) {
|
||||
Some(s) => s,
|
||||
None => { respond(&mut out, &req.id, false, None, Some("missing data_b64")); continue; }
|
||||
};
|
||||
let sample_rate = req.params.get("sample_rate").and_then(|v| v.as_u64()).unwrap_or(22050) as u32;
|
||||
let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(9) as u8;
|
||||
|
||||
let data = match B64.decode(data_b64) {
|
||||
Ok(b) => b,
|
||||
Err(e) => { respond(&mut out, &req.id, false, None, Some(&format!("b64: {e}"))); continue; }
|
||||
};
|
||||
|
||||
if data.len() % 2 != 0 {
|
||||
respond(&mut out, &req.id, false, None, Some("PCM data has odd byte count (expected 16-bit LE samples)"));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Convert raw bytes to i16 samples.
|
||||
let pcm: Vec<i16> = data.chunks_exact(2)
|
||||
.map(|c| i16::from_le_bytes([c[0], c[1]]))
|
||||
.collect();
|
||||
|
||||
// Resample to target codec's sample rate.
|
||||
let target_rate = codec_sample_rate(to_pt);
|
||||
let resampled = match st.resample(&pcm, sample_rate, target_rate) {
|
||||
Ok(r) => r,
|
||||
Err(e) => { respond(&mut out, &req.id, false, None, Some(&e)); continue; }
|
||||
};
|
||||
|
||||
// Encode to target codec (reuse encode_from_pcm).
|
||||
match st.encode_from_pcm(&resampled, to_pt) {
|
||||
Ok(encoded) => {
|
||||
respond(&mut out, &req.id, true, Some(serde_json::json!({ "data_b64": B64.encode(&encoded) })), None);
|
||||
}
|
||||
Err(e) => { respond(&mut out, &req.id, false, None, Some(&e)); continue; }
|
||||
}
|
||||
}
|
||||
|
||||
// Legacy commands (kept for backward compat).
|
||||
"encode" | "decode" => {
|
||||
respond(&mut out, &req.id, false, None, Some("use 'transcode' command instead"));
|
||||
}
|
||||
|
||||
_ => respond(&mut out, &req.id, false, None, Some(&format!("unknown: {}", req.method))),
|
||||
}
|
||||
}
|
||||
}
|
||||
18
rust/crates/tts-engine/Cargo.toml
Normal file
18
rust/crates/tts-engine/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "tts-engine"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[[bin]]
|
||||
name = "tts-engine"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
kokoro-tts = { version = "0.3", default-features = false }
|
||||
# Pin to rc.11 matching kokoro-tts's expectation; enable vendored TLS to avoid system libssl-dev.
|
||||
ort = { version = "=2.0.0-rc.11", default-features = false, features = [
|
||||
"std", "download-binaries", "copy-dylibs", "ndarray",
|
||||
"tls-native-vendored"
|
||||
] }
|
||||
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
|
||||
hound = "3.5"
|
||||
149
rust/crates/tts-engine/src/main.rs
Normal file
149
rust/crates/tts-engine/src/main.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS.
|
||||
///
|
||||
/// Usage:
|
||||
/// echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav
|
||||
/// tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world"
|
||||
///
|
||||
/// Outputs 24kHz 16-bit mono WAV.
|
||||
|
||||
use kokoro_tts::{KokoroTts, Voice};
|
||||
use std::io::{self, Read};
|
||||
|
||||
fn parse_args() -> Result<(String, String, String, String, Option<String>), String> {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
let mut model = String::new();
|
||||
let mut voices = String::new();
|
||||
let mut output = String::new();
|
||||
let mut text: Option<String> = None;
|
||||
let mut voice_name: Option<String> = None;
|
||||
|
||||
let mut i = 1;
|
||||
while i < args.len() {
|
||||
match args[i].as_str() {
|
||||
"--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); }
|
||||
"--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); }
|
||||
"--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); }
|
||||
"--text" => { i += 1; text = args.get(i).cloned(); }
|
||||
"--voice" => { i += 1; voice_name = args.get(i).cloned(); }
|
||||
_ => {}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
if model.is_empty() { return Err("--model required".into()); }
|
||||
if voices.is_empty() { return Err("--voices required".into()); }
|
||||
if output.is_empty() { return Err("--output required".into()); }
|
||||
|
||||
let voice_str = voice_name.unwrap_or_else(|| "af_bella".into());
|
||||
|
||||
Ok((model, voices, output, voice_str, text))
|
||||
}
|
||||
|
||||
fn select_voice(name: &str) -> Voice {
|
||||
match name {
|
||||
"af_bella" => Voice::AfBella(1.0),
|
||||
"af_heart" => Voice::AfHeart(1.0),
|
||||
"af_jessica" => Voice::AfJessica(1.0),
|
||||
"af_nicole" => Voice::AfNicole(1.0),
|
||||
"af_nova" => Voice::AfNova(1.0),
|
||||
"af_sarah" => Voice::AfSarah(1.0),
|
||||
"af_sky" => Voice::AfSky(1.0),
|
||||
"af_river" => Voice::AfRiver(1.0),
|
||||
"af_alloy" => Voice::AfAlloy(1.0),
|
||||
"af_aoede" => Voice::AfAoede(1.0),
|
||||
"af_kore" => Voice::AfKore(1.0),
|
||||
"am_adam" => Voice::AmAdam(1.0),
|
||||
"am_echo" => Voice::AmEcho(1.0),
|
||||
"am_eric" => Voice::AmEric(1.0),
|
||||
"am_fenrir" => Voice::AmFenrir(1.0),
|
||||
"am_liam" => Voice::AmLiam(1.0),
|
||||
"am_michael" => Voice::AmMichael(1.0),
|
||||
"am_onyx" => Voice::AmOnyx(1.0),
|
||||
"am_puck" => Voice::AmPuck(1.0),
|
||||
"bf_alice" => Voice::BfAlice(1.0),
|
||||
"bf_emma" => Voice::BfEmma(1.0),
|
||||
"bf_isabella" => Voice::BfIsabella(1.0),
|
||||
"bf_lily" => Voice::BfLily(1.0),
|
||||
"bm_daniel" => Voice::BmDaniel(1.0),
|
||||
"bm_fable" => Voice::BmFable(1.0),
|
||||
"bm_george" => Voice::BmGeorge(1.0),
|
||||
"bm_lewis" => Voice::BmLewis(1.0),
|
||||
_ => {
|
||||
eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name);
|
||||
Voice::AfBella(1.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
eprintln!("Error: {}", e);
|
||||
eprintln!("Usage: tts-engine --model <model.onnx> --voices <voices.bin> --output <output.wav> [--text <text>] [--voice <voice_name>]");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
// Get text from --text arg or stdin.
|
||||
let text = match text_arg {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
let mut buf = String::new();
|
||||
io::stdin().read_to_string(&mut buf).expect("failed to read stdin");
|
||||
buf.trim().to_string()
|
||||
}
|
||||
};
|
||||
|
||||
if text.is_empty() {
|
||||
eprintln!("[tts-engine] no text provided");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
eprintln!("[tts-engine] loading model: {}", model_path);
|
||||
let tts = match KokoroTts::new(&model_path, &voices_path).await {
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
eprintln!("[tts-engine] failed to load model: {:?}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let voice = select_voice(&voice_name);
|
||||
eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text);
|
||||
|
||||
let (samples, duration) = match tts.synth(&text, voice).await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!("[tts-engine] synthesis failed: {:?}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration);
|
||||
|
||||
// Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects).
|
||||
let spec = hound::WavSpec {
|
||||
channels: 1,
|
||||
sample_rate: 24000,
|
||||
bits_per_sample: 16,
|
||||
sample_format: hound::SampleFormat::Int,
|
||||
};
|
||||
|
||||
let mut writer = match hound::WavWriter::create(&output_path, spec) {
|
||||
Ok(w) => w,
|
||||
Err(e) => {
|
||||
eprintln!("[tts-engine] failed to create WAV: {}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
for &sample in &samples {
|
||||
let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
|
||||
writer.write_sample(s16).unwrap();
|
||||
}
|
||||
writer.finalize().unwrap();
|
||||
|
||||
eprintln!("[tts-engine] wrote {}", output_path);
|
||||
}
|
||||
Reference in New Issue
Block a user