feat(proxy-engine): integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files
This commit is contained in:
@@ -1,5 +1,12 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 2026-04-10 - 1.16.0 - feat(proxy-engine)
|
||||||
|
integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files
|
||||||
|
|
||||||
|
- adds a generate_tts command to proxy-engine with lazy-loaded Kokoro model support and WAV output generation
|
||||||
|
- removes standalone opus-codec and tts-engine workspace binaries by consolidating TTS generation into proxy-engine
|
||||||
|
- updates announcement and prompt cache flows to generate and cache WAV files on disk instead of pre-encoding RTP frames in TypeScript
|
||||||
|
|
||||||
## 2026-04-10 - 1.15.0 - feat(proxy-engine)
|
## 2026-04-10 - 1.15.0 - feat(proxy-engine)
|
||||||
add device leg, leg transfer, and leg replacement call controls
|
add device leg, leg transfer, and leg replacement call controls
|
||||||
|
|
||||||
|
|||||||
22
rust/Cargo.lock
generated
22
rust/Cargo.lock
generated
@@ -1881,16 +1881,6 @@ dependencies = [
|
|||||||
"vcpkg",
|
"vcpkg",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "opus-codec"
|
|
||||||
version = "0.2.0"
|
|
||||||
dependencies = [
|
|
||||||
"base64 0.22.1",
|
|
||||||
"codec-lib",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ort"
|
name = "ort"
|
||||||
version = "2.0.0-rc.11"
|
version = "2.0.0-rc.11"
|
||||||
@@ -2188,6 +2178,8 @@ dependencies = [
|
|||||||
"base64 0.22.1",
|
"base64 0.22.1",
|
||||||
"codec-lib",
|
"codec-lib",
|
||||||
"hound",
|
"hound",
|
||||||
|
"kokoro-tts",
|
||||||
|
"ort",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"regex-lite",
|
"regex-lite",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -3008,16 +3000,6 @@ dependencies = [
|
|||||||
"strength_reduce",
|
"strength_reduce",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "tts-engine"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"hound",
|
|
||||||
"kokoro-tts",
|
|
||||||
"ort",
|
|
||||||
"tokio",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "turn"
|
name = "turn"
|
||||||
version = "0.6.1"
|
version = "0.6.1"
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
members = [
|
members = [
|
||||||
"crates/codec-lib",
|
"crates/codec-lib",
|
||||||
"crates/opus-codec",
|
|
||||||
"crates/tts-engine",
|
|
||||||
"crates/sip-proto",
|
"crates/sip-proto",
|
||||||
"crates/proxy-engine",
|
"crates/proxy-engine",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
//! Audio codec library for the SIP router.
|
//! Audio codec library for the SIP router.
|
||||||
//!
|
//!
|
||||||
//! Handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding with ML noise suppression.
|
//! Handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding with ML noise suppression.
|
||||||
//! Used by both the standalone `opus-codec` CLI and the `proxy-engine` binary.
|
//! Used by the `proxy-engine` binary for all audio transcoding.
|
||||||
|
|
||||||
use audiopus::coder::{Decoder as OpusDecoder, Encoder as OpusEncoder};
|
use audiopus::coder::{Decoder as OpusDecoder, Encoder as OpusEncoder};
|
||||||
use audiopus::packet::Packet as OpusPacket;
|
use audiopus::packet::Packet as OpusPacket;
|
||||||
|
|||||||
@@ -1,14 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "opus-codec"
|
|
||||||
version = "0.2.0"
|
|
||||||
edition = "2021"
|
|
||||||
|
|
||||||
[[bin]]
|
|
||||||
name = "opus-codec"
|
|
||||||
path = "src/main.rs"
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
codec-lib = { path = "../codec-lib" }
|
|
||||||
serde = { version = "1", features = ["derive"] }
|
|
||||||
serde_json = "1"
|
|
||||||
base64 = "0.22"
|
|
||||||
@@ -1,286 +0,0 @@
|
|||||||
/// Audio transcoding bridge for smartrust.
|
|
||||||
///
|
|
||||||
/// Thin CLI wrapper around `codec-lib`. Handles Opus ↔ G.722 ↔ PCMU transcoding.
|
|
||||||
///
|
|
||||||
/// Protocol:
|
|
||||||
/// -> {"id":"1","method":"init","params":{}}
|
|
||||||
/// <- {"id":"1","success":true,"result":{}}
|
|
||||||
/// -> {"id":"2","method":"create_session","params":{"session_id":"call-abc"}}
|
|
||||||
/// <- {"id":"2","success":true,"result":{}}
|
|
||||||
/// -> {"id":"3","method":"transcode","params":{"session_id":"call-abc","data_b64":"...","from_pt":111,"to_pt":9}}
|
|
||||||
/// <- {"id":"3","success":true,"result":{"data_b64":"..."}}
|
|
||||||
/// -> {"id":"4","method":"destroy_session","params":{"session_id":"call-abc"}}
|
|
||||||
/// <- {"id":"4","success":true,"result":{}}
|
|
||||||
|
|
||||||
use base64::engine::general_purpose::STANDARD as B64;
|
|
||||||
use base64::Engine as _;
|
|
||||||
use codec_lib::{codec_sample_rate, TranscodeState};
|
|
||||||
use serde::Deserialize;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::io::{self, BufRead, Write};
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct Request {
|
|
||||||
id: String,
|
|
||||||
method: String,
|
|
||||||
#[serde(default)]
|
|
||||||
params: serde_json::Value,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn respond(
|
|
||||||
out: &mut impl Write,
|
|
||||||
id: &str,
|
|
||||||
success: bool,
|
|
||||||
result: Option<serde_json::Value>,
|
|
||||||
error: Option<&str>,
|
|
||||||
) {
|
|
||||||
let mut resp = serde_json::json!({ "id": id, "success": success });
|
|
||||||
if let Some(r) = result {
|
|
||||||
resp["result"] = r;
|
|
||||||
}
|
|
||||||
if let Some(e) = error {
|
|
||||||
resp["error"] = serde_json::Value::String(e.to_string());
|
|
||||||
}
|
|
||||||
let _ = writeln!(out, "{}", resp);
|
|
||||||
let _ = out.flush();
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Resolve a session: if session_id is provided, look it up in the sessions map;
|
|
||||||
/// otherwise fall back to the default state (backward compat with `init`).
|
|
||||||
fn get_session<'a>(
|
|
||||||
sessions: &'a mut HashMap<String, TranscodeState>,
|
|
||||||
default: &'a mut Option<TranscodeState>,
|
|
||||||
params: &serde_json::Value,
|
|
||||||
) -> Option<&'a mut TranscodeState> {
|
|
||||||
if let Some(sid) = params.get("session_id").and_then(|v| v.as_str()) {
|
|
||||||
sessions.get_mut(sid)
|
|
||||||
} else {
|
|
||||||
default.as_mut()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
let stdin = io::stdin();
|
|
||||||
let stdout = io::stdout();
|
|
||||||
let mut out = io::BufWriter::new(stdout.lock());
|
|
||||||
|
|
||||||
let _ = writeln!(out, r#"{{"event":"ready","data":{{}}}}"#);
|
|
||||||
let _ = out.flush();
|
|
||||||
|
|
||||||
let mut default_state: Option<TranscodeState> = None;
|
|
||||||
let mut sessions: HashMap<String, TranscodeState> = HashMap::new();
|
|
||||||
|
|
||||||
for line in stdin.lock().lines() {
|
|
||||||
let line = match line {
|
|
||||||
Ok(l) if !l.trim().is_empty() => l,
|
|
||||||
Ok(_) => continue,
|
|
||||||
Err(_) => break,
|
|
||||||
};
|
|
||||||
|
|
||||||
let req: Request = match serde_json::from_str(&line) {
|
|
||||||
Ok(r) => r,
|
|
||||||
Err(e) => {
|
|
||||||
respond(&mut out, "", false, None, Some(&format!("parse: {e}")));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
match req.method.as_str() {
|
|
||||||
"init" => match TranscodeState::new() {
|
|
||||||
Ok(s) => {
|
|
||||||
default_state = Some(s);
|
|
||||||
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
|
|
||||||
}
|
|
||||||
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
|
|
||||||
},
|
|
||||||
|
|
||||||
"create_session" => {
|
|
||||||
let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) {
|
|
||||||
Some(s) => s.to_string(),
|
|
||||||
None => {
|
|
||||||
respond(&mut out, &req.id, false, None, Some("missing session_id"));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if sessions.contains_key(&session_id) {
|
|
||||||
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
match TranscodeState::new() {
|
|
||||||
Ok(s) => {
|
|
||||||
sessions.insert(session_id, s);
|
|
||||||
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
|
|
||||||
}
|
|
||||||
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
"destroy_session" => {
|
|
||||||
let session_id = match req.params.get("session_id").and_then(|v| v.as_str()) {
|
|
||||||
Some(s) => s,
|
|
||||||
None => {
|
|
||||||
respond(&mut out, &req.id, false, None, Some("missing session_id"));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
sessions.remove(session_id);
|
|
||||||
respond(&mut out, &req.id, true, Some(serde_json::json!({})), None);
|
|
||||||
}
|
|
||||||
|
|
||||||
"transcode" => {
|
|
||||||
let st = match get_session(&mut sessions, &mut default_state, &req.params) {
|
|
||||||
Some(s) => s,
|
|
||||||
None => {
|
|
||||||
respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
false,
|
|
||||||
None,
|
|
||||||
Some("not initialized (no session or default state)"),
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) {
|
|
||||||
Some(s) => s,
|
|
||||||
None => {
|
|
||||||
respond(&mut out, &req.id, false, None, Some("missing data_b64"));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let from_pt =
|
|
||||||
req.params.get("from_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8;
|
|
||||||
let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(0) as u8;
|
|
||||||
let direction = req.params.get("direction").and_then(|v| v.as_str());
|
|
||||||
|
|
||||||
let data = match B64.decode(data_b64) {
|
|
||||||
Ok(b) => b,
|
|
||||||
Err(e) => {
|
|
||||||
respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
false,
|
|
||||||
None,
|
|
||||||
Some(&format!("b64: {e}")),
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
match st.transcode(&data, from_pt, to_pt, direction) {
|
|
||||||
Ok(result) => {
|
|
||||||
respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
true,
|
|
||||||
Some(serde_json::json!({ "data_b64": B64.encode(&result) })),
|
|
||||||
None,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Err(e) => respond(&mut out, &req.id, false, None, Some(&e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
"encode_pcm" => {
|
|
||||||
let st = match get_session(&mut sessions, &mut default_state, &req.params) {
|
|
||||||
Some(s) => s,
|
|
||||||
None => {
|
|
||||||
respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
false,
|
|
||||||
None,
|
|
||||||
Some("not initialized (no session or default state)"),
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let data_b64 = match req.params.get("data_b64").and_then(|v| v.as_str()) {
|
|
||||||
Some(s) => s,
|
|
||||||
None => {
|
|
||||||
respond(&mut out, &req.id, false, None, Some("missing data_b64"));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let sample_rate = req
|
|
||||||
.params
|
|
||||||
.get("sample_rate")
|
|
||||||
.and_then(|v| v.as_u64())
|
|
||||||
.unwrap_or(22050) as u32;
|
|
||||||
let to_pt = req.params.get("to_pt").and_then(|v| v.as_u64()).unwrap_or(9) as u8;
|
|
||||||
|
|
||||||
let data = match B64.decode(data_b64) {
|
|
||||||
Ok(b) => b,
|
|
||||||
Err(e) => {
|
|
||||||
respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
false,
|
|
||||||
None,
|
|
||||||
Some(&format!("b64: {e}")),
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if data.len() % 2 != 0 {
|
|
||||||
respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
false,
|
|
||||||
None,
|
|
||||||
Some("PCM data has odd byte count (expected 16-bit LE samples)"),
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let pcm: Vec<i16> = data
|
|
||||||
.chunks_exact(2)
|
|
||||||
.map(|c| i16::from_le_bytes([c[0], c[1]]))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let target_rate = codec_sample_rate(to_pt);
|
|
||||||
let resampled = match st.resample(&pcm, sample_rate, target_rate) {
|
|
||||||
Ok(r) => r,
|
|
||||||
Err(e) => {
|
|
||||||
respond(&mut out, &req.id, false, None, Some(&e));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
match st.encode_from_pcm(&resampled, to_pt) {
|
|
||||||
Ok(encoded) => {
|
|
||||||
respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
true,
|
|
||||||
Some(serde_json::json!({ "data_b64": B64.encode(&encoded) })),
|
|
||||||
None,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
respond(&mut out, &req.id, false, None, Some(&e));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
"encode" | "decode" => {
|
|
||||||
respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
false,
|
|
||||||
None,
|
|
||||||
Some("use 'transcode' command instead"),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => respond(
|
|
||||||
&mut out,
|
|
||||||
&req.id,
|
|
||||||
false,
|
|
||||||
None,
|
|
||||||
Some(&format!("unknown: {}", req.method)),
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -18,3 +18,8 @@ regex-lite = "0.1"
|
|||||||
webrtc = "0.8"
|
webrtc = "0.8"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
hound = "3.5"
|
hound = "3.5"
|
||||||
|
kokoro-tts = { version = "0.3", default-features = false }
|
||||||
|
ort = { version = "=2.0.0-rc.11", default-features = false, features = [
|
||||||
|
"std", "download-binaries", "copy-dylibs", "ndarray",
|
||||||
|
"tls-native-vendored"
|
||||||
|
] }
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ mod rtp;
|
|||||||
mod sip_leg;
|
mod sip_leg;
|
||||||
mod sip_transport;
|
mod sip_transport;
|
||||||
mod tool_leg;
|
mod tool_leg;
|
||||||
|
mod tts;
|
||||||
mod voicemail;
|
mod voicemail;
|
||||||
mod webrtc_engine;
|
mod webrtc_engine;
|
||||||
|
|
||||||
@@ -93,6 +94,9 @@ async fn main() {
|
|||||||
// WebRTC engine — separate lock to avoid deadlock with SIP handlers.
|
// WebRTC engine — separate lock to avoid deadlock with SIP handlers.
|
||||||
let webrtc = Arc::new(Mutex::new(WebRtcEngine::new(out_tx.clone())));
|
let webrtc = Arc::new(Mutex::new(WebRtcEngine::new(out_tx.clone())));
|
||||||
|
|
||||||
|
// TTS engine — separate lock, lazy-loads model on first use.
|
||||||
|
let tts_engine = Arc::new(Mutex::new(tts::TtsEngine::new()));
|
||||||
|
|
||||||
// Read commands from stdin.
|
// Read commands from stdin.
|
||||||
let stdin = tokio::io::stdin();
|
let stdin = tokio::io::stdin();
|
||||||
let reader = BufReader::new(stdin);
|
let reader = BufReader::new(stdin);
|
||||||
@@ -113,11 +117,12 @@ async fn main() {
|
|||||||
|
|
||||||
let engine = engine.clone();
|
let engine = engine.clone();
|
||||||
let webrtc = webrtc.clone();
|
let webrtc = webrtc.clone();
|
||||||
|
let tts_engine = tts_engine.clone();
|
||||||
let out_tx = out_tx.clone();
|
let out_tx = out_tx.clone();
|
||||||
|
|
||||||
// Handle commands — some are async, so we spawn.
|
// Handle commands — some are async, so we spawn.
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
handle_command(engine, webrtc, &out_tx, cmd).await;
|
handle_command(engine, webrtc, tts_engine, &out_tx, cmd).await;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -125,6 +130,7 @@ async fn main() {
|
|||||||
async fn handle_command(
|
async fn handle_command(
|
||||||
engine: Arc<Mutex<ProxyEngine>>,
|
engine: Arc<Mutex<ProxyEngine>>,
|
||||||
webrtc: Arc<Mutex<WebRtcEngine>>,
|
webrtc: Arc<Mutex<WebRtcEngine>>,
|
||||||
|
tts_engine: Arc<Mutex<tts::TtsEngine>>,
|
||||||
out_tx: &OutTx,
|
out_tx: &OutTx,
|
||||||
cmd: Command,
|
cmd: Command,
|
||||||
) {
|
) {
|
||||||
@@ -150,6 +156,8 @@ async fn handle_command(
|
|||||||
"add_tool_leg" => handle_add_tool_leg(engine, out_tx, &cmd).await,
|
"add_tool_leg" => handle_add_tool_leg(engine, out_tx, &cmd).await,
|
||||||
"remove_tool_leg" => handle_remove_tool_leg(engine, out_tx, &cmd).await,
|
"remove_tool_leg" => handle_remove_tool_leg(engine, out_tx, &cmd).await,
|
||||||
"set_leg_metadata" => handle_set_leg_metadata(engine, out_tx, &cmd).await,
|
"set_leg_metadata" => handle_set_leg_metadata(engine, out_tx, &cmd).await,
|
||||||
|
// TTS command — lock tts_engine only (no SIP/WebRTC contention).
|
||||||
|
"generate_tts" => handle_generate_tts(tts_engine, out_tx, &cmd).await,
|
||||||
_ => respond_err(out_tx, &cmd.id, &format!("unknown command: {}", cmd.method)),
|
_ => respond_err(out_tx, &cmd.id, &format!("unknown command: {}", cmd.method)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1218,3 +1226,16 @@ async fn handle_set_leg_metadata(
|
|||||||
leg.metadata.insert(key, value);
|
leg.metadata.insert(key, value);
|
||||||
respond_ok(out_tx, &cmd.id, serde_json::json!({}));
|
respond_ok(out_tx, &cmd.id, serde_json::json!({}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Handle `generate_tts` — synthesize text to a WAV file using Kokoro TTS.
|
||||||
|
async fn handle_generate_tts(
|
||||||
|
tts_engine: Arc<Mutex<tts::TtsEngine>>,
|
||||||
|
out_tx: &OutTx,
|
||||||
|
cmd: &Command,
|
||||||
|
) {
|
||||||
|
let mut tts = tts_engine.lock().await;
|
||||||
|
match tts.generate(&cmd.params).await {
|
||||||
|
Ok(result) => respond_ok(out_tx, &cmd.id, result),
|
||||||
|
Err(e) => respond_err(out_tx, &cmd.id, &e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
138
rust/crates/proxy-engine/src/tts.rs
Normal file
138
rust/crates/proxy-engine/src/tts.rs
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
//! Text-to-speech engine — synthesizes text to WAV files using Kokoro neural TTS.
|
||||||
|
//!
|
||||||
|
//! The model is loaded lazily on first use. If the model/voices files are not
|
||||||
|
//! present, the generate command returns an error and the TS side falls back
|
||||||
|
//! to espeak-ng.
|
||||||
|
|
||||||
|
use kokoro_tts::{KokoroTts, Voice};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
/// Wraps the Kokoro TTS engine with lazy model loading.
|
||||||
|
pub struct TtsEngine {
|
||||||
|
tts: Option<KokoroTts>,
|
||||||
|
/// Path that was used to load the current model (for cache invalidation).
|
||||||
|
loaded_model_path: String,
|
||||||
|
loaded_voices_path: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TtsEngine {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
tts: None,
|
||||||
|
loaded_model_path: String::new(),
|
||||||
|
loaded_voices_path: String::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a WAV file from text.
|
||||||
|
///
|
||||||
|
/// Params (from IPC JSON):
|
||||||
|
/// - `model`: path to the ONNX model file
|
||||||
|
/// - `voices`: path to the voices.bin file
|
||||||
|
/// - `voice`: voice name (e.g. "af_bella")
|
||||||
|
/// - `text`: text to synthesize
|
||||||
|
/// - `output`: output WAV file path
|
||||||
|
pub async fn generate(&mut self, params: &serde_json::Value) -> Result<serde_json::Value, String> {
|
||||||
|
let model_path = params.get("model").and_then(|v| v.as_str())
|
||||||
|
.ok_or("missing 'model' param")?;
|
||||||
|
let voices_path = params.get("voices").and_then(|v| v.as_str())
|
||||||
|
.ok_or("missing 'voices' param")?;
|
||||||
|
let voice_name = params.get("voice").and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("af_bella");
|
||||||
|
let text = params.get("text").and_then(|v| v.as_str())
|
||||||
|
.ok_or("missing 'text' param")?;
|
||||||
|
let output_path = params.get("output").and_then(|v| v.as_str())
|
||||||
|
.ok_or("missing 'output' param")?;
|
||||||
|
|
||||||
|
if text.is_empty() {
|
||||||
|
return Err("empty text".into());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that model/voices files exist.
|
||||||
|
if !Path::new(model_path).exists() {
|
||||||
|
return Err(format!("model not found: {model_path}"));
|
||||||
|
}
|
||||||
|
if !Path::new(voices_path).exists() {
|
||||||
|
return Err(format!("voices not found: {voices_path}"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lazy-load or reload if paths changed.
|
||||||
|
if self.tts.is_none()
|
||||||
|
|| self.loaded_model_path != model_path
|
||||||
|
|| self.loaded_voices_path != voices_path
|
||||||
|
{
|
||||||
|
eprintln!("[tts] loading model: {model_path}");
|
||||||
|
let tts = KokoroTts::new(model_path, voices_path)
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("model load failed: {e:?}"))?;
|
||||||
|
self.tts = Some(tts);
|
||||||
|
self.loaded_model_path = model_path.to_string();
|
||||||
|
self.loaded_voices_path = voices_path.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
let tts = self.tts.as_ref().unwrap();
|
||||||
|
let voice = select_voice(voice_name);
|
||||||
|
|
||||||
|
eprintln!("[tts] synthesizing voice '{voice_name}': \"{text}\"");
|
||||||
|
let (samples, duration) = tts.synth(text, voice)
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("synthesis failed: {e:?}"))?;
|
||||||
|
eprintln!("[tts] synthesized {} samples in {duration:?}", samples.len());
|
||||||
|
|
||||||
|
// Write 24kHz 16-bit mono WAV.
|
||||||
|
let spec = hound::WavSpec {
|
||||||
|
channels: 1,
|
||||||
|
sample_rate: 24000,
|
||||||
|
bits_per_sample: 16,
|
||||||
|
sample_format: hound::SampleFormat::Int,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut writer = hound::WavWriter::create(output_path, spec)
|
||||||
|
.map_err(|e| format!("WAV create failed: {e}"))?;
|
||||||
|
for &sample in &samples {
|
||||||
|
let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
|
||||||
|
writer.write_sample(s16).map_err(|e| format!("WAV write: {e}"))?;
|
||||||
|
}
|
||||||
|
writer.finalize().map_err(|e| format!("WAV finalize: {e}"))?;
|
||||||
|
|
||||||
|
eprintln!("[tts] wrote {output_path}");
|
||||||
|
Ok(serde_json::json!({ "output": output_path }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map voice name string to Kokoro Voice enum variant.
|
||||||
|
fn select_voice(name: &str) -> Voice {
|
||||||
|
match name {
|
||||||
|
"af_bella" => Voice::AfBella(1.0),
|
||||||
|
"af_heart" => Voice::AfHeart(1.0),
|
||||||
|
"af_jessica" => Voice::AfJessica(1.0),
|
||||||
|
"af_nicole" => Voice::AfNicole(1.0),
|
||||||
|
"af_nova" => Voice::AfNova(1.0),
|
||||||
|
"af_sarah" => Voice::AfSarah(1.0),
|
||||||
|
"af_sky" => Voice::AfSky(1.0),
|
||||||
|
"af_river" => Voice::AfRiver(1.0),
|
||||||
|
"af_alloy" => Voice::AfAlloy(1.0),
|
||||||
|
"af_aoede" => Voice::AfAoede(1.0),
|
||||||
|
"af_kore" => Voice::AfKore(1.0),
|
||||||
|
"am_adam" => Voice::AmAdam(1.0),
|
||||||
|
"am_echo" => Voice::AmEcho(1.0),
|
||||||
|
"am_eric" => Voice::AmEric(1.0),
|
||||||
|
"am_fenrir" => Voice::AmFenrir(1.0),
|
||||||
|
"am_liam" => Voice::AmLiam(1.0),
|
||||||
|
"am_michael" => Voice::AmMichael(1.0),
|
||||||
|
"am_onyx" => Voice::AmOnyx(1.0),
|
||||||
|
"am_puck" => Voice::AmPuck(1.0),
|
||||||
|
"bf_alice" => Voice::BfAlice(1.0),
|
||||||
|
"bf_emma" => Voice::BfEmma(1.0),
|
||||||
|
"bf_isabella" => Voice::BfIsabella(1.0),
|
||||||
|
"bf_lily" => Voice::BfLily(1.0),
|
||||||
|
"bm_daniel" => Voice::BmDaniel(1.0),
|
||||||
|
"bm_fable" => Voice::BmFable(1.0),
|
||||||
|
"bm_george" => Voice::BmGeorge(1.0),
|
||||||
|
"bm_lewis" => Voice::BmLewis(1.0),
|
||||||
|
_ => {
|
||||||
|
eprintln!("[tts] unknown voice '{name}', falling back to af_bella");
|
||||||
|
Voice::AfBella(1.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "tts-engine"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition = "2021"
|
|
||||||
|
|
||||||
[[bin]]
|
|
||||||
name = "tts-engine"
|
|
||||||
path = "src/main.rs"
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
kokoro-tts = { version = "0.3", default-features = false }
|
|
||||||
# Pin to rc.11 matching kokoro-tts's expectation; enable vendored TLS to avoid system libssl-dev.
|
|
||||||
ort = { version = "=2.0.0-rc.11", default-features = false, features = [
|
|
||||||
"std", "download-binaries", "copy-dylibs", "ndarray",
|
|
||||||
"tls-native-vendored"
|
|
||||||
] }
|
|
||||||
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
|
|
||||||
hound = "3.5"
|
|
||||||
@@ -1,149 +0,0 @@
|
|||||||
/// TTS engine CLI — synthesizes text to a WAV file using Kokoro neural TTS.
|
|
||||||
///
|
|
||||||
/// Usage:
|
|
||||||
/// echo "Hello world" | tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav
|
|
||||||
/// tts-engine --model kokoro-v1.0.onnx --voices voices.bin --output out.wav --text "Hello world"
|
|
||||||
///
|
|
||||||
/// Outputs 24kHz 16-bit mono WAV.
|
|
||||||
|
|
||||||
use kokoro_tts::{KokoroTts, Voice};
|
|
||||||
use std::io::{self, Read};
|
|
||||||
|
|
||||||
fn parse_args() -> Result<(String, String, String, String, Option<String>), String> {
|
|
||||||
let args: Vec<String> = std::env::args().collect();
|
|
||||||
let mut model = String::new();
|
|
||||||
let mut voices = String::new();
|
|
||||||
let mut output = String::new();
|
|
||||||
let mut text: Option<String> = None;
|
|
||||||
let mut voice_name: Option<String> = None;
|
|
||||||
|
|
||||||
let mut i = 1;
|
|
||||||
while i < args.len() {
|
|
||||||
match args[i].as_str() {
|
|
||||||
"--model" => { i += 1; model = args.get(i).cloned().unwrap_or_default(); }
|
|
||||||
"--voices" => { i += 1; voices = args.get(i).cloned().unwrap_or_default(); }
|
|
||||||
"--output" | "--output_file" => { i += 1; output = args.get(i).cloned().unwrap_or_default(); }
|
|
||||||
"--text" => { i += 1; text = args.get(i).cloned(); }
|
|
||||||
"--voice" => { i += 1; voice_name = args.get(i).cloned(); }
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if model.is_empty() { return Err("--model required".into()); }
|
|
||||||
if voices.is_empty() { return Err("--voices required".into()); }
|
|
||||||
if output.is_empty() { return Err("--output required".into()); }
|
|
||||||
|
|
||||||
let voice_str = voice_name.unwrap_or_else(|| "af_bella".into());
|
|
||||||
|
|
||||||
Ok((model, voices, output, voice_str, text))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn select_voice(name: &str) -> Voice {
|
|
||||||
match name {
|
|
||||||
"af_bella" => Voice::AfBella(1.0),
|
|
||||||
"af_heart" => Voice::AfHeart(1.0),
|
|
||||||
"af_jessica" => Voice::AfJessica(1.0),
|
|
||||||
"af_nicole" => Voice::AfNicole(1.0),
|
|
||||||
"af_nova" => Voice::AfNova(1.0),
|
|
||||||
"af_sarah" => Voice::AfSarah(1.0),
|
|
||||||
"af_sky" => Voice::AfSky(1.0),
|
|
||||||
"af_river" => Voice::AfRiver(1.0),
|
|
||||||
"af_alloy" => Voice::AfAlloy(1.0),
|
|
||||||
"af_aoede" => Voice::AfAoede(1.0),
|
|
||||||
"af_kore" => Voice::AfKore(1.0),
|
|
||||||
"am_adam" => Voice::AmAdam(1.0),
|
|
||||||
"am_echo" => Voice::AmEcho(1.0),
|
|
||||||
"am_eric" => Voice::AmEric(1.0),
|
|
||||||
"am_fenrir" => Voice::AmFenrir(1.0),
|
|
||||||
"am_liam" => Voice::AmLiam(1.0),
|
|
||||||
"am_michael" => Voice::AmMichael(1.0),
|
|
||||||
"am_onyx" => Voice::AmOnyx(1.0),
|
|
||||||
"am_puck" => Voice::AmPuck(1.0),
|
|
||||||
"bf_alice" => Voice::BfAlice(1.0),
|
|
||||||
"bf_emma" => Voice::BfEmma(1.0),
|
|
||||||
"bf_isabella" => Voice::BfIsabella(1.0),
|
|
||||||
"bf_lily" => Voice::BfLily(1.0),
|
|
||||||
"bm_daniel" => Voice::BmDaniel(1.0),
|
|
||||||
"bm_fable" => Voice::BmFable(1.0),
|
|
||||||
"bm_george" => Voice::BmGeorge(1.0),
|
|
||||||
"bm_lewis" => Voice::BmLewis(1.0),
|
|
||||||
_ => {
|
|
||||||
eprintln!("[tts-engine] unknown voice '{}', falling back to af_bella", name);
|
|
||||||
Voice::AfBella(1.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
|
||||||
async fn main() {
|
|
||||||
let (model_path, voices_path, output_path, voice_name, text_arg) = match parse_args() {
|
|
||||||
Ok(v) => v,
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("Error: {}", e);
|
|
||||||
eprintln!("Usage: tts-engine --model <model.onnx> --voices <voices.bin> --output <output.wav> [--text <text>] [--voice <voice_name>]");
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get text from --text arg or stdin.
|
|
||||||
let text = match text_arg {
|
|
||||||
Some(t) => t,
|
|
||||||
None => {
|
|
||||||
let mut buf = String::new();
|
|
||||||
io::stdin().read_to_string(&mut buf).expect("failed to read stdin");
|
|
||||||
buf.trim().to_string()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if text.is_empty() {
|
|
||||||
eprintln!("[tts-engine] no text provided");
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
eprintln!("[tts-engine] loading model: {}", model_path);
|
|
||||||
let tts = match KokoroTts::new(&model_path, &voices_path).await {
|
|
||||||
Ok(t) => t,
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("[tts-engine] failed to load model: {:?}", e);
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let voice = select_voice(&voice_name);
|
|
||||||
eprintln!("[tts-engine] synthesizing with voice '{}': \"{}\"", voice_name, text);
|
|
||||||
|
|
||||||
let (samples, duration) = match tts.synth(&text, voice).await {
|
|
||||||
Ok(r) => r,
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("[tts-engine] synthesis failed: {:?}", e);
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
eprintln!("[tts-engine] synthesized {} samples in {:?}", samples.len(), duration);
|
|
||||||
|
|
||||||
// Write WAV: 24kHz, 16-bit, mono (same format announcement.ts expects).
|
|
||||||
let spec = hound::WavSpec {
|
|
||||||
channels: 1,
|
|
||||||
sample_rate: 24000,
|
|
||||||
bits_per_sample: 16,
|
|
||||||
sample_format: hound::SampleFormat::Int,
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut writer = match hound::WavWriter::create(&output_path, spec) {
|
|
||||||
Ok(w) => w,
|
|
||||||
Err(e) => {
|
|
||||||
eprintln!("[tts-engine] failed to create WAV: {}", e);
|
|
||||||
std::process::exit(1);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
for &sample in &samples {
|
|
||||||
let s16 = (sample * 32767.0).round().clamp(-32768.0, 32767.0) as i16;
|
|
||||||
writer.write_sample(s16).unwrap();
|
|
||||||
}
|
|
||||||
writer.finalize().unwrap();
|
|
||||||
|
|
||||||
eprintln!("[tts-engine] wrote {}", output_path);
|
|
||||||
}
|
|
||||||
@@ -3,6 +3,6 @@
|
|||||||
*/
|
*/
|
||||||
export const commitinfo = {
|
export const commitinfo = {
|
||||||
name: 'siprouter',
|
name: 'siprouter',
|
||||||
version: '1.15.0',
|
version: '1.16.0',
|
||||||
description: 'undefined'
|
description: 'undefined'
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,59 +1,22 @@
|
|||||||
/**
|
/**
|
||||||
* TTS announcement module — pre-generates audio announcements using espeak-ng
|
* TTS announcement module — generates announcement WAV files at startup.
|
||||||
* and caches them as encoded RTP packets for playback during call setup.
|
|
||||||
*
|
*
|
||||||
* On startup, generates the announcement WAV via espeak-ng (formant-based TTS
|
* Engine priority: espeak-ng (formant TTS, fast) → Kokoro neural TTS via
|
||||||
* with highly accurate pronunciation), encodes each 20ms frame to G.722 (for
|
* proxy-engine → disabled.
|
||||||
* SIP) and Opus (for WebRTC) via the Rust transcoder, and caches the packets.
|
|
||||||
*
|
*
|
||||||
* Falls back to the Rust tts-engine (Kokoro neural TTS) if espeak-ng is not
|
* The generated WAV is left on disk for Rust's audio_player / start_interaction
|
||||||
* installed, and disables announcements if neither is available.
|
* to play during calls. No encoding or RTP playback happens in TypeScript.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { execSync } from 'node:child_process';
|
import { execSync } from 'node:child_process';
|
||||||
import fs from 'node:fs';
|
import fs from 'node:fs';
|
||||||
import path from 'node:path';
|
import path from 'node:path';
|
||||||
import { Buffer } from 'node:buffer';
|
import { sendProxyCommand, isProxyReady } from './proxybridge.ts';
|
||||||
import { encodePcm, isCodecReady } from './opusbridge.ts';
|
|
||||||
|
|
||||||
/** RTP clock increment per 20ms frame for each codec. */
|
|
||||||
function rtpClockIncrement(pt: number): number {
|
|
||||||
if (pt === 111) return 960;
|
|
||||||
if (pt === 9) return 160;
|
|
||||||
return 160;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Build a fresh RTP header. */
|
|
||||||
function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer {
|
|
||||||
const hdr = Buffer.alloc(12);
|
|
||||||
hdr[0] = 0x80;
|
|
||||||
hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f);
|
|
||||||
hdr.writeUInt16BE(seq & 0xffff, 2);
|
|
||||||
hdr.writeUInt32BE(ts >>> 0, 4);
|
|
||||||
hdr.writeUInt32BE(ssrc >>> 0, 8);
|
|
||||||
return hdr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Types
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/** A pre-encoded announcement ready for RTP playback. */
|
|
||||||
export interface IAnnouncementCache {
|
|
||||||
/** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */
|
|
||||||
g722Frames: Buffer[];
|
|
||||||
/** Opus encoded frames for WebRTC playback. */
|
|
||||||
opusFrames: Buffer[];
|
|
||||||
/** Total duration in milliseconds. */
|
|
||||||
durationMs: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// State
|
// State
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
let cachedAnnouncement: IAnnouncementCache | null = null;
|
|
||||||
|
|
||||||
const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
|
const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
|
||||||
const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
|
const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
|
||||||
const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');
|
const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');
|
||||||
@@ -64,12 +27,10 @@ const KOKORO_VOICES = 'voices.bin';
|
|||||||
const KOKORO_VOICE = 'af_bella';
|
const KOKORO_VOICE = 'af_bella';
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Initialization
|
// TTS generators
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/** Check if espeak-ng is available on the system. */
|
||||||
* Check if espeak-ng is available on the system.
|
|
||||||
*/
|
|
||||||
function isEspeakAvailable(): boolean {
|
function isEspeakAvailable(): boolean {
|
||||||
try {
|
try {
|
||||||
execSync('which espeak-ng', { stdio: 'pipe' });
|
execSync('which espeak-ng', { stdio: 'pipe' });
|
||||||
@@ -79,10 +40,7 @@ function isEspeakAvailable(): boolean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Generate announcement WAV via espeak-ng (primary engine). */
|
||||||
* Generate announcement WAV via espeak-ng (primary engine).
|
|
||||||
* Returns true on success.
|
|
||||||
*/
|
|
||||||
function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
|
function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
|
||||||
log('[tts] generating announcement audio via espeak-ng...');
|
log('[tts] generating announcement audio via espeak-ng...');
|
||||||
try {
|
try {
|
||||||
@@ -98,11 +56,8 @@ function generateViaEspeak(wavPath: string, text: string, log: (msg: string) =>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Generate announcement WAV via Kokoro TTS (fallback, runs inside proxy-engine). */
|
||||||
* Generate announcement WAV via Kokoro TTS (fallback engine).
|
async function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): Promise<boolean> {
|
||||||
* Returns true on success.
|
|
||||||
*/
|
|
||||||
function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): boolean {
|
|
||||||
const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
|
const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
|
||||||
const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);
|
const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);
|
||||||
|
|
||||||
@@ -111,25 +66,21 @@ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) =>
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const root = process.cwd();
|
if (!isProxyReady()) {
|
||||||
const ttsBinPaths = [
|
log('[tts] proxy-engine not ready — Kokoro fallback unavailable');
|
||||||
path.join(root, 'dist_rust', 'tts-engine'),
|
|
||||||
path.join(root, 'rust', 'target', 'release', 'tts-engine'),
|
|
||||||
path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
|
|
||||||
];
|
|
||||||
const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p));
|
|
||||||
if (!ttsBin) {
|
|
||||||
log('[tts] tts-engine binary not found — Kokoro fallback unavailable');
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
|
log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
|
||||||
try {
|
try {
|
||||||
execSync(
|
await sendProxyCommand('generate_tts', {
|
||||||
`"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${wavPath}" --text "${text}"`,
|
model: modelPath,
|
||||||
{ timeout: 120000, stdio: 'pipe' },
|
voices: voicesPath,
|
||||||
);
|
voice: KOKORO_VOICE,
|
||||||
log('[tts] Kokoro WAV generated');
|
text,
|
||||||
|
output: wavPath,
|
||||||
|
});
|
||||||
|
log('[tts] Kokoro WAV generated (via proxy-engine)');
|
||||||
return true;
|
return true;
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
log(`[tts] Kokoro failed: ${e.message}`);
|
log(`[tts] Kokoro failed: ${e.message}`);
|
||||||
@@ -137,40 +88,13 @@ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) =>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
// ---------------------------------------------------------------------------
|
||||||
* Read a WAV file and detect its sample rate from the fmt chunk.
|
// Initialization
|
||||||
* Returns { pcm, sampleRate } or null on failure.
|
// ---------------------------------------------------------------------------
|
||||||
*/
|
|
||||||
function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
|
|
||||||
const wav = fs.readFileSync(wavPath);
|
|
||||||
if (wav.length < 44) return null;
|
|
||||||
if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
|
|
||||||
if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
|
|
||||||
|
|
||||||
let sampleRate = 22050; // default
|
|
||||||
let offset = 12;
|
|
||||||
let pcm: Buffer | null = null;
|
|
||||||
|
|
||||||
while (offset < wav.length - 8) {
|
|
||||||
const chunkId = wav.toString('ascii', offset, offset + 4);
|
|
||||||
const chunkSize = wav.readUInt32LE(offset + 4);
|
|
||||||
if (chunkId === 'fmt ') {
|
|
||||||
sampleRate = wav.readUInt32LE(offset + 12);
|
|
||||||
}
|
|
||||||
if (chunkId === 'data') {
|
|
||||||
pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
|
|
||||||
}
|
|
||||||
offset += 8 + chunkSize;
|
|
||||||
if (offset % 2 !== 0) offset++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!pcm) return null;
|
|
||||||
return { pcm, sampleRate };
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pre-generate the announcement audio and encode to G.722 + Opus frames.
|
* Pre-generate the announcement WAV file.
|
||||||
* Must be called after the codec bridge is initialized.
|
* Must be called after the proxy engine is initialized.
|
||||||
*
|
*
|
||||||
* Engine priority: espeak-ng → Kokoro → disabled.
|
* Engine priority: espeak-ng → Kokoro → disabled.
|
||||||
*/
|
*/
|
||||||
@@ -178,7 +102,6 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
|
|||||||
fs.mkdirSync(TTS_DIR, { recursive: true });
|
fs.mkdirSync(TTS_DIR, { recursive: true });
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Generate WAV if not cached.
|
|
||||||
if (!fs.existsSync(CACHE_WAV)) {
|
if (!fs.existsSync(CACHE_WAV)) {
|
||||||
let generated = false;
|
let generated = false;
|
||||||
|
|
||||||
@@ -189,9 +112,9 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
|
|||||||
log('[tts] espeak-ng not installed — trying Kokoro fallback');
|
log('[tts] espeak-ng not installed — trying Kokoro fallback');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fall back to Kokoro.
|
// Fall back to Kokoro (via proxy-engine).
|
||||||
if (!generated) {
|
if (!generated) {
|
||||||
generated = generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
|
generated = await generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!generated) {
|
if (!generated) {
|
||||||
@@ -200,49 +123,7 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read WAV and extract raw PCM + sample rate.
|
log('[tts] announcement WAV ready');
|
||||||
const result = readWavWithRate(CACHE_WAV);
|
|
||||||
if (!result) {
|
|
||||||
log('[tts] failed to parse WAV file');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const { pcm, sampleRate } = result;
|
|
||||||
|
|
||||||
// Wait for codec bridge to be ready.
|
|
||||||
if (!isCodecReady()) {
|
|
||||||
log('[tts] codec bridge not ready — will retry');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Encode in 20ms chunks. The Rust encoder resamples to each codec's native rate.
|
|
||||||
const FRAME_SAMPLES = Math.floor(sampleRate * 0.02);
|
|
||||||
const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample
|
|
||||||
const totalFrames = Math.floor(pcm.length / FRAME_BYTES);
|
|
||||||
|
|
||||||
const g722Frames: Buffer[] = [];
|
|
||||||
const opusFrames: Buffer[] = [];
|
|
||||||
|
|
||||||
log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${sampleRate}Hz)...`);
|
|
||||||
for (let i = 0; i < totalFrames; i++) {
|
|
||||||
const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES);
|
|
||||||
const pcmBuf = Buffer.from(framePcm);
|
|
||||||
const [g722, opus] = await Promise.all([
|
|
||||||
encodePcm(pcmBuf, sampleRate, 9), // G.722 for SIP devices
|
|
||||||
encodePcm(pcmBuf, sampleRate, 111), // Opus for WebRTC browsers
|
|
||||||
]);
|
|
||||||
if (g722) g722Frames.push(g722);
|
|
||||||
if (opus) opusFrames.push(opus);
|
|
||||||
if (!g722 && !opus && i < 3) log(`[tts] frame ${i} encode failed`);
|
|
||||||
}
|
|
||||||
|
|
||||||
cachedAnnouncement = {
|
|
||||||
g722Frames,
|
|
||||||
opusFrames,
|
|
||||||
durationMs: totalFrames * 20,
|
|
||||||
};
|
|
||||||
|
|
||||||
log(`[tts] announcement cached: ${g722Frames.length} frames (${(totalFrames * 20 / 1000).toFixed(1)}s)`);
|
|
||||||
return true;
|
return true;
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
log(`[tts] init error: ${e.message}`);
|
log(`[tts] init error: ${e.message}`);
|
||||||
@@ -250,100 +131,7 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
/** Get the path to the cached announcement WAV, or null if not generated. */
|
||||||
// Playback
|
export function getAnnouncementWavPath(): string | null {
|
||||||
// ---------------------------------------------------------------------------
|
return fs.existsSync(CACHE_WAV) ? CACHE_WAV : null;
|
||||||
|
|
||||||
/**
|
|
||||||
* Play the pre-cached announcement to an RTP endpoint.
|
|
||||||
*
|
|
||||||
* @param sendPacket - function to send a raw RTP packet
|
|
||||||
* @param ssrc - SSRC to use in RTP headers
|
|
||||||
* @param onDone - called when the announcement finishes
|
|
||||||
* @returns a cancel function, or null if no announcement is cached
|
|
||||||
*/
|
|
||||||
export function playAnnouncement(
|
|
||||||
sendPacket: (pkt: Buffer) => void,
|
|
||||||
ssrc: number,
|
|
||||||
onDone?: () => void,
|
|
||||||
): (() => void) | null {
|
|
||||||
if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) {
|
|
||||||
onDone?.();
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const frames = cachedAnnouncement.g722Frames;
|
|
||||||
const PT = 9; // G.722
|
|
||||||
let frameIdx = 0;
|
|
||||||
let seq = Math.floor(Math.random() * 0xffff);
|
|
||||||
let rtpTs = Math.floor(Math.random() * 0xffffffff);
|
|
||||||
|
|
||||||
const timer = setInterval(() => {
|
|
||||||
if (frameIdx >= frames.length) {
|
|
||||||
clearInterval(timer);
|
|
||||||
onDone?.();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const payload = frames[frameIdx];
|
|
||||||
const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
|
|
||||||
const pkt = Buffer.concat([hdr, payload]);
|
|
||||||
sendPacket(pkt);
|
|
||||||
|
|
||||||
seq++;
|
|
||||||
rtpTs += rtpClockIncrement(PT);
|
|
||||||
frameIdx++;
|
|
||||||
}, 20);
|
|
||||||
|
|
||||||
// Return cancel function.
|
|
||||||
return () => clearInterval(timer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Play pre-cached Opus announcement to a WebRTC PeerConnection sender.
|
|
||||||
*
|
|
||||||
* @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp()
|
|
||||||
* @param ssrc - SSRC to use in RTP headers
|
|
||||||
* @param onDone - called when announcement finishes
|
|
||||||
* @returns cancel function, or null if no announcement cached
|
|
||||||
*/
|
|
||||||
export function playAnnouncementToWebRtc(
|
|
||||||
sendRtpPacket: (pkt: Buffer) => void,
|
|
||||||
ssrc: number,
|
|
||||||
counters: { seq: number; ts: number },
|
|
||||||
onDone?: () => void,
|
|
||||||
): (() => void) | null {
|
|
||||||
if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) {
|
|
||||||
onDone?.();
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const frames = cachedAnnouncement.opusFrames;
|
|
||||||
const PT = 111; // Opus
|
|
||||||
let frameIdx = 0;
|
|
||||||
|
|
||||||
const timer = setInterval(() => {
|
|
||||||
if (frameIdx >= frames.length) {
|
|
||||||
clearInterval(timer);
|
|
||||||
onDone?.();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const payload = frames[frameIdx];
|
|
||||||
const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
|
|
||||||
const pkt = Buffer.concat([hdr, payload]);
|
|
||||||
sendRtpPacket(pkt);
|
|
||||||
|
|
||||||
counters.seq++;
|
|
||||||
counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms
|
|
||||||
frameIdx++;
|
|
||||||
}, 20);
|
|
||||||
|
|
||||||
return () => clearInterval(timer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Check if an announcement is cached and ready. */
|
|
||||||
export function isAnnouncementReady(): boolean {
|
|
||||||
return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,55 +1,31 @@
|
|||||||
/**
|
/**
|
||||||
* PromptCache — manages multiple named audio prompts for IVR and voicemail.
|
* PromptCache — manages named audio prompt WAV files for IVR and voicemail.
|
||||||
*
|
*
|
||||||
* Each prompt is pre-encoded as both G.722 frames (for SIP legs) and Opus
|
* Generates WAV files via espeak-ng (primary) or Kokoro TTS through the
|
||||||
* frames (for WebRTC legs), ready for 20ms RTP playback.
|
* proxy-engine (fallback). Also supports loading pre-existing WAV files
|
||||||
|
* and programmatic tone generation.
|
||||||
*
|
*
|
||||||
* Supports three sources:
|
* All audio playback happens in Rust (audio_player / start_interaction).
|
||||||
* 1. TTS generation via espeak-ng (primary) or Kokoro (fallback)
|
* This module only manages WAV files on disk.
|
||||||
* 2. Loading from a pre-existing WAV file
|
|
||||||
* 3. Programmatic tone generation (beep, etc.)
|
|
||||||
*
|
|
||||||
* The existing announcement.ts system continues to work independently;
|
|
||||||
* this module provides generalized prompt management for IVR/voicemail.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { execSync } from 'node:child_process';
|
import { execSync } from 'node:child_process';
|
||||||
import fs from 'node:fs';
|
import fs from 'node:fs';
|
||||||
import path from 'node:path';
|
import path from 'node:path';
|
||||||
import { Buffer } from 'node:buffer';
|
import { Buffer } from 'node:buffer';
|
||||||
import { encodePcm, isCodecReady } from '../opusbridge.ts';
|
import { sendProxyCommand, isProxyReady } from '../proxybridge.ts';
|
||||||
|
|
||||||
/** RTP clock increment per 20ms frame for each codec. */
|
|
||||||
function rtpClockIncrement(pt: number): number {
|
|
||||||
if (pt === 111) return 960;
|
|
||||||
if (pt === 9) return 160;
|
|
||||||
return 160;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Build a fresh RTP header. */
|
|
||||||
function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer {
|
|
||||||
const hdr = Buffer.alloc(12);
|
|
||||||
hdr[0] = 0x80;
|
|
||||||
hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f);
|
|
||||||
hdr.writeUInt16BE(seq & 0xffff, 2);
|
|
||||||
hdr.writeUInt32BE(ts >>> 0, 4);
|
|
||||||
hdr.writeUInt32BE(ssrc >>> 0, 8);
|
|
||||||
return hdr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Types
|
// Types
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
/** A pre-encoded prompt ready for RTP playback. */
|
/** A cached prompt — just a WAV file path and metadata. */
|
||||||
export interface ICachedPrompt {
|
export interface ICachedPrompt {
|
||||||
/** Unique prompt identifier. */
|
/** Unique prompt identifier. */
|
||||||
id: string;
|
id: string;
|
||||||
/** G.722 encoded frames (20ms each, no RTP header). */
|
/** Path to the WAV file on disk. */
|
||||||
g722Frames: Buffer[];
|
wavPath: string;
|
||||||
/** Opus encoded frames (20ms each, no RTP header). */
|
/** Total duration in milliseconds (approximate, from WAV header). */
|
||||||
opusFrames: Buffer[];
|
|
||||||
/** Total duration in milliseconds. */
|
|
||||||
durationMs: number;
|
durationMs: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -82,84 +58,61 @@ function generateViaEspeak(wavPath: string, text: string): boolean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Generate WAV via Kokoro TTS. */
|
/** Generate WAV via Kokoro TTS (runs inside proxy-engine). */
|
||||||
function generateViaKokoro(wavPath: string, text: string, voice: string): boolean {
|
async function generateViaKokoro(wavPath: string, text: string, voice: string): Promise<boolean> {
|
||||||
const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx');
|
const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx');
|
||||||
const voicesPath = path.join(TTS_DIR, 'voices.bin');
|
const voicesPath = path.join(TTS_DIR, 'voices.bin');
|
||||||
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false;
|
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false;
|
||||||
|
if (!isProxyReady()) return false;
|
||||||
const root = process.cwd();
|
|
||||||
const ttsBin = [
|
|
||||||
path.join(root, 'dist_rust', 'tts-engine'),
|
|
||||||
path.join(root, 'rust', 'target', 'release', 'tts-engine'),
|
|
||||||
path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
|
|
||||||
].find((p) => fs.existsSync(p));
|
|
||||||
if (!ttsBin) return false;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
execSync(
|
await sendProxyCommand('generate_tts', {
|
||||||
`"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${voice}" --output "${wavPath}" --text "${text}"`,
|
model: modelPath,
|
||||||
{ timeout: 120000, stdio: 'pipe' },
|
voices: voicesPath,
|
||||||
);
|
voice,
|
||||||
|
text,
|
||||||
|
output: wavPath,
|
||||||
|
});
|
||||||
return true;
|
return true;
|
||||||
} catch {
|
} catch {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Read a WAV file and return raw PCM + sample rate. */
|
/** Read a WAV file's duration from its header. */
|
||||||
function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
|
function getWavDurationMs(wavPath: string): number {
|
||||||
|
try {
|
||||||
const wav = fs.readFileSync(wavPath);
|
const wav = fs.readFileSync(wavPath);
|
||||||
if (wav.length < 44) return null;
|
if (wav.length < 44) return 0;
|
||||||
if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
|
if (wav.toString('ascii', 0, 4) !== 'RIFF') return 0;
|
||||||
if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
|
|
||||||
|
|
||||||
let sampleRate = 22050;
|
let sampleRate = 16000;
|
||||||
let pcm: Buffer | null = null;
|
let dataSize = 0;
|
||||||
|
let bitsPerSample = 16;
|
||||||
|
let channels = 1;
|
||||||
let offset = 12;
|
let offset = 12;
|
||||||
|
|
||||||
while (offset < wav.length - 8) {
|
while (offset < wav.length - 8) {
|
||||||
const chunkId = wav.toString('ascii', offset, offset + 4);
|
const chunkId = wav.toString('ascii', offset, offset + 4);
|
||||||
const chunkSize = wav.readUInt32LE(offset + 4);
|
const chunkSize = wav.readUInt32LE(offset + 4);
|
||||||
if (chunkId === 'fmt ') {
|
if (chunkId === 'fmt ') {
|
||||||
|
channels = wav.readUInt16LE(offset + 10);
|
||||||
sampleRate = wav.readUInt32LE(offset + 12);
|
sampleRate = wav.readUInt32LE(offset + 12);
|
||||||
|
bitsPerSample = wav.readUInt16LE(offset + 22);
|
||||||
}
|
}
|
||||||
if (chunkId === 'data') {
|
if (chunkId === 'data') {
|
||||||
pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
|
dataSize = chunkSize;
|
||||||
}
|
}
|
||||||
offset += 8 + chunkSize;
|
offset += 8 + chunkSize;
|
||||||
if (offset % 2 !== 0) offset++;
|
if (offset % 2 !== 0) offset++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return pcm ? { pcm, sampleRate } : null;
|
const bytesPerSample = (bitsPerSample / 8) * channels;
|
||||||
|
const totalSamples = bytesPerSample > 0 ? dataSize / bytesPerSample : 0;
|
||||||
|
return sampleRate > 0 ? Math.round((totalSamples / sampleRate) * 1000) : 0;
|
||||||
|
} catch {
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Encode raw PCM frames to G.722 + Opus. */
|
|
||||||
async function encodePcmFrames(
|
|
||||||
pcm: Buffer,
|
|
||||||
sampleRate: number,
|
|
||||||
log: (msg: string) => void,
|
|
||||||
): Promise<{ g722Frames: Buffer[]; opusFrames: Buffer[] } | null> {
|
|
||||||
if (!isCodecReady()) return null;
|
|
||||||
|
|
||||||
const frameSamples = Math.floor(sampleRate * 0.02); // 20ms
|
|
||||||
const frameBytes = frameSamples * 2; // 16-bit
|
|
||||||
const totalFrames = Math.floor(pcm.length / frameBytes);
|
|
||||||
|
|
||||||
const g722Frames: Buffer[] = [];
|
|
||||||
const opusFrames: Buffer[] = [];
|
|
||||||
|
|
||||||
for (let i = 0; i < totalFrames; i++) {
|
|
||||||
const framePcm = Buffer.from(pcm.subarray(i * frameBytes, (i + 1) * frameBytes));
|
|
||||||
const [g722, opus] = await Promise.all([
|
|
||||||
encodePcm(framePcm, sampleRate, 9), // G.722
|
|
||||||
encodePcm(framePcm, sampleRate, 111), // Opus
|
|
||||||
]);
|
|
||||||
if (g722) g722Frames.push(g722);
|
|
||||||
if (opus) opusFrames.push(opus);
|
|
||||||
}
|
|
||||||
|
|
||||||
return { g722Frames, opusFrames };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -195,7 +148,7 @@ export class PromptCache {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate a TTS prompt and cache it.
|
* Generate a TTS prompt WAV and cache its path.
|
||||||
* Uses espeak-ng (primary) or Kokoro (fallback).
|
* Uses espeak-ng (primary) or Kokoro (fallback).
|
||||||
*/
|
*/
|
||||||
async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise<ICachedPrompt | null> {
|
async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise<ICachedPrompt | null> {
|
||||||
@@ -207,14 +160,14 @@ export class PromptCache {
|
|||||||
this.espeakAvailable = isEspeakAvailable();
|
this.espeakAvailable = isEspeakAvailable();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate WAV.
|
// Generate WAV if not already on disk.
|
||||||
let generated = false;
|
|
||||||
if (!fs.existsSync(wavPath)) {
|
if (!fs.existsSync(wavPath)) {
|
||||||
|
let generated = false;
|
||||||
if (this.espeakAvailable) {
|
if (this.espeakAvailable) {
|
||||||
generated = generateViaEspeak(wavPath, text);
|
generated = generateViaEspeak(wavPath, text);
|
||||||
}
|
}
|
||||||
if (!generated) {
|
if (!generated) {
|
||||||
generated = generateViaKokoro(wavPath, text, voice);
|
generated = await generateViaKokoro(wavPath, text, voice);
|
||||||
}
|
}
|
||||||
if (!generated) {
|
if (!generated) {
|
||||||
this.log(`[prompt-cache] failed to generate TTS for "${id}"`);
|
this.log(`[prompt-cache] failed to generate TTS for "${id}"`);
|
||||||
@@ -223,49 +176,22 @@ export class PromptCache {
|
|||||||
this.log(`[prompt-cache] generated WAV for "${id}"`);
|
this.log(`[prompt-cache] generated WAV for "${id}"`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.loadWavPrompt(id, wavPath);
|
return this.registerWav(id, wavPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load a WAV file as a prompt and cache it.
|
* Load a pre-existing WAV file as a prompt.
|
||||||
*/
|
*/
|
||||||
async loadWavPrompt(id: string, wavPath: string): Promise<ICachedPrompt | null> {
|
async loadWavPrompt(id: string, wavPath: string): Promise<ICachedPrompt | null> {
|
||||||
if (!fs.existsSync(wavPath)) {
|
if (!fs.existsSync(wavPath)) {
|
||||||
this.log(`[prompt-cache] WAV not found: ${wavPath}`);
|
this.log(`[prompt-cache] WAV not found: ${wavPath}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
return this.registerWav(id, wavPath);
|
||||||
const result = readWavWithRate(wavPath);
|
|
||||||
if (!result) {
|
|
||||||
this.log(`[prompt-cache] failed to parse WAV: ${wavPath}`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const encoded = await encodePcmFrames(result.pcm, result.sampleRate, this.log);
|
|
||||||
if (!encoded) {
|
|
||||||
this.log(`[prompt-cache] encoding failed for "${id}" (codec bridge not ready?)`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const durationMs = encoded.g722Frames.length * 20;
|
|
||||||
const prompt: ICachedPrompt = {
|
|
||||||
id,
|
|
||||||
g722Frames: encoded.g722Frames,
|
|
||||||
opusFrames: encoded.opusFrames,
|
|
||||||
durationMs,
|
|
||||||
};
|
|
||||||
|
|
||||||
this.prompts.set(id, prompt);
|
|
||||||
this.log(`[prompt-cache] cached "${id}": ${encoded.g722Frames.length} frames (${(durationMs / 1000).toFixed(1)}s)`);
|
|
||||||
return prompt;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate a beep tone prompt (sine wave).
|
* Generate a beep tone WAV and cache it.
|
||||||
* @param id - prompt ID
|
|
||||||
* @param freqHz - tone frequency (default 1000 Hz)
|
|
||||||
* @param durationMs - tone duration (default 500ms)
|
|
||||||
* @param amplitude - 16-bit amplitude (default 8000)
|
|
||||||
*/
|
*/
|
||||||
async generateBeep(
|
async generateBeep(
|
||||||
id: string,
|
id: string,
|
||||||
@@ -273,14 +199,17 @@ export class PromptCache {
|
|||||||
durationMs = 500,
|
durationMs = 500,
|
||||||
amplitude = 8000,
|
amplitude = 8000,
|
||||||
): Promise<ICachedPrompt | null> {
|
): Promise<ICachedPrompt | null> {
|
||||||
// Generate at 16kHz for decent quality.
|
fs.mkdirSync(TTS_DIR, { recursive: true });
|
||||||
|
const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
|
||||||
|
|
||||||
|
if (!fs.existsSync(wavPath)) {
|
||||||
|
// Generate 16kHz 16-bit mono sine wave WAV.
|
||||||
const sampleRate = 16000;
|
const sampleRate = 16000;
|
||||||
const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
|
const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
|
||||||
const pcm = Buffer.alloc(totalSamples * 2);
|
const pcm = Buffer.alloc(totalSamples * 2);
|
||||||
|
|
||||||
for (let i = 0; i < totalSamples; i++) {
|
for (let i = 0; i < totalSamples; i++) {
|
||||||
const t = i / sampleRate;
|
const t = i / sampleRate;
|
||||||
// Apply a short fade-in/fade-out to avoid click artifacts.
|
|
||||||
const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
|
const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
|
||||||
let envelope = 1.0;
|
let envelope = 1.0;
|
||||||
if (i < fadeLen) envelope = i / fadeLen;
|
if (i < fadeLen) envelope = i / fadeLen;
|
||||||
@@ -290,132 +219,57 @@ export class PromptCache {
|
|||||||
pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
|
pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
const encoded = await encodePcmFrames(pcm, sampleRate, this.log);
|
// Write WAV file.
|
||||||
if (!encoded) {
|
const headerSize = 44;
|
||||||
this.log(`[prompt-cache] beep encoding failed for "${id}"`);
|
const dataSize = pcm.length;
|
||||||
return null;
|
const wav = Buffer.alloc(headerSize + dataSize);
|
||||||
|
|
||||||
|
// RIFF header
|
||||||
|
wav.write('RIFF', 0);
|
||||||
|
wav.writeUInt32LE(36 + dataSize, 4);
|
||||||
|
wav.write('WAVE', 8);
|
||||||
|
|
||||||
|
// fmt chunk
|
||||||
|
wav.write('fmt ', 12);
|
||||||
|
wav.writeUInt32LE(16, 16); // chunk size
|
||||||
|
wav.writeUInt16LE(1, 20); // PCM format
|
||||||
|
wav.writeUInt16LE(1, 22); // mono
|
||||||
|
wav.writeUInt32LE(sampleRate, 24);
|
||||||
|
wav.writeUInt32LE(sampleRate * 2, 28); // byte rate
|
||||||
|
wav.writeUInt16LE(2, 32); // block align
|
||||||
|
wav.writeUInt16LE(16, 34); // bits per sample
|
||||||
|
|
||||||
|
// data chunk
|
||||||
|
wav.write('data', 36);
|
||||||
|
wav.writeUInt32LE(dataSize, 40);
|
||||||
|
pcm.copy(wav, 44);
|
||||||
|
|
||||||
|
fs.writeFileSync(wavPath, wav);
|
||||||
|
this.log(`[prompt-cache] beep WAV generated for "${id}"`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const actualDuration = encoded.g722Frames.length * 20;
|
return this.registerWav(id, wavPath);
|
||||||
const prompt: ICachedPrompt = {
|
|
||||||
id,
|
|
||||||
g722Frames: encoded.g722Frames,
|
|
||||||
opusFrames: encoded.opusFrames,
|
|
||||||
durationMs: actualDuration,
|
|
||||||
};
|
|
||||||
|
|
||||||
this.prompts.set(id, prompt);
|
|
||||||
this.log(`[prompt-cache] beep "${id}" cached: ${actualDuration}ms @ ${freqHz}Hz`);
|
|
||||||
return prompt;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Remove a prompt from the cache. */
|
||||||
* Remove a prompt from the cache.
|
|
||||||
*/
|
|
||||||
remove(id: string): void {
|
remove(id: string): void {
|
||||||
this.prompts.delete(id);
|
this.prompts.delete(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** Clear all cached prompts. */
|
||||||
* Clear all cached prompts.
|
|
||||||
*/
|
|
||||||
clear(): void {
|
clear(): void {
|
||||||
this.prompts.clear();
|
this.prompts.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
// Internal
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
private registerWav(id: string, wavPath: string): ICachedPrompt {
|
||||||
|
const durationMs = getWavDurationMs(wavPath);
|
||||||
|
const prompt: ICachedPrompt = { id, wavPath, durationMs };
|
||||||
|
this.prompts.set(id, prompt);
|
||||||
|
this.log(`[prompt-cache] cached "${id}": ${wavPath} (${(durationMs / 1000).toFixed(1)}s)`);
|
||||||
|
return prompt;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Standalone playback helpers (for use by SystemLeg)
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Play a cached prompt's G.722 frames as RTP packets at 20ms intervals.
|
|
||||||
*
|
|
||||||
* @param prompt - the cached prompt to play
|
|
||||||
* @param sendPacket - function to send a raw RTP packet (12-byte header + payload)
|
|
||||||
* @param ssrc - SSRC for RTP headers
|
|
||||||
* @param onDone - called when playback finishes
|
|
||||||
* @returns cancel function, or null if prompt has no G.722 frames
|
|
||||||
*/
|
|
||||||
export function playPromptG722(
|
|
||||||
prompt: ICachedPrompt,
|
|
||||||
sendPacket: (pkt: Buffer) => void,
|
|
||||||
ssrc: number,
|
|
||||||
onDone?: () => void,
|
|
||||||
): (() => void) | null {
|
|
||||||
if (prompt.g722Frames.length === 0) {
|
|
||||||
onDone?.();
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const frames = prompt.g722Frames;
|
|
||||||
const PT = 9;
|
|
||||||
let frameIdx = 0;
|
|
||||||
let seq = Math.floor(Math.random() * 0xffff);
|
|
||||||
let rtpTs = Math.floor(Math.random() * 0xffffffff);
|
|
||||||
|
|
||||||
const timer = setInterval(() => {
|
|
||||||
if (frameIdx >= frames.length) {
|
|
||||||
clearInterval(timer);
|
|
||||||
onDone?.();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const payload = frames[frameIdx];
|
|
||||||
const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
|
|
||||||
const pkt = Buffer.concat([hdr, payload]);
|
|
||||||
sendPacket(pkt);
|
|
||||||
|
|
||||||
seq++;
|
|
||||||
rtpTs += rtpClockIncrement(PT);
|
|
||||||
frameIdx++;
|
|
||||||
}, 20);
|
|
||||||
|
|
||||||
return () => clearInterval(timer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Play a cached prompt's Opus frames as RTP packets at 20ms intervals.
|
|
||||||
*
|
|
||||||
* @param prompt - the cached prompt to play
|
|
||||||
* @param sendPacket - function to send a raw RTP packet
|
|
||||||
* @param ssrc - SSRC for RTP headers
|
|
||||||
* @param counters - shared seq/ts counters (mutated in place for seamless transitions)
|
|
||||||
* @param onDone - called when playback finishes
|
|
||||||
* @returns cancel function, or null if prompt has no Opus frames
|
|
||||||
*/
|
|
||||||
export function playPromptOpus(
|
|
||||||
prompt: ICachedPrompt,
|
|
||||||
sendPacket: (pkt: Buffer) => void,
|
|
||||||
ssrc: number,
|
|
||||||
counters: { seq: number; ts: number },
|
|
||||||
onDone?: () => void,
|
|
||||||
): (() => void) | null {
|
|
||||||
if (prompt.opusFrames.length === 0) {
|
|
||||||
onDone?.();
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const frames = prompt.opusFrames;
|
|
||||||
const PT = 111;
|
|
||||||
let frameIdx = 0;
|
|
||||||
|
|
||||||
const timer = setInterval(() => {
|
|
||||||
if (frameIdx >= frames.length) {
|
|
||||||
clearInterval(timer);
|
|
||||||
onDone?.();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const payload = frames[frameIdx];
|
|
||||||
const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
|
|
||||||
const pkt = Buffer.concat([hdr, payload]);
|
|
||||||
sendPacket(pkt);
|
|
||||||
|
|
||||||
counters.seq++;
|
|
||||||
counters.ts += 960; // Opus 48kHz: 960 samples per 20ms
|
|
||||||
frameIdx++;
|
|
||||||
}, 20);
|
|
||||||
|
|
||||||
return () => clearInterval(timer);
|
|
||||||
}
|
}
|
||||||
|
|||||||
199
ts/opusbridge.ts
199
ts/opusbridge.ts
@@ -1,199 +0,0 @@
|
|||||||
/**
|
|
||||||
* Audio transcoding bridge — uses smartrust to communicate with the Rust
|
|
||||||
* opus-codec binary, which handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding.
|
|
||||||
*
|
|
||||||
* All codec conversion happens in Rust (libopus + SpanDSP G.722 port).
|
|
||||||
* The TypeScript side just passes raw payloads back and forth.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import path from 'node:path';
|
|
||||||
import { RustBridge } from '@push.rocks/smartrust';
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Command type map for smartrust
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
type TCodecCommands = {
|
|
||||||
init: {
|
|
||||||
params: Record<string, never>;
|
|
||||||
result: Record<string, never>;
|
|
||||||
};
|
|
||||||
create_session: {
|
|
||||||
params: { session_id: string };
|
|
||||||
result: Record<string, never>;
|
|
||||||
};
|
|
||||||
destroy_session: {
|
|
||||||
params: { session_id: string };
|
|
||||||
result: Record<string, never>;
|
|
||||||
};
|
|
||||||
transcode: {
|
|
||||||
params: { data_b64: string; from_pt: number; to_pt: number; session_id?: string; direction?: string };
|
|
||||||
result: { data_b64: string };
|
|
||||||
};
|
|
||||||
encode_pcm: {
|
|
||||||
params: { data_b64: string; sample_rate: number; to_pt: number; session_id?: string };
|
|
||||||
result: { data_b64: string };
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Bridge singleton
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
let bridge: RustBridge<TCodecCommands> | null = null;
|
|
||||||
let initialized = false;
|
|
||||||
|
|
||||||
function buildLocalPaths(): string[] {
|
|
||||||
const root = process.cwd();
|
|
||||||
return [
|
|
||||||
path.join(root, 'dist_rust', 'opus-codec'),
|
|
||||||
path.join(root, 'rust', 'target', 'release', 'opus-codec'),
|
|
||||||
path.join(root, 'rust', 'target', 'debug', 'opus-codec'),
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
let logFn: ((msg: string) => void) | undefined;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Initialize the audio transcoding bridge. Spawns the Rust binary.
|
|
||||||
*/
|
|
||||||
export async function initCodecBridge(log?: (msg: string) => void): Promise<boolean> {
|
|
||||||
if (initialized && bridge) return true;
|
|
||||||
logFn = log;
|
|
||||||
|
|
||||||
try {
|
|
||||||
bridge = new RustBridge<TCodecCommands>({
|
|
||||||
binaryName: 'opus-codec',
|
|
||||||
localPaths: buildLocalPaths(),
|
|
||||||
});
|
|
||||||
|
|
||||||
const spawned = await bridge.spawn();
|
|
||||||
if (!spawned) {
|
|
||||||
log?.('[codec] failed to spawn opus-codec binary');
|
|
||||||
bridge = null;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Auto-restart: reset state when the Rust process exits so the next
|
|
||||||
// transcode attempt triggers re-initialization instead of silent failure.
|
|
||||||
bridge.on('exit', () => {
|
|
||||||
logFn?.('[codec] Rust audio transcoder process exited — will re-init on next use');
|
|
||||||
bridge = null;
|
|
||||||
initialized = false;
|
|
||||||
});
|
|
||||||
|
|
||||||
await bridge.sendCommand('init', {} as any);
|
|
||||||
initialized = true;
|
|
||||||
log?.('[codec] Rust audio transcoder initialized (Opus + G.722 + PCMU/PCMA)');
|
|
||||||
return true;
|
|
||||||
} catch (e: any) {
|
|
||||||
log?.(`[codec] init error: ${e.message}`);
|
|
||||||
bridge = null;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Session management — per-call codec isolation
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Create an isolated codec session. Each session gets its own Opus/G.722
|
|
||||||
* encoder/decoder state, preventing concurrent calls from corrupting each
|
|
||||||
* other's stateful codec predictions.
|
|
||||||
*/
|
|
||||||
export async function createSession(sessionId: string): Promise<boolean> {
|
|
||||||
if (!bridge || !initialized) {
|
|
||||||
// Attempt auto-reinit if bridge died.
|
|
||||||
const ok = await initCodecBridge(logFn);
|
|
||||||
if (!ok) return false;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
await bridge!.sendCommand('create_session', { session_id: sessionId });
|
|
||||||
return true;
|
|
||||||
} catch (e: any) {
|
|
||||||
logFn?.(`[codec] create_session error: ${e?.message || e}`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Destroy a codec session, freeing its encoder/decoder state.
|
|
||||||
*/
|
|
||||||
export async function destroySession(sessionId: string): Promise<void> {
|
|
||||||
if (!bridge || !initialized) return;
|
|
||||||
try {
|
|
||||||
await bridge.sendCommand('destroy_session', { session_id: sessionId });
|
|
||||||
} catch {
|
|
||||||
// Best-effort cleanup.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Transcoding
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Transcode an RTP payload between two codecs.
|
|
||||||
* All codec work (Opus, G.722, PCMU, PCMA) + resampling happens in Rust.
|
|
||||||
*
|
|
||||||
* @param data - raw RTP payload (no header)
|
|
||||||
* @param fromPT - source payload type (0=PCMU, 8=PCMA, 9=G.722, 111=Opus)
|
|
||||||
* @param toPT - target payload type
|
|
||||||
* @param sessionId - optional session for isolated codec state
|
|
||||||
* @returns transcoded payload, or null on failure
|
|
||||||
*/
|
|
||||||
export async function transcode(data: Buffer, fromPT: number, toPT: number, sessionId?: string, direction?: string): Promise<Buffer | null> {
|
|
||||||
if (!bridge || !initialized) return null;
|
|
||||||
try {
|
|
||||||
const params: any = {
|
|
||||||
data_b64: data.toString('base64'),
|
|
||||||
from_pt: fromPT,
|
|
||||||
to_pt: toPT,
|
|
||||||
};
|
|
||||||
if (sessionId) params.session_id = sessionId;
|
|
||||||
if (direction) params.direction = direction;
|
|
||||||
const result = await bridge.sendCommand('transcode', params);
|
|
||||||
return Buffer.from(result.data_b64, 'base64');
|
|
||||||
} catch {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Encode raw 16-bit PCM to a target codec.
|
|
||||||
* @param pcmData - raw 16-bit LE PCM bytes
|
|
||||||
* @param sampleRate - input sample rate (e.g. 22050 for Piper TTS)
|
|
||||||
* @param toPT - target payload type (9=G.722, 111=Opus, 0=PCMU, 8=PCMA)
|
|
||||||
* @param sessionId - optional session for isolated codec state
|
|
||||||
*/
|
|
||||||
export async function encodePcm(pcmData: Buffer, sampleRate: number, toPT: number, sessionId?: string): Promise<Buffer | null> {
|
|
||||||
if (!bridge || !initialized) return null;
|
|
||||||
try {
|
|
||||||
const params: any = {
|
|
||||||
data_b64: pcmData.toString('base64'),
|
|
||||||
sample_rate: sampleRate,
|
|
||||||
to_pt: toPT,
|
|
||||||
};
|
|
||||||
if (sessionId) params.session_id = sessionId;
|
|
||||||
const result = await bridge.sendCommand('encode_pcm', params);
|
|
||||||
return Buffer.from(result.data_b64, 'base64');
|
|
||||||
} catch (e: any) {
|
|
||||||
console.error('[encodePcm] error:', e?.message || e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Check if the codec bridge is ready. */
|
|
||||||
export function isCodecReady(): boolean {
|
|
||||||
return initialized && bridge !== null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Shut down the codec bridge. */
|
|
||||||
export function shutdownCodecBridge(): void {
|
|
||||||
if (bridge) {
|
|
||||||
try { bridge.kill(); } catch { /* ignore */ }
|
|
||||||
bridge = null;
|
|
||||||
initialized = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -79,6 +79,10 @@ type TProxyCommands = {
|
|||||||
params: { call_id: string; leg_id: string; key: string; value: unknown };
|
params: { call_id: string; leg_id: string; key: string; value: unknown };
|
||||||
result: Record<string, never>;
|
result: Record<string, never>;
|
||||||
};
|
};
|
||||||
|
generate_tts: {
|
||||||
|
params: { model: string; voices: string; voice: string; text: string; output: string };
|
||||||
|
result: { output: string };
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -493,6 +497,15 @@ export function isProxyReady(): boolean {
|
|||||||
return initialized && bridge !== null;
|
return initialized && bridge !== null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Send an arbitrary command to the proxy engine bridge. */
|
||||||
|
export async function sendProxyCommand<K extends keyof TProxyCommands>(
|
||||||
|
method: K,
|
||||||
|
params: TProxyCommands[K]['params'],
|
||||||
|
): Promise<TProxyCommands[K]['result']> {
|
||||||
|
if (!bridge || !initialized) throw new Error('proxy engine not initialized');
|
||||||
|
return bridge.sendCommand(method as string, params as any) as any;
|
||||||
|
}
|
||||||
|
|
||||||
/** Shut down the proxy engine. */
|
/** Shut down the proxy engine. */
|
||||||
export function shutdownProxyEngine(): void {
|
export function shutdownProxyEngine(): void {
|
||||||
if (bridge) {
|
if (bridge) {
|
||||||
|
|||||||
@@ -24,7 +24,6 @@ import {
|
|||||||
getAllBrowserDeviceIds,
|
getAllBrowserDeviceIds,
|
||||||
getBrowserDeviceWs,
|
getBrowserDeviceWs,
|
||||||
} from './webrtcbridge.ts';
|
} from './webrtcbridge.ts';
|
||||||
import { initCodecBridge } from './opusbridge.ts';
|
|
||||||
import { initAnnouncement } from './announcement.ts';
|
import { initAnnouncement } from './announcement.ts';
|
||||||
import { PromptCache } from './call/prompt-cache.ts';
|
import { PromptCache } from './call/prompt-cache.ts';
|
||||||
import { VoiceboxManager } from './voicebox.ts';
|
import { VoiceboxManager } from './voicebox.ts';
|
||||||
@@ -523,9 +522,8 @@ async function startProxyEngine(): Promise<void> {
|
|||||||
const deviceList = appConfig.devices.map((d) => d.displayName).join(', ');
|
const deviceList = appConfig.devices.map((d) => d.displayName).join(', ');
|
||||||
log(`proxy engine started | LAN ${appConfig.proxy.lanIp}:${appConfig.proxy.lanPort} | providers: ${providerList} | devices: ${deviceList}`);
|
log(`proxy engine started | LAN ${appConfig.proxy.lanIp}:${appConfig.proxy.lanPort} | providers: ${providerList} | devices: ${deviceList}`);
|
||||||
|
|
||||||
// Initialize audio codec bridge (still needed for WebRTC transcoding).
|
// Generate TTS audio (WAV files on disk, played by Rust audio_player).
|
||||||
try {
|
try {
|
||||||
await initCodecBridge(log);
|
|
||||||
await initAnnouncement(log);
|
await initAnnouncement(log);
|
||||||
|
|
||||||
// Pre-generate prompts.
|
// Pre-generate prompts.
|
||||||
@@ -547,7 +545,7 @@ async function startProxyEngine(): Promise<void> {
|
|||||||
}
|
}
|
||||||
log(`[startup] prompts cached: ${promptCache.listIds().join(', ') || 'none'}`);
|
log(`[startup] prompts cached: ${promptCache.listIds().join(', ') || 'none'}`);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
log(`[codec] init failed: ${e}`);
|
log(`[tts] init failed: ${e}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,6 @@
|
|||||||
*/
|
*/
|
||||||
export const commitinfo = {
|
export const commitinfo = {
|
||||||
name: 'siprouter',
|
name: 'siprouter',
|
||||||
version: '1.15.0',
|
version: '1.16.0',
|
||||||
description: 'undefined'
|
description: 'undefined'
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user