feat(proxy-engine): add on-demand TTS caching for voicemail and IVR prompts

This commit is contained in:
2026-04-12 20:45:08 +00:00
parent cfadd7a2b6
commit 59d8c2557c
17 changed files with 460 additions and 488 deletions

View File

@@ -3,6 +3,6 @@
*/
export const commitinfo = {
name: 'siprouter',
version: '1.21.0',
version: '1.22.0',
description: 'undefined'
}

View File

@@ -1,137 +0,0 @@
/**
* TTS announcement module — generates announcement WAV files at startup.
*
* Engine priority: espeak-ng (formant TTS, fast) → Kokoro neural TTS via
* proxy-engine → disabled.
*
* The generated WAV is left on disk for Rust's audio_player / start_interaction
* to play during calls. No encoding or RTP playback happens in TypeScript.
*/
import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { sendProxyCommand, isProxyReady } from './proxybridge.ts';
// ---------------------------------------------------------------------------
// State
// ---------------------------------------------------------------------------
const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');
// Kokoro fallback constants.
const KOKORO_MODEL = 'kokoro-v1.0.onnx';
const KOKORO_VOICES = 'voices.bin';
const KOKORO_VOICE = 'af_bella';
// ---------------------------------------------------------------------------
// TTS generators
// ---------------------------------------------------------------------------
/** Check if espeak-ng is available on the system. */
function isEspeakAvailable(): boolean {
try {
execSync('which espeak-ng', { stdio: 'pipe' });
return true;
} catch {
return false;
}
}
/** Generate announcement WAV via espeak-ng (primary engine). */
function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
log('[tts] generating announcement audio via espeak-ng...');
try {
execSync(
`espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
{ timeout: 10000, stdio: 'pipe' },
);
log('[tts] espeak-ng WAV generated');
return true;
} catch (e: any) {
log(`[tts] espeak-ng failed: ${e.message}`);
return false;
}
}
/** Generate announcement WAV via Kokoro TTS (fallback, runs inside proxy-engine). */
async function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): Promise<boolean> {
const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) {
log('[tts] Kokoro model/voices not found — Kokoro fallback unavailable');
return false;
}
if (!isProxyReady()) {
log('[tts] proxy-engine not ready — Kokoro fallback unavailable');
return false;
}
log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
try {
await sendProxyCommand('generate_tts', {
model: modelPath,
voices: voicesPath,
voice: KOKORO_VOICE,
text,
output: wavPath,
});
log('[tts] Kokoro WAV generated (via proxy-engine)');
return true;
} catch (e: any) {
log(`[tts] Kokoro failed: ${e.message}`);
return false;
}
}
// ---------------------------------------------------------------------------
// Initialization
// ---------------------------------------------------------------------------
/**
* Pre-generate the announcement WAV file.
* Must be called after the proxy engine is initialized.
*
* Engine priority: espeak-ng → Kokoro → disabled.
*/
export async function initAnnouncement(log: (msg: string) => void): Promise<boolean> {
fs.mkdirSync(TTS_DIR, { recursive: true });
try {
if (!fs.existsSync(CACHE_WAV)) {
let generated = false;
// Try espeak-ng first.
if (isEspeakAvailable()) {
generated = generateViaEspeak(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
} else {
log('[tts] espeak-ng not installed — trying Kokoro fallback');
}
// Fall back to Kokoro (via proxy-engine).
if (!generated) {
generated = await generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
}
if (!generated) {
log('[tts] no TTS engine available — announcements disabled');
return false;
}
}
log('[tts] announcement WAV ready');
return true;
} catch (e: any) {
log(`[tts] init error: ${e.message}`);
return false;
}
}
/** Get the path to the cached announcement WAV, or null if not generated. */
export function getAnnouncementWavPath(): string | null {
return fs.existsSync(CACHE_WAV) ? CACHE_WAV : null;
}

View File

@@ -1,275 +0,0 @@
/**
* PromptCache — manages named audio prompt WAV files for IVR and voicemail.
*
* Generates WAV files via espeak-ng (primary) or Kokoro TTS through the
* proxy-engine (fallback). Also supports loading pre-existing WAV files
* and programmatic tone generation.
*
* All audio playback happens in Rust (audio_player / start_interaction).
* This module only manages WAV files on disk.
*/
import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { Buffer } from 'node:buffer';
import { sendProxyCommand, isProxyReady } from '../proxybridge.ts';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
/** A cached prompt — just a WAV file path and metadata. */
export interface ICachedPrompt {
/** Unique prompt identifier. */
id: string;
/** Path to the WAV file on disk. */
wavPath: string;
/** Total duration in milliseconds (approximate, from WAV header). */
durationMs: number;
}
// ---------------------------------------------------------------------------
// TTS helpers
// ---------------------------------------------------------------------------
const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
/** Check if espeak-ng is available. */
function isEspeakAvailable(): boolean {
try {
execSync('which espeak-ng', { stdio: 'pipe' });
return true;
} catch {
return false;
}
}
/** Generate WAV via espeak-ng. */
function generateViaEspeak(wavPath: string, text: string): boolean {
try {
execSync(
`espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
{ timeout: 10000, stdio: 'pipe' },
);
return true;
} catch {
return false;
}
}
/** Generate WAV via Kokoro TTS (runs inside proxy-engine). */
async function generateViaKokoro(wavPath: string, text: string, voice: string): Promise<boolean> {
const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx');
const voicesPath = path.join(TTS_DIR, 'voices.bin');
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false;
if (!isProxyReady()) return false;
try {
await sendProxyCommand('generate_tts', {
model: modelPath,
voices: voicesPath,
voice,
text,
output: wavPath,
});
return true;
} catch {
return false;
}
}
/** Read a WAV file's duration from its header. */
function getWavDurationMs(wavPath: string): number {
try {
const wav = fs.readFileSync(wavPath);
if (wav.length < 44) return 0;
if (wav.toString('ascii', 0, 4) !== 'RIFF') return 0;
let sampleRate = 16000;
let dataSize = 0;
let bitsPerSample = 16;
let channels = 1;
let offset = 12;
while (offset < wav.length - 8) {
const chunkId = wav.toString('ascii', offset, offset + 4);
const chunkSize = wav.readUInt32LE(offset + 4);
if (chunkId === 'fmt ') {
channels = wav.readUInt16LE(offset + 10);
sampleRate = wav.readUInt32LE(offset + 12);
bitsPerSample = wav.readUInt16LE(offset + 22);
}
if (chunkId === 'data') {
dataSize = chunkSize;
}
offset += 8 + chunkSize;
if (offset % 2 !== 0) offset++;
}
const bytesPerSample = (bitsPerSample / 8) * channels;
const totalSamples = bytesPerSample > 0 ? dataSize / bytesPerSample : 0;
return sampleRate > 0 ? Math.round((totalSamples / sampleRate) * 1000) : 0;
} catch {
return 0;
}
}
// ---------------------------------------------------------------------------
// PromptCache
// ---------------------------------------------------------------------------
export class PromptCache {
private prompts = new Map<string, ICachedPrompt>();
private log: (msg: string) => void;
private espeakAvailable: boolean | null = null;
constructor(log: (msg: string) => void) {
this.log = log;
}
// -------------------------------------------------------------------------
// Public API
// -------------------------------------------------------------------------
/** Get a cached prompt by ID. */
get(id: string): ICachedPrompt | null {
return this.prompts.get(id) ?? null;
}
/** Check if a prompt is cached. */
has(id: string): boolean {
return this.prompts.has(id);
}
/** List all cached prompt IDs. */
listIds(): string[] {
return [...this.prompts.keys()];
}
/**
* Generate a TTS prompt WAV and cache its path.
* Uses espeak-ng (primary) or Kokoro (fallback).
*/
async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise<ICachedPrompt | null> {
fs.mkdirSync(TTS_DIR, { recursive: true });
const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
// Check espeak availability once.
if (this.espeakAvailable === null) {
this.espeakAvailable = isEspeakAvailable();
}
// Generate WAV if not already on disk.
if (!fs.existsSync(wavPath)) {
let generated = false;
if (this.espeakAvailable) {
generated = generateViaEspeak(wavPath, text);
}
if (!generated) {
generated = await generateViaKokoro(wavPath, text, voice);
}
if (!generated) {
this.log(`[prompt-cache] failed to generate TTS for "${id}"`);
return null;
}
this.log(`[prompt-cache] generated WAV for "${id}"`);
}
return this.registerWav(id, wavPath);
}
/**
* Load a pre-existing WAV file as a prompt.
*/
async loadWavPrompt(id: string, wavPath: string): Promise<ICachedPrompt | null> {
if (!fs.existsSync(wavPath)) {
this.log(`[prompt-cache] WAV not found: ${wavPath}`);
return null;
}
return this.registerWav(id, wavPath);
}
/**
* Generate a beep tone WAV and cache it.
*/
async generateBeep(
id: string,
freqHz = 1000,
durationMs = 500,
amplitude = 8000,
): Promise<ICachedPrompt | null> {
fs.mkdirSync(TTS_DIR, { recursive: true });
const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
if (!fs.existsSync(wavPath)) {
// Generate 16kHz 16-bit mono sine wave WAV.
const sampleRate = 16000;
const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
const pcm = Buffer.alloc(totalSamples * 2);
for (let i = 0; i < totalSamples; i++) {
const t = i / sampleRate;
const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
let envelope = 1.0;
if (i < fadeLen) envelope = i / fadeLen;
else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
}
// Write WAV file.
const headerSize = 44;
const dataSize = pcm.length;
const wav = Buffer.alloc(headerSize + dataSize);
// RIFF header
wav.write('RIFF', 0);
wav.writeUInt32LE(36 + dataSize, 4);
wav.write('WAVE', 8);
// fmt chunk
wav.write('fmt ', 12);
wav.writeUInt32LE(16, 16); // chunk size
wav.writeUInt16LE(1, 20); // PCM format
wav.writeUInt16LE(1, 22); // mono
wav.writeUInt32LE(sampleRate, 24);
wav.writeUInt32LE(sampleRate * 2, 28); // byte rate
wav.writeUInt16LE(2, 32); // block align
wav.writeUInt16LE(16, 34); // bits per sample
// data chunk
wav.write('data', 36);
wav.writeUInt32LE(dataSize, 40);
pcm.copy(wav, 44);
fs.writeFileSync(wavPath, wav);
this.log(`[prompt-cache] beep WAV generated for "${id}"`);
}
return this.registerWav(id, wavPath);
}
/** Remove a prompt from the cache. */
remove(id: string): void {
this.prompts.delete(id);
}
/** Clear all cached prompts. */
clear(): void {
this.prompts.clear();
}
// -------------------------------------------------------------------------
// Internal
// -------------------------------------------------------------------------
private registerWav(id: string, wavPath: string): ICachedPrompt {
const durationMs = getWavDurationMs(wavPath);
const prompt: ICachedPrompt = { id, wavPath, durationMs };
this.prompts.set(id, prompt);
this.log(`[prompt-cache] cached "${id}": ${wavPath} (${(durationMs / 1000).toFixed(1)}s)`);
return prompt;
}
}

View File

@@ -88,7 +88,7 @@ type TProxyCommands = {
result: Record<string, never>;
};
generate_tts: {
params: { model: string; voices: string; voice: string; text: string; output: string };
params: { model: string; voices: string; voice: string; text: string; output: string; cacheable?: boolean };
result: { output: string };
};
// WebRTC signaling — bridged from the browser via the TS control plane.

View File

@@ -24,8 +24,6 @@ import {
getAllBrowserDeviceIds,
getBrowserDeviceWs,
} from './webrtcbridge.ts';
import { initAnnouncement } from './announcement.ts';
import { PromptCache } from './call/prompt-cache.ts';
import { VoiceboxManager } from './voicebox.ts';
import {
initProxyEngine,
@@ -170,7 +168,6 @@ for (const d of appConfig.devices) {
// Initialize subsystems
// ---------------------------------------------------------------------------
const promptCache = new PromptCache(log);
const voiceboxManager = new VoiceboxManager(log);
voiceboxManager.init(appConfig.voiceboxes ?? []);
@@ -519,6 +516,8 @@ async function startProxyEngine(): Promise<void> {
providers: appConfig.providers,
devices: appConfig.devices,
routing: appConfig.routing,
voiceboxes: appConfig.voiceboxes ?? [],
ivr: appConfig.ivr,
});
if (!configured) {
@@ -530,31 +529,8 @@ async function startProxyEngine(): Promise<void> {
const deviceList = appConfig.devices.map((d) => d.displayName).join(', ');
log(`proxy engine started | LAN ${appConfig.proxy.lanIp}:${appConfig.proxy.lanPort} | providers: ${providerList} | devices: ${deviceList}`);
// Generate TTS audio (WAV files on disk, played by Rust audio_player).
try {
await initAnnouncement(log);
// Pre-generate prompts.
await promptCache.generateBeep('voicemail-beep', 1000, 500, 8000);
for (const vb of appConfig.voiceboxes ?? []) {
if (!vb.enabled) continue;
const promptId = `voicemail-greeting-${vb.id}`;
if (vb.greetingWavPath) {
await promptCache.loadWavPrompt(promptId, vb.greetingWavPath);
} else {
const text = vb.greetingText || 'The person you are trying to reach is not available. Please leave a message after the tone.';
await promptCache.generatePrompt(promptId, text, vb.greetingVoice || 'af_bella');
}
}
if (appConfig.ivr?.enabled) {
for (const menu of appConfig.ivr.menus) {
await promptCache.generatePrompt(`ivr-menu-${menu.id}`, menu.promptText, menu.promptVoice || 'af_bella');
}
}
log(`[startup] prompts cached: ${promptCache.listIds().join(', ') || 'none'}`);
} catch (e) {
log(`[tts] init failed: ${e}`);
}
// TTS prompts (voicemail greetings, IVR menus) are generated on-demand
// by the Rust TTS engine when first needed. No startup pre-generation.
}
// ---------------------------------------------------------------------------
@@ -620,6 +596,8 @@ initWebUi(
providers: fresh.providers,
devices: fresh.devices,
routing: fresh.routing,
voiceboxes: fresh.voiceboxes ?? [],
ivr: fresh.ivr,
}).then((ok) => {
if (ok) log('[config] reloaded — proxy engine reconfigured');
else log('[config] reload failed — proxy engine rejected config');