feat(proxy-engine): add on-demand TTS caching for voicemail and IVR prompts
This commit is contained in:
@@ -3,6 +3,6 @@
|
||||
*/
|
||||
export const commitinfo = {
|
||||
name: 'siprouter',
|
||||
version: '1.21.0',
|
||||
version: '1.22.0',
|
||||
description: 'undefined'
|
||||
}
|
||||
|
||||
@@ -1,137 +0,0 @@
|
||||
/**
|
||||
* TTS announcement module — generates announcement WAV files at startup.
|
||||
*
|
||||
* Engine priority: espeak-ng (formant TTS, fast) → Kokoro neural TTS via
|
||||
* proxy-engine → disabled.
|
||||
*
|
||||
* The generated WAV is left on disk for Rust's audio_player / start_interaction
|
||||
* to play during calls. No encoding or RTP playback happens in TypeScript.
|
||||
*/
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { sendProxyCommand, isProxyReady } from './proxybridge.ts';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// State
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
|
||||
const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
|
||||
const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');
|
||||
|
||||
// Kokoro fallback constants.
|
||||
const KOKORO_MODEL = 'kokoro-v1.0.onnx';
|
||||
const KOKORO_VOICES = 'voices.bin';
|
||||
const KOKORO_VOICE = 'af_bella';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// TTS generators
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Check if espeak-ng is available on the system. */
|
||||
function isEspeakAvailable(): boolean {
|
||||
try {
|
||||
execSync('which espeak-ng', { stdio: 'pipe' });
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Generate announcement WAV via espeak-ng (primary engine). */
|
||||
function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
|
||||
log('[tts] generating announcement audio via espeak-ng...');
|
||||
try {
|
||||
execSync(
|
||||
`espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
|
||||
{ timeout: 10000, stdio: 'pipe' },
|
||||
);
|
||||
log('[tts] espeak-ng WAV generated');
|
||||
return true;
|
||||
} catch (e: any) {
|
||||
log(`[tts] espeak-ng failed: ${e.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Generate announcement WAV via Kokoro TTS (fallback, runs inside proxy-engine). */
|
||||
async function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): Promise<boolean> {
|
||||
const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
|
||||
const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);
|
||||
|
||||
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) {
|
||||
log('[tts] Kokoro model/voices not found — Kokoro fallback unavailable');
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!isProxyReady()) {
|
||||
log('[tts] proxy-engine not ready — Kokoro fallback unavailable');
|
||||
return false;
|
||||
}
|
||||
|
||||
log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
|
||||
try {
|
||||
await sendProxyCommand('generate_tts', {
|
||||
model: modelPath,
|
||||
voices: voicesPath,
|
||||
voice: KOKORO_VOICE,
|
||||
text,
|
||||
output: wavPath,
|
||||
});
|
||||
log('[tts] Kokoro WAV generated (via proxy-engine)');
|
||||
return true;
|
||||
} catch (e: any) {
|
||||
log(`[tts] Kokoro failed: ${e.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Initialization
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Pre-generate the announcement WAV file.
|
||||
* Must be called after the proxy engine is initialized.
|
||||
*
|
||||
* Engine priority: espeak-ng → Kokoro → disabled.
|
||||
*/
|
||||
export async function initAnnouncement(log: (msg: string) => void): Promise<boolean> {
|
||||
fs.mkdirSync(TTS_DIR, { recursive: true });
|
||||
|
||||
try {
|
||||
if (!fs.existsSync(CACHE_WAV)) {
|
||||
let generated = false;
|
||||
|
||||
// Try espeak-ng first.
|
||||
if (isEspeakAvailable()) {
|
||||
generated = generateViaEspeak(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
|
||||
} else {
|
||||
log('[tts] espeak-ng not installed — trying Kokoro fallback');
|
||||
}
|
||||
|
||||
// Fall back to Kokoro (via proxy-engine).
|
||||
if (!generated) {
|
||||
generated = await generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
|
||||
}
|
||||
|
||||
if (!generated) {
|
||||
log('[tts] no TTS engine available — announcements disabled');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
log('[tts] announcement WAV ready');
|
||||
return true;
|
||||
} catch (e: any) {
|
||||
log(`[tts] init error: ${e.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the path to the cached announcement WAV, or null if not generated. */
|
||||
export function getAnnouncementWavPath(): string | null {
|
||||
return fs.existsSync(CACHE_WAV) ? CACHE_WAV : null;
|
||||
}
|
||||
@@ -1,275 +0,0 @@
|
||||
/**
|
||||
* PromptCache — manages named audio prompt WAV files for IVR and voicemail.
|
||||
*
|
||||
* Generates WAV files via espeak-ng (primary) or Kokoro TTS through the
|
||||
* proxy-engine (fallback). Also supports loading pre-existing WAV files
|
||||
* and programmatic tone generation.
|
||||
*
|
||||
* All audio playback happens in Rust (audio_player / start_interaction).
|
||||
* This module only manages WAV files on disk.
|
||||
*/
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { Buffer } from 'node:buffer';
|
||||
import { sendProxyCommand, isProxyReady } from '../proxybridge.ts';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** A cached prompt — just a WAV file path and metadata. */
|
||||
export interface ICachedPrompt {
|
||||
/** Unique prompt identifier. */
|
||||
id: string;
|
||||
/** Path to the WAV file on disk. */
|
||||
wavPath: string;
|
||||
/** Total duration in milliseconds (approximate, from WAV header). */
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// TTS helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
|
||||
|
||||
/** Check if espeak-ng is available. */
|
||||
function isEspeakAvailable(): boolean {
|
||||
try {
|
||||
execSync('which espeak-ng', { stdio: 'pipe' });
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Generate WAV via espeak-ng. */
|
||||
function generateViaEspeak(wavPath: string, text: string): boolean {
|
||||
try {
|
||||
execSync(
|
||||
`espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
|
||||
{ timeout: 10000, stdio: 'pipe' },
|
||||
);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Generate WAV via Kokoro TTS (runs inside proxy-engine). */
|
||||
async function generateViaKokoro(wavPath: string, text: string, voice: string): Promise<boolean> {
|
||||
const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx');
|
||||
const voicesPath = path.join(TTS_DIR, 'voices.bin');
|
||||
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false;
|
||||
if (!isProxyReady()) return false;
|
||||
|
||||
try {
|
||||
await sendProxyCommand('generate_tts', {
|
||||
model: modelPath,
|
||||
voices: voicesPath,
|
||||
voice,
|
||||
text,
|
||||
output: wavPath,
|
||||
});
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Read a WAV file's duration from its header. */
|
||||
function getWavDurationMs(wavPath: string): number {
|
||||
try {
|
||||
const wav = fs.readFileSync(wavPath);
|
||||
if (wav.length < 44) return 0;
|
||||
if (wav.toString('ascii', 0, 4) !== 'RIFF') return 0;
|
||||
|
||||
let sampleRate = 16000;
|
||||
let dataSize = 0;
|
||||
let bitsPerSample = 16;
|
||||
let channels = 1;
|
||||
let offset = 12;
|
||||
|
||||
while (offset < wav.length - 8) {
|
||||
const chunkId = wav.toString('ascii', offset, offset + 4);
|
||||
const chunkSize = wav.readUInt32LE(offset + 4);
|
||||
if (chunkId === 'fmt ') {
|
||||
channels = wav.readUInt16LE(offset + 10);
|
||||
sampleRate = wav.readUInt32LE(offset + 12);
|
||||
bitsPerSample = wav.readUInt16LE(offset + 22);
|
||||
}
|
||||
if (chunkId === 'data') {
|
||||
dataSize = chunkSize;
|
||||
}
|
||||
offset += 8 + chunkSize;
|
||||
if (offset % 2 !== 0) offset++;
|
||||
}
|
||||
|
||||
const bytesPerSample = (bitsPerSample / 8) * channels;
|
||||
const totalSamples = bytesPerSample > 0 ? dataSize / bytesPerSample : 0;
|
||||
return sampleRate > 0 ? Math.round((totalSamples / sampleRate) * 1000) : 0;
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// PromptCache
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export class PromptCache {
|
||||
private prompts = new Map<string, ICachedPrompt>();
|
||||
private log: (msg: string) => void;
|
||||
private espeakAvailable: boolean | null = null;
|
||||
|
||||
constructor(log: (msg: string) => void) {
|
||||
this.log = log;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Public API
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/** Get a cached prompt by ID. */
|
||||
get(id: string): ICachedPrompt | null {
|
||||
return this.prompts.get(id) ?? null;
|
||||
}
|
||||
|
||||
/** Check if a prompt is cached. */
|
||||
has(id: string): boolean {
|
||||
return this.prompts.has(id);
|
||||
}
|
||||
|
||||
/** List all cached prompt IDs. */
|
||||
listIds(): string[] {
|
||||
return [...this.prompts.keys()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a TTS prompt WAV and cache its path.
|
||||
* Uses espeak-ng (primary) or Kokoro (fallback).
|
||||
*/
|
||||
async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise<ICachedPrompt | null> {
|
||||
fs.mkdirSync(TTS_DIR, { recursive: true });
|
||||
const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
|
||||
|
||||
// Check espeak availability once.
|
||||
if (this.espeakAvailable === null) {
|
||||
this.espeakAvailable = isEspeakAvailable();
|
||||
}
|
||||
|
||||
// Generate WAV if not already on disk.
|
||||
if (!fs.existsSync(wavPath)) {
|
||||
let generated = false;
|
||||
if (this.espeakAvailable) {
|
||||
generated = generateViaEspeak(wavPath, text);
|
||||
}
|
||||
if (!generated) {
|
||||
generated = await generateViaKokoro(wavPath, text, voice);
|
||||
}
|
||||
if (!generated) {
|
||||
this.log(`[prompt-cache] failed to generate TTS for "${id}"`);
|
||||
return null;
|
||||
}
|
||||
this.log(`[prompt-cache] generated WAV for "${id}"`);
|
||||
}
|
||||
|
||||
return this.registerWav(id, wavPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a pre-existing WAV file as a prompt.
|
||||
*/
|
||||
async loadWavPrompt(id: string, wavPath: string): Promise<ICachedPrompt | null> {
|
||||
if (!fs.existsSync(wavPath)) {
|
||||
this.log(`[prompt-cache] WAV not found: ${wavPath}`);
|
||||
return null;
|
||||
}
|
||||
return this.registerWav(id, wavPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a beep tone WAV and cache it.
|
||||
*/
|
||||
async generateBeep(
|
||||
id: string,
|
||||
freqHz = 1000,
|
||||
durationMs = 500,
|
||||
amplitude = 8000,
|
||||
): Promise<ICachedPrompt | null> {
|
||||
fs.mkdirSync(TTS_DIR, { recursive: true });
|
||||
const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
|
||||
|
||||
if (!fs.existsSync(wavPath)) {
|
||||
// Generate 16kHz 16-bit mono sine wave WAV.
|
||||
const sampleRate = 16000;
|
||||
const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
|
||||
const pcm = Buffer.alloc(totalSamples * 2);
|
||||
|
||||
for (let i = 0; i < totalSamples; i++) {
|
||||
const t = i / sampleRate;
|
||||
const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
|
||||
let envelope = 1.0;
|
||||
if (i < fadeLen) envelope = i / fadeLen;
|
||||
else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
|
||||
|
||||
const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
|
||||
pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
|
||||
}
|
||||
|
||||
// Write WAV file.
|
||||
const headerSize = 44;
|
||||
const dataSize = pcm.length;
|
||||
const wav = Buffer.alloc(headerSize + dataSize);
|
||||
|
||||
// RIFF header
|
||||
wav.write('RIFF', 0);
|
||||
wav.writeUInt32LE(36 + dataSize, 4);
|
||||
wav.write('WAVE', 8);
|
||||
|
||||
// fmt chunk
|
||||
wav.write('fmt ', 12);
|
||||
wav.writeUInt32LE(16, 16); // chunk size
|
||||
wav.writeUInt16LE(1, 20); // PCM format
|
||||
wav.writeUInt16LE(1, 22); // mono
|
||||
wav.writeUInt32LE(sampleRate, 24);
|
||||
wav.writeUInt32LE(sampleRate * 2, 28); // byte rate
|
||||
wav.writeUInt16LE(2, 32); // block align
|
||||
wav.writeUInt16LE(16, 34); // bits per sample
|
||||
|
||||
// data chunk
|
||||
wav.write('data', 36);
|
||||
wav.writeUInt32LE(dataSize, 40);
|
||||
pcm.copy(wav, 44);
|
||||
|
||||
fs.writeFileSync(wavPath, wav);
|
||||
this.log(`[prompt-cache] beep WAV generated for "${id}"`);
|
||||
}
|
||||
|
||||
return this.registerWav(id, wavPath);
|
||||
}
|
||||
|
||||
/** Remove a prompt from the cache. */
|
||||
remove(id: string): void {
|
||||
this.prompts.delete(id);
|
||||
}
|
||||
|
||||
/** Clear all cached prompts. */
|
||||
clear(): void {
|
||||
this.prompts.clear();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Internal
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private registerWav(id: string, wavPath: string): ICachedPrompt {
|
||||
const durationMs = getWavDurationMs(wavPath);
|
||||
const prompt: ICachedPrompt = { id, wavPath, durationMs };
|
||||
this.prompts.set(id, prompt);
|
||||
this.log(`[prompt-cache] cached "${id}": ${wavPath} (${(durationMs / 1000).toFixed(1)}s)`);
|
||||
return prompt;
|
||||
}
|
||||
}
|
||||
@@ -88,7 +88,7 @@ type TProxyCommands = {
|
||||
result: Record<string, never>;
|
||||
};
|
||||
generate_tts: {
|
||||
params: { model: string; voices: string; voice: string; text: string; output: string };
|
||||
params: { model: string; voices: string; voice: string; text: string; output: string; cacheable?: boolean };
|
||||
result: { output: string };
|
||||
};
|
||||
// WebRTC signaling — bridged from the browser via the TS control plane.
|
||||
|
||||
@@ -24,8 +24,6 @@ import {
|
||||
getAllBrowserDeviceIds,
|
||||
getBrowserDeviceWs,
|
||||
} from './webrtcbridge.ts';
|
||||
import { initAnnouncement } from './announcement.ts';
|
||||
import { PromptCache } from './call/prompt-cache.ts';
|
||||
import { VoiceboxManager } from './voicebox.ts';
|
||||
import {
|
||||
initProxyEngine,
|
||||
@@ -170,7 +168,6 @@ for (const d of appConfig.devices) {
|
||||
// Initialize subsystems
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const promptCache = new PromptCache(log);
|
||||
const voiceboxManager = new VoiceboxManager(log);
|
||||
voiceboxManager.init(appConfig.voiceboxes ?? []);
|
||||
|
||||
@@ -519,6 +516,8 @@ async function startProxyEngine(): Promise<void> {
|
||||
providers: appConfig.providers,
|
||||
devices: appConfig.devices,
|
||||
routing: appConfig.routing,
|
||||
voiceboxes: appConfig.voiceboxes ?? [],
|
||||
ivr: appConfig.ivr,
|
||||
});
|
||||
|
||||
if (!configured) {
|
||||
@@ -530,31 +529,8 @@ async function startProxyEngine(): Promise<void> {
|
||||
const deviceList = appConfig.devices.map((d) => d.displayName).join(', ');
|
||||
log(`proxy engine started | LAN ${appConfig.proxy.lanIp}:${appConfig.proxy.lanPort} | providers: ${providerList} | devices: ${deviceList}`);
|
||||
|
||||
// Generate TTS audio (WAV files on disk, played by Rust audio_player).
|
||||
try {
|
||||
await initAnnouncement(log);
|
||||
|
||||
// Pre-generate prompts.
|
||||
await promptCache.generateBeep('voicemail-beep', 1000, 500, 8000);
|
||||
for (const vb of appConfig.voiceboxes ?? []) {
|
||||
if (!vb.enabled) continue;
|
||||
const promptId = `voicemail-greeting-${vb.id}`;
|
||||
if (vb.greetingWavPath) {
|
||||
await promptCache.loadWavPrompt(promptId, vb.greetingWavPath);
|
||||
} else {
|
||||
const text = vb.greetingText || 'The person you are trying to reach is not available. Please leave a message after the tone.';
|
||||
await promptCache.generatePrompt(promptId, text, vb.greetingVoice || 'af_bella');
|
||||
}
|
||||
}
|
||||
if (appConfig.ivr?.enabled) {
|
||||
for (const menu of appConfig.ivr.menus) {
|
||||
await promptCache.generatePrompt(`ivr-menu-${menu.id}`, menu.promptText, menu.promptVoice || 'af_bella');
|
||||
}
|
||||
}
|
||||
log(`[startup] prompts cached: ${promptCache.listIds().join(', ') || 'none'}`);
|
||||
} catch (e) {
|
||||
log(`[tts] init failed: ${e}`);
|
||||
}
|
||||
// TTS prompts (voicemail greetings, IVR menus) are generated on-demand
|
||||
// by the Rust TTS engine when first needed. No startup pre-generation.
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -620,6 +596,8 @@ initWebUi(
|
||||
providers: fresh.providers,
|
||||
devices: fresh.devices,
|
||||
routing: fresh.routing,
|
||||
voiceboxes: fresh.voiceboxes ?? [],
|
||||
ivr: fresh.ivr,
|
||||
}).then((ok) => {
|
||||
if (ok) log('[config] reloaded — proxy engine reconfigured');
|
||||
else log('[config] reload failed — proxy engine rejected config');
|
||||
|
||||
Reference in New Issue
Block a user