feat(proxy-engine): add on-demand TTS caching for voicemail and IVR prompts

2026-04-12 20:45:08 +00:00
parent cfadd7a2b6
commit 59d8c2557c
17 changed files with 460 additions and 488 deletions
--- a/ts/00_commitinfo_data.ts
+++ b/ts/00_commitinfo_data.ts
@@ -3,6 +3,6 @@
 */
 export const commitinfo = {
  name: 'siprouter',
-  version: '1.21.0',
+  version: '1.22.0',
  description: 'undefined'
 }
--- a/ts/announcement.ts
+++ b/ts/announcement.ts
@@ -1,137 +0,0 @@
-/**
- * TTS announcement module — generates announcement WAV files at startup.
- *
- * Engine priority: espeak-ng (formant TTS, fast) → Kokoro neural TTS via
- * proxy-engine → disabled.
- *
- * The generated WAV is left on disk for Rust's audio_player / start_interaction
- * to play during calls. No encoding or RTP playback happens in TypeScript.
- */
-
-import { execSync } from 'node:child_process';
-import fs from 'node:fs';
-import path from 'node:path';
-import { sendProxyCommand, isProxyReady } from './proxybridge.ts';
-
-// ---------------------------------------------------------------------------
-// State
-// ---------------------------------------------------------------------------
-
-const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
-const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
-const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');
-
-// Kokoro fallback constants.
-const KOKORO_MODEL = 'kokoro-v1.0.onnx';
-const KOKORO_VOICES = 'voices.bin';
-const KOKORO_VOICE = 'af_bella';
-
-// ---------------------------------------------------------------------------
-// TTS generators
-// ---------------------------------------------------------------------------
-
-/** Check if espeak-ng is available on the system. */
-function isEspeakAvailable(): boolean {
-  try {
-    execSync('which espeak-ng', { stdio: 'pipe' });
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-/** Generate announcement WAV via espeak-ng (primary engine). */
-function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
-  log('[tts] generating announcement audio via espeak-ng...');
-  try {
-    execSync(
-      `espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
-      { timeout: 10000, stdio: 'pipe' },
-    );
-    log('[tts] espeak-ng WAV generated');
-    return true;
-  } catch (e: any) {
-    log(`[tts] espeak-ng failed: ${e.message}`);
-    return false;
-  }
-}
-
-/** Generate announcement WAV via Kokoro TTS (fallback, runs inside proxy-engine). */
-async function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): Promise<boolean> {
-  const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
-  const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);
-
-  if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) {
-    log('[tts] Kokoro model/voices not found — Kokoro fallback unavailable');
-    return false;
-  }
-
-  if (!isProxyReady()) {
-    log('[tts] proxy-engine not ready — Kokoro fallback unavailable');
-    return false;
-  }
-
-  log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
-  try {
-    await sendProxyCommand('generate_tts', {
-      model: modelPath,
-      voices: voicesPath,
-      voice: KOKORO_VOICE,
-      text,
-      output: wavPath,
-    });
-    log('[tts] Kokoro WAV generated (via proxy-engine)');
-    return true;
-  } catch (e: any) {
-    log(`[tts] Kokoro failed: ${e.message}`);
-    return false;
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Initialization
-// ---------------------------------------------------------------------------
-
-/**
- * Pre-generate the announcement WAV file.
- * Must be called after the proxy engine is initialized.
- *
- * Engine priority: espeak-ng → Kokoro → disabled.
- */
-export async function initAnnouncement(log: (msg: string) => void): Promise<boolean> {
-  fs.mkdirSync(TTS_DIR, { recursive: true });
-
-  try {
-    if (!fs.existsSync(CACHE_WAV)) {
-      let generated = false;
-
-      // Try espeak-ng first.
-      if (isEspeakAvailable()) {
-        generated = generateViaEspeak(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
-      } else {
-        log('[tts] espeak-ng not installed — trying Kokoro fallback');
-      }
-
-      // Fall back to Kokoro (via proxy-engine).
-      if (!generated) {
-        generated = await generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
-      }
-
-      if (!generated) {
-        log('[tts] no TTS engine available — announcements disabled');
-        return false;
-      }
-    }
-
-    log('[tts] announcement WAV ready');
-    return true;
-  } catch (e: any) {
-    log(`[tts] init error: ${e.message}`);
-    return false;
-  }
-}
-
-/** Get the path to the cached announcement WAV, or null if not generated. */
-export function getAnnouncementWavPath(): string | null {
-  return fs.existsSync(CACHE_WAV) ? CACHE_WAV : null;
-}
--- a/ts/call/prompt-cache.ts
+++ b/ts/call/prompt-cache.ts
@@ -1,275 +0,0 @@
-/**
- * PromptCache — manages named audio prompt WAV files for IVR and voicemail.
- *
- * Generates WAV files via espeak-ng (primary) or Kokoro TTS through the
- * proxy-engine (fallback). Also supports loading pre-existing WAV files
- * and programmatic tone generation.
- *
- * All audio playback happens in Rust (audio_player / start_interaction).
- * This module only manages WAV files on disk.
- */
-
-import { execSync } from 'node:child_process';
-import fs from 'node:fs';
-import path from 'node:path';
-import { Buffer } from 'node:buffer';
-import { sendProxyCommand, isProxyReady } from '../proxybridge.ts';
-
-// ---------------------------------------------------------------------------
-// Types
-// ---------------------------------------------------------------------------
-
-/** A cached prompt — just a WAV file path and metadata. */
-export interface ICachedPrompt {
-  /** Unique prompt identifier. */
-  id: string;
-  /** Path to the WAV file on disk. */
-  wavPath: string;
-  /** Total duration in milliseconds (approximate, from WAV header). */
-  durationMs: number;
-}
-
-// ---------------------------------------------------------------------------
-// TTS helpers
-// ---------------------------------------------------------------------------
-
-const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
-
-/** Check if espeak-ng is available. */
-function isEspeakAvailable(): boolean {
-  try {
-    execSync('which espeak-ng', { stdio: 'pipe' });
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-/** Generate WAV via espeak-ng. */
-function generateViaEspeak(wavPath: string, text: string): boolean {
-  try {
-    execSync(
-      `espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
-      { timeout: 10000, stdio: 'pipe' },
-    );
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-/** Generate WAV via Kokoro TTS (runs inside proxy-engine). */
-async function generateViaKokoro(wavPath: string, text: string, voice: string): Promise<boolean> {
-  const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx');
-  const voicesPath = path.join(TTS_DIR, 'voices.bin');
-  if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false;
-  if (!isProxyReady()) return false;
-
-  try {
-    await sendProxyCommand('generate_tts', {
-      model: modelPath,
-      voices: voicesPath,
-      voice,
-      text,
-      output: wavPath,
-    });
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-/** Read a WAV file's duration from its header. */
-function getWavDurationMs(wavPath: string): number {
-  try {
-    const wav = fs.readFileSync(wavPath);
-    if (wav.length < 44) return 0;
-    if (wav.toString('ascii', 0, 4) !== 'RIFF') return 0;
-
-    let sampleRate = 16000;
-    let dataSize = 0;
-    let bitsPerSample = 16;
-    let channels = 1;
-    let offset = 12;
-
-    while (offset < wav.length - 8) {
-      const chunkId = wav.toString('ascii', offset, offset + 4);
-      const chunkSize = wav.readUInt32LE(offset + 4);
-      if (chunkId === 'fmt ') {
-        channels = wav.readUInt16LE(offset + 10);
-        sampleRate = wav.readUInt32LE(offset + 12);
-        bitsPerSample = wav.readUInt16LE(offset + 22);
-      }
-      if (chunkId === 'data') {
-        dataSize = chunkSize;
-      }
-      offset += 8 + chunkSize;
-      if (offset % 2 !== 0) offset++;
-    }
-
-    const bytesPerSample = (bitsPerSample / 8) * channels;
-    const totalSamples = bytesPerSample > 0 ? dataSize / bytesPerSample : 0;
-    return sampleRate > 0 ? Math.round((totalSamples / sampleRate) * 1000) : 0;
-  } catch {
-    return 0;
-  }
-}
-
-// ---------------------------------------------------------------------------
-// PromptCache
-// ---------------------------------------------------------------------------
-
-export class PromptCache {
-  private prompts = new Map<string, ICachedPrompt>();
-  private log: (msg: string) => void;
-  private espeakAvailable: boolean | null = null;
-
-  constructor(log: (msg: string) => void) {
-    this.log = log;
-  }
-
-  // -------------------------------------------------------------------------
-  // Public API
-  // -------------------------------------------------------------------------
-
-  /** Get a cached prompt by ID. */
-  get(id: string): ICachedPrompt | null {
-    return this.prompts.get(id) ?? null;
-  }
-
-  /** Check if a prompt is cached. */
-  has(id: string): boolean {
-    return this.prompts.has(id);
-  }
-
-  /** List all cached prompt IDs. */
-  listIds(): string[] {
-    return [...this.prompts.keys()];
-  }
-
-  /**
-   * Generate a TTS prompt WAV and cache its path.
-   * Uses espeak-ng (primary) or Kokoro (fallback).
-   */
-  async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise<ICachedPrompt | null> {
-    fs.mkdirSync(TTS_DIR, { recursive: true });
-    const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
-
-    // Check espeak availability once.
-    if (this.espeakAvailable === null) {
-      this.espeakAvailable = isEspeakAvailable();
-    }
-
-    // Generate WAV if not already on disk.
-    if (!fs.existsSync(wavPath)) {
-      let generated = false;
-      if (this.espeakAvailable) {
-        generated = generateViaEspeak(wavPath, text);
-      }
-      if (!generated) {
-        generated = await generateViaKokoro(wavPath, text, voice);
-      }
-      if (!generated) {
-        this.log(`[prompt-cache] failed to generate TTS for "${id}"`);
-        return null;
-      }
-      this.log(`[prompt-cache] generated WAV for "${id}"`);
-    }
-
-    return this.registerWav(id, wavPath);
-  }
-
-  /**
-   * Load a pre-existing WAV file as a prompt.
-   */
-  async loadWavPrompt(id: string, wavPath: string): Promise<ICachedPrompt | null> {
-    if (!fs.existsSync(wavPath)) {
-      this.log(`[prompt-cache] WAV not found: ${wavPath}`);
-      return null;
-    }
-    return this.registerWav(id, wavPath);
-  }
-
-  /**
-   * Generate a beep tone WAV and cache it.
-   */
-  async generateBeep(
-    id: string,
-    freqHz = 1000,
-    durationMs = 500,
-    amplitude = 8000,
-  ): Promise<ICachedPrompt | null> {
-    fs.mkdirSync(TTS_DIR, { recursive: true });
-    const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
-
-    if (!fs.existsSync(wavPath)) {
-      // Generate 16kHz 16-bit mono sine wave WAV.
-      const sampleRate = 16000;
-      const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
-      const pcm = Buffer.alloc(totalSamples * 2);
-
-      for (let i = 0; i < totalSamples; i++) {
-        const t = i / sampleRate;
-        const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
-        let envelope = 1.0;
-        if (i < fadeLen) envelope = i / fadeLen;
-        else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
-
-        const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
-        pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
-      }
-
-      // Write WAV file.
-      const headerSize = 44;
-      const dataSize = pcm.length;
-      const wav = Buffer.alloc(headerSize + dataSize);
-
-      // RIFF header
-      wav.write('RIFF', 0);
-      wav.writeUInt32LE(36 + dataSize, 4);
-      wav.write('WAVE', 8);
-
-      // fmt chunk
-      wav.write('fmt ', 12);
-      wav.writeUInt32LE(16, 16);        // chunk size
-      wav.writeUInt16LE(1, 20);         // PCM format
-      wav.writeUInt16LE(1, 22);         // mono
-      wav.writeUInt32LE(sampleRate, 24);
-      wav.writeUInt32LE(sampleRate * 2, 28); // byte rate
-      wav.writeUInt16LE(2, 32);         // block align
-      wav.writeUInt16LE(16, 34);        // bits per sample
-
-      // data chunk
-      wav.write('data', 36);
-      wav.writeUInt32LE(dataSize, 40);
-      pcm.copy(wav, 44);
-
-      fs.writeFileSync(wavPath, wav);
-      this.log(`[prompt-cache] beep WAV generated for "${id}"`);
-    }
-
-    return this.registerWav(id, wavPath);
-  }
-
-  /** Remove a prompt from the cache. */
-  remove(id: string): void {
-    this.prompts.delete(id);
-  }
-
-  /** Clear all cached prompts. */
-  clear(): void {
-    this.prompts.clear();
-  }
-
-  // -------------------------------------------------------------------------
-  // Internal
-  // -------------------------------------------------------------------------
-
-  private registerWav(id: string, wavPath: string): ICachedPrompt {
-    const durationMs = getWavDurationMs(wavPath);
-    const prompt: ICachedPrompt = { id, wavPath, durationMs };
-    this.prompts.set(id, prompt);
-    this.log(`[prompt-cache] cached "${id}": ${wavPath} (${(durationMs / 1000).toFixed(1)}s)`);
-    return prompt;
-  }
-}
--- a/ts/proxybridge.ts
+++ b/ts/proxybridge.ts
@@ -88,7 +88,7 @@ type TProxyCommands = {
    result: Record<string, never>;
  };
  generate_tts: {
-    params: { model: string; voices: string; voice: string; text: string; output: string };
+    params: { model: string; voices: string; voice: string; text: string; output: string; cacheable?: boolean };
    result: { output: string };
  };
  // WebRTC signaling — bridged from the browser via the TS control plane.
--- a/ts/sipproxy.ts
+++ b/ts/sipproxy.ts
@@ -24,8 +24,6 @@ import {
  getAllBrowserDeviceIds,
  getBrowserDeviceWs,
 } from './webrtcbridge.ts';
-import { initAnnouncement } from './announcement.ts';
-import { PromptCache } from './call/prompt-cache.ts';
 import { VoiceboxManager } from './voicebox.ts';
 import {
  initProxyEngine,
@@ -170,7 +168,6 @@ for (const d of appConfig.devices) {
 // Initialize subsystems
 // ---------------------------------------------------------------------------

-const promptCache = new PromptCache(log);
 const voiceboxManager = new VoiceboxManager(log);
 voiceboxManager.init(appConfig.voiceboxes ?? []);

@@ -519,6 +516,8 @@ async function startProxyEngine(): Promise<void> {
    providers: appConfig.providers,
    devices: appConfig.devices,
    routing: appConfig.routing,
+    voiceboxes: appConfig.voiceboxes ?? [],
+    ivr: appConfig.ivr,
  });

  if (!configured) {
@@ -530,31 +529,8 @@ async function startProxyEngine(): Promise<void> {
  const deviceList = appConfig.devices.map((d) => d.displayName).join(', ');
  log(`proxy engine started | LAN ${appConfig.proxy.lanIp}:${appConfig.proxy.lanPort} | providers: ${providerList} | devices: ${deviceList}`);

-  // Generate TTS audio (WAV files on disk, played by Rust audio_player).
-  try {
-    await initAnnouncement(log);
-
-    // Pre-generate prompts.
-    await promptCache.generateBeep('voicemail-beep', 1000, 500, 8000);
-    for (const vb of appConfig.voiceboxes ?? []) {
-      if (!vb.enabled) continue;
-      const promptId = `voicemail-greeting-${vb.id}`;
-      if (vb.greetingWavPath) {
-        await promptCache.loadWavPrompt(promptId, vb.greetingWavPath);
-      } else {
-        const text = vb.greetingText || 'The person you are trying to reach is not available. Please leave a message after the tone.';
-        await promptCache.generatePrompt(promptId, text, vb.greetingVoice || 'af_bella');
-      }
-    }
-    if (appConfig.ivr?.enabled) {
-      for (const menu of appConfig.ivr.menus) {
-        await promptCache.generatePrompt(`ivr-menu-${menu.id}`, menu.promptText, menu.promptVoice || 'af_bella');
-      }
-    }
-    log(`[startup] prompts cached: ${promptCache.listIds().join(', ') || 'none'}`);
-  } catch (e) {
-    log(`[tts] init failed: ${e}`);
-  }
+  // TTS prompts (voicemail greetings, IVR menus) are generated on-demand
+  // by the Rust TTS engine when first needed. No startup pre-generation.
 }

 // ---------------------------------------------------------------------------
@@ -620,6 +596,8 @@ initWebUi(
        providers: fresh.providers,
        devices: fresh.devices,
        routing: fresh.routing,
+        voiceboxes: fresh.voiceboxes ?? [],
+        ivr: fresh.ivr,
      }).then((ok) => {
        if (ok) log('[config] reloaded — proxy engine reconfigured');
        else log('[config] reload failed — proxy engine rejected config');