feat(routing): add rule-based SIP routing for inbound and outbound calls with dashboard route management

2026-04-10 08:22:12 +00:00
parent f3e1c96872
commit fd3a408cc2
13 changed files with 893 additions and 114 deletions
--- a/ts/announcement.ts
+++ b/ts/announcement.ts
@@ -1,10 +1,13 @@
 /**
- * TTS announcement module — pre-generates audio announcements using Kokoro TTS
+ * TTS announcement module — pre-generates audio announcements using espeak-ng
 * and caches them as encoded RTP packets for playback during call setup.
 *
- * On startup, generates the announcement WAV via the Rust tts-engine binary
- * (Kokoro neural TTS), encodes each 20ms frame to G.722 (for SIP) and Opus
- * (for WebRTC) via the Rust transcoder, and caches the packets.
+ * On startup, generates the announcement WAV via espeak-ng (formant-based TTS
+ * with highly accurate pronunciation), encodes each 20ms frame to G.722 (for
+ * SIP) and Opus (for WebRTC) via the Rust transcoder, and caches the packets.
+ *
+ * Falls back to the Rust tts-engine (Kokoro neural TTS) if espeak-ng is not
+ * installed, and disables announcements if neither is available.
 */

 import { execSync } from 'node:child_process';
@@ -35,35 +38,62 @@ export interface IAnnouncementCache {
 let cachedAnnouncement: IAnnouncementCache | null = null;

 const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
-const KOKORO_MODEL = 'kokoro-v1.0.onnx';
-const KOKORO_VOICES = 'voices.bin';
-const KOKORO_VOICE = 'af_bella'; // American female, clear and natural
 const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
 const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');

+// Kokoro fallback constants.
+const KOKORO_MODEL = 'kokoro-v1.0.onnx';
+const KOKORO_VOICES = 'voices.bin';
+const KOKORO_VOICE = 'af_bella';
+
 // ---------------------------------------------------------------------------
 // Initialization
 // ---------------------------------------------------------------------------

 /**
- * Pre-generate the announcement audio and encode to G.722 frames.
- * Must be called after the codec bridge is initialized.
+ * Check if espeak-ng is available on the system.
 */
-export async function initAnnouncement(log: (msg: string) => void): Promise<boolean> {
+function isEspeakAvailable(): boolean {
+  try {
+    execSync('which espeak-ng', { stdio: 'pipe' });
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Generate announcement WAV via espeak-ng (primary engine).
+ * Returns true on success.
+ */
+function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
+  log('[tts] generating announcement audio via espeak-ng...');
+  try {
+    execSync(
+      `espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
+      { timeout: 10000, stdio: 'pipe' },
+    );
+    log('[tts] espeak-ng WAV generated');
+    return true;
+  } catch (e: any) {
+    log(`[tts] espeak-ng failed: ${e.message}`);
+    return false;
+  }
+}
+
+/**
+ * Generate announcement WAV via Kokoro TTS (fallback engine).
+ * Returns true on success.
+ */
+function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): boolean {
  const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
  const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);

-  // Check if Kokoro model files exist.
-  if (!fs.existsSync(modelPath)) {
-    log('[tts] Kokoro model not found at ' + modelPath + ' — announcements disabled');
-    return false;
-  }
-  if (!fs.existsSync(voicesPath)) {
-    log('[tts] Kokoro voices not found at ' + voicesPath + ' — announcements disabled');
+  if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) {
+    log('[tts] Kokoro model/voices not found — Kokoro fallback unavailable');
    return false;
  }

-  // Find tts-engine binary.
  const root = process.cwd();
  const ttsBinPaths = [
    path.join(root, 'dist_rust', 'tts-engine'),
@@ -72,53 +102,117 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
  ];
  const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p));
  if (!ttsBin) {
-    log('[tts] tts-engine binary not found — announcements disabled');
+    log('[tts] tts-engine binary not found — Kokoro fallback unavailable');
    return false;
  }

+  log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
+  try {
+    execSync(
+      `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${wavPath}" --text "${text}"`,
+      { timeout: 120000, stdio: 'pipe' },
+    );
+    log('[tts] Kokoro WAV generated');
+    return true;
+  } catch (e: any) {
+    log(`[tts] Kokoro failed: ${e.message}`);
+    return false;
+  }
+}
+
+/**
+ * Read a WAV file and detect its sample rate from the fmt chunk.
+ * Returns { pcm, sampleRate } or null on failure.
+ */
+function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
+  const wav = fs.readFileSync(wavPath);
+  if (wav.length < 44) return null;
+  if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
+  if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
+
+  let sampleRate = 22050; // default
+  let offset = 12;
+  let pcm: Buffer | null = null;
+
+  while (offset < wav.length - 8) {
+    const chunkId = wav.toString('ascii', offset, offset + 4);
+    const chunkSize = wav.readUInt32LE(offset + 4);
+    if (chunkId === 'fmt ') {
+      sampleRate = wav.readUInt32LE(offset + 12);
+    }
+    if (chunkId === 'data') {
+      pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
+    }
+    offset += 8 + chunkSize;
+    if (offset % 2 !== 0) offset++;
+  }
+
+  if (!pcm) return null;
+  return { pcm, sampleRate };
+}
+
+/**
+ * Pre-generate the announcement audio and encode to G.722 + Opus frames.
+ * Must be called after the codec bridge is initialized.
+ *
+ * Engine priority: espeak-ng → Kokoro → disabled.
+ */
+export async function initAnnouncement(log: (msg: string) => void): Promise<boolean> {
+  fs.mkdirSync(TTS_DIR, { recursive: true });
+
  try {
    // Generate WAV if not cached.
    if (!fs.existsSync(CACHE_WAV)) {
-      log('[tts] generating announcement audio via Kokoro TTS...');
-      execSync(
-        `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${CACHE_WAV}" --text "${ANNOUNCEMENT_TEXT}"`,
-        { timeout: 120000, stdio: 'pipe' },
-      );
-      log('[tts] announcement WAV generated');
+      let generated = false;
+
+      // Try espeak-ng first.
+      if (isEspeakAvailable()) {
+        generated = generateViaEspeak(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
+      } else {
+        log('[tts] espeak-ng not installed — trying Kokoro fallback');
+      }
+
+      // Fall back to Kokoro.
+      if (!generated) {
+        generated = generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
+      }
+
+      if (!generated) {
+        log('[tts] no TTS engine available — announcements disabled');
+        return false;
+      }
    }

-    // Read WAV and extract raw PCM.
-    const wav = fs.readFileSync(CACHE_WAV);
-    const pcm = extractPcmFromWav(wav);
-    if (!pcm) {
+    // Read WAV and extract raw PCM + sample rate.
+    const result = readWavWithRate(CACHE_WAV);
+    if (!result) {
      log('[tts] failed to parse WAV file');
      return false;
    }

+    const { pcm, sampleRate } = result;
+
    // Wait for codec bridge to be ready.
    if (!isCodecReady()) {
      log('[tts] codec bridge not ready — will retry');
      return false;
    }

-    // Kokoro outputs 24000 Hz, 16-bit mono.
-    // We encode in chunks: 20ms at 24000 Hz = 480 samples = 960 bytes of PCM.
-    // The Rust encoder will resample to 16kHz internally for G.722.
-    const SAMPLE_RATE = 24000;
-    const FRAME_SAMPLES = Math.floor(SAMPLE_RATE * 0.02); // 480 samples per 20ms
+    // Encode in 20ms chunks. The Rust encoder resamples to each codec's native rate.
+    const FRAME_SAMPLES = Math.floor(sampleRate * 0.02);
    const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample
    const totalFrames = Math.floor(pcm.length / FRAME_BYTES);

    const g722Frames: Buffer[] = [];
    const opusFrames: Buffer[] = [];

-    log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${SAMPLE_RATE}Hz)...`);
+    log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${sampleRate}Hz)...`);
    for (let i = 0; i < totalFrames; i++) {
      const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES);
      const pcmBuf = Buffer.from(framePcm);
      const [g722, opus] = await Promise.all([
-        encodePcm(pcmBuf, SAMPLE_RATE, 9),   // G.722 for SIP devices
-        encodePcm(pcmBuf, SAMPLE_RATE, 111),  // Opus for WebRTC browsers
+        encodePcm(pcmBuf, sampleRate, 9),   // G.722 for SIP devices
+        encodePcm(pcmBuf, sampleRate, 111),  // Opus for WebRTC browsers
      ]);
      if (g722) g722Frames.push(g722);
      if (opus) opusFrames.push(opus);
@@ -236,26 +330,3 @@ export function isAnnouncementReady(): boolean {
  return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0;
 }

-// ---------------------------------------------------------------------------
-// WAV parsing
-// ---------------------------------------------------------------------------
-
-function extractPcmFromWav(wav: Buffer): Buffer | null {
-  // Minimal WAV parser — find the "data" chunk.
-  if (wav.length < 44) return null;
-  if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
-  if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
-
-  let offset = 12;
-  while (offset < wav.length - 8) {
-    const chunkId = wav.toString('ascii', offset, offset + 4);
-    const chunkSize = wav.readUInt32LE(offset + 4);
-    if (chunkId === 'data') {
-      return wav.subarray(offset + 8, offset + 8 + chunkSize);
-    }
-    offset += 8 + chunkSize;
-    // Word-align.
-    if (offset % 2 !== 0) offset++;
-  }
-  return null;
-}