siprouter/ts/announcement.ts

/**
 * TTS announcement module — pre-generates audio announcements using espeak-ng
 * and caches them as encoded RTP packets for playback during call setup.
 *
 * On startup, generates the announcement WAV via espeak-ng (formant-based TTS
 * with highly accurate pronunciation), encodes each 20ms frame to G.722 (for
 * SIP) and Opus (for WebRTC) via the Rust transcoder, and caches the packets.
 *
 * Falls back to the Rust tts-engine (Kokoro neural TTS) if espeak-ng is not
 * installed, and disables announcements if neither is available.
 */

import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { Buffer } from 'node:buffer';
import { buildRtpHeader, rtpClockIncrement } from './call/leg.ts';
import { encodePcm, isCodecReady } from './opusbridge.ts';

// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------

/** A pre-encoded announcement ready for RTP playback. */
export interface IAnnouncementCache {
  /** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */
  g722Frames: Buffer[];
  /** Opus encoded frames for WebRTC playback. */
  opusFrames: Buffer[];
  /** Total duration in milliseconds. */
  durationMs: number;
}

// ---------------------------------------------------------------------------
// State
// ---------------------------------------------------------------------------

let cachedAnnouncement: IAnnouncementCache | null = null;

const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');

// Kokoro fallback constants.
const KOKORO_MODEL = 'kokoro-v1.0.onnx';
const KOKORO_VOICES = 'voices.bin';
const KOKORO_VOICE = 'af_bella';

// ---------------------------------------------------------------------------
// Initialization
// ---------------------------------------------------------------------------

/**
 * Check if espeak-ng is available on the system.
 */
function isEspeakAvailable(): boolean {
  try {
    execSync('which espeak-ng', { stdio: 'pipe' });
    return true;
  } catch {
    return false;
  }
}

/**
 * Generate announcement WAV via espeak-ng (primary engine).
 * Returns true on success.
 */
function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
  log('[tts] generating announcement audio via espeak-ng...');
  try {
    execSync(
      `espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
      { timeout: 10000, stdio: 'pipe' },
    );
    log('[tts] espeak-ng WAV generated');
    return true;
  } catch (e: any) {
    log(`[tts] espeak-ng failed: ${e.message}`);
    return false;
  }
}

/**
 * Generate announcement WAV via Kokoro TTS (fallback engine).
 * Returns true on success.
 */
function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): boolean {
  const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
  const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);

  if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) {
    log('[tts] Kokoro model/voices not found — Kokoro fallback unavailable');
    return false;
  }

  const root = process.cwd();
  const ttsBinPaths = [
    path.join(root, 'dist_rust', 'tts-engine'),
    path.join(root, 'rust', 'target', 'release', 'tts-engine'),
    path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
  ];
  const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p));
  if (!ttsBin) {
    log('[tts] tts-engine binary not found — Kokoro fallback unavailable');
    return false;
  }

  log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
  try {
    execSync(
      `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${wavPath}" --text "${text}"`,
      { timeout: 120000, stdio: 'pipe' },
    );
    log('[tts] Kokoro WAV generated');
    return true;
  } catch (e: any) {
    log(`[tts] Kokoro failed: ${e.message}`);
    return false;
  }
}

/**
 * Read a WAV file and detect its sample rate from the fmt chunk.
 * Returns { pcm, sampleRate } or null on failure.
 */
function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
  const wav = fs.readFileSync(wavPath);
  if (wav.length < 44) return null;
  if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
  if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;

  let sampleRate = 22050; // default
  let offset = 12;
  let pcm: Buffer | null = null;

  while (offset < wav.length - 8) {
    const chunkId = wav.toString('ascii', offset, offset + 4);
    const chunkSize = wav.readUInt32LE(offset + 4);
    if (chunkId === 'fmt ') {
      sampleRate = wav.readUInt32LE(offset + 12);
    }
    if (chunkId === 'data') {
      pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
    }
    offset += 8 + chunkSize;
    if (offset % 2 !== 0) offset++;
  }

  if (!pcm) return null;
  return { pcm, sampleRate };
}

/**
 * Pre-generate the announcement audio and encode to G.722 + Opus frames.
 * Must be called after the codec bridge is initialized.
 *
 * Engine priority: espeak-ng → Kokoro → disabled.
 */
export async function initAnnouncement(log: (msg: string) => void): Promise<boolean> {
  fs.mkdirSync(TTS_DIR, { recursive: true });

  try {
    // Generate WAV if not cached.
    if (!fs.existsSync(CACHE_WAV)) {
      let generated = false;

      // Try espeak-ng first.
      if (isEspeakAvailable()) {
        generated = generateViaEspeak(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
      } else {
        log('[tts] espeak-ng not installed — trying Kokoro fallback');
      }

      // Fall back to Kokoro.
      if (!generated) {
        generated = generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
      }

      if (!generated) {
        log('[tts] no TTS engine available — announcements disabled');
        return false;
      }
    }

    // Read WAV and extract raw PCM + sample rate.
    const result = readWavWithRate(CACHE_WAV);
    if (!result) {
      log('[tts] failed to parse WAV file');
      return false;
    }

    const { pcm, sampleRate } = result;

    // Wait for codec bridge to be ready.
    if (!isCodecReady()) {
      log('[tts] codec bridge not ready — will retry');
      return false;
    }

    // Encode in 20ms chunks. The Rust encoder resamples to each codec's native rate.
    const FRAME_SAMPLES = Math.floor(sampleRate * 0.02);
    const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample
    const totalFrames = Math.floor(pcm.length / FRAME_BYTES);

    const g722Frames: Buffer[] = [];
    const opusFrames: Buffer[] = [];

    log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${sampleRate}Hz)...`);
    for (let i = 0; i < totalFrames; i++) {
      const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES);
      const pcmBuf = Buffer.from(framePcm);
      const [g722, opus] = await Promise.all([
        encodePcm(pcmBuf, sampleRate, 9),   // G.722 for SIP devices
        encodePcm(pcmBuf, sampleRate, 111),  // Opus for WebRTC browsers
      ]);
      if (g722) g722Frames.push(g722);
      if (opus) opusFrames.push(opus);
      if (!g722 && !opus && i < 3) log(`[tts] frame ${i} encode failed`);
    }

    cachedAnnouncement = {
      g722Frames,
      opusFrames,
      durationMs: totalFrames * 20,
    };

    log(`[tts] announcement cached: ${g722Frames.length} frames (${(totalFrames * 20 / 1000).toFixed(1)}s)`);
    return true;
  } catch (e: any) {
    log(`[tts] init error: ${e.message}`);
    return false;
  }
}

// ---------------------------------------------------------------------------
// Playback
// ---------------------------------------------------------------------------

/**
 * Play the pre-cached announcement to an RTP endpoint.
 *
 * @param sendPacket - function to send a raw RTP packet
 * @param ssrc - SSRC to use in RTP headers
 * @param onDone - called when the announcement finishes
 * @returns a cancel function, or null if no announcement is cached
 */
export function playAnnouncement(
  sendPacket: (pkt: Buffer) => void,
  ssrc: number,
  onDone?: () => void,
): (() => void) | null {
  if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) {
    onDone?.();
    return null;
  }

  const frames = cachedAnnouncement.g722Frames;
  const PT = 9; // G.722
  let frameIdx = 0;
  let seq = Math.floor(Math.random() * 0xffff);
  let rtpTs = Math.floor(Math.random() * 0xffffffff);

  const timer = setInterval(() => {
    if (frameIdx >= frames.length) {
      clearInterval(timer);
      onDone?.();
      return;
    }

    const payload = frames[frameIdx];
    const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
    const pkt = Buffer.concat([hdr, payload]);
    sendPacket(pkt);

    seq++;
    rtpTs += rtpClockIncrement(PT);
    frameIdx++;
  }, 20);

  // Return cancel function.
  return () => clearInterval(timer);
}

/**
 * Play pre-cached Opus announcement to a WebRTC PeerConnection sender.
 *
 * @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp()
 * @param ssrc - SSRC to use in RTP headers
 * @param onDone - called when announcement finishes
 * @returns cancel function, or null if no announcement cached
 */
export function playAnnouncementToWebRtc(
  sendRtpPacket: (pkt: Buffer) => void,
  ssrc: number,
  counters: { seq: number; ts: number },
  onDone?: () => void,
): (() => void) | null {
  if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) {
    onDone?.();
    return null;
  }

  const frames = cachedAnnouncement.opusFrames;
  const PT = 111; // Opus
  let frameIdx = 0;

  const timer = setInterval(() => {
    if (frameIdx >= frames.length) {
      clearInterval(timer);
      onDone?.();
      return;
    }

    const payload = frames[frameIdx];
    const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
    const pkt = Buffer.concat([hdr, payload]);
    sendRtpPacket(pkt);

    counters.seq++;
    counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms
    frameIdx++;
  }, 20);

  return () => clearInterval(timer);
}

/** Check if an announcement is cached and ready. */
export function isAnnouncementReady(): boolean {
  return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0;
}