siprouter/ts/announcement.ts

/**
 * TTS announcement module — pre-generates audio announcements using Kokoro TTS
 * and caches them as encoded RTP packets for playback during call setup.
 *
 * On startup, generates the announcement WAV via the Rust tts-engine binary
 * (Kokoro neural TTS), encodes each 20ms frame to G.722 (for SIP) and Opus
 * (for WebRTC) via the Rust transcoder, and caches the packets.
 */

import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { Buffer } from 'node:buffer';
import { buildRtpHeader, rtpClockIncrement } from './call/leg.ts';
import { encodePcm, isCodecReady } from './opusbridge.ts';

// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------

/** A pre-encoded announcement ready for RTP playback. */
export interface IAnnouncementCache {
  /** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */
  g722Frames: Buffer[];
  /** Opus encoded frames for WebRTC playback. */
  opusFrames: Buffer[];
  /** Total duration in milliseconds. */
  durationMs: number;
}

// ---------------------------------------------------------------------------
// State
// ---------------------------------------------------------------------------

let cachedAnnouncement: IAnnouncementCache | null = null;

const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
const KOKORO_MODEL = 'kokoro-v1.0.onnx';
const KOKORO_VOICES = 'voices.bin';
const KOKORO_VOICE = 'af_bella'; // American female, clear and natural
const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');

// ---------------------------------------------------------------------------
// Initialization
// ---------------------------------------------------------------------------

/**
 * Pre-generate the announcement audio and encode to G.722 frames.
 * Must be called after the codec bridge is initialized.
 */
export async function initAnnouncement(log: (msg: string) => void): Promise<boolean> {
  const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
  const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);

  // Check if Kokoro model files exist.
  if (!fs.existsSync(modelPath)) {
    log('[tts] Kokoro model not found at ' + modelPath + ' — announcements disabled');
    return false;
  }
  if (!fs.existsSync(voicesPath)) {
    log('[tts] Kokoro voices not found at ' + voicesPath + ' — announcements disabled');
    return false;
  }

  // Find tts-engine binary.
  const root = process.cwd();
  const ttsBinPaths = [
    path.join(root, 'dist_rust', 'tts-engine'),
    path.join(root, 'rust', 'target', 'release', 'tts-engine'),
    path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
  ];
  const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p));
  if (!ttsBin) {
    log('[tts] tts-engine binary not found — announcements disabled');
    return false;
  }

  try {
    // Generate WAV if not cached.
    if (!fs.existsSync(CACHE_WAV)) {
      log('[tts] generating announcement audio via Kokoro TTS...');
      execSync(
        `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${CACHE_WAV}" --text "${ANNOUNCEMENT_TEXT}"`,
        { timeout: 120000, stdio: 'pipe' },
      );
      log('[tts] announcement WAV generated');
    }

    // Read WAV and extract raw PCM.
    const wav = fs.readFileSync(CACHE_WAV);
    const pcm = extractPcmFromWav(wav);
    if (!pcm) {
      log('[tts] failed to parse WAV file');
      return false;
    }

    // Wait for codec bridge to be ready.
    if (!isCodecReady()) {
      log('[tts] codec bridge not ready — will retry');
      return false;
    }

    // Kokoro outputs 24000 Hz, 16-bit mono.
    // We encode in chunks: 20ms at 24000 Hz = 480 samples = 960 bytes of PCM.
    // The Rust encoder will resample to 16kHz internally for G.722.
    const SAMPLE_RATE = 24000;
    const FRAME_SAMPLES = Math.floor(SAMPLE_RATE * 0.02); // 480 samples per 20ms
    const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample
    const totalFrames = Math.floor(pcm.length / FRAME_BYTES);

    const g722Frames: Buffer[] = [];
    const opusFrames: Buffer[] = [];

    log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${SAMPLE_RATE}Hz)...`);
    for (let i = 0; i < totalFrames; i++) {
      const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES);
      const pcmBuf = Buffer.from(framePcm);
      const [g722, opus] = await Promise.all([
        encodePcm(pcmBuf, SAMPLE_RATE, 9),   // G.722 for SIP devices
        encodePcm(pcmBuf, SAMPLE_RATE, 111),  // Opus for WebRTC browsers
      ]);
      if (g722) g722Frames.push(g722);
      if (opus) opusFrames.push(opus);
      if (!g722 && !opus && i < 3) log(`[tts] frame ${i} encode failed`);
    }

    cachedAnnouncement = {
      g722Frames,
      opusFrames,
      durationMs: totalFrames * 20,
    };

    log(`[tts] announcement cached: ${g722Frames.length} frames (${(totalFrames * 20 / 1000).toFixed(1)}s)`);
    return true;
  } catch (e: any) {
    log(`[tts] init error: ${e.message}`);
    return false;
  }
}

// ---------------------------------------------------------------------------
// Playback
// ---------------------------------------------------------------------------

/**
 * Play the pre-cached announcement to an RTP endpoint.
 *
 * @param sendPacket - function to send a raw RTP packet
 * @param ssrc - SSRC to use in RTP headers
 * @param onDone - called when the announcement finishes
 * @returns a cancel function, or null if no announcement is cached
 */
export function playAnnouncement(
  sendPacket: (pkt: Buffer) => void,
  ssrc: number,
  onDone?: () => void,
): (() => void) | null {
  if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) {
    onDone?.();
    return null;
  }

  const frames = cachedAnnouncement.g722Frames;
  const PT = 9; // G.722
  let frameIdx = 0;
  let seq = Math.floor(Math.random() * 0xffff);
  let rtpTs = Math.floor(Math.random() * 0xffffffff);

  const timer = setInterval(() => {
    if (frameIdx >= frames.length) {
      clearInterval(timer);
      onDone?.();
      return;
    }

    const payload = frames[frameIdx];
    const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
    const pkt = Buffer.concat([hdr, payload]);
    sendPacket(pkt);

    seq++;
    rtpTs += rtpClockIncrement(PT);
    frameIdx++;
  }, 20);

  // Return cancel function.
  return () => clearInterval(timer);
}

/**
 * Play pre-cached Opus announcement to a WebRTC PeerConnection sender.
 *
 * @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp()
 * @param ssrc - SSRC to use in RTP headers
 * @param onDone - called when announcement finishes
 * @returns cancel function, or null if no announcement cached
 */
export function playAnnouncementToWebRtc(
  sendRtpPacket: (pkt: Buffer) => void,
  ssrc: number,
  counters: { seq: number; ts: number },
  onDone?: () => void,
): (() => void) | null {
  if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) {
    onDone?.();
    return null;
  }

  const frames = cachedAnnouncement.opusFrames;
  const PT = 111; // Opus
  let frameIdx = 0;

  const timer = setInterval(() => {
    if (frameIdx >= frames.length) {
      clearInterval(timer);
      onDone?.();
      return;
    }

    const payload = frames[frameIdx];
    const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
    const pkt = Buffer.concat([hdr, payload]);
    sendRtpPacket(pkt);

    counters.seq++;
    counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms
    frameIdx++;
  }, 20);

  return () => clearInterval(timer);
}

/** Check if an announcement is cached and ready. */
export function isAnnouncementReady(): boolean {
  return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0;
}

// ---------------------------------------------------------------------------
// WAV parsing
// ---------------------------------------------------------------------------

function extractPcmFromWav(wav: Buffer): Buffer | null {
  // Minimal WAV parser — find the "data" chunk.
  if (wav.length < 44) return null;
  if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
  if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;

  let offset = 12;
  while (offset < wav.length - 8) {
    const chunkId = wav.toString('ascii', offset, offset + 4);
    const chunkSize = wav.readUInt32LE(offset + 4);
    if (chunkId === 'data') {
      return wav.subarray(offset + 8, offset + 8 + chunkSize);
    }
    offset += 8 + chunkSize;
    // Word-align.
    if (offset % 2 !== 0) offset++;
  }
  return null;
}