/** * TTS announcement module — pre-generates audio announcements using espeak-ng * and caches them as encoded RTP packets for playback during call setup. * * On startup, generates the announcement WAV via espeak-ng (formant-based TTS * with highly accurate pronunciation), encodes each 20ms frame to G.722 (for * SIP) and Opus (for WebRTC) via the Rust transcoder, and caches the packets. * * Falls back to the Rust tts-engine (Kokoro neural TTS) if espeak-ng is not * installed, and disables announcements if neither is available. */ import { execSync } from 'node:child_process'; import fs from 'node:fs'; import path from 'node:path'; import { Buffer } from 'node:buffer'; import { encodePcm, isCodecReady } from './opusbridge.ts'; /** RTP clock increment per 20ms frame for each codec. */ function rtpClockIncrement(pt: number): number { if (pt === 111) return 960; if (pt === 9) return 160; return 160; } /** Build a fresh RTP header. */ function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer { const hdr = Buffer.alloc(12); hdr[0] = 0x80; hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f); hdr.writeUInt16BE(seq & 0xffff, 2); hdr.writeUInt32BE(ts >>> 0, 4); hdr.writeUInt32BE(ssrc >>> 0, 8); return hdr; } // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- /** A pre-encoded announcement ready for RTP playback. */ export interface IAnnouncementCache { /** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */ g722Frames: Buffer[]; /** Opus encoded frames for WebRTC playback. */ opusFrames: Buffer[]; /** Total duration in milliseconds. */ durationMs: number; } // --------------------------------------------------------------------------- // State // --------------------------------------------------------------------------- let cachedAnnouncement: IAnnouncementCache | null = null; const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts'); const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now."; const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav'); // Kokoro fallback constants. const KOKORO_MODEL = 'kokoro-v1.0.onnx'; const KOKORO_VOICES = 'voices.bin'; const KOKORO_VOICE = 'af_bella'; // --------------------------------------------------------------------------- // Initialization // --------------------------------------------------------------------------- /** * Check if espeak-ng is available on the system. */ function isEspeakAvailable(): boolean { try { execSync('which espeak-ng', { stdio: 'pipe' }); return true; } catch { return false; } } /** * Generate announcement WAV via espeak-ng (primary engine). * Returns true on success. */ function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean { log('[tts] generating announcement audio via espeak-ng...'); try { execSync( `espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`, { timeout: 10000, stdio: 'pipe' }, ); log('[tts] espeak-ng WAV generated'); return true; } catch (e: any) { log(`[tts] espeak-ng failed: ${e.message}`); return false; } } /** * Generate announcement WAV via Kokoro TTS (fallback engine). * Returns true on success. */ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): boolean { const modelPath = path.join(TTS_DIR, KOKORO_MODEL); const voicesPath = path.join(TTS_DIR, KOKORO_VOICES); if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) { log('[tts] Kokoro model/voices not found — Kokoro fallback unavailable'); return false; } const root = process.cwd(); const ttsBinPaths = [ path.join(root, 'dist_rust', 'tts-engine'), path.join(root, 'rust', 'target', 'release', 'tts-engine'), path.join(root, 'rust', 'target', 'debug', 'tts-engine'), ]; const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p)); if (!ttsBin) { log('[tts] tts-engine binary not found — Kokoro fallback unavailable'); return false; } log('[tts] generating announcement audio via Kokoro TTS (fallback)...'); try { execSync( `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${wavPath}" --text "${text}"`, { timeout: 120000, stdio: 'pipe' }, ); log('[tts] Kokoro WAV generated'); return true; } catch (e: any) { log(`[tts] Kokoro failed: ${e.message}`); return false; } } /** * Read a WAV file and detect its sample rate from the fmt chunk. * Returns { pcm, sampleRate } or null on failure. */ function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null { const wav = fs.readFileSync(wavPath); if (wav.length < 44) return null; if (wav.toString('ascii', 0, 4) !== 'RIFF') return null; if (wav.toString('ascii', 8, 12) !== 'WAVE') return null; let sampleRate = 22050; // default let offset = 12; let pcm: Buffer | null = null; while (offset < wav.length - 8) { const chunkId = wav.toString('ascii', offset, offset + 4); const chunkSize = wav.readUInt32LE(offset + 4); if (chunkId === 'fmt ') { sampleRate = wav.readUInt32LE(offset + 12); } if (chunkId === 'data') { pcm = wav.subarray(offset + 8, offset + 8 + chunkSize); } offset += 8 + chunkSize; if (offset % 2 !== 0) offset++; } if (!pcm) return null; return { pcm, sampleRate }; } /** * Pre-generate the announcement audio and encode to G.722 + Opus frames. * Must be called after the codec bridge is initialized. * * Engine priority: espeak-ng → Kokoro → disabled. */ export async function initAnnouncement(log: (msg: string) => void): Promise { fs.mkdirSync(TTS_DIR, { recursive: true }); try { // Generate WAV if not cached. if (!fs.existsSync(CACHE_WAV)) { let generated = false; // Try espeak-ng first. if (isEspeakAvailable()) { generated = generateViaEspeak(CACHE_WAV, ANNOUNCEMENT_TEXT, log); } else { log('[tts] espeak-ng not installed — trying Kokoro fallback'); } // Fall back to Kokoro. if (!generated) { generated = generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log); } if (!generated) { log('[tts] no TTS engine available — announcements disabled'); return false; } } // Read WAV and extract raw PCM + sample rate. const result = readWavWithRate(CACHE_WAV); if (!result) { log('[tts] failed to parse WAV file'); return false; } const { pcm, sampleRate } = result; // Wait for codec bridge to be ready. if (!isCodecReady()) { log('[tts] codec bridge not ready — will retry'); return false; } // Encode in 20ms chunks. The Rust encoder resamples to each codec's native rate. const FRAME_SAMPLES = Math.floor(sampleRate * 0.02); const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample const totalFrames = Math.floor(pcm.length / FRAME_BYTES); const g722Frames: Buffer[] = []; const opusFrames: Buffer[] = []; log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${sampleRate}Hz)...`); for (let i = 0; i < totalFrames; i++) { const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES); const pcmBuf = Buffer.from(framePcm); const [g722, opus] = await Promise.all([ encodePcm(pcmBuf, sampleRate, 9), // G.722 for SIP devices encodePcm(pcmBuf, sampleRate, 111), // Opus for WebRTC browsers ]); if (g722) g722Frames.push(g722); if (opus) opusFrames.push(opus); if (!g722 && !opus && i < 3) log(`[tts] frame ${i} encode failed`); } cachedAnnouncement = { g722Frames, opusFrames, durationMs: totalFrames * 20, }; log(`[tts] announcement cached: ${g722Frames.length} frames (${(totalFrames * 20 / 1000).toFixed(1)}s)`); return true; } catch (e: any) { log(`[tts] init error: ${e.message}`); return false; } } // --------------------------------------------------------------------------- // Playback // --------------------------------------------------------------------------- /** * Play the pre-cached announcement to an RTP endpoint. * * @param sendPacket - function to send a raw RTP packet * @param ssrc - SSRC to use in RTP headers * @param onDone - called when the announcement finishes * @returns a cancel function, or null if no announcement is cached */ export function playAnnouncement( sendPacket: (pkt: Buffer) => void, ssrc: number, onDone?: () => void, ): (() => void) | null { if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) { onDone?.(); return null; } const frames = cachedAnnouncement.g722Frames; const PT = 9; // G.722 let frameIdx = 0; let seq = Math.floor(Math.random() * 0xffff); let rtpTs = Math.floor(Math.random() * 0xffffffff); const timer = setInterval(() => { if (frameIdx >= frames.length) { clearInterval(timer); onDone?.(); return; } const payload = frames[frameIdx]; const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0); const pkt = Buffer.concat([hdr, payload]); sendPacket(pkt); seq++; rtpTs += rtpClockIncrement(PT); frameIdx++; }, 20); // Return cancel function. return () => clearInterval(timer); } /** * Play pre-cached Opus announcement to a WebRTC PeerConnection sender. * * @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp() * @param ssrc - SSRC to use in RTP headers * @param onDone - called when announcement finishes * @returns cancel function, or null if no announcement cached */ export function playAnnouncementToWebRtc( sendRtpPacket: (pkt: Buffer) => void, ssrc: number, counters: { seq: number; ts: number }, onDone?: () => void, ): (() => void) | null { if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) { onDone?.(); return null; } const frames = cachedAnnouncement.opusFrames; const PT = 111; // Opus let frameIdx = 0; const timer = setInterval(() => { if (frameIdx >= frames.length) { clearInterval(timer); onDone?.(); return; } const payload = frames[frameIdx]; const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0); const pkt = Buffer.concat([hdr, payload]); sendRtpPacket(pkt); counters.seq++; counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms frameIdx++; }, 20); return () => clearInterval(timer); } /** Check if an announcement is cached and ready. */ export function isAnnouncementReady(): boolean { return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0; }