/** * TTS announcement module — pre-generates audio announcements using Kokoro TTS * and caches them as encoded RTP packets for playback during call setup. * * On startup, generates the announcement WAV via the Rust tts-engine binary * (Kokoro neural TTS), encodes each 20ms frame to G.722 (for SIP) and Opus * (for WebRTC) via the Rust transcoder, and caches the packets. */ import { execSync } from 'node:child_process'; import fs from 'node:fs'; import path from 'node:path'; import { Buffer } from 'node:buffer'; import { buildRtpHeader, rtpClockIncrement } from './call/leg.ts'; import { encodePcm, isCodecReady } from './opusbridge.ts'; // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- /** A pre-encoded announcement ready for RTP playback. */ export interface IAnnouncementCache { /** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */ g722Frames: Buffer[]; /** Opus encoded frames for WebRTC playback. */ opusFrames: Buffer[]; /** Total duration in milliseconds. */ durationMs: number; } // --------------------------------------------------------------------------- // State // --------------------------------------------------------------------------- let cachedAnnouncement: IAnnouncementCache | null = null; const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts'); const KOKORO_MODEL = 'kokoro-v1.0.onnx'; const KOKORO_VOICES = 'voices.bin'; const KOKORO_VOICE = 'af_bella'; // American female, clear and natural const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now."; const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav'); // --------------------------------------------------------------------------- // Initialization // --------------------------------------------------------------------------- /** * Pre-generate the announcement audio and encode to G.722 frames. * Must be called after the codec bridge is initialized. */ export async function initAnnouncement(log: (msg: string) => void): Promise { const modelPath = path.join(TTS_DIR, KOKORO_MODEL); const voicesPath = path.join(TTS_DIR, KOKORO_VOICES); // Check if Kokoro model files exist. if (!fs.existsSync(modelPath)) { log('[tts] Kokoro model not found at ' + modelPath + ' — announcements disabled'); return false; } if (!fs.existsSync(voicesPath)) { log('[tts] Kokoro voices not found at ' + voicesPath + ' — announcements disabled'); return false; } // Find tts-engine binary. const root = process.cwd(); const ttsBinPaths = [ path.join(root, 'dist_rust', 'tts-engine'), path.join(root, 'rust', 'target', 'release', 'tts-engine'), path.join(root, 'rust', 'target', 'debug', 'tts-engine'), ]; const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p)); if (!ttsBin) { log('[tts] tts-engine binary not found — announcements disabled'); return false; } try { // Generate WAV if not cached. if (!fs.existsSync(CACHE_WAV)) { log('[tts] generating announcement audio via Kokoro TTS...'); execSync( `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${CACHE_WAV}" --text "${ANNOUNCEMENT_TEXT}"`, { timeout: 120000, stdio: 'pipe' }, ); log('[tts] announcement WAV generated'); } // Read WAV and extract raw PCM. const wav = fs.readFileSync(CACHE_WAV); const pcm = extractPcmFromWav(wav); if (!pcm) { log('[tts] failed to parse WAV file'); return false; } // Wait for codec bridge to be ready. if (!isCodecReady()) { log('[tts] codec bridge not ready — will retry'); return false; } // Kokoro outputs 24000 Hz, 16-bit mono. // We encode in chunks: 20ms at 24000 Hz = 480 samples = 960 bytes of PCM. // The Rust encoder will resample to 16kHz internally for G.722. const SAMPLE_RATE = 24000; const FRAME_SAMPLES = Math.floor(SAMPLE_RATE * 0.02); // 480 samples per 20ms const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample const totalFrames = Math.floor(pcm.length / FRAME_BYTES); const g722Frames: Buffer[] = []; const opusFrames: Buffer[] = []; log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${SAMPLE_RATE}Hz)...`); for (let i = 0; i < totalFrames; i++) { const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES); const pcmBuf = Buffer.from(framePcm); const [g722, opus] = await Promise.all([ encodePcm(pcmBuf, SAMPLE_RATE, 9), // G.722 for SIP devices encodePcm(pcmBuf, SAMPLE_RATE, 111), // Opus for WebRTC browsers ]); if (g722) g722Frames.push(g722); if (opus) opusFrames.push(opus); if (!g722 && !opus && i < 3) log(`[tts] frame ${i} encode failed`); } cachedAnnouncement = { g722Frames, opusFrames, durationMs: totalFrames * 20, }; log(`[tts] announcement cached: ${g722Frames.length} frames (${(totalFrames * 20 / 1000).toFixed(1)}s)`); return true; } catch (e: any) { log(`[tts] init error: ${e.message}`); return false; } } // --------------------------------------------------------------------------- // Playback // --------------------------------------------------------------------------- /** * Play the pre-cached announcement to an RTP endpoint. * * @param sendPacket - function to send a raw RTP packet * @param ssrc - SSRC to use in RTP headers * @param onDone - called when the announcement finishes * @returns a cancel function, or null if no announcement is cached */ export function playAnnouncement( sendPacket: (pkt: Buffer) => void, ssrc: number, onDone?: () => void, ): (() => void) | null { if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) { onDone?.(); return null; } const frames = cachedAnnouncement.g722Frames; const PT = 9; // G.722 let frameIdx = 0; let seq = Math.floor(Math.random() * 0xffff); let rtpTs = Math.floor(Math.random() * 0xffffffff); const timer = setInterval(() => { if (frameIdx >= frames.length) { clearInterval(timer); onDone?.(); return; } const payload = frames[frameIdx]; const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0); const pkt = Buffer.concat([hdr, payload]); sendPacket(pkt); seq++; rtpTs += rtpClockIncrement(PT); frameIdx++; }, 20); // Return cancel function. return () => clearInterval(timer); } /** * Play pre-cached Opus announcement to a WebRTC PeerConnection sender. * * @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp() * @param ssrc - SSRC to use in RTP headers * @param onDone - called when announcement finishes * @returns cancel function, or null if no announcement cached */ export function playAnnouncementToWebRtc( sendRtpPacket: (pkt: Buffer) => void, ssrc: number, counters: { seq: number; ts: number }, onDone?: () => void, ): (() => void) | null { if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) { onDone?.(); return null; } const frames = cachedAnnouncement.opusFrames; const PT = 111; // Opus let frameIdx = 0; const timer = setInterval(() => { if (frameIdx >= frames.length) { clearInterval(timer); onDone?.(); return; } const payload = frames[frameIdx]; const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0); const pkt = Buffer.concat([hdr, payload]); sendRtpPacket(pkt); counters.seq++; counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms frameIdx++; }, 20); return () => clearInterval(timer); } /** Check if an announcement is cached and ready. */ export function isAnnouncementReady(): boolean { return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0; } // --------------------------------------------------------------------------- // WAV parsing // --------------------------------------------------------------------------- function extractPcmFromWav(wav: Buffer): Buffer | null { // Minimal WAV parser — find the "data" chunk. if (wav.length < 44) return null; if (wav.toString('ascii', 0, 4) !== 'RIFF') return null; if (wav.toString('ascii', 8, 12) !== 'WAVE') return null; let offset = 12; while (offset < wav.length - 8) { const chunkId = wav.toString('ascii', offset, offset + 4); const chunkSize = wav.readUInt32LE(offset + 4); if (chunkId === 'data') { return wav.subarray(offset + 8, offset + 8 + chunkSize); } offset += 8 + chunkSize; // Word-align. if (offset % 2 !== 0) offset++; } return null; }