/** * PromptCache — manages multiple named audio prompts for IVR and voicemail. * * Each prompt is pre-encoded as both G.722 frames (for SIP legs) and Opus * frames (for WebRTC legs), ready for 20ms RTP playback. * * Supports three sources: * 1. TTS generation via espeak-ng (primary) or Kokoro (fallback) * 2. Loading from a pre-existing WAV file * 3. Programmatic tone generation (beep, etc.) * * The existing announcement.ts system continues to work independently; * this module provides generalized prompt management for IVR/voicemail. */ import { execSync } from 'node:child_process'; import fs from 'node:fs'; import path from 'node:path'; import { Buffer } from 'node:buffer'; import { encodePcm, isCodecReady } from '../opusbridge.ts'; /** RTP clock increment per 20ms frame for each codec. */ function rtpClockIncrement(pt: number): number { if (pt === 111) return 960; if (pt === 9) return 160; return 160; } /** Build a fresh RTP header. */ function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer { const hdr = Buffer.alloc(12); hdr[0] = 0x80; hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f); hdr.writeUInt16BE(seq & 0xffff, 2); hdr.writeUInt32BE(ts >>> 0, 4); hdr.writeUInt32BE(ssrc >>> 0, 8); return hdr; } // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- /** A pre-encoded prompt ready for RTP playback. */ export interface ICachedPrompt { /** Unique prompt identifier. */ id: string; /** G.722 encoded frames (20ms each, no RTP header). */ g722Frames: Buffer[]; /** Opus encoded frames (20ms each, no RTP header). */ opusFrames: Buffer[]; /** Total duration in milliseconds. */ durationMs: number; } // --------------------------------------------------------------------------- // TTS helpers // --------------------------------------------------------------------------- const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts'); /** Check if espeak-ng is available. */ function isEspeakAvailable(): boolean { try { execSync('which espeak-ng', { stdio: 'pipe' }); return true; } catch { return false; } } /** Generate WAV via espeak-ng. */ function generateViaEspeak(wavPath: string, text: string): boolean { try { execSync( `espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`, { timeout: 10000, stdio: 'pipe' }, ); return true; } catch { return false; } } /** Generate WAV via Kokoro TTS. */ function generateViaKokoro(wavPath: string, text: string, voice: string): boolean { const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx'); const voicesPath = path.join(TTS_DIR, 'voices.bin'); if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false; const root = process.cwd(); const ttsBin = [ path.join(root, 'dist_rust', 'tts-engine'), path.join(root, 'rust', 'target', 'release', 'tts-engine'), path.join(root, 'rust', 'target', 'debug', 'tts-engine'), ].find((p) => fs.existsSync(p)); if (!ttsBin) return false; try { execSync( `"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${voice}" --output "${wavPath}" --text "${text}"`, { timeout: 120000, stdio: 'pipe' }, ); return true; } catch { return false; } } /** Read a WAV file and return raw PCM + sample rate. */ function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null { const wav = fs.readFileSync(wavPath); if (wav.length < 44) return null; if (wav.toString('ascii', 0, 4) !== 'RIFF') return null; if (wav.toString('ascii', 8, 12) !== 'WAVE') return null; let sampleRate = 22050; let pcm: Buffer | null = null; let offset = 12; while (offset < wav.length - 8) { const chunkId = wav.toString('ascii', offset, offset + 4); const chunkSize = wav.readUInt32LE(offset + 4); if (chunkId === 'fmt ') { sampleRate = wav.readUInt32LE(offset + 12); } if (chunkId === 'data') { pcm = wav.subarray(offset + 8, offset + 8 + chunkSize); } offset += 8 + chunkSize; if (offset % 2 !== 0) offset++; } return pcm ? { pcm, sampleRate } : null; } /** Encode raw PCM frames to G.722 + Opus. */ async function encodePcmFrames( pcm: Buffer, sampleRate: number, log: (msg: string) => void, ): Promise<{ g722Frames: Buffer[]; opusFrames: Buffer[] } | null> { if (!isCodecReady()) return null; const frameSamples = Math.floor(sampleRate * 0.02); // 20ms const frameBytes = frameSamples * 2; // 16-bit const totalFrames = Math.floor(pcm.length / frameBytes); const g722Frames: Buffer[] = []; const opusFrames: Buffer[] = []; for (let i = 0; i < totalFrames; i++) { const framePcm = Buffer.from(pcm.subarray(i * frameBytes, (i + 1) * frameBytes)); const [g722, opus] = await Promise.all([ encodePcm(framePcm, sampleRate, 9), // G.722 encodePcm(framePcm, sampleRate, 111), // Opus ]); if (g722) g722Frames.push(g722); if (opus) opusFrames.push(opus); } return { g722Frames, opusFrames }; } // --------------------------------------------------------------------------- // PromptCache // --------------------------------------------------------------------------- export class PromptCache { private prompts = new Map(); private log: (msg: string) => void; private espeakAvailable: boolean | null = null; constructor(log: (msg: string) => void) { this.log = log; } // ------------------------------------------------------------------------- // Public API // ------------------------------------------------------------------------- /** Get a cached prompt by ID. */ get(id: string): ICachedPrompt | null { return this.prompts.get(id) ?? null; } /** Check if a prompt is cached. */ has(id: string): boolean { return this.prompts.has(id); } /** List all cached prompt IDs. */ listIds(): string[] { return [...this.prompts.keys()]; } /** * Generate a TTS prompt and cache it. * Uses espeak-ng (primary) or Kokoro (fallback). */ async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise { fs.mkdirSync(TTS_DIR, { recursive: true }); const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`); // Check espeak availability once. if (this.espeakAvailable === null) { this.espeakAvailable = isEspeakAvailable(); } // Generate WAV. let generated = false; if (!fs.existsSync(wavPath)) { if (this.espeakAvailable) { generated = generateViaEspeak(wavPath, text); } if (!generated) { generated = generateViaKokoro(wavPath, text, voice); } if (!generated) { this.log(`[prompt-cache] failed to generate TTS for "${id}"`); return null; } this.log(`[prompt-cache] generated WAV for "${id}"`); } return this.loadWavPrompt(id, wavPath); } /** * Load a WAV file as a prompt and cache it. */ async loadWavPrompt(id: string, wavPath: string): Promise { if (!fs.existsSync(wavPath)) { this.log(`[prompt-cache] WAV not found: ${wavPath}`); return null; } const result = readWavWithRate(wavPath); if (!result) { this.log(`[prompt-cache] failed to parse WAV: ${wavPath}`); return null; } const encoded = await encodePcmFrames(result.pcm, result.sampleRate, this.log); if (!encoded) { this.log(`[prompt-cache] encoding failed for "${id}" (codec bridge not ready?)`); return null; } const durationMs = encoded.g722Frames.length * 20; const prompt: ICachedPrompt = { id, g722Frames: encoded.g722Frames, opusFrames: encoded.opusFrames, durationMs, }; this.prompts.set(id, prompt); this.log(`[prompt-cache] cached "${id}": ${encoded.g722Frames.length} frames (${(durationMs / 1000).toFixed(1)}s)`); return prompt; } /** * Generate a beep tone prompt (sine wave). * @param id - prompt ID * @param freqHz - tone frequency (default 1000 Hz) * @param durationMs - tone duration (default 500ms) * @param amplitude - 16-bit amplitude (default 8000) */ async generateBeep( id: string, freqHz = 1000, durationMs = 500, amplitude = 8000, ): Promise { // Generate at 16kHz for decent quality. const sampleRate = 16000; const totalSamples = Math.floor((sampleRate * durationMs) / 1000); const pcm = Buffer.alloc(totalSamples * 2); for (let i = 0; i < totalSamples; i++) { const t = i / sampleRate; // Apply a short fade-in/fade-out to avoid click artifacts. const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade let envelope = 1.0; if (i < fadeLen) envelope = i / fadeLen; else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen; const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope); pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2); } const encoded = await encodePcmFrames(pcm, sampleRate, this.log); if (!encoded) { this.log(`[prompt-cache] beep encoding failed for "${id}"`); return null; } const actualDuration = encoded.g722Frames.length * 20; const prompt: ICachedPrompt = { id, g722Frames: encoded.g722Frames, opusFrames: encoded.opusFrames, durationMs: actualDuration, }; this.prompts.set(id, prompt); this.log(`[prompt-cache] beep "${id}" cached: ${actualDuration}ms @ ${freqHz}Hz`); return prompt; } /** * Remove a prompt from the cache. */ remove(id: string): void { this.prompts.delete(id); } /** * Clear all cached prompts. */ clear(): void { this.prompts.clear(); } } // --------------------------------------------------------------------------- // Standalone playback helpers (for use by SystemLeg) // --------------------------------------------------------------------------- /** * Play a cached prompt's G.722 frames as RTP packets at 20ms intervals. * * @param prompt - the cached prompt to play * @param sendPacket - function to send a raw RTP packet (12-byte header + payload) * @param ssrc - SSRC for RTP headers * @param onDone - called when playback finishes * @returns cancel function, or null if prompt has no G.722 frames */ export function playPromptG722( prompt: ICachedPrompt, sendPacket: (pkt: Buffer) => void, ssrc: number, onDone?: () => void, ): (() => void) | null { if (prompt.g722Frames.length === 0) { onDone?.(); return null; } const frames = prompt.g722Frames; const PT = 9; let frameIdx = 0; let seq = Math.floor(Math.random() * 0xffff); let rtpTs = Math.floor(Math.random() * 0xffffffff); const timer = setInterval(() => { if (frameIdx >= frames.length) { clearInterval(timer); onDone?.(); return; } const payload = frames[frameIdx]; const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0); const pkt = Buffer.concat([hdr, payload]); sendPacket(pkt); seq++; rtpTs += rtpClockIncrement(PT); frameIdx++; }, 20); return () => clearInterval(timer); } /** * Play a cached prompt's Opus frames as RTP packets at 20ms intervals. * * @param prompt - the cached prompt to play * @param sendPacket - function to send a raw RTP packet * @param ssrc - SSRC for RTP headers * @param counters - shared seq/ts counters (mutated in place for seamless transitions) * @param onDone - called when playback finishes * @returns cancel function, or null if prompt has no Opus frames */ export function playPromptOpus( prompt: ICachedPrompt, sendPacket: (pkt: Buffer) => void, ssrc: number, counters: { seq: number; ts: number }, onDone?: () => void, ): (() => void) | null { if (prompt.opusFrames.length === 0) { onDone?.(); return null; } const frames = prompt.opusFrames; const PT = 111; let frameIdx = 0; const timer = setInterval(() => { if (frameIdx >= frames.length) { clearInterval(timer); onDone?.(); return; } const payload = frames[frameIdx]; const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0); const pkt = Buffer.concat([hdr, payload]); sendPacket(pkt); counters.seq++; counters.ts += 960; // Opus 48kHz: 960 samples per 20ms frameIdx++; }, 20); return () => clearInterval(timer); }