2026-04-09 23:03:55 +00:00
|
|
|
/**
|
2026-04-10 08:22:12 +00:00
|
|
|
* TTS announcement module — pre-generates audio announcements using espeak-ng
|
2026-04-09 23:03:55 +00:00
|
|
|
* and caches them as encoded RTP packets for playback during call setup.
|
|
|
|
|
*
|
2026-04-10 08:22:12 +00:00
|
|
|
* On startup, generates the announcement WAV via espeak-ng (formant-based TTS
|
|
|
|
|
* with highly accurate pronunciation), encodes each 20ms frame to G.722 (for
|
|
|
|
|
* SIP) and Opus (for WebRTC) via the Rust transcoder, and caches the packets.
|
|
|
|
|
*
|
|
|
|
|
* Falls back to the Rust tts-engine (Kokoro neural TTS) if espeak-ng is not
|
|
|
|
|
* installed, and disables announcements if neither is available.
|
2026-04-09 23:03:55 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
import { execSync } from 'node:child_process';
|
|
|
|
|
import fs from 'node:fs';
|
|
|
|
|
import path from 'node:path';
|
|
|
|
|
import { Buffer } from 'node:buffer';
|
|
|
|
|
import { encodePcm, isCodecReady } from './opusbridge.ts';
|
|
|
|
|
|
2026-04-10 11:36:18 +00:00
|
|
|
/** RTP clock increment per 20ms frame for each codec. */
|
|
|
|
|
function rtpClockIncrement(pt: number): number {
|
|
|
|
|
if (pt === 111) return 960;
|
|
|
|
|
if (pt === 9) return 160;
|
|
|
|
|
return 160;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Build a fresh RTP header. */
|
|
|
|
|
function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer {
|
|
|
|
|
const hdr = Buffer.alloc(12);
|
|
|
|
|
hdr[0] = 0x80;
|
|
|
|
|
hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f);
|
|
|
|
|
hdr.writeUInt16BE(seq & 0xffff, 2);
|
|
|
|
|
hdr.writeUInt32BE(ts >>> 0, 4);
|
|
|
|
|
hdr.writeUInt32BE(ssrc >>> 0, 8);
|
|
|
|
|
return hdr;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-09 23:03:55 +00:00
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
// Types
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
/** A pre-encoded announcement ready for RTP playback. */
|
|
|
|
|
export interface IAnnouncementCache {
|
|
|
|
|
/** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */
|
|
|
|
|
g722Frames: Buffer[];
|
|
|
|
|
/** Opus encoded frames for WebRTC playback. */
|
|
|
|
|
opusFrames: Buffer[];
|
|
|
|
|
/** Total duration in milliseconds. */
|
|
|
|
|
durationMs: number;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
// State
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
let cachedAnnouncement: IAnnouncementCache | null = null;
|
|
|
|
|
|
|
|
|
|
const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
|
|
|
|
|
const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
|
|
|
|
|
const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');
|
|
|
|
|
|
2026-04-10 08:22:12 +00:00
|
|
|
// Kokoro fallback constants.
|
|
|
|
|
const KOKORO_MODEL = 'kokoro-v1.0.onnx';
|
|
|
|
|
const KOKORO_VOICES = 'voices.bin';
|
|
|
|
|
const KOKORO_VOICE = 'af_bella';
|
|
|
|
|
|
2026-04-09 23:03:55 +00:00
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
// Initialization
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
/**
|
2026-04-10 08:22:12 +00:00
|
|
|
* Check if espeak-ng is available on the system.
|
2026-04-09 23:03:55 +00:00
|
|
|
*/
|
2026-04-10 08:22:12 +00:00
|
|
|
function isEspeakAvailable(): boolean {
|
|
|
|
|
try {
|
|
|
|
|
execSync('which espeak-ng', { stdio: 'pipe' });
|
|
|
|
|
return true;
|
|
|
|
|
} catch {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-09 23:03:55 +00:00
|
|
|
|
2026-04-10 08:22:12 +00:00
|
|
|
/**
|
|
|
|
|
* Generate announcement WAV via espeak-ng (primary engine).
|
|
|
|
|
* Returns true on success.
|
|
|
|
|
*/
|
|
|
|
|
function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
|
|
|
|
|
log('[tts] generating announcement audio via espeak-ng...');
|
|
|
|
|
try {
|
|
|
|
|
execSync(
|
|
|
|
|
`espeak-ng -v en-us -s 150 -w "${wavPath}" "${text}"`,
|
|
|
|
|
{ timeout: 10000, stdio: 'pipe' },
|
|
|
|
|
);
|
|
|
|
|
log('[tts] espeak-ng WAV generated');
|
|
|
|
|
return true;
|
|
|
|
|
} catch (e: any) {
|
|
|
|
|
log(`[tts] espeak-ng failed: ${e.message}`);
|
2026-04-09 23:03:55 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
2026-04-10 08:22:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Generate announcement WAV via Kokoro TTS (fallback engine).
|
|
|
|
|
* Returns true on success.
|
|
|
|
|
*/
|
|
|
|
|
function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): boolean {
|
|
|
|
|
const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
|
|
|
|
|
const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);
|
|
|
|
|
|
|
|
|
|
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) {
|
|
|
|
|
log('[tts] Kokoro model/voices not found — Kokoro fallback unavailable');
|
2026-04-09 23:03:55 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const root = process.cwd();
|
|
|
|
|
const ttsBinPaths = [
|
|
|
|
|
path.join(root, 'dist_rust', 'tts-engine'),
|
|
|
|
|
path.join(root, 'rust', 'target', 'release', 'tts-engine'),
|
|
|
|
|
path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
|
|
|
|
|
];
|
|
|
|
|
const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p));
|
|
|
|
|
if (!ttsBin) {
|
2026-04-10 08:22:12 +00:00
|
|
|
log('[tts] tts-engine binary not found — Kokoro fallback unavailable');
|
2026-04-09 23:03:55 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-10 08:22:12 +00:00
|
|
|
log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
|
|
|
|
|
try {
|
|
|
|
|
execSync(
|
|
|
|
|
`"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${wavPath}" --text "${text}"`,
|
|
|
|
|
{ timeout: 120000, stdio: 'pipe' },
|
|
|
|
|
);
|
|
|
|
|
log('[tts] Kokoro WAV generated');
|
|
|
|
|
return true;
|
|
|
|
|
} catch (e: any) {
|
|
|
|
|
log(`[tts] Kokoro failed: ${e.message}`);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read a WAV file and detect its sample rate from the fmt chunk.
|
|
|
|
|
* Returns { pcm, sampleRate } or null on failure.
|
|
|
|
|
*/
|
|
|
|
|
function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
|
|
|
|
|
const wav = fs.readFileSync(wavPath);
|
|
|
|
|
if (wav.length < 44) return null;
|
|
|
|
|
if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
|
|
|
|
|
if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
|
|
|
|
|
|
|
|
|
|
let sampleRate = 22050; // default
|
|
|
|
|
let offset = 12;
|
|
|
|
|
let pcm: Buffer | null = null;
|
|
|
|
|
|
|
|
|
|
while (offset < wav.length - 8) {
|
|
|
|
|
const chunkId = wav.toString('ascii', offset, offset + 4);
|
|
|
|
|
const chunkSize = wav.readUInt32LE(offset + 4);
|
|
|
|
|
if (chunkId === 'fmt ') {
|
|
|
|
|
sampleRate = wav.readUInt32LE(offset + 12);
|
|
|
|
|
}
|
|
|
|
|
if (chunkId === 'data') {
|
|
|
|
|
pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
|
|
|
|
|
}
|
|
|
|
|
offset += 8 + chunkSize;
|
|
|
|
|
if (offset % 2 !== 0) offset++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!pcm) return null;
|
|
|
|
|
return { pcm, sampleRate };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Pre-generate the announcement audio and encode to G.722 + Opus frames.
|
|
|
|
|
* Must be called after the codec bridge is initialized.
|
|
|
|
|
*
|
|
|
|
|
* Engine priority: espeak-ng → Kokoro → disabled.
|
|
|
|
|
*/
|
|
|
|
|
export async function initAnnouncement(log: (msg: string) => void): Promise<boolean> {
|
|
|
|
|
fs.mkdirSync(TTS_DIR, { recursive: true });
|
|
|
|
|
|
2026-04-09 23:03:55 +00:00
|
|
|
try {
|
|
|
|
|
// Generate WAV if not cached.
|
|
|
|
|
if (!fs.existsSync(CACHE_WAV)) {
|
2026-04-10 08:22:12 +00:00
|
|
|
let generated = false;
|
|
|
|
|
|
|
|
|
|
// Try espeak-ng first.
|
|
|
|
|
if (isEspeakAvailable()) {
|
|
|
|
|
generated = generateViaEspeak(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
|
|
|
|
|
} else {
|
|
|
|
|
log('[tts] espeak-ng not installed — trying Kokoro fallback');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fall back to Kokoro.
|
|
|
|
|
if (!generated) {
|
|
|
|
|
generated = generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!generated) {
|
|
|
|
|
log('[tts] no TTS engine available — announcements disabled');
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2026-04-09 23:03:55 +00:00
|
|
|
}
|
|
|
|
|
|
2026-04-10 08:22:12 +00:00
|
|
|
// Read WAV and extract raw PCM + sample rate.
|
|
|
|
|
const result = readWavWithRate(CACHE_WAV);
|
|
|
|
|
if (!result) {
|
2026-04-09 23:03:55 +00:00
|
|
|
log('[tts] failed to parse WAV file');
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-10 08:22:12 +00:00
|
|
|
const { pcm, sampleRate } = result;
|
|
|
|
|
|
2026-04-09 23:03:55 +00:00
|
|
|
// Wait for codec bridge to be ready.
|
|
|
|
|
if (!isCodecReady()) {
|
|
|
|
|
log('[tts] codec bridge not ready — will retry');
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-10 08:22:12 +00:00
|
|
|
// Encode in 20ms chunks. The Rust encoder resamples to each codec's native rate.
|
|
|
|
|
const FRAME_SAMPLES = Math.floor(sampleRate * 0.02);
|
2026-04-09 23:03:55 +00:00
|
|
|
const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample
|
|
|
|
|
const totalFrames = Math.floor(pcm.length / FRAME_BYTES);
|
|
|
|
|
|
|
|
|
|
const g722Frames: Buffer[] = [];
|
|
|
|
|
const opusFrames: Buffer[] = [];
|
|
|
|
|
|
2026-04-10 08:22:12 +00:00
|
|
|
log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${sampleRate}Hz)...`);
|
2026-04-09 23:03:55 +00:00
|
|
|
for (let i = 0; i < totalFrames; i++) {
|
|
|
|
|
const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES);
|
|
|
|
|
const pcmBuf = Buffer.from(framePcm);
|
|
|
|
|
const [g722, opus] = await Promise.all([
|
2026-04-10 08:22:12 +00:00
|
|
|
encodePcm(pcmBuf, sampleRate, 9), // G.722 for SIP devices
|
|
|
|
|
encodePcm(pcmBuf, sampleRate, 111), // Opus for WebRTC browsers
|
2026-04-09 23:03:55 +00:00
|
|
|
]);
|
|
|
|
|
if (g722) g722Frames.push(g722);
|
|
|
|
|
if (opus) opusFrames.push(opus);
|
|
|
|
|
if (!g722 && !opus && i < 3) log(`[tts] frame ${i} encode failed`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cachedAnnouncement = {
|
|
|
|
|
g722Frames,
|
|
|
|
|
opusFrames,
|
|
|
|
|
durationMs: totalFrames * 20,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
log(`[tts] announcement cached: ${g722Frames.length} frames (${(totalFrames * 20 / 1000).toFixed(1)}s)`);
|
|
|
|
|
return true;
|
|
|
|
|
} catch (e: any) {
|
|
|
|
|
log(`[tts] init error: ${e.message}`);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
// Playback
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Play the pre-cached announcement to an RTP endpoint.
|
|
|
|
|
*
|
|
|
|
|
* @param sendPacket - function to send a raw RTP packet
|
|
|
|
|
* @param ssrc - SSRC to use in RTP headers
|
|
|
|
|
* @param onDone - called when the announcement finishes
|
|
|
|
|
* @returns a cancel function, or null if no announcement is cached
|
|
|
|
|
*/
|
|
|
|
|
export function playAnnouncement(
|
|
|
|
|
sendPacket: (pkt: Buffer) => void,
|
|
|
|
|
ssrc: number,
|
|
|
|
|
onDone?: () => void,
|
|
|
|
|
): (() => void) | null {
|
|
|
|
|
if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) {
|
|
|
|
|
onDone?.();
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const frames = cachedAnnouncement.g722Frames;
|
|
|
|
|
const PT = 9; // G.722
|
|
|
|
|
let frameIdx = 0;
|
|
|
|
|
let seq = Math.floor(Math.random() * 0xffff);
|
|
|
|
|
let rtpTs = Math.floor(Math.random() * 0xffffffff);
|
|
|
|
|
|
|
|
|
|
const timer = setInterval(() => {
|
|
|
|
|
if (frameIdx >= frames.length) {
|
|
|
|
|
clearInterval(timer);
|
|
|
|
|
onDone?.();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const payload = frames[frameIdx];
|
|
|
|
|
const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
|
|
|
|
|
const pkt = Buffer.concat([hdr, payload]);
|
|
|
|
|
sendPacket(pkt);
|
|
|
|
|
|
|
|
|
|
seq++;
|
|
|
|
|
rtpTs += rtpClockIncrement(PT);
|
|
|
|
|
frameIdx++;
|
|
|
|
|
}, 20);
|
|
|
|
|
|
|
|
|
|
// Return cancel function.
|
|
|
|
|
return () => clearInterval(timer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Play pre-cached Opus announcement to a WebRTC PeerConnection sender.
|
|
|
|
|
*
|
|
|
|
|
* @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp()
|
|
|
|
|
* @param ssrc - SSRC to use in RTP headers
|
|
|
|
|
* @param onDone - called when announcement finishes
|
|
|
|
|
* @returns cancel function, or null if no announcement cached
|
|
|
|
|
*/
|
|
|
|
|
export function playAnnouncementToWebRtc(
|
|
|
|
|
sendRtpPacket: (pkt: Buffer) => void,
|
|
|
|
|
ssrc: number,
|
|
|
|
|
counters: { seq: number; ts: number },
|
|
|
|
|
onDone?: () => void,
|
|
|
|
|
): (() => void) | null {
|
|
|
|
|
if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) {
|
|
|
|
|
onDone?.();
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const frames = cachedAnnouncement.opusFrames;
|
|
|
|
|
const PT = 111; // Opus
|
|
|
|
|
let frameIdx = 0;
|
|
|
|
|
|
|
|
|
|
const timer = setInterval(() => {
|
|
|
|
|
if (frameIdx >= frames.length) {
|
|
|
|
|
clearInterval(timer);
|
|
|
|
|
onDone?.();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const payload = frames[frameIdx];
|
|
|
|
|
const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
|
|
|
|
|
const pkt = Buffer.concat([hdr, payload]);
|
|
|
|
|
sendRtpPacket(pkt);
|
|
|
|
|
|
|
|
|
|
counters.seq++;
|
|
|
|
|
counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms
|
|
|
|
|
frameIdx++;
|
|
|
|
|
}, 20);
|
|
|
|
|
|
|
|
|
|
return () => clearInterval(timer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Check if an announcement is cached and ready. */
|
|
|
|
|
export function isAnnouncementReady(): boolean {
|
|
|
|
|
return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0;
|
|
|
|
|
}
|
|
|
|
|
|