feat(proxy-engine): integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files
This commit is contained in:
@@ -1,55 +1,31 @@
|
||||
/**
|
||||
* PromptCache — manages multiple named audio prompts for IVR and voicemail.
|
||||
* PromptCache — manages named audio prompt WAV files for IVR and voicemail.
|
||||
*
|
||||
* Each prompt is pre-encoded as both G.722 frames (for SIP legs) and Opus
|
||||
* frames (for WebRTC legs), ready for 20ms RTP playback.
|
||||
* Generates WAV files via espeak-ng (primary) or Kokoro TTS through the
|
||||
* proxy-engine (fallback). Also supports loading pre-existing WAV files
|
||||
* and programmatic tone generation.
|
||||
*
|
||||
* Supports three sources:
|
||||
* 1. TTS generation via espeak-ng (primary) or Kokoro (fallback)
|
||||
* 2. Loading from a pre-existing WAV file
|
||||
* 3. Programmatic tone generation (beep, etc.)
|
||||
*
|
||||
* The existing announcement.ts system continues to work independently;
|
||||
* this module provides generalized prompt management for IVR/voicemail.
|
||||
* All audio playback happens in Rust (audio_player / start_interaction).
|
||||
* This module only manages WAV files on disk.
|
||||
*/
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { Buffer } from 'node:buffer';
|
||||
import { encodePcm, isCodecReady } from '../opusbridge.ts';
|
||||
|
||||
/** RTP clock increment per 20ms frame for each codec. */
|
||||
function rtpClockIncrement(pt: number): number {
|
||||
if (pt === 111) return 960;
|
||||
if (pt === 9) return 160;
|
||||
return 160;
|
||||
}
|
||||
|
||||
/** Build a fresh RTP header. */
|
||||
function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer {
|
||||
const hdr = Buffer.alloc(12);
|
||||
hdr[0] = 0x80;
|
||||
hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f);
|
||||
hdr.writeUInt16BE(seq & 0xffff, 2);
|
||||
hdr.writeUInt32BE(ts >>> 0, 4);
|
||||
hdr.writeUInt32BE(ssrc >>> 0, 8);
|
||||
return hdr;
|
||||
}
|
||||
import { sendProxyCommand, isProxyReady } from '../proxybridge.ts';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** A pre-encoded prompt ready for RTP playback. */
|
||||
/** A cached prompt — just a WAV file path and metadata. */
|
||||
export interface ICachedPrompt {
|
||||
/** Unique prompt identifier. */
|
||||
id: string;
|
||||
/** G.722 encoded frames (20ms each, no RTP header). */
|
||||
g722Frames: Buffer[];
|
||||
/** Opus encoded frames (20ms each, no RTP header). */
|
||||
opusFrames: Buffer[];
|
||||
/** Total duration in milliseconds. */
|
||||
/** Path to the WAV file on disk. */
|
||||
wavPath: string;
|
||||
/** Total duration in milliseconds (approximate, from WAV header). */
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
@@ -82,84 +58,61 @@ function generateViaEspeak(wavPath: string, text: string): boolean {
|
||||
}
|
||||
}
|
||||
|
||||
/** Generate WAV via Kokoro TTS. */
|
||||
function generateViaKokoro(wavPath: string, text: string, voice: string): boolean {
|
||||
/** Generate WAV via Kokoro TTS (runs inside proxy-engine). */
|
||||
async function generateViaKokoro(wavPath: string, text: string, voice: string): Promise<boolean> {
|
||||
const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx');
|
||||
const voicesPath = path.join(TTS_DIR, 'voices.bin');
|
||||
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false;
|
||||
|
||||
const root = process.cwd();
|
||||
const ttsBin = [
|
||||
path.join(root, 'dist_rust', 'tts-engine'),
|
||||
path.join(root, 'rust', 'target', 'release', 'tts-engine'),
|
||||
path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
|
||||
].find((p) => fs.existsSync(p));
|
||||
if (!ttsBin) return false;
|
||||
if (!isProxyReady()) return false;
|
||||
|
||||
try {
|
||||
execSync(
|
||||
`"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${voice}" --output "${wavPath}" --text "${text}"`,
|
||||
{ timeout: 120000, stdio: 'pipe' },
|
||||
);
|
||||
await sendProxyCommand('generate_tts', {
|
||||
model: modelPath,
|
||||
voices: voicesPath,
|
||||
voice,
|
||||
text,
|
||||
output: wavPath,
|
||||
});
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Read a WAV file and return raw PCM + sample rate. */
|
||||
function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
|
||||
const wav = fs.readFileSync(wavPath);
|
||||
if (wav.length < 44) return null;
|
||||
if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
|
||||
if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
|
||||
/** Read a WAV file's duration from its header. */
|
||||
function getWavDurationMs(wavPath: string): number {
|
||||
try {
|
||||
const wav = fs.readFileSync(wavPath);
|
||||
if (wav.length < 44) return 0;
|
||||
if (wav.toString('ascii', 0, 4) !== 'RIFF') return 0;
|
||||
|
||||
let sampleRate = 22050;
|
||||
let pcm: Buffer | null = null;
|
||||
let offset = 12;
|
||||
let sampleRate = 16000;
|
||||
let dataSize = 0;
|
||||
let bitsPerSample = 16;
|
||||
let channels = 1;
|
||||
let offset = 12;
|
||||
|
||||
while (offset < wav.length - 8) {
|
||||
const chunkId = wav.toString('ascii', offset, offset + 4);
|
||||
const chunkSize = wav.readUInt32LE(offset + 4);
|
||||
if (chunkId === 'fmt ') {
|
||||
sampleRate = wav.readUInt32LE(offset + 12);
|
||||
while (offset < wav.length - 8) {
|
||||
const chunkId = wav.toString('ascii', offset, offset + 4);
|
||||
const chunkSize = wav.readUInt32LE(offset + 4);
|
||||
if (chunkId === 'fmt ') {
|
||||
channels = wav.readUInt16LE(offset + 10);
|
||||
sampleRate = wav.readUInt32LE(offset + 12);
|
||||
bitsPerSample = wav.readUInt16LE(offset + 22);
|
||||
}
|
||||
if (chunkId === 'data') {
|
||||
dataSize = chunkSize;
|
||||
}
|
||||
offset += 8 + chunkSize;
|
||||
if (offset % 2 !== 0) offset++;
|
||||
}
|
||||
if (chunkId === 'data') {
|
||||
pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
|
||||
}
|
||||
offset += 8 + chunkSize;
|
||||
if (offset % 2 !== 0) offset++;
|
||||
|
||||
const bytesPerSample = (bitsPerSample / 8) * channels;
|
||||
const totalSamples = bytesPerSample > 0 ? dataSize / bytesPerSample : 0;
|
||||
return sampleRate > 0 ? Math.round((totalSamples / sampleRate) * 1000) : 0;
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return pcm ? { pcm, sampleRate } : null;
|
||||
}
|
||||
|
||||
/** Encode raw PCM frames to G.722 + Opus. */
|
||||
async function encodePcmFrames(
|
||||
pcm: Buffer,
|
||||
sampleRate: number,
|
||||
log: (msg: string) => void,
|
||||
): Promise<{ g722Frames: Buffer[]; opusFrames: Buffer[] } | null> {
|
||||
if (!isCodecReady()) return null;
|
||||
|
||||
const frameSamples = Math.floor(sampleRate * 0.02); // 20ms
|
||||
const frameBytes = frameSamples * 2; // 16-bit
|
||||
const totalFrames = Math.floor(pcm.length / frameBytes);
|
||||
|
||||
const g722Frames: Buffer[] = [];
|
||||
const opusFrames: Buffer[] = [];
|
||||
|
||||
for (let i = 0; i < totalFrames; i++) {
|
||||
const framePcm = Buffer.from(pcm.subarray(i * frameBytes, (i + 1) * frameBytes));
|
||||
const [g722, opus] = await Promise.all([
|
||||
encodePcm(framePcm, sampleRate, 9), // G.722
|
||||
encodePcm(framePcm, sampleRate, 111), // Opus
|
||||
]);
|
||||
if (g722) g722Frames.push(g722);
|
||||
if (opus) opusFrames.push(opus);
|
||||
}
|
||||
|
||||
return { g722Frames, opusFrames };
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -195,7 +148,7 @@ export class PromptCache {
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a TTS prompt and cache it.
|
||||
* Generate a TTS prompt WAV and cache its path.
|
||||
* Uses espeak-ng (primary) or Kokoro (fallback).
|
||||
*/
|
||||
async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise<ICachedPrompt | null> {
|
||||
@@ -207,14 +160,14 @@ export class PromptCache {
|
||||
this.espeakAvailable = isEspeakAvailable();
|
||||
}
|
||||
|
||||
// Generate WAV.
|
||||
let generated = false;
|
||||
// Generate WAV if not already on disk.
|
||||
if (!fs.existsSync(wavPath)) {
|
||||
let generated = false;
|
||||
if (this.espeakAvailable) {
|
||||
generated = generateViaEspeak(wavPath, text);
|
||||
}
|
||||
if (!generated) {
|
||||
generated = generateViaKokoro(wavPath, text, voice);
|
||||
generated = await generateViaKokoro(wavPath, text, voice);
|
||||
}
|
||||
if (!generated) {
|
||||
this.log(`[prompt-cache] failed to generate TTS for "${id}"`);
|
||||
@@ -223,49 +176,22 @@ export class PromptCache {
|
||||
this.log(`[prompt-cache] generated WAV for "${id}"`);
|
||||
}
|
||||
|
||||
return this.loadWavPrompt(id, wavPath);
|
||||
return this.registerWav(id, wavPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a WAV file as a prompt and cache it.
|
||||
* Load a pre-existing WAV file as a prompt.
|
||||
*/
|
||||
async loadWavPrompt(id: string, wavPath: string): Promise<ICachedPrompt | null> {
|
||||
if (!fs.existsSync(wavPath)) {
|
||||
this.log(`[prompt-cache] WAV not found: ${wavPath}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const result = readWavWithRate(wavPath);
|
||||
if (!result) {
|
||||
this.log(`[prompt-cache] failed to parse WAV: ${wavPath}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const encoded = await encodePcmFrames(result.pcm, result.sampleRate, this.log);
|
||||
if (!encoded) {
|
||||
this.log(`[prompt-cache] encoding failed for "${id}" (codec bridge not ready?)`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const durationMs = encoded.g722Frames.length * 20;
|
||||
const prompt: ICachedPrompt = {
|
||||
id,
|
||||
g722Frames: encoded.g722Frames,
|
||||
opusFrames: encoded.opusFrames,
|
||||
durationMs,
|
||||
};
|
||||
|
||||
this.prompts.set(id, prompt);
|
||||
this.log(`[prompt-cache] cached "${id}": ${encoded.g722Frames.length} frames (${(durationMs / 1000).toFixed(1)}s)`);
|
||||
return prompt;
|
||||
return this.registerWav(id, wavPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a beep tone prompt (sine wave).
|
||||
* @param id - prompt ID
|
||||
* @param freqHz - tone frequency (default 1000 Hz)
|
||||
* @param durationMs - tone duration (default 500ms)
|
||||
* @param amplitude - 16-bit amplitude (default 8000)
|
||||
* Generate a beep tone WAV and cache it.
|
||||
*/
|
||||
async generateBeep(
|
||||
id: string,
|
||||
@@ -273,149 +199,77 @@ export class PromptCache {
|
||||
durationMs = 500,
|
||||
amplitude = 8000,
|
||||
): Promise<ICachedPrompt | null> {
|
||||
// Generate at 16kHz for decent quality.
|
||||
const sampleRate = 16000;
|
||||
const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
|
||||
const pcm = Buffer.alloc(totalSamples * 2);
|
||||
fs.mkdirSync(TTS_DIR, { recursive: true });
|
||||
const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
|
||||
|
||||
for (let i = 0; i < totalSamples; i++) {
|
||||
const t = i / sampleRate;
|
||||
// Apply a short fade-in/fade-out to avoid click artifacts.
|
||||
const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
|
||||
let envelope = 1.0;
|
||||
if (i < fadeLen) envelope = i / fadeLen;
|
||||
else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
|
||||
if (!fs.existsSync(wavPath)) {
|
||||
// Generate 16kHz 16-bit mono sine wave WAV.
|
||||
const sampleRate = 16000;
|
||||
const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
|
||||
const pcm = Buffer.alloc(totalSamples * 2);
|
||||
|
||||
const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
|
||||
pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
|
||||
for (let i = 0; i < totalSamples; i++) {
|
||||
const t = i / sampleRate;
|
||||
const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
|
||||
let envelope = 1.0;
|
||||
if (i < fadeLen) envelope = i / fadeLen;
|
||||
else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
|
||||
|
||||
const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
|
||||
pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
|
||||
}
|
||||
|
||||
// Write WAV file.
|
||||
const headerSize = 44;
|
||||
const dataSize = pcm.length;
|
||||
const wav = Buffer.alloc(headerSize + dataSize);
|
||||
|
||||
// RIFF header
|
||||
wav.write('RIFF', 0);
|
||||
wav.writeUInt32LE(36 + dataSize, 4);
|
||||
wav.write('WAVE', 8);
|
||||
|
||||
// fmt chunk
|
||||
wav.write('fmt ', 12);
|
||||
wav.writeUInt32LE(16, 16); // chunk size
|
||||
wav.writeUInt16LE(1, 20); // PCM format
|
||||
wav.writeUInt16LE(1, 22); // mono
|
||||
wav.writeUInt32LE(sampleRate, 24);
|
||||
wav.writeUInt32LE(sampleRate * 2, 28); // byte rate
|
||||
wav.writeUInt16LE(2, 32); // block align
|
||||
wav.writeUInt16LE(16, 34); // bits per sample
|
||||
|
||||
// data chunk
|
||||
wav.write('data', 36);
|
||||
wav.writeUInt32LE(dataSize, 40);
|
||||
pcm.copy(wav, 44);
|
||||
|
||||
fs.writeFileSync(wavPath, wav);
|
||||
this.log(`[prompt-cache] beep WAV generated for "${id}"`);
|
||||
}
|
||||
|
||||
const encoded = await encodePcmFrames(pcm, sampleRate, this.log);
|
||||
if (!encoded) {
|
||||
this.log(`[prompt-cache] beep encoding failed for "${id}"`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const actualDuration = encoded.g722Frames.length * 20;
|
||||
const prompt: ICachedPrompt = {
|
||||
id,
|
||||
g722Frames: encoded.g722Frames,
|
||||
opusFrames: encoded.opusFrames,
|
||||
durationMs: actualDuration,
|
||||
};
|
||||
|
||||
this.prompts.set(id, prompt);
|
||||
this.log(`[prompt-cache] beep "${id}" cached: ${actualDuration}ms @ ${freqHz}Hz`);
|
||||
return prompt;
|
||||
return this.registerWav(id, wavPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a prompt from the cache.
|
||||
*/
|
||||
/** Remove a prompt from the cache. */
|
||||
remove(id: string): void {
|
||||
this.prompts.delete(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all cached prompts.
|
||||
*/
|
||||
/** Clear all cached prompts. */
|
||||
clear(): void {
|
||||
this.prompts.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Standalone playback helpers (for use by SystemLeg)
|
||||
// ---------------------------------------------------------------------------
|
||||
// -------------------------------------------------------------------------
|
||||
// Internal
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Play a cached prompt's G.722 frames as RTP packets at 20ms intervals.
|
||||
*
|
||||
* @param prompt - the cached prompt to play
|
||||
* @param sendPacket - function to send a raw RTP packet (12-byte header + payload)
|
||||
* @param ssrc - SSRC for RTP headers
|
||||
* @param onDone - called when playback finishes
|
||||
* @returns cancel function, or null if prompt has no G.722 frames
|
||||
*/
|
||||
export function playPromptG722(
|
||||
prompt: ICachedPrompt,
|
||||
sendPacket: (pkt: Buffer) => void,
|
||||
ssrc: number,
|
||||
onDone?: () => void,
|
||||
): (() => void) | null {
|
||||
if (prompt.g722Frames.length === 0) {
|
||||
onDone?.();
|
||||
return null;
|
||||
private registerWav(id: string, wavPath: string): ICachedPrompt {
|
||||
const durationMs = getWavDurationMs(wavPath);
|
||||
const prompt: ICachedPrompt = { id, wavPath, durationMs };
|
||||
this.prompts.set(id, prompt);
|
||||
this.log(`[prompt-cache] cached "${id}": ${wavPath} (${(durationMs / 1000).toFixed(1)}s)`);
|
||||
return prompt;
|
||||
}
|
||||
|
||||
const frames = prompt.g722Frames;
|
||||
const PT = 9;
|
||||
let frameIdx = 0;
|
||||
let seq = Math.floor(Math.random() * 0xffff);
|
||||
let rtpTs = Math.floor(Math.random() * 0xffffffff);
|
||||
|
||||
const timer = setInterval(() => {
|
||||
if (frameIdx >= frames.length) {
|
||||
clearInterval(timer);
|
||||
onDone?.();
|
||||
return;
|
||||
}
|
||||
|
||||
const payload = frames[frameIdx];
|
||||
const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
|
||||
const pkt = Buffer.concat([hdr, payload]);
|
||||
sendPacket(pkt);
|
||||
|
||||
seq++;
|
||||
rtpTs += rtpClockIncrement(PT);
|
||||
frameIdx++;
|
||||
}, 20);
|
||||
|
||||
return () => clearInterval(timer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Play a cached prompt's Opus frames as RTP packets at 20ms intervals.
|
||||
*
|
||||
* @param prompt - the cached prompt to play
|
||||
* @param sendPacket - function to send a raw RTP packet
|
||||
* @param ssrc - SSRC for RTP headers
|
||||
* @param counters - shared seq/ts counters (mutated in place for seamless transitions)
|
||||
* @param onDone - called when playback finishes
|
||||
* @returns cancel function, or null if prompt has no Opus frames
|
||||
*/
|
||||
export function playPromptOpus(
|
||||
prompt: ICachedPrompt,
|
||||
sendPacket: (pkt: Buffer) => void,
|
||||
ssrc: number,
|
||||
counters: { seq: number; ts: number },
|
||||
onDone?: () => void,
|
||||
): (() => void) | null {
|
||||
if (prompt.opusFrames.length === 0) {
|
||||
onDone?.();
|
||||
return null;
|
||||
}
|
||||
|
||||
const frames = prompt.opusFrames;
|
||||
const PT = 111;
|
||||
let frameIdx = 0;
|
||||
|
||||
const timer = setInterval(() => {
|
||||
if (frameIdx >= frames.length) {
|
||||
clearInterval(timer);
|
||||
onDone?.();
|
||||
return;
|
||||
}
|
||||
|
||||
const payload = frames[frameIdx];
|
||||
const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
|
||||
const pkt = Buffer.concat([hdr, payload]);
|
||||
sendPacket(pkt);
|
||||
|
||||
counters.seq++;
|
||||
counters.ts += 960; // Opus 48kHz: 960 samples per 20ms
|
||||
frameIdx++;
|
||||
}, 20);
|
||||
|
||||
return () => clearInterval(timer);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user