feat(proxy-engine): integrate Kokoro TTS generation into proxy-engine and simplify TypeScript prompt handling to use cached WAV files

This commit is contained in:
2026-04-10 15:21:44 +00:00
parent c9ae747c95
commit 66112091a2
18 changed files with 340 additions and 1202 deletions

View File

@@ -3,6 +3,6 @@
*/
export const commitinfo = {
name: 'siprouter',
version: '1.15.0',
version: '1.16.0',
description: 'undefined'
}

View File

@@ -1,59 +1,22 @@
/**
* TTS announcement module — pre-generates audio announcements using espeak-ng
* and caches them as encoded RTP packets for playback during call setup.
* TTS announcement module — generates announcement WAV files at startup.
*
* On startup, generates the announcement WAV via espeak-ng (formant-based TTS
* with highly accurate pronunciation), encodes each 20ms frame to G.722 (for
* SIP) and Opus (for WebRTC) via the Rust transcoder, and caches the packets.
* Engine priority: espeak-ng (formant TTS, fast) → Kokoro neural TTS via
* proxy-engine → disabled.
*
* Falls back to the Rust tts-engine (Kokoro neural TTS) if espeak-ng is not
* installed, and disables announcements if neither is available.
* The generated WAV is left on disk for Rust's audio_player / start_interaction
* to play during calls. No encoding or RTP playback happens in TypeScript.
*/
import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { Buffer } from 'node:buffer';
import { encodePcm, isCodecReady } from './opusbridge.ts';
/** RTP clock increment per 20ms frame for each codec. */
function rtpClockIncrement(pt: number): number {
if (pt === 111) return 960;
if (pt === 9) return 160;
return 160;
}
/** Build a fresh RTP header. */
function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer {
const hdr = Buffer.alloc(12);
hdr[0] = 0x80;
hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f);
hdr.writeUInt16BE(seq & 0xffff, 2);
hdr.writeUInt32BE(ts >>> 0, 4);
hdr.writeUInt32BE(ssrc >>> 0, 8);
return hdr;
}
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
/** A pre-encoded announcement ready for RTP playback. */
export interface IAnnouncementCache {
/** G.722 encoded frames (each is a 20ms frame payload, no RTP header). */
g722Frames: Buffer[];
/** Opus encoded frames for WebRTC playback. */
opusFrames: Buffer[];
/** Total duration in milliseconds. */
durationMs: number;
}
import { sendProxyCommand, isProxyReady } from './proxybridge.ts';
// ---------------------------------------------------------------------------
// State
// ---------------------------------------------------------------------------
let cachedAnnouncement: IAnnouncementCache | null = null;
const TTS_DIR = path.join(process.cwd(), '.nogit', 'tts');
const ANNOUNCEMENT_TEXT = "Hello. I'm connecting your call now.";
const CACHE_WAV = path.join(TTS_DIR, 'announcement.wav');
@@ -64,12 +27,10 @@ const KOKORO_VOICES = 'voices.bin';
const KOKORO_VOICE = 'af_bella';
// ---------------------------------------------------------------------------
// Initialization
// TTS generators
// ---------------------------------------------------------------------------
/**
* Check if espeak-ng is available on the system.
*/
/** Check if espeak-ng is available on the system. */
function isEspeakAvailable(): boolean {
try {
execSync('which espeak-ng', { stdio: 'pipe' });
@@ -79,10 +40,7 @@ function isEspeakAvailable(): boolean {
}
}
/**
* Generate announcement WAV via espeak-ng (primary engine).
* Returns true on success.
*/
/** Generate announcement WAV via espeak-ng (primary engine). */
function generateViaEspeak(wavPath: string, text: string, log: (msg: string) => void): boolean {
log('[tts] generating announcement audio via espeak-ng...');
try {
@@ -98,11 +56,8 @@ function generateViaEspeak(wavPath: string, text: string, log: (msg: string) =>
}
}
/**
* Generate announcement WAV via Kokoro TTS (fallback engine).
* Returns true on success.
*/
function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): boolean {
/** Generate announcement WAV via Kokoro TTS (fallback, runs inside proxy-engine). */
async function generateViaKokoro(wavPath: string, text: string, log: (msg: string) => void): Promise<boolean> {
const modelPath = path.join(TTS_DIR, KOKORO_MODEL);
const voicesPath = path.join(TTS_DIR, KOKORO_VOICES);
@@ -111,25 +66,21 @@ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) =>
return false;
}
const root = process.cwd();
const ttsBinPaths = [
path.join(root, 'dist_rust', 'tts-engine'),
path.join(root, 'rust', 'target', 'release', 'tts-engine'),
path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
];
const ttsBin = ttsBinPaths.find((p) => fs.existsSync(p));
if (!ttsBin) {
log('[tts] tts-engine binary not found — Kokoro fallback unavailable');
if (!isProxyReady()) {
log('[tts] proxy-engine not ready — Kokoro fallback unavailable');
return false;
}
log('[tts] generating announcement audio via Kokoro TTS (fallback)...');
try {
execSync(
`"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${KOKORO_VOICE}" --output "${wavPath}" --text "${text}"`,
{ timeout: 120000, stdio: 'pipe' },
);
log('[tts] Kokoro WAV generated');
await sendProxyCommand('generate_tts', {
model: modelPath,
voices: voicesPath,
voice: KOKORO_VOICE,
text,
output: wavPath,
});
log('[tts] Kokoro WAV generated (via proxy-engine)');
return true;
} catch (e: any) {
log(`[tts] Kokoro failed: ${e.message}`);
@@ -137,40 +88,13 @@ function generateViaKokoro(wavPath: string, text: string, log: (msg: string) =>
}
}
/**
* Read a WAV file and detect its sample rate from the fmt chunk.
* Returns { pcm, sampleRate } or null on failure.
*/
function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
const wav = fs.readFileSync(wavPath);
if (wav.length < 44) return null;
if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
let sampleRate = 22050; // default
let offset = 12;
let pcm: Buffer | null = null;
while (offset < wav.length - 8) {
const chunkId = wav.toString('ascii', offset, offset + 4);
const chunkSize = wav.readUInt32LE(offset + 4);
if (chunkId === 'fmt ') {
sampleRate = wav.readUInt32LE(offset + 12);
}
if (chunkId === 'data') {
pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
}
offset += 8 + chunkSize;
if (offset % 2 !== 0) offset++;
}
if (!pcm) return null;
return { pcm, sampleRate };
}
// ---------------------------------------------------------------------------
// Initialization
// ---------------------------------------------------------------------------
/**
* Pre-generate the announcement audio and encode to G.722 + Opus frames.
* Must be called after the codec bridge is initialized.
* Pre-generate the announcement WAV file.
* Must be called after the proxy engine is initialized.
*
* Engine priority: espeak-ng → Kokoro → disabled.
*/
@@ -178,7 +102,6 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
fs.mkdirSync(TTS_DIR, { recursive: true });
try {
// Generate WAV if not cached.
if (!fs.existsSync(CACHE_WAV)) {
let generated = false;
@@ -189,9 +112,9 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
log('[tts] espeak-ng not installed — trying Kokoro fallback');
}
// Fall back to Kokoro.
// Fall back to Kokoro (via proxy-engine).
if (!generated) {
generated = generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
generated = await generateViaKokoro(CACHE_WAV, ANNOUNCEMENT_TEXT, log);
}
if (!generated) {
@@ -200,49 +123,7 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
}
}
// Read WAV and extract raw PCM + sample rate.
const result = readWavWithRate(CACHE_WAV);
if (!result) {
log('[tts] failed to parse WAV file');
return false;
}
const { pcm, sampleRate } = result;
// Wait for codec bridge to be ready.
if (!isCodecReady()) {
log('[tts] codec bridge not ready — will retry');
return false;
}
// Encode in 20ms chunks. The Rust encoder resamples to each codec's native rate.
const FRAME_SAMPLES = Math.floor(sampleRate * 0.02);
const FRAME_BYTES = FRAME_SAMPLES * 2; // 16-bit = 2 bytes per sample
const totalFrames = Math.floor(pcm.length / FRAME_BYTES);
const g722Frames: Buffer[] = [];
const opusFrames: Buffer[] = [];
log(`[tts] encoding ${totalFrames} frames (${FRAME_SAMPLES} samples/frame @ ${sampleRate}Hz)...`);
for (let i = 0; i < totalFrames; i++) {
const framePcm = pcm.subarray(i * FRAME_BYTES, (i + 1) * FRAME_BYTES);
const pcmBuf = Buffer.from(framePcm);
const [g722, opus] = await Promise.all([
encodePcm(pcmBuf, sampleRate, 9), // G.722 for SIP devices
encodePcm(pcmBuf, sampleRate, 111), // Opus for WebRTC browsers
]);
if (g722) g722Frames.push(g722);
if (opus) opusFrames.push(opus);
if (!g722 && !opus && i < 3) log(`[tts] frame ${i} encode failed`);
}
cachedAnnouncement = {
g722Frames,
opusFrames,
durationMs: totalFrames * 20,
};
log(`[tts] announcement cached: ${g722Frames.length} frames (${(totalFrames * 20 / 1000).toFixed(1)}s)`);
log('[tts] announcement WAV ready');
return true;
} catch (e: any) {
log(`[tts] init error: ${e.message}`);
@@ -250,100 +131,7 @@ export async function initAnnouncement(log: (msg: string) => void): Promise<bool
}
}
// ---------------------------------------------------------------------------
// Playback
// ---------------------------------------------------------------------------
/**
* Play the pre-cached announcement to an RTP endpoint.
*
* @param sendPacket - function to send a raw RTP packet
* @param ssrc - SSRC to use in RTP headers
* @param onDone - called when the announcement finishes
* @returns a cancel function, or null if no announcement is cached
*/
export function playAnnouncement(
sendPacket: (pkt: Buffer) => void,
ssrc: number,
onDone?: () => void,
): (() => void) | null {
if (!cachedAnnouncement || cachedAnnouncement.g722Frames.length === 0) {
onDone?.();
return null;
}
const frames = cachedAnnouncement.g722Frames;
const PT = 9; // G.722
let frameIdx = 0;
let seq = Math.floor(Math.random() * 0xffff);
let rtpTs = Math.floor(Math.random() * 0xffffffff);
const timer = setInterval(() => {
if (frameIdx >= frames.length) {
clearInterval(timer);
onDone?.();
return;
}
const payload = frames[frameIdx];
const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
const pkt = Buffer.concat([hdr, payload]);
sendPacket(pkt);
seq++;
rtpTs += rtpClockIncrement(PT);
frameIdx++;
}, 20);
// Return cancel function.
return () => clearInterval(timer);
/** Get the path to the cached announcement WAV, or null if not generated. */
export function getAnnouncementWavPath(): string | null {
return fs.existsSync(CACHE_WAV) ? CACHE_WAV : null;
}
/**
* Play pre-cached Opus announcement to a WebRTC PeerConnection sender.
*
* @param sendRtpPacket - function to send a raw RTP packet via sender.sendRtp()
* @param ssrc - SSRC to use in RTP headers
* @param onDone - called when announcement finishes
* @returns cancel function, or null if no announcement cached
*/
export function playAnnouncementToWebRtc(
sendRtpPacket: (pkt: Buffer) => void,
ssrc: number,
counters: { seq: number; ts: number },
onDone?: () => void,
): (() => void) | null {
if (!cachedAnnouncement || cachedAnnouncement.opusFrames.length === 0) {
onDone?.();
return null;
}
const frames = cachedAnnouncement.opusFrames;
const PT = 111; // Opus
let frameIdx = 0;
const timer = setInterval(() => {
if (frameIdx >= frames.length) {
clearInterval(timer);
onDone?.();
return;
}
const payload = frames[frameIdx];
const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
const pkt = Buffer.concat([hdr, payload]);
sendRtpPacket(pkt);
counters.seq++;
counters.ts += 960; // Opus at 48kHz: 960 samples per 20ms
frameIdx++;
}, 20);
return () => clearInterval(timer);
}
/** Check if an announcement is cached and ready. */
export function isAnnouncementReady(): boolean {
return cachedAnnouncement !== null && cachedAnnouncement.g722Frames.length > 0;
}

View File

@@ -1,55 +1,31 @@
/**
* PromptCache — manages multiple named audio prompts for IVR and voicemail.
* PromptCache — manages named audio prompt WAV files for IVR and voicemail.
*
* Each prompt is pre-encoded as both G.722 frames (for SIP legs) and Opus
* frames (for WebRTC legs), ready for 20ms RTP playback.
* Generates WAV files via espeak-ng (primary) or Kokoro TTS through the
* proxy-engine (fallback). Also supports loading pre-existing WAV files
* and programmatic tone generation.
*
* Supports three sources:
* 1. TTS generation via espeak-ng (primary) or Kokoro (fallback)
* 2. Loading from a pre-existing WAV file
* 3. Programmatic tone generation (beep, etc.)
*
* The existing announcement.ts system continues to work independently;
* this module provides generalized prompt management for IVR/voicemail.
* All audio playback happens in Rust (audio_player / start_interaction).
* This module only manages WAV files on disk.
*/
import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';
import { Buffer } from 'node:buffer';
import { encodePcm, isCodecReady } from '../opusbridge.ts';
/** RTP clock increment per 20ms frame for each codec. */
function rtpClockIncrement(pt: number): number {
if (pt === 111) return 960;
if (pt === 9) return 160;
return 160;
}
/** Build a fresh RTP header. */
function buildRtpHeader(pt: number, seq: number, ts: number, ssrc: number, marker: boolean): Buffer {
const hdr = Buffer.alloc(12);
hdr[0] = 0x80;
hdr[1] = (marker ? 0x80 : 0) | (pt & 0x7f);
hdr.writeUInt16BE(seq & 0xffff, 2);
hdr.writeUInt32BE(ts >>> 0, 4);
hdr.writeUInt32BE(ssrc >>> 0, 8);
return hdr;
}
import { sendProxyCommand, isProxyReady } from '../proxybridge.ts';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
/** A pre-encoded prompt ready for RTP playback. */
/** A cached prompt — just a WAV file path and metadata. */
export interface ICachedPrompt {
/** Unique prompt identifier. */
id: string;
/** G.722 encoded frames (20ms each, no RTP header). */
g722Frames: Buffer[];
/** Opus encoded frames (20ms each, no RTP header). */
opusFrames: Buffer[];
/** Total duration in milliseconds. */
/** Path to the WAV file on disk. */
wavPath: string;
/** Total duration in milliseconds (approximate, from WAV header). */
durationMs: number;
}
@@ -82,84 +58,61 @@ function generateViaEspeak(wavPath: string, text: string): boolean {
}
}
/** Generate WAV via Kokoro TTS. */
function generateViaKokoro(wavPath: string, text: string, voice: string): boolean {
/** Generate WAV via Kokoro TTS (runs inside proxy-engine). */
async function generateViaKokoro(wavPath: string, text: string, voice: string): Promise<boolean> {
const modelPath = path.join(TTS_DIR, 'kokoro-v1.0.onnx');
const voicesPath = path.join(TTS_DIR, 'voices.bin');
if (!fs.existsSync(modelPath) || !fs.existsSync(voicesPath)) return false;
const root = process.cwd();
const ttsBin = [
path.join(root, 'dist_rust', 'tts-engine'),
path.join(root, 'rust', 'target', 'release', 'tts-engine'),
path.join(root, 'rust', 'target', 'debug', 'tts-engine'),
].find((p) => fs.existsSync(p));
if (!ttsBin) return false;
if (!isProxyReady()) return false;
try {
execSync(
`"${ttsBin}" --model "${modelPath}" --voices "${voicesPath}" --voice "${voice}" --output "${wavPath}" --text "${text}"`,
{ timeout: 120000, stdio: 'pipe' },
);
await sendProxyCommand('generate_tts', {
model: modelPath,
voices: voicesPath,
voice,
text,
output: wavPath,
});
return true;
} catch {
return false;
}
}
/** Read a WAV file and return raw PCM + sample rate. */
function readWavWithRate(wavPath: string): { pcm: Buffer; sampleRate: number } | null {
const wav = fs.readFileSync(wavPath);
if (wav.length < 44) return null;
if (wav.toString('ascii', 0, 4) !== 'RIFF') return null;
if (wav.toString('ascii', 8, 12) !== 'WAVE') return null;
/** Read a WAV file's duration from its header. */
function getWavDurationMs(wavPath: string): number {
try {
const wav = fs.readFileSync(wavPath);
if (wav.length < 44) return 0;
if (wav.toString('ascii', 0, 4) !== 'RIFF') return 0;
let sampleRate = 22050;
let pcm: Buffer | null = null;
let offset = 12;
let sampleRate = 16000;
let dataSize = 0;
let bitsPerSample = 16;
let channels = 1;
let offset = 12;
while (offset < wav.length - 8) {
const chunkId = wav.toString('ascii', offset, offset + 4);
const chunkSize = wav.readUInt32LE(offset + 4);
if (chunkId === 'fmt ') {
sampleRate = wav.readUInt32LE(offset + 12);
while (offset < wav.length - 8) {
const chunkId = wav.toString('ascii', offset, offset + 4);
const chunkSize = wav.readUInt32LE(offset + 4);
if (chunkId === 'fmt ') {
channels = wav.readUInt16LE(offset + 10);
sampleRate = wav.readUInt32LE(offset + 12);
bitsPerSample = wav.readUInt16LE(offset + 22);
}
if (chunkId === 'data') {
dataSize = chunkSize;
}
offset += 8 + chunkSize;
if (offset % 2 !== 0) offset++;
}
if (chunkId === 'data') {
pcm = wav.subarray(offset + 8, offset + 8 + chunkSize);
}
offset += 8 + chunkSize;
if (offset % 2 !== 0) offset++;
const bytesPerSample = (bitsPerSample / 8) * channels;
const totalSamples = bytesPerSample > 0 ? dataSize / bytesPerSample : 0;
return sampleRate > 0 ? Math.round((totalSamples / sampleRate) * 1000) : 0;
} catch {
return 0;
}
return pcm ? { pcm, sampleRate } : null;
}
/** Encode raw PCM frames to G.722 + Opus. */
async function encodePcmFrames(
pcm: Buffer,
sampleRate: number,
log: (msg: string) => void,
): Promise<{ g722Frames: Buffer[]; opusFrames: Buffer[] } | null> {
if (!isCodecReady()) return null;
const frameSamples = Math.floor(sampleRate * 0.02); // 20ms
const frameBytes = frameSamples * 2; // 16-bit
const totalFrames = Math.floor(pcm.length / frameBytes);
const g722Frames: Buffer[] = [];
const opusFrames: Buffer[] = [];
for (let i = 0; i < totalFrames; i++) {
const framePcm = Buffer.from(pcm.subarray(i * frameBytes, (i + 1) * frameBytes));
const [g722, opus] = await Promise.all([
encodePcm(framePcm, sampleRate, 9), // G.722
encodePcm(framePcm, sampleRate, 111), // Opus
]);
if (g722) g722Frames.push(g722);
if (opus) opusFrames.push(opus);
}
return { g722Frames, opusFrames };
}
// ---------------------------------------------------------------------------
@@ -195,7 +148,7 @@ export class PromptCache {
}
/**
* Generate a TTS prompt and cache it.
* Generate a TTS prompt WAV and cache its path.
* Uses espeak-ng (primary) or Kokoro (fallback).
*/
async generatePrompt(id: string, text: string, voice = 'af_bella'): Promise<ICachedPrompt | null> {
@@ -207,14 +160,14 @@ export class PromptCache {
this.espeakAvailable = isEspeakAvailable();
}
// Generate WAV.
let generated = false;
// Generate WAV if not already on disk.
if (!fs.existsSync(wavPath)) {
let generated = false;
if (this.espeakAvailable) {
generated = generateViaEspeak(wavPath, text);
}
if (!generated) {
generated = generateViaKokoro(wavPath, text, voice);
generated = await generateViaKokoro(wavPath, text, voice);
}
if (!generated) {
this.log(`[prompt-cache] failed to generate TTS for "${id}"`);
@@ -223,49 +176,22 @@ export class PromptCache {
this.log(`[prompt-cache] generated WAV for "${id}"`);
}
return this.loadWavPrompt(id, wavPath);
return this.registerWav(id, wavPath);
}
/**
* Load a WAV file as a prompt and cache it.
* Load a pre-existing WAV file as a prompt.
*/
async loadWavPrompt(id: string, wavPath: string): Promise<ICachedPrompt | null> {
if (!fs.existsSync(wavPath)) {
this.log(`[prompt-cache] WAV not found: ${wavPath}`);
return null;
}
const result = readWavWithRate(wavPath);
if (!result) {
this.log(`[prompt-cache] failed to parse WAV: ${wavPath}`);
return null;
}
const encoded = await encodePcmFrames(result.pcm, result.sampleRate, this.log);
if (!encoded) {
this.log(`[prompt-cache] encoding failed for "${id}" (codec bridge not ready?)`);
return null;
}
const durationMs = encoded.g722Frames.length * 20;
const prompt: ICachedPrompt = {
id,
g722Frames: encoded.g722Frames,
opusFrames: encoded.opusFrames,
durationMs,
};
this.prompts.set(id, prompt);
this.log(`[prompt-cache] cached "${id}": ${encoded.g722Frames.length} frames (${(durationMs / 1000).toFixed(1)}s)`);
return prompt;
return this.registerWav(id, wavPath);
}
/**
* Generate a beep tone prompt (sine wave).
* @param id - prompt ID
* @param freqHz - tone frequency (default 1000 Hz)
* @param durationMs - tone duration (default 500ms)
* @param amplitude - 16-bit amplitude (default 8000)
* Generate a beep tone WAV and cache it.
*/
async generateBeep(
id: string,
@@ -273,149 +199,77 @@ export class PromptCache {
durationMs = 500,
amplitude = 8000,
): Promise<ICachedPrompt | null> {
// Generate at 16kHz for decent quality.
const sampleRate = 16000;
const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
const pcm = Buffer.alloc(totalSamples * 2);
fs.mkdirSync(TTS_DIR, { recursive: true });
const wavPath = path.join(TTS_DIR, `prompt-${id}.wav`);
for (let i = 0; i < totalSamples; i++) {
const t = i / sampleRate;
// Apply a short fade-in/fade-out to avoid click artifacts.
const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
let envelope = 1.0;
if (i < fadeLen) envelope = i / fadeLen;
else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
if (!fs.existsSync(wavPath)) {
// Generate 16kHz 16-bit mono sine wave WAV.
const sampleRate = 16000;
const totalSamples = Math.floor((sampleRate * durationMs) / 1000);
const pcm = Buffer.alloc(totalSamples * 2);
const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
for (let i = 0; i < totalSamples; i++) {
const t = i / sampleRate;
const fadeLen = Math.floor(sampleRate * 0.01); // 10ms fade
let envelope = 1.0;
if (i < fadeLen) envelope = i / fadeLen;
else if (i > totalSamples - fadeLen) envelope = (totalSamples - i) / fadeLen;
const sample = Math.round(Math.sin(2 * Math.PI * freqHz * t) * amplitude * envelope);
pcm.writeInt16LE(Math.max(-32768, Math.min(32767, sample)), i * 2);
}
// Write WAV file.
const headerSize = 44;
const dataSize = pcm.length;
const wav = Buffer.alloc(headerSize + dataSize);
// RIFF header
wav.write('RIFF', 0);
wav.writeUInt32LE(36 + dataSize, 4);
wav.write('WAVE', 8);
// fmt chunk
wav.write('fmt ', 12);
wav.writeUInt32LE(16, 16); // chunk size
wav.writeUInt16LE(1, 20); // PCM format
wav.writeUInt16LE(1, 22); // mono
wav.writeUInt32LE(sampleRate, 24);
wav.writeUInt32LE(sampleRate * 2, 28); // byte rate
wav.writeUInt16LE(2, 32); // block align
wav.writeUInt16LE(16, 34); // bits per sample
// data chunk
wav.write('data', 36);
wav.writeUInt32LE(dataSize, 40);
pcm.copy(wav, 44);
fs.writeFileSync(wavPath, wav);
this.log(`[prompt-cache] beep WAV generated for "${id}"`);
}
const encoded = await encodePcmFrames(pcm, sampleRate, this.log);
if (!encoded) {
this.log(`[prompt-cache] beep encoding failed for "${id}"`);
return null;
}
const actualDuration = encoded.g722Frames.length * 20;
const prompt: ICachedPrompt = {
id,
g722Frames: encoded.g722Frames,
opusFrames: encoded.opusFrames,
durationMs: actualDuration,
};
this.prompts.set(id, prompt);
this.log(`[prompt-cache] beep "${id}" cached: ${actualDuration}ms @ ${freqHz}Hz`);
return prompt;
return this.registerWav(id, wavPath);
}
/**
* Remove a prompt from the cache.
*/
/** Remove a prompt from the cache. */
remove(id: string): void {
this.prompts.delete(id);
}
/**
* Clear all cached prompts.
*/
/** Clear all cached prompts. */
clear(): void {
this.prompts.clear();
}
}
// ---------------------------------------------------------------------------
// Standalone playback helpers (for use by SystemLeg)
// ---------------------------------------------------------------------------
// -------------------------------------------------------------------------
// Internal
// -------------------------------------------------------------------------
/**
* Play a cached prompt's G.722 frames as RTP packets at 20ms intervals.
*
* @param prompt - the cached prompt to play
* @param sendPacket - function to send a raw RTP packet (12-byte header + payload)
* @param ssrc - SSRC for RTP headers
* @param onDone - called when playback finishes
* @returns cancel function, or null if prompt has no G.722 frames
*/
export function playPromptG722(
prompt: ICachedPrompt,
sendPacket: (pkt: Buffer) => void,
ssrc: number,
onDone?: () => void,
): (() => void) | null {
if (prompt.g722Frames.length === 0) {
onDone?.();
return null;
private registerWav(id: string, wavPath: string): ICachedPrompt {
const durationMs = getWavDurationMs(wavPath);
const prompt: ICachedPrompt = { id, wavPath, durationMs };
this.prompts.set(id, prompt);
this.log(`[prompt-cache] cached "${id}": ${wavPath} (${(durationMs / 1000).toFixed(1)}s)`);
return prompt;
}
const frames = prompt.g722Frames;
const PT = 9;
let frameIdx = 0;
let seq = Math.floor(Math.random() * 0xffff);
let rtpTs = Math.floor(Math.random() * 0xffffffff);
const timer = setInterval(() => {
if (frameIdx >= frames.length) {
clearInterval(timer);
onDone?.();
return;
}
const payload = frames[frameIdx];
const hdr = buildRtpHeader(PT, seq & 0xffff, rtpTs >>> 0, ssrc >>> 0, frameIdx === 0);
const pkt = Buffer.concat([hdr, payload]);
sendPacket(pkt);
seq++;
rtpTs += rtpClockIncrement(PT);
frameIdx++;
}, 20);
return () => clearInterval(timer);
}
/**
* Play a cached prompt's Opus frames as RTP packets at 20ms intervals.
*
* @param prompt - the cached prompt to play
* @param sendPacket - function to send a raw RTP packet
* @param ssrc - SSRC for RTP headers
* @param counters - shared seq/ts counters (mutated in place for seamless transitions)
* @param onDone - called when playback finishes
* @returns cancel function, or null if prompt has no Opus frames
*/
export function playPromptOpus(
prompt: ICachedPrompt,
sendPacket: (pkt: Buffer) => void,
ssrc: number,
counters: { seq: number; ts: number },
onDone?: () => void,
): (() => void) | null {
if (prompt.opusFrames.length === 0) {
onDone?.();
return null;
}
const frames = prompt.opusFrames;
const PT = 111;
let frameIdx = 0;
const timer = setInterval(() => {
if (frameIdx >= frames.length) {
clearInterval(timer);
onDone?.();
return;
}
const payload = frames[frameIdx];
const hdr = buildRtpHeader(PT, counters.seq & 0xffff, counters.ts >>> 0, ssrc >>> 0, frameIdx === 0);
const pkt = Buffer.concat([hdr, payload]);
sendPacket(pkt);
counters.seq++;
counters.ts += 960; // Opus 48kHz: 960 samples per 20ms
frameIdx++;
}, 20);
return () => clearInterval(timer);
}

View File

@@ -1,199 +0,0 @@
/**
* Audio transcoding bridge — uses smartrust to communicate with the Rust
* opus-codec binary, which handles Opus ↔ G.722 ↔ PCMU/PCMA transcoding.
*
* All codec conversion happens in Rust (libopus + SpanDSP G.722 port).
* The TypeScript side just passes raw payloads back and forth.
*/
import path from 'node:path';
import { RustBridge } from '@push.rocks/smartrust';
// ---------------------------------------------------------------------------
// Command type map for smartrust
// ---------------------------------------------------------------------------
type TCodecCommands = {
init: {
params: Record<string, never>;
result: Record<string, never>;
};
create_session: {
params: { session_id: string };
result: Record<string, never>;
};
destroy_session: {
params: { session_id: string };
result: Record<string, never>;
};
transcode: {
params: { data_b64: string; from_pt: number; to_pt: number; session_id?: string; direction?: string };
result: { data_b64: string };
};
encode_pcm: {
params: { data_b64: string; sample_rate: number; to_pt: number; session_id?: string };
result: { data_b64: string };
};
};
// ---------------------------------------------------------------------------
// Bridge singleton
// ---------------------------------------------------------------------------
let bridge: RustBridge<TCodecCommands> | null = null;
let initialized = false;
function buildLocalPaths(): string[] {
const root = process.cwd();
return [
path.join(root, 'dist_rust', 'opus-codec'),
path.join(root, 'rust', 'target', 'release', 'opus-codec'),
path.join(root, 'rust', 'target', 'debug', 'opus-codec'),
];
}
let logFn: ((msg: string) => void) | undefined;
/**
* Initialize the audio transcoding bridge. Spawns the Rust binary.
*/
export async function initCodecBridge(log?: (msg: string) => void): Promise<boolean> {
if (initialized && bridge) return true;
logFn = log;
try {
bridge = new RustBridge<TCodecCommands>({
binaryName: 'opus-codec',
localPaths: buildLocalPaths(),
});
const spawned = await bridge.spawn();
if (!spawned) {
log?.('[codec] failed to spawn opus-codec binary');
bridge = null;
return false;
}
// Auto-restart: reset state when the Rust process exits so the next
// transcode attempt triggers re-initialization instead of silent failure.
bridge.on('exit', () => {
logFn?.('[codec] Rust audio transcoder process exited — will re-init on next use');
bridge = null;
initialized = false;
});
await bridge.sendCommand('init', {} as any);
initialized = true;
log?.('[codec] Rust audio transcoder initialized (Opus + G.722 + PCMU/PCMA)');
return true;
} catch (e: any) {
log?.(`[codec] init error: ${e.message}`);
bridge = null;
return false;
}
}
// ---------------------------------------------------------------------------
// Session management — per-call codec isolation
// ---------------------------------------------------------------------------
/**
* Create an isolated codec session. Each session gets its own Opus/G.722
* encoder/decoder state, preventing concurrent calls from corrupting each
* other's stateful codec predictions.
*/
export async function createSession(sessionId: string): Promise<boolean> {
if (!bridge || !initialized) {
// Attempt auto-reinit if bridge died.
const ok = await initCodecBridge(logFn);
if (!ok) return false;
}
try {
await bridge!.sendCommand('create_session', { session_id: sessionId });
return true;
} catch (e: any) {
logFn?.(`[codec] create_session error: ${e?.message || e}`);
return false;
}
}
/**
* Destroy a codec session, freeing its encoder/decoder state.
*/
export async function destroySession(sessionId: string): Promise<void> {
if (!bridge || !initialized) return;
try {
await bridge.sendCommand('destroy_session', { session_id: sessionId });
} catch {
// Best-effort cleanup.
}
}
// ---------------------------------------------------------------------------
// Transcoding
// ---------------------------------------------------------------------------
/**
* Transcode an RTP payload between two codecs.
* All codec work (Opus, G.722, PCMU, PCMA) + resampling happens in Rust.
*
* @param data - raw RTP payload (no header)
* @param fromPT - source payload type (0=PCMU, 8=PCMA, 9=G.722, 111=Opus)
* @param toPT - target payload type
* @param sessionId - optional session for isolated codec state
* @returns transcoded payload, or null on failure
*/
export async function transcode(data: Buffer, fromPT: number, toPT: number, sessionId?: string, direction?: string): Promise<Buffer | null> {
if (!bridge || !initialized) return null;
try {
const params: any = {
data_b64: data.toString('base64'),
from_pt: fromPT,
to_pt: toPT,
};
if (sessionId) params.session_id = sessionId;
if (direction) params.direction = direction;
const result = await bridge.sendCommand('transcode', params);
return Buffer.from(result.data_b64, 'base64');
} catch {
return null;
}
}
/**
* Encode raw 16-bit PCM to a target codec.
* @param pcmData - raw 16-bit LE PCM bytes
* @param sampleRate - input sample rate (e.g. 22050 for Piper TTS)
* @param toPT - target payload type (9=G.722, 111=Opus, 0=PCMU, 8=PCMA)
* @param sessionId - optional session for isolated codec state
*/
export async function encodePcm(pcmData: Buffer, sampleRate: number, toPT: number, sessionId?: string): Promise<Buffer | null> {
if (!bridge || !initialized) return null;
try {
const params: any = {
data_b64: pcmData.toString('base64'),
sample_rate: sampleRate,
to_pt: toPT,
};
if (sessionId) params.session_id = sessionId;
const result = await bridge.sendCommand('encode_pcm', params);
return Buffer.from(result.data_b64, 'base64');
} catch (e: any) {
console.error('[encodePcm] error:', e?.message || e);
return null;
}
}
/** Check if the codec bridge is ready. */
export function isCodecReady(): boolean {
return initialized && bridge !== null;
}
/** Shut down the codec bridge. */
export function shutdownCodecBridge(): void {
if (bridge) {
try { bridge.kill(); } catch { /* ignore */ }
bridge = null;
initialized = false;
}
}

View File

@@ -79,6 +79,10 @@ type TProxyCommands = {
params: { call_id: string; leg_id: string; key: string; value: unknown };
result: Record<string, never>;
};
generate_tts: {
params: { model: string; voices: string; voice: string; text: string; output: string };
result: { output: string };
};
};
// ---------------------------------------------------------------------------
@@ -493,6 +497,15 @@ export function isProxyReady(): boolean {
return initialized && bridge !== null;
}
/** Send an arbitrary command to the proxy engine bridge. */
export async function sendProxyCommand<K extends keyof TProxyCommands>(
method: K,
params: TProxyCommands[K]['params'],
): Promise<TProxyCommands[K]['result']> {
if (!bridge || !initialized) throw new Error('proxy engine not initialized');
return bridge.sendCommand(method as string, params as any) as any;
}
/** Shut down the proxy engine. */
export function shutdownProxyEngine(): void {
if (bridge) {

View File

@@ -24,7 +24,6 @@ import {
getAllBrowserDeviceIds,
getBrowserDeviceWs,
} from './webrtcbridge.ts';
import { initCodecBridge } from './opusbridge.ts';
import { initAnnouncement } from './announcement.ts';
import { PromptCache } from './call/prompt-cache.ts';
import { VoiceboxManager } from './voicebox.ts';
@@ -523,9 +522,8 @@ async function startProxyEngine(): Promise<void> {
const deviceList = appConfig.devices.map((d) => d.displayName).join(', ');
log(`proxy engine started | LAN ${appConfig.proxy.lanIp}:${appConfig.proxy.lanPort} | providers: ${providerList} | devices: ${deviceList}`);
// Initialize audio codec bridge (still needed for WebRTC transcoding).
// Generate TTS audio (WAV files on disk, played by Rust audio_player).
try {
await initCodecBridge(log);
await initAnnouncement(log);
// Pre-generate prompts.
@@ -547,7 +545,7 @@ async function startProxyEngine(): Promise<void> {
}
log(`[startup] prompts cached: ${promptCache.listIds().join(', ') || 'none'}`);
} catch (e) {
log(`[codec] init failed: ${e}`);
log(`[tts] init failed: ${e}`);
}
}