fix(proxy-engine): improve inbound SIP routing diagnostics and enrich leg media state reporting

2026-04-14 20:19:34 +00:00
parent 0d82a626b5
commit 88768f0586
46 changed files with 555689 additions and 107 deletions
@@ -1,5 +1,14 @@
 # Changelog

+## 2026-04-14 - 1.25.2 - fix(proxy-engine)
+improve inbound SIP routing diagnostics and enrich leg media state reporting
+
+- Extract inbound called numbers from DID-related SIP headers when the request URI contains a provider account username.
+- Emit detailed sip_unhandled diagnostics for inbound route misses, missing devices, and RTP allocation failures.
+- Include codec, RTP port, remote media, and metadata in leg state change events and preserve those fields in runtime status/history views.
+- Match hostname-based providers against resolved inbound source IPs to accept provider traffic sent from resolved addresses.
+- Invalidate cached TTS WAV metadata across engine restarts and vendor the kokoro-tts crate via a local patch.
+
 ## 2026-04-14 - 1.25.1 - fix(proxy-engine)
 respect explicit inbound route targets and store voicemail in the configured mailbox

@@ -1733,8 +1733,6 @@ dependencies = [
 [[package]]
 name = "kokoro-tts"
 version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68e5d46e20a28fa5fd313d9ffcf4bbcf41570e64841d3944c832eef6b98d208b"
 dependencies = [
 "bincode 2.0.1",
 "cc",
@@ -9,3 +9,6 @@ resolver = "2"
 [profile.release]
 opt-level = 3
 lto = true
+
+[patch.crates-io]
+kokoro-tts = { path = "vendor/kokoro-tts" }
@@ -5,7 +5,7 @@
 //! The mixer provides mix-minus audio to all participants.

 use crate::call::{Call, CallDirection, CallState, LegId, LegInfo, LegKind, LegState};
-use crate::config::{normalize_routing_identity, AppConfig, ProviderConfig};
+use crate::config::{extract_inbound_called_number, normalize_routing_identity, AppConfig, ProviderConfig};
 use crate::ipc::{emit_event, OutTx};
 use crate::leg_io::{create_leg_channels, spawn_sip_inbound, spawn_sip_outbound};
 use crate::mixer::spawn_mixer;
@@ -25,6 +25,32 @@ use std::sync::Arc;
 use tokio::net::UdpSocket;
 use tokio::sync::Mutex;

+fn emit_inbound_diagnostic(
+    out_tx: &OutTx,
+    label: &str,
+    invite: &SipMessage,
+    from_addr: SocketAddr,
+    provider_id: &str,
+    called_number: &str,
+    caller_number: &str,
+) {
+    emit_event(
+        out_tx,
+        "sip_unhandled",
+        serde_json::json!({
+            "method_or_status": format!(
+                "INVITE {label} provider={provider_id} called={called_number} caller={caller_number} ruri={} to={} pcalled={}",
+                invite.request_uri().unwrap_or(""),
+                invite.get_header("To").unwrap_or(""),
+                invite.get_header("P-Called-Party-ID").unwrap_or(""),
+            ),
+            "call_id": invite.call_id(),
+            "from_addr": from_addr.ip().to_string(),
+            "from_port": from_addr.port(),
+        }),
+    );
+}
+
 /// Result of creating an inbound call — carries both the call id and
 /// whether browsers should be notified (flows from the matched inbound
 /// route's `ring_browsers` flag).
@@ -35,7 +61,17 @@ pub struct InboundCallCreated {

 /// Emit a `leg_added` event with full leg information.
 /// Free function (not a method) to avoid `&self` borrow conflicts when `self.calls` is borrowed.
-fn emit_leg_added_event(tx: &OutTx, call_id: &str, leg: &LegInfo) {
+fn codec_label(codec_pt: u8) -> String {
+    match codec_pt {
+        0 => "PCMU".to_string(),
+        8 => "PCMA".to_string(),
+        9 => "G.722".to_string(),
+        111 => "Opus".to_string(),
+        _ => format!("PT{codec_pt}"),
+    }
+}
+
+fn leg_metadata_json(leg: &LegInfo) -> serde_json::Value {
    let metadata: serde_json::Value = if leg.metadata.is_empty() {
        serde_json::json!({})
    } else {
@@ -46,22 +82,35 @@ fn emit_leg_added_event(tx: &OutTx, call_id: &str, leg: &LegInfo) {
                .collect(),
        )
    };
+
+    metadata
+}
+
+fn leg_event_payload(call_id: &str, leg: &LegInfo) -> serde_json::Value {
+    serde_json::json!({
+        "call_id": call_id,
+        "leg_id": leg.id,
+        "kind": leg.kind.as_str(),
+        "state": leg.state.as_str(),
+        "codec": codec_label(leg.codec_pt),
+        "rtpPort": leg.rtp_port,
+        "remoteMedia": leg.remote_media.map(|a| format!("{}:{}", a.ip(), a.port())),
+        "metadata": leg_metadata_json(leg),
+    })
+}
+
+fn emit_leg_added_event(tx: &OutTx, call_id: &str, leg: &LegInfo) {
    emit_event(
        tx,
        "leg_added",
-        serde_json::json!({
-            "call_id": call_id,
-            "leg_id": leg.id,
-            "kind": leg.kind.as_str(),
-            "state": leg.state.as_str(),
-            "codec": sip_proto::helpers::codec_name(leg.codec_pt),
-            "rtpPort": leg.rtp_port,
-            "remoteMedia": leg.remote_media.map(|a| format!("{}:{}", a.ip(), a.port())),
-            "metadata": metadata,
-        }),
+        leg_event_payload(call_id, leg),
    );
 }

+fn emit_leg_state_changed_event(tx: &OutTx, call_id: &str, leg: &LegInfo) {
+    emit_event(tx, "leg_state_changed", leg_event_payload(call_id, leg));
+}
+
 pub struct CallManager {
    /// All active calls, keyed by internal call ID.
    pub calls: HashMap<String, Call>,
@@ -232,11 +281,11 @@ impl CallManager {
                    "call_ringing",
                    serde_json::json!({ "call_id": call_id }),
                );
-                emit_event(
-                    &self.out_tx,
-                    "leg_state_changed",
-                    serde_json::json!({ "call_id": call_id, "leg_id": leg_id, "state": "ringing" }),
-                );
+                if let Some(call) = self.calls.get(call_id) {
+                    if let Some(leg) = call.legs.get(leg_id) {
+                        emit_leg_state_changed_event(&self.out_tx, call_id, leg);
+                    }
+                }
            }
            SipLegAction::ConnectedWithAck(ack_buf) => {
                let _ = socket.send_to(&ack_buf, target).await;
@@ -248,6 +297,7 @@ impl CallManager {
                    let sip_leg = leg.sip_leg.as_ref().unwrap();
                    let remote = sip_leg.remote_media;
                    leg.state = LegState::Connected;
+                    leg.codec_pt = sip_pt;
                    leg.remote_media = remote;
                    call.state = CallState::Connected;
                    remote
@@ -298,8 +348,17 @@ impl CallManager {
                            dev_rtp_socket,
                            dev_remote,
                            dev_leg_id,
-                        )) = device_leg_info
+                            )) = device_leg_info
                        {
+                            // Use the device's preferred codec from its INVITE SDP,
+                            // not the provider's negotiated codec.
+                            let dev_pt = device_invite
+                                .has_sdp_body()
+                                .then(|| parse_sdp_endpoint(&device_invite.body))
+                                .flatten()
+                                .and_then(|ep| ep.codec_pt)
+                                .unwrap_or(sip_pt);
+
                            // Build SDP pointing device to our device_rtp port.
                            // Use LAN IP for the device (it's on the local network).
                            let call_ref = self.calls.get(call_id).unwrap();
@@ -336,23 +395,16 @@ impl CallManager {
                            if let Some(call) = self.calls.get_mut(call_id) {
                                if let Some(dev_leg) = call.legs.get_mut(&dev_leg_id) {
                                    dev_leg.state = LegState::Connected;
+                                    dev_leg.codec_pt = dev_pt;
+                                }
+                            }
+                            if let Some(call) = self.calls.get(call_id) {
+                                if let Some(dev_leg) = call.legs.get(&dev_leg_id) {
+                                    emit_leg_state_changed_event(&self.out_tx, call_id, dev_leg);
                                }
                            }
-                            emit_event(
-                                &self.out_tx,
-                                "leg_state_changed",
-                                serde_json::json!({ "call_id": call_id, "leg_id": dev_leg_id, "state": "connected" }),
-                            );

                            // Wire device leg to mixer.
-                            // Use the device's preferred codec from its INVITE SDP,
-                            // not the provider's negotiated codec.
-                            let dev_pt = device_invite
-                                .has_sdp_body()
-                                .then(|| parse_sdp_endpoint(&device_invite.body))
-                                .flatten()
-                                .and_then(|ep| ep.codec_pt)
-                                .unwrap_or(sip_pt);
                            if let Some(dev_remote_addr) = dev_remote {
                                let dev_channels = create_leg_channels();
                                spawn_sip_inbound(dev_rtp_socket.clone(), dev_channels.inbound_tx);
@@ -385,11 +437,11 @@ impl CallManager {
                        "sip_pt": sip_pt,
                    }),
                );
-                emit_event(
-                    &self.out_tx,
-                    "leg_state_changed",
-                    serde_json::json!({ "call_id": call_id, "leg_id": leg_id, "state": "connected" }),
-                );
+                if let Some(call) = self.calls.get(call_id) {
+                    if let Some(leg) = call.legs.get(leg_id) {
+                        emit_leg_state_changed_event(&self.out_tx, call_id, leg);
+                    }
+                }
            }
            SipLegAction::Terminated(reason) => {
                let duration = self
@@ -436,11 +488,11 @@ impl CallManager {
                        leg.state = LegState::Terminated;
                    }
                }
-                emit_event(
-                    &self.out_tx,
-                    "leg_state_changed",
-                    serde_json::json!({ "call_id": call_id, "leg_id": leg_id, "state": "terminated" }),
-                );
+                if let Some(call) = self.calls.get(call_id) {
+                    if let Some(leg) = call.legs.get(leg_id) {
+                        emit_leg_state_changed_event(&self.out_tx, call_id, leg);
+                    }
+                }
                emit_event(
                    &self.out_tx,
                    "call_ended",
@@ -684,11 +736,9 @@ impl CallManager {
                    if let Some(leg) = call.legs.get_mut(this_leg_id) {
                        leg.state = LegState::Ringing;
                    }
-                    emit_event(
-                        &self.out_tx,
-                        "leg_state_changed",
-                        serde_json::json!({ "call_id": call_id, "leg_id": this_leg_id, "state": "ringing" }),
-                    );
+                    if let Some(leg) = call.legs.get(this_leg_id) {
+                        emit_leg_state_changed_event(&self.out_tx, call_id, leg);
+                    }
                } else if code >= 200 && code < 300 {
                    let mut needs_wiring = false;
                    if let Some(leg) = call.legs.get_mut(this_leg_id) {
@@ -708,11 +758,9 @@ impl CallManager {
                        needs_wiring = true;
                    }

-                    emit_event(
-                        &self.out_tx,
-                        "leg_state_changed",
-                        serde_json::json!({ "call_id": call_id, "leg_id": this_leg_id, "state": "connected" }),
-                    );
+                    if let Some(leg) = call.legs.get(this_leg_id) {
+                        emit_leg_state_changed_event(&self.out_tx, call_id, leg);
+                    }

                    if call.state != CallState::Connected {
                        call.state = CallState::Connected;
@@ -811,7 +859,7 @@ impl CallManager {
        // Extract caller/callee info.
        let from_header = invite.get_header("From").unwrap_or("");
        let caller_number = normalize_routing_identity(from_header);
-        let called_number = normalize_routing_identity(invite.request_uri().unwrap_or(""));
+        let called_number = extract_inbound_called_number(invite);

        // Resolve via the configured inbound routing table. The matched route
        // is the source of truth for which external numbers this provider is
@@ -826,6 +874,15 @@ impl CallManager {
        {
            Some(route) => route,
            None => {
+                emit_inbound_diagnostic(
+                    &self.out_tx,
+                    "route_miss",
+                    invite,
+                    from_addr,
+                    provider_id,
+                    &called_number,
+                    &caller_number,
+                );
                let resp = SipMessage::create_response(404, "Not Found", invite, None);
                let _ = socket.send_to(&resp.serialize(), from_addr).await;
                return None;
@@ -940,6 +997,15 @@ impl CallManager {
        let provider_rtp = match rtp_pool.allocate().await {
            Some(a) => a,
            None => {
+                emit_inbound_diagnostic(
+                    &self.out_tx,
+                    "provider_rtp_unavailable",
+                    invite,
+                    from_addr,
+                    provider_id,
+                    &called_number,
+                    &caller_number,
+                );
                let resp = SipMessage::create_response(503, "Service Unavailable", invite, None);
                let _ = socket.send_to(&resp.serialize(), from_addr).await;
                return None;
@@ -948,6 +1014,15 @@ impl CallManager {
        let device_rtp = match rtp_pool.allocate().await {
            Some(a) => a,
            None => {
+                emit_inbound_diagnostic(
+                    &self.out_tx,
+                    "device_rtp_unavailable",
+                    invite,
+                    from_addr,
+                    provider_id,
+                    &called_number,
+                    &caller_number,
+                );
                let resp = SipMessage::create_response(503, "Service Unavailable", invite, None);
                let _ = socket.send_to(&resp.serialize(), from_addr).await;
                return None;
@@ -1707,11 +1782,7 @@ impl CallManager {
                }
            }
            leg.state = LegState::Terminated;
-            emit_event(
-                &self.out_tx,
-                "leg_state_changed",
-                serde_json::json!({ "call_id": call_id, "leg_id": leg.id, "state": "terminated" }),
-            );
+            emit_leg_state_changed_event(&self.out_tx, call_id, leg);
        }

        emit_event(
@@ -1760,6 +1831,16 @@ impl CallManager {
        let rtp_alloc = match rtp_pool.allocate().await {
            Some(a) => a,
            None => {
+                let called_number = extract_inbound_called_number(invite);
+                emit_inbound_diagnostic(
+                    &self.out_tx,
+                    "voicemail_rtp_unavailable",
+                    invite,
+                    from_addr,
+                    provider_id,
+                    &called_number,
+                    caller_number,
+                );
                let resp = SipMessage::create_response(503, "Service Unavailable", invite, None);
                let _ = socket.send_to(&resp.serialize(), from_addr).await;
                return None;
@@ -1901,6 +1982,16 @@ impl CallManager {
        let rtp_alloc = match rtp_pool.allocate().await {
            Some(a) => a,
            None => {
+                let called_number = extract_inbound_called_number(invite);
+                emit_inbound_diagnostic(
+                    &self.out_tx,
+                    "ivr_rtp_unavailable",
+                    invite,
+                    from_addr,
+                    provider_id,
+                    &called_number,
+                    caller_number,
+                );
                let resp = SipMessage::create_response(503, "Service Unavailable", invite, None);
                let _ = socket.send_to(&resp.serialize(), from_addr).await;
                return None;
@@ -273,6 +273,38 @@ pub fn normalize_routing_identity(value: &str) -> String {
    digits
 }

+fn looks_like_phone_identity(value: &str) -> bool {
+    let digits = value.chars().filter(|c| c.is_ascii_digit()).count();
+    digits >= 6 && value.chars().all(|c| c.is_ascii_digit() || c == '+')
+}
+
+/// Pick the best inbound called-number identity from common SIP headers.
+///
+/// Some providers deliver the DID in `To` / `P-Called-Party-ID` while the
+/// request URI contains an account username. Prefer a phone-like identity when
+/// present; otherwise fall back to the request URI user part.
+pub fn extract_inbound_called_number(msg: &SipMessage) -> String {
+    let request_uri = normalize_routing_identity(msg.request_uri().unwrap_or(""));
+    if looks_like_phone_identity(&request_uri) {
+        return request_uri;
+    }
+
+    for header_name in [
+        "P-Called-Party-ID",
+        "X-Called-Party-ID",
+        "Diversion",
+        "History-Info",
+        "To",
+    ] {
+        let candidate = normalize_routing_identity(msg.get_header(header_name).unwrap_or(""));
+        if looks_like_phone_identity(&candidate) {
+            return candidate;
+        }
+    }
+
+    request_uri
+}
+
 fn parse_numeric_range_value(value: &str) -> Option<(bool, &str)> {
    let trimmed = value.trim();
    if trimmed.is_empty() {
@@ -636,6 +668,20 @@ mod tests {
        assert!(!support.ring_browsers);
    }

+    #[test]
+    fn extract_inbound_called_number_prefers_did_headers_over_username_ruri() {
+        let raw = b"INVITE sip:2830573e1@proxy.example SIP/2.0\r\nTo: <sip:+4942116767548@proxy.example>\r\nFrom: <sip:+491701234567@provider.example>;tag=abc\r\nCall-ID: test-1\r\nCSeq: 1 INVITE\r\nContent-Length: 0\r\n\r\n";
+        let msg = SipMessage::parse(raw).expect("invite should parse");
+        assert_eq!(extract_inbound_called_number(&msg), "+4942116767548");
+    }
+
+    #[test]
+    fn extract_inbound_called_number_keeps_phone_ruri_when_already_present() {
+        let raw = b"INVITE sip:042116767548@proxy.example SIP/2.0\r\nTo: <sip:2830573e1@proxy.example>\r\nFrom: <sip:+491701234567@provider.example>;tag=abc\r\nCall-ID: test-2\r\nCSeq: 1 INVITE\r\nContent-Length: 0\r\n\r\n";
+        let msg = SipMessage::parse(raw).expect("invite should parse");
+        assert_eq!(extract_inbound_called_number(&msg), "042116767548");
+    }
+
    #[test]
    fn matches_pattern_supports_numeric_ranges() {
        assert!(matches_pattern(
@@ -25,7 +25,7 @@ mod voicemail;
 mod webrtc_engine;

 use crate::call_manager::CallManager;
-use crate::config::{normalize_routing_identity, AppConfig};
+use crate::config::{extract_inbound_called_number, normalize_routing_identity, AppConfig};
 use crate::ipc::{emit_event, respond_err, respond_ok, Command, OutTx};
 use crate::provider::ProviderManager;
 use crate::registrar::Registrar;
@@ -346,7 +346,7 @@ async fn handle_sip_packet(
            // Emit event so TypeScript knows about the call (for dashboard, IVR routing, etc).
            let from_header = msg.get_header("From").unwrap_or("");
            let from_uri = normalize_routing_identity(from_header);
-            let called_number = normalize_routing_identity(msg.request_uri().unwrap_or(""));
+            let called_number = extract_inbound_called_number(&msg);

            emit_event(
                &eng.out_tx,
@@ -369,6 +369,20 @@ async fn handle_sip_packet(
        let dialed_number = normalize_routing_identity(msg.request_uri().unwrap_or(""));

        let device = eng.registrar.find_by_address(&from_addr);
+        if device.is_none() {
+            emit_event(
+                &eng.out_tx,
+                "sip_unhandled",
+                serde_json::json!({
+                    "method_or_status": "INVITE",
+                    "call_id": msg.call_id(),
+                    "from_addr": from_addr.ip().to_string(),
+                    "from_port": from_addr.port(),
+                    "is_from_provider": false,
+                }),
+            );
+            return;
+        }
        let device_id = device.map(|d| d.device_id.clone());

        // Find provider via routing rules.
@@ -313,6 +313,23 @@ impl ProviderManager {
            if ps.config.outbound_proxy.address == addr.ip().to_string() {
                return Some(ps_arc.clone());
            }
+
+            // Hostname-based providers (e.g. sipgate.de) often deliver inbound
+            // INVITEs from resolved IPs rather than the literal configured host.
+            // Resolve the proxy host and accept any matching IP/port variant.
+            use std::net::ToSocketAddrs;
+            if let Ok(resolved) = format!(
+                "{}:{}",
+                ps.config.outbound_proxy.address, ps.config.outbound_proxy.port
+            )
+            .to_socket_addrs()
+            {
+                for resolved_addr in resolved {
+                    if resolved_addr == *addr || resolved_addr.ip() == addr.ip() {
+                        return Some(ps_arc.clone());
+                    }
+                }
+            }
        }
        None
    }
@@ -13,6 +13,7 @@ use crate::audio_player::pcm_to_mix_frames;
 use kokoro_tts::{KokoroTts, Voice};
 use std::path::Path;
 use std::sync::Arc;
+use std::time::{SystemTime, UNIX_EPOCH};
 use tokio::sync::{mpsc, watch};

 pub const DEFAULT_MODEL_PATH: &str = ".nogit/tts/kokoro-v1.0.onnx";
@@ -47,6 +48,10 @@ pub struct TtsEngine {
    /// Path that was used to load the current model (for cache invalidation).
    loaded_model_path: String,
    loaded_voices_path: String,
+    /// On-disk TTS WAVs are cacheable only within a single engine lifetime.
+    /// Every restart gets a new generation token, so prior process outputs are
+    /// treated as stale and regenerated on first use.
+    cache_generation: String,
 }

 impl TtsEngine {
@@ -55,6 +60,10 @@ impl TtsEngine {
            tts: None,
            loaded_model_path: String::new(),
            loaded_voices_path: String::new(),
+            cache_generation: SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .map(|d| d.as_nanos().to_string())
+                .unwrap_or_else(|_| "0".to_string()),
        }
    }

@@ -228,7 +237,7 @@ impl TtsEngine {
            return false;
        }
        match std::fs::read_to_string(&meta_path) {
-            Ok(contents) => contents == Self::cache_key(text, voice),
+            Ok(contents) => contents == self.cache_key(text, voice),
            Err(_) => false,
        }
    }
@@ -236,12 +245,12 @@ impl TtsEngine {
    /// Write the sidecar `.meta` file next to the WAV.
    fn write_cache_meta(&self, output_path: &str, text: &str, voice: &str) {
        let meta_path = format!("{output_path}.meta");
-        let _ = std::fs::write(&meta_path, Self::cache_key(text, voice));
+        let _ = std::fs::write(&meta_path, self.cache_key(text, voice));
    }

-    /// Build the cache key from text + voice.
-    fn cache_key(text: &str, voice: &str) -> String {
-        format!("{}\0{}", text, voice)
+    /// Build the cache key from process generation + text + voice.
+    fn cache_key(&self, text: &str, voice: &str) -> String {
+        format!("{}\0{}\0{}", self.cache_generation, text, voice)
    }
 }

@@ -0,0 +1 @@
+{"v":1}
@@ -0,0 +1,7 @@
+{
+  "git": {
+    "sha1": "dfa3eda5e8c3f23f8b4c5d504acaebd6e7a45020",
+    "dirty": true
+  },
+  "path_in_vcs": ""
+}
@@ -0,0 +1,35 @@
+name: Rust
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      # Ubuntu 专属依赖安装
+      - name: Setup Ubuntu dependencies
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          sudo apt-get update
+          sudo apt install libasound2-dev
+
+      # 构建项目
+      - name: Build
+        run: cargo build -vv
+
+      # 运行测试
+      - name: Run tests
+        run: cargo test --workspace -vv
@@ -0,0 +1,5 @@
+*.bin
+*.onnx
+Cargo.lock
+/target
+.idea
@@ -0,0 +1,116 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2024"
+name = "kokoro-tts"
+version = "0.3.2"
+build = "build.rs"
+autolib = false
+autobins = false
+autoexamples = false
+autotests = false
+autobenches = false
+description = "用于Rust的轻量级AI离线语音合成器（Kokoro TTS），可轻松交叉编译到移动端"
+readme = "README.md"
+keywords = [
+    "TTS",
+    "Offline",
+    "Lite",
+    "AI",
+    "Synthesizer",
+]
+license = "Apache-2.0"
+repository = "https://github.com/mzdk100/kokoro.git"
+
+[features]
+use-cmudict = ["cmudict-fast"]
+
+[lib]
+name = "kokoro_tts"
+path = "src/lib.rs"
+
+[[example]]
+name = "synth_directly_v10"
+path = "examples/synth_directly_v10.rs"
+
+[[example]]
+name = "synth_directly_v11"
+path = "examples/synth_directly_v11.rs"
+
+[[example]]
+name = "synth_stream"
+path = "examples/synth_stream.rs"
+
+[dependencies.bincode]
+version = "2.0"
+
+[dependencies.chinese-number]
+version = "0.7.8"
+features = [
+    "number-to-chinese",
+    "chinese-to-number",
+]
+default-features = false
+
+[dependencies.cmudict-fast]
+version = "0.8.0"
+optional = true
+
+[dependencies.futures]
+version = "0.3.31"
+
+[dependencies.jieba-rs]
+version = "0.8.1"
+
+[dependencies.log]
+version = "0.4.29"
+
+[dependencies.ndarray]
+version = "0.17.2"
+
+[dependencies.ort]
+version = "2.0.0-rc.11"
+
+[dependencies.pin-project]
+version = "1.1.10"
+
+[dependencies.pinyin]
+version = "0.11.0"
+
+[dependencies.rand]
+version = "0.10.0-rc.7"
+
+[dependencies.regex]
+version = "1.12.2"
+
+[dependencies.tokio]
+version = "1.49.0"
+features = [
+    "fs",
+    "rt-multi-thread",
+    "time",
+    "sync",
+]
+
+[dev-dependencies.anyhow]
+version = "1.0.100"
+
+[dev-dependencies.tokio]
+version = "1.49.0"
+features = ["macros"]
+
+[dev-dependencies.voxudio]
+version = "0.5.7"
+features = ["device"]
+
+[build-dependencies.cc]
+version = "1.2.53"
@@ -0,0 +1,35 @@
+[package]
+name = "kokoro-tts"
+description = "用于Rust的轻量级AI离线语音合成器（Kokoro TTS），可轻松交叉编译到移动端"
+version = "0.3.2"
+edition = "2024"
+keywords = ["TTS", "Offline", "Lite", "AI", "Synthesizer"]
+license = "Apache-2.0"
+repository = "https://github.com/mzdk100/kokoro.git"
+readme = "README.md"
+
+[features]
+use-cmudict = ["cmudict-fast"]
+
+[dependencies]
+bincode = "2.0"
+chinese-number = { version = "0.7.8",default-features = false,features = ["number-to-chinese", "chinese-to-number"] }
+cmudict-fast = { version = "0.8.0", optional = true }
+futures = "0.3.31"
+jieba-rs = "0.8.1"
+log = "0.4.29"
+ndarray = "0.17.2"
+ort = "2.0.0-rc.11"
+pin-project = "1.1.10"
+pinyin = "0.11.0"
+rand="0.10.0-rc.7"
+regex = "1.12.2"
+tokio = { version = "1.49.0",features = ["fs", "rt-multi-thread","time", "sync"] }
+
+[dev-dependencies]
+anyhow = "1.0.100"
+tokio = {version = "1.49.0",features = ["macros"]}
+voxudio = { version = "0.5.7",features = ["device"] }
+
+[build-dependencies]
+cc = "1.2.53"
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -0,0 +1,59 @@
+# Kokoro TTS的rust推理实现
+
+[Kokoro](https://github.com/hexgrad/kokoro)
+
+> **Kokoro**是具有8200万参数的开放式TTS型号。
+> 尽管具有轻巧的体系结构，但它的质量与大型型号相当，同时更快，更具成本效益。使用Apache许可的权重，可以将Kokoro部署从生产环境到个人项目的任何地方。
+
+
+## 概述
+
+本项目包含幾个示例脚本，展示了如何使用Kokoro库进行语音合成。这些示例展示了如何直接合成语音和通过流式合成来处理更长的文本。
+
+## 前置条件
+
+- Rust编程语言
+- Tokio异步运行时
+- Rodio音频处理和播放的库（可选）
+- 下载模型资源，在這裡可以找到[1.0模型](https://github.com/mzdk100/kokoro/releases/tag/V1.0)和[1.1模型](https://github.com/mzdk100/kokoro/releases/tag/V1.1)
+
+## 特点
+- 跨平台，可以轻松在Windows、Mac OS上构建，也可以轻松交叉编译到安卓和iOS。
+- 离线推理，不依赖网络。
+- 足够轻量级，有不同尺寸的模型可以选择（最小的模型仅88M）。
+- 发音人多样化，跨越多国语言。
+
+## 使用方法
+
+1. 运行示例，克隆或下载本项目到本地。在项目根目录下运行：
+    ```shell
+    cargo run --example synth_directly_v10
+    cargo run --example synth_directly_v11
+    ```
+2. 集成到自己的项目中：
+    ```shell
+    cargo add kokoro-tts
+    ```
+3. Linux依赖项
+    ```shell
+    sudo apt install libasound2-dev
+    ```
+参考[examples](examples)文件夹中的示例代码进行开发。
+
+
+## 许可证
+
+本项目采用Apache-2.0许可证。请查看项目中的LICENSE文件了解更多信息。
+
+## 注意
+
+- 请确保在运行示例之前已经正确加载了模型和语音数据。
+- 示例中的语音合成参数（如语音名称、文本内容、速度等）仅作为示例，实际使用时请根据需要进行调整。
+
+## 贡献
+
+如果您有任何改进意见或想要贡献代码，请随时提交Pull Request或创建Issue。
+
+## 免责声明
+
+本项目中的示例代码仅用于演示目的。在使用本项目中的代码时，请确保遵守相关法律法规和社会主义核心价值观。开发者不对因使用本项目中的代码而导致的任何后果负责。
@@ -0,0 +1,5 @@
+fn main() {
+    const SRC: &str = "src/transcription/en_ipa.c";
+    cc::Build::new().file(SRC).compile("es");
+    println!("cargo:rerun-if-changed={}", SRC);
+}
@@ -0,0 +1,21 @@
+use {
+    kokoro_tts::{KokoroTts, Voice},
+    voxudio::AudioPlayer,
+};
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let tts = KokoroTts::new("kokoro-v1.0.int8.onnx", "voices.bin").await?;
+    let (audio, took) = tts
+        .synth(
+            "Hello, world!你好，我们是一群追逐梦想的人。我正在使用qq。",
+            Voice::ZfXiaoxiao(1.2),
+        )
+        .await?;
+    println!("Synth took: {:?}", took);
+    let mut player = AudioPlayer::new()?;
+    player.play()?;
+    player.write::<24000>(&audio, 1).await?;
+
+    Ok(())
+}
@@ -0,0 +1,21 @@
+use {
+    kokoro_tts::{KokoroTts, Voice},
+    voxudio::AudioPlayer,
+};
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let tts = KokoroTts::new("kokoro-v1.1-zh.onnx", "voices-v1.1-zh.bin").await?;
+    let (audio, took) = tts
+        .synth(
+            "Hello, world!你好，我们是一群追逐梦想的人。我正在使用qq。",
+            Voice::Zm045(1),
+        )
+        .await?;
+    println!("Synth took: {:?}", took);
+    let mut player = AudioPlayer::new()?;
+    player.play()?;
+    player.write::<24000>(&audio, 1).await?;
+
+    Ok(())
+}
@@ -0,0 +1,51 @@
+use {
+    futures::StreamExt,
+    kokoro_tts::{KokoroTts, Voice},
+    voxudio::AudioPlayer,
+};
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let tts = KokoroTts::new("kokoro-v1.1-zh.onnx", "voices-v1.1-zh.bin").await?;
+    let (mut sink, mut stream) = tts.stream(Voice::Zm098(1));
+    sink.synth("hello world.").await?;
+    sink.synth("你好，我们是一群追逐梦想的人。").await?;
+    sink.set_voice(Voice::Zf032(2));
+    sink.synth("我正在使用qq。").await?;
+    sink.set_voice(Voice::Zf090(3));
+    sink.synth("今天天气如何？").await?;
+    sink.set_voice(Voice::Zm045(1));
+    sink.synth("你在使用Rust编程语言吗？").await?;
+    sink.set_voice(Voice::Zf039(1));
+    sink.synth(
+        "你轻轻地走过那
+在风雨花丛中
+每一点一滴带走
+是我醒来的梦
+是在那天空上
+最美丽的云朵
+在那彩虹 最温柔的风",
+    )
+    .await?;
+    sink.set_voice(Voice::Zf088(1));
+    sink.synth(
+        "你静静看着我们
+最不舍的面容
+像流星划过夜空
+转瞬即逝的梦
+是最深情的脸 在这一瞬间
+在遥远天边
+",
+    )
+    .await?;
+    drop(sink);
+
+    let mut player = AudioPlayer::new()?;
+    player.play()?;
+    while let Some((audio, took)) = stream.next().await {
+        player.write::<24000>(&audio, 1).await?;
+        println!("Synth took: {:?}", took);
+    }
+
+    Ok(())
+}
@@ -0,0 +1,514 @@
+import re
+from typing import List, Optional, Tuple
+from jieba import posseg, cut_for_search
+from pypinyin import lazy_pinyin, load_phrases_dict, Style
+from dataclasses import dataclass
+
+@dataclass
+class MToken:
+    tag: str
+    whitespace: str
+    phonemes: Optional[str] = None
+
+ZH_MAP = {"b":"ㄅ","p":"ㄆ","m":"ㄇ","f":"ㄈ","d":"ㄉ","t":"ㄊ","n":"ㄋ","l":"ㄌ","g":"ㄍ","k":"ㄎ","h":"ㄏ","j":"ㄐ","q":"ㄑ","x":"ㄒ","zh":"ㄓ","ch":"ㄔ","sh":"ㄕ","r":"ㄖ","z":"ㄗ","c":"ㄘ","s":"ㄙ","a":"ㄚ","o":"ㄛ","e":"ㄜ","ie":"ㄝ","ai":"ㄞ","ei":"ㄟ","ao":"ㄠ","ou":"ㄡ","an":"ㄢ","en":"ㄣ","ang":"ㄤ","eng":"ㄥ","er":"ㄦ","i":"ㄧ","u":"ㄨ","v":"ㄩ","ii":"ㄭ","iii":"十","ve":"月","ia":"压","ian":"言","iang":"阳","iao":"要","in":"阴","ing":"应","iong":"用","iou":"又","ong":"中","ua":"穵","uai":"外","uan":"万","uang":"王","uei":"为","uen":"文","ueng":"瓮","uo":"我","van":"元","vn":"云"}
+for p in ';:,.!?/—…"()“” 12345R':
+    assert p not in ZH_MAP, p
+    ZH_MAP[p] = p
+
+unk = '❓'
+punc = frozenset(';:,.!?—…"()“”')
+phrases_dict = {
+    '开户行': [['ka1i'], ['hu4'], ['hang2']],
+    '发卡行': [['fa4'], ['ka3'], ['hang2']],
+    '放款行': [['fa4ng'], ['kua3n'], ['hang2']],
+    '茧行': [['jia3n'], ['hang2']],
+    '行号': [['hang2'], ['ha4o']],
+    '各地': [['ge4'], ['di4']],
+    '借还款': [['jie4'], ['hua2n'], ['kua3n']],
+    '时间为': [['shi2'], ['jia1n'], ['we2i']],
+    '为准': [['we2i'], ['zhu3n']],
+    '色差': [['se4'], ['cha1']],
+    '嗲': [['dia3']],
+    '呗': [['bei5']],
+    '不': [['bu4']],
+    '咗': [['zuo5']],
+    '嘞': [['lei5']],
+    '掺和': [['chan1'], ['huo5']]
+}
+must_erhua = {
+   "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
+}
+must_not_neural_tone_words = {
+    '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
+    '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
+    '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
+    '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
+}
+must_neural_tone_words = {
+    '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
+    '难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊',
+    '里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去',
+    '软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号',
+    '认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当',
+    '蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
+    '舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
+    '胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
+    '戏弄', '将军', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
+    '精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
+    '窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
+    '码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
+    '白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨',
+    '琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快',
+    '爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜',
+    '溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔',
+    '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事',
+    '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
+    '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
+    '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担',
+    '戒指', '懒得', '意识', '意思', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '别人',
+    '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事',
+    '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股',
+    '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在',
+    '官司', '学问', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', '姑娘', '姐夫',
+    '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', '大意', '大夫',
+    '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', '嘱咐', '嘟囔',
+    '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', '咳嗽', '和尚',
+    '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', '叫唤', '口袋',
+    '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', '功夫', '力气',
+    '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', '凑合', '凉快',
+    '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', '佩服', '作坊',
+    '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', '交情', '云彩',
+    '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', '不由', '下水',
+    '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜',
+    '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
+    '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
+    '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记'
+}
+not_erhua = {
+    "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
+    "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
+    "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
+    "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
+    "狗儿", "少儿"
+}
+BU = '不'
+YI = '一'
+X_ENG = frozenset(['x', 'eng'])
+
+# g2p
+load_phrases_dict(phrases_dict)
+
+def get_initials_finals(word: str) -> Tuple[List[str], List[str]]:
+    """
+    Get word initial and final by pypinyin or g2pM
+    """
+    initials = []
+    finals = []
+    orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
+    orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+    print(orig_initials, orig_finals)
+    # after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
+    en_index = [index for index, c in enumerate(word) if c == "嗯"]
+    for i in en_index:
+        orig_finals[i] = "n2"
+
+    for c, v in zip(orig_initials, orig_finals):
+        if re.match(r'i\d', v):
+            if c in ['z', 'c', 's']:
+                # zi, ci, si
+                v = re.sub('i', 'ii', v)
+            elif c in ['zh', 'ch', 'sh', 'r']:
+                # zhi, chi, shi
+                v = re.sub('i', 'iii', v)
+        initials.append(c)
+        finals.append(v)
+
+    return initials, finals
+
+def merge_erhua(initials: List[str], finals: List[str], word: str, pos: str) -> Tuple[List[str], List[str]]:
+    """
+    Do erhub.
+    """
+    # fix er1
+    for i, phn in enumerate(finals):
+        if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
+            finals[i] = 'er2'
+
+    # 发音
+    if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}):
+        return initials, finals
+
+    # "……" 等情况直接返回
+    if len(finals) != len(word):
+        return initials, finals
+
+    assert len(finals) == len(word)
+
+    # 不发音
+    new_initials = []
+    new_finals = []
+    for i, phn in enumerate(finals):
+        if i == len(finals) - 1 and word[i] == "儿" and phn in {"er2", "er5"} and word[-2:] not in not_erhua and new_finals:
+            new_finals[-1] = new_finals[-1][:-1] + "R" + new_finals[-1][-1]
+        else:
+            new_initials.append(initials[i])
+            new_finals.append(phn)
+
+    return new_initials, new_finals
+
+# merge "不" and the word behind it
+# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
+def merge_bu(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    new_seg = []
+    for i, (word, pos) in enumerate(seg):
+        if pos not in X_ENG:
+            last_word = None
+            if i > 0:
+                last_word, _ = seg[i - 1]
+            if last_word == BU:
+                word = last_word + word
+        next_pos = None
+        if i + 1 < len(seg):
+            _, next_pos = seg[i + 1]
+        if word != BU or next_pos is None or next_pos in X_ENG:
+            new_seg.append((word, pos))
+    return new_seg
+
+# function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
+# function 2: merge single  "一" and the word behind it
+# if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
+# e.g.
+# input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
+# output seg: [['听一听', 'v']]
+def merge_yi(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    new_seg = []
+    skip_next = False
+    # function 1
+    for i, (word, pos) in enumerate(seg):
+        if skip_next:
+            skip_next = False
+            continue
+        if i - 1 >= 0 and word == YI and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v" and seg[i + 1][1] not in X_ENG:
+            new_seg[-1] = (new_seg[-1][0] + YI + seg[i + 1][0], new_seg[-1][1])
+            skip_next = True
+        else:
+            new_seg.append((word, pos))
+    seg = new_seg
+    new_seg = []
+    # function 2
+    for i, (word, pos) in enumerate(seg):
+        if new_seg and new_seg[-1][0] == YI and pos not in X_ENG:
+            new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
+        else:
+            new_seg.append((word, pos))
+    return new_seg
+
+def merge_reduplication(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    new_seg = []
+    for i, (word, pos) in enumerate(seg):
+        if new_seg and word == new_seg[-1][0] and pos not in X_ENG:
+            new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+        else:
+            new_seg.append([word, pos])
+    return new_seg
+
+def is_reduplication(word: str) -> bool:
+    return len(word) == 2 and word[0] == word[1]
+
+# the first and the second words are all_tone_three
+def merge_continuous_three_tones(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    new_seg = []
+    sub_finals_list = []
+    for (word, pos) in seg:
+        if pos in X_ENG:
+            sub_finals_list.append(['0'])
+            continue
+        orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+        # after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
+        en_index = [index for index, c in enumerate(word) if c == "嗯"]
+        for i in en_index:
+            orig_finals[i] = "n2"
+        sub_finals_list.append(orig_finals)
+
+    assert len(sub_finals_list) == len(seg)
+    merge_last = [False] * len(seg)
+    for i, (word, pos) in enumerate(seg):
+        if pos not in X_ENG and i - 1 >= 0 and all_tone_three(sub_finals_list[i - 1]) and all_tone_three(sub_finals_list[i]) and not merge_last[i - 1]:
+            # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
+            if not is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
+                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+                merge_last[i] = True
+            else:
+                new_seg.append([word, pos])
+        else:
+            new_seg.append([word, pos])
+
+    return new_seg
+
+# the last char of first word and the first char of second word is tone_three
+def merge_continuous_three_tones_2(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    new_seg = []
+    sub_finals_list = []
+    for (word, pos) in seg:
+        if pos in X_ENG:
+            sub_finals_list.append(['0'])
+            continue
+        orig_finals = lazy_pinyin(
+            word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+        # after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
+        en_index = [index for index, c in enumerate(word) if c == "嗯"]
+        for i in en_index:
+            orig_finals[i] = "n2"
+        sub_finals_list.append(orig_finals)
+    assert len(sub_finals_list) == len(seg)
+    merge_last = [False] * len(seg)
+    for i, (word, pos) in enumerate(seg):
+        if pos not in X_ENG and i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not merge_last[i - 1]:
+            # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
+            if not is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
+                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+                merge_last[i] = True
+            else:
+                new_seg.append([word, pos])
+        else:
+            new_seg.append([word, pos])
+    return new_seg
+
+def merge_er(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    new_seg = []
+    for i, (word, pos) in enumerate(seg):
+        if i - 1 >= 0 and word == "儿" and new_seg[-1][1] not in X_ENG:
+            new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+        else:
+            new_seg.append([word, pos])
+    return new_seg
+
+def pre_merge_for_modify(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    """
+        seg: [(word, pos), ...]
+    """
+    seg = merge_bu(seg)
+    seg = merge_yi(seg)
+    seg = merge_reduplication(seg)
+    seg = merge_continuous_three_tones(seg)
+    seg = merge_continuous_three_tones_2(seg)
+    return merge_er(seg)
+
+def bu_sandhi(word: str, finals: List[str]) -> List[str]:
+    # e.g. 看不懂
+    if len(word) == 3 and word[1] == BU:
+        finals[1] = finals[1][:-1] + "5"
+    else:
+        for i, char in enumerate(word):
+            # "不" before tone4 should be bu2, e.g. 不怕
+            if char == BU and i + 1 < len(word) and finals[i + 1][-1] == "4":
+                finals[i] = finals[i][:-1] + "2"
+    return finals
+
+def yi_sandhi(word: str, finals: List[str]) -> List[str]:
+    # "一" in number sequences, e.g. 一零零, 二一零
+    if word.find(YI) != -1 and all(
+        [item.isnumeric() for item in word if item != YI]):
+        return finals
+    # "一" between reduplication words shold be yi5, e.g. 看一看
+    elif len(word) == 3 and word[1] == YI and word[0] == word[-1]:
+        finals[1] = finals[1][:-1] + "5"
+    # when "一" is ordinal word, it should be yi1
+    elif word.startswith("第一"):
+        finals[1] = finals[1][:-1] + "1"
+    else:
+        for i, char in enumerate(word):
+            if char == YI and i + 1 < len(word):
+                # "一" before tone4 should be yi2, e.g. 一段
+                if finals[i + 1][-1] in {'4', '5'}:
+                    finals[i] = finals[i][:-1] + "2"
+                # "一" before non-tone4 should be yi4, e.g. 一天
+                else:
+                    # "一" 后面如果是标点，还读一声
+                    if word[i + 1] not in punc:
+                        finals[i] = finals[i][:-1] + "4"
+    return finals
+
+def split_word(word: str) -> List[str]:
+    word_list = cut_for_search(word)
+    word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
+    first_subword = word_list[0]
+    first_begin_idx = word.find(first_subword)
+    if first_begin_idx == 0:
+        second_subword = word[len(first_subword):]
+        new_word_list = [first_subword, second_subword]
+    else:
+        second_subword = word[:-len(first_subword)]
+        new_word_list = [second_subword, first_subword]
+    return new_word_list
+
+# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
+# e.g.
+# word: "家里"
+# pos: "s"
+# finals: ['ia1', 'i3']
+def neural_sandhi(word: str, pos: str, finals: List[str]) -> List[str]:
+    if word in must_not_neural_tone_words:
+        return finals
+    # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
+    for j, item in enumerate(word):
+        if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
+            finals[j] = finals[j][:-1] + "5"
+    ge_idx = word.find("个")
+    if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒滴哩哟喽啰耶喔诶":
+        finals[-1] = finals[-1][:-1] + "5"
+    elif len(word) >= 1 and word[-1] in "的地得":
+        finals[-1] = finals[-1][:-1] + "5"
+    # e.g. 走了, 看着, 去过
+    elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
+        finals[-1] = finals[-1][:-1] + "5"
+    elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"}:
+        finals[-1] = finals[-1][:-1] + "5"
+    # e.g. 桌上, 地下
+    elif len(word) > 1 and word[-1] in "上下" and pos in {"s", "l", "f"}:
+        finals[-1] = finals[-1][:-1] + "5"
+    # e.g. 上来, 下去
+    elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
+        finals[-1] = finals[-1][:-1] + "5"
+    # 个做量词
+    elif (ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个':
+        finals[ge_idx] = finals[ge_idx][:-1] + "5"
+    else:
+        if word in must_neural_tone_words or word[-2:] in must_neural_tone_words:
+            finals[-1] = finals[-1][:-1] + "5"
+
+    word_list = split_word(word)
+    finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
+    for i, word in enumerate(word_list):
+        # conventional neural in Chinese
+        if word in must_neural_tone_words or word[-2:] in must_neural_tone_words:
+            finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
+    finals = sum(finals_list, [])
+    return finals
+
+def all_tone_three(finals: List[str]) -> bool:
+    return all(x[-1] == "3" for x in finals)
+
+def three_sandhi(word: str, finals: List[str]) -> List[str]:
+    if len(word) == 2 and all_tone_three(finals):
+        finals[0] = finals[0][:-1] + "2"
+    elif len(word) == 3:
+        word_list = split_word(word)
+        if all_tone_three(finals):
+            #  disyllabic + monosyllabic, e.g. 蒙古/包
+            if len(word_list[0]) == 2:
+                finals[0] = finals[0][:-1] + "2"
+                finals[1] = finals[1][:-1] + "2"
+            #  monosyllabic + disyllabic, e.g. 纸/老虎
+            elif len(word_list[0]) == 1:
+                finals[1] = finals[1][:-1] + "2"
+        else:
+            finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
+            if len(finals_list) == 2:
+                for i, sub in enumerate(finals_list):
+                    # e.g. 所有/人
+                    if all_tone_three(sub) and len(sub) == 2:
+                        finals_list[i][0] = finals_list[i][0][:-1] + "2"
+                    # e.g. 好/喜欢
+                    elif i == 1 and not all_tone_three(sub) and finals_list[i][0][-1] == "3" and finals_list[0][-1][-1] == "3":
+                        finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
+                    finals = sum(finals_list, [])
+    # split idiom into two words who's length is 2
+    elif len(word) == 4:
+        finals_list = [finals[:2], finals[2:]]
+        finals = []
+        for sub in finals_list:
+            if all_tone_three(sub):
+                sub[0] = sub[0][:-1] + "2"
+            finals += sub
+
+    return finals
+
+def modified_tone(word: str, pos: str, finals: List[str]) -> List[str]:
+    """
+        word: 分词
+        pos: 词性
+        finals: 带调韵母, [final1, ..., finaln]
+    """
+    finals = bu_sandhi(word, finals)
+    finals = yi_sandhi(word, finals)
+    finals = neural_sandhi(word, pos, finals)
+    return three_sandhi(word, finals)
+
+def g2p(text: str, with_erhua: bool = True) -> str:
+    """
+    Return: string of phonemes.
+        'ㄋㄧ2ㄏㄠ3/ㄕ十4ㄐㄝ4'
+    """
+    tokens = []
+    seg_cut = posseg.lcut(text)
+    # fix wordseg bad case for sandhi
+    seg_cut = pre_merge_for_modify(seg_cut)
+
+    # 为了多音词获得更好的效果，这里采用整句预测
+    initials = []
+    finals = []
+    # pypinyin, g2pM
+    for word, pos in seg_cut:
+        if pos == 'x' and '\u4E00' <= min(word) and max(word) <= '\u9FFF':
+            pos = 'X'
+        elif pos != 'x' and word in punc:
+            pos = 'x'
+        tk = MToken(tag=pos, whitespace='')
+        if pos in X_ENG:
+            if not word.isspace():
+                if pos == 'x' and word in punc:
+                    tk.phonemes = word
+                tokens.append(tk)
+            elif tokens:
+                tokens[-1].whitespace += word
+            continue
+        elif tokens and tokens[-1].tag not in X_ENG and not tokens[-1].whitespace:
+            tokens[-1].whitespace = '/'
+
+        # g2p
+        sub_initials, sub_finals = get_initials_finals(word)
+        # tone sandhi
+        sub_finals = modified_tone(word, pos, sub_finals)
+        # er hua
+        if with_erhua:
+            sub_initials, sub_finals = merge_erhua(sub_initials, sub_finals, word, pos)
+
+        initials.append(sub_initials)
+        finals.append(sub_finals)
+        # assert len(sub_initials) == len(sub_finals) == len(word)
+
+        # sum(iterable[, start])
+        # initials = sum(initials, [])
+        # finals = sum(finals, [])
+
+        phones = []
+        for c, v in zip(sub_initials, sub_finals):
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c:
+                phones.append(c)
+            # replace punctuation by ` `
+            # if c and c in punc:
+            #     phones.append(c)
+            if v and (v not in punc or v != c):# and v not in rhy_phns:
+                phones.append(v)
+        phones = '_'.join(phones).replace('_eR', '_er').replace('R', '_R')
+        phones = re.sub(r'(?=\d)', '_', phones).split('_')
+        print(phones)
+        tk.phonemes = ''.join(ZH_MAP.get(p, unk) for p in phones)
+        tokens.append(tk)
+
+    return ''.join((unk if tk.phonemes is None else tk.phonemes) + tk.whitespace for tk in tokens)
+
+print(g2p('时间为。Hello, world!你好，我们是一群追逐梦想的人。我正在使用qq。忽略卢驴'))
+seg = posseg.lcut('不好看', True)
+print(seg, merge_bu(seg))
+seg = merge_bu(posseg.lcut('听一听一个', True))
+print(seg, merge_yi(seg))
+seg = merge_bu(posseg.lcut('谢谢谢谢', True))
+print(seg, merge_reduplication(seg))
+seg = merge_bu(posseg.lcut('小美好', True))
+print(seg, merge_continuous_three_tones(seg))
+seg = merge_bu(posseg.lcut('风景好', True))
+print(seg, merge_continuous_three_tones_2(seg))
@@ -0,0 +1,3 @@
+set PATH=%PATH%;D:\msys64\mingw64\bin
+cargo run --example synth_directly_v11
+pause
@@ -0,0 +1,80 @@
+use crate::G2PError;
+use bincode::error::DecodeError;
+use ndarray::ShapeError;
+use ort::Error as OrtError;
+use std::{
+    error::Error,
+    fmt::{Debug, Display, Formatter, Result as FmtResult},
+    io::Error as IoError,
+    time::SystemTimeError,
+};
+
+#[derive(Debug)]
+pub enum KokoroError {
+    Decode(DecodeError),
+    G2P(G2PError),
+    Io(IoError),
+    ModelReleased,
+    Ort(OrtError),
+    Send(String),
+    Shape(ShapeError),
+    SystemTime(SystemTimeError),
+    VoiceNotFound(String),
+    VoiceVersionInvalid(String),
+}
+
+impl Display for KokoroError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        write!(f, "KokoroError: ")?;
+        match self {
+            Self::Decode(e) => Display::fmt(e, f),
+            Self::G2P(e) => Display::fmt(e, f),
+            Self::Io(e) => Display::fmt(e, f),
+            Self::Ort(e) => Display::fmt(e, f),
+            Self::ModelReleased => write!(f, "ModelReleased"),
+            Self::Send(e) => Display::fmt(e, f),
+            Self::Shape(e) => Display::fmt(e, f),
+            Self::SystemTime(e) => Display::fmt(e, f),
+            Self::VoiceNotFound(name) => write!(f, "VoiceNotFound({})", name),
+            Self::VoiceVersionInvalid(msg) => write!(f, "VoiceVersionInvalid({})", msg),
+        }
+    }
+}
+
+impl Error for KokoroError {}
+
+impl From<IoError> for KokoroError {
+    fn from(value: IoError) -> Self {
+        Self::Io(value)
+    }
+}
+
+impl From<DecodeError> for KokoroError {
+    fn from(value: DecodeError) -> Self {
+        Self::Decode(value)
+    }
+}
+
+impl From<OrtError> for KokoroError {
+    fn from(value: OrtError) -> Self {
+        Self::Ort(value)
+    }
+}
+
+impl From<G2PError> for KokoroError {
+    fn from(value: G2PError) -> Self {
+        Self::G2P(value)
+    }
+}
+
+impl From<ShapeError> for KokoroError {
+    fn from(value: ShapeError) -> Self {
+        Self::Shape(value)
+    }
+}
+
+impl From<SystemTimeError> for KokoroError {
+    fn from(value: SystemTimeError) -> Self {
+        Self::SystemTime(value)
+    }
+}
@@ -0,0 +1,321 @@
+/// 文本到国际音标的转换
+mod v10;
+mod v11;
+
+use super::PinyinError;
+use chinese_number::{ChineseCase, ChineseCountMethod, ChineseVariant, NumberToChinese};
+#[cfg(feature = "use-cmudict")]
+use cmudict_fast::{Cmudict, Error as CmudictError};
+use pinyin::ToPinyin;
+use regex::{Captures, Error as RegexError, Regex};
+use std::{
+    error::Error,
+    fmt::{Display, Formatter, Result as FmtResult},
+};
+
+#[derive(Debug)]
+pub enum G2PError {
+    #[cfg(feature = "use-cmudict")]
+    CmudictError(CmudictError),
+    EnptyData,
+    #[cfg(not(feature = "use-cmudict"))]
+    Nul(std::ffi::NulError),
+    Pinyin(PinyinError),
+    Regex(RegexError),
+    #[cfg(not(feature = "use-cmudict"))]
+    Utf8(std::str::Utf8Error),
+}
+
+impl Display for G2PError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        write!(f, "G2PError: ")?;
+        match self {
+            #[cfg(feature = "use-cmudict")]
+            Self::CmudictError(e) => Display::fmt(e, f),
+            Self::EnptyData => Display::fmt("EmptyData", f),
+            #[cfg(not(feature = "use-cmudict"))]
+            Self::Nul(e) => Display::fmt(e, f),
+            Self::Pinyin(e) => Display::fmt(e, f),
+            Self::Regex(e) => Display::fmt(e, f),
+            #[cfg(not(feature = "use-cmudict"))]
+            Self::Utf8(e) => Display::fmt(e, f),
+        }
+    }
+}
+
+impl Error for G2PError {}
+
+impl From<PinyinError> for G2PError {
+    fn from(value: PinyinError) -> Self {
+        Self::Pinyin(value)
+    }
+}
+
+impl From<RegexError> for G2PError {
+    fn from(value: RegexError) -> Self {
+        Self::Regex(value)
+    }
+}
+
+#[cfg(feature = "use-cmudict")]
+impl From<CmudictError> for G2PError {
+    fn from(value: CmudictError) -> Self {
+        Self::CmudictError(value)
+    }
+}
+
+#[cfg(not(feature = "use-cmudict"))]
+impl From<std::ffi::NulError> for G2PError {
+    fn from(value: std::ffi::NulError) -> Self {
+        Self::Nul(value)
+    }
+}
+
+#[cfg(not(feature = "use-cmudict"))]
+impl From<std::str::Utf8Error> for G2PError {
+    fn from(value: std::str::Utf8Error) -> Self {
+        Self::Utf8(value)
+    }
+}
+
+fn word2ipa_zh(word: &str) -> Result<String, G2PError> {
+    let iter = word.chars().map(|i| match i.to_pinyin() {
+        None => Ok(i.to_string()),
+        Some(p) => v10::py2ipa(p.with_tone_num_end()),
+    });
+
+    let mut result = String::new();
+    for i in iter {
+        result.push_str(&i?);
+    }
+    Ok(result)
+}
+
+#[cfg(feature = "use-cmudict")]
+fn word2ipa_en(word: &str) -> Result<String, G2PError> {
+    use super::{arpa_to_ipa, letters_to_ipa};
+    use std::{
+        io::{Error as IoError, ErrorKind},
+        str::FromStr,
+        sync::LazyLock,
+    };
+
+    fn get_cmudict<'a>() -> Result<&'a Cmudict, CmudictError> {
+        static CMUDICT: LazyLock<Result<Cmudict, CmudictError>> =
+            LazyLock::new(|| Cmudict::from_str(include_str!("../dict/cmudict.dict")));
+        CMUDICT.as_ref().map_err(|i| match i {
+            CmudictError::IoErr(e) => CmudictError::IoErr(IoError::new(ErrorKind::Other, e)),
+            CmudictError::InvalidLine(e) => CmudictError::InvalidLine(*e),
+            CmudictError::RuleParseError(e) => CmudictError::RuleParseError(e.clone()),
+        })
+    }
+
+    if word.chars().count() < 4 && word.chars().all(|c| c.is_ascii_uppercase()) {
+        return Ok(letters_to_ipa(word));
+    }
+
+    let dict = get_cmudict()?;
+    let upper = word.to_ascii_uppercase();
+    let lower = word.to_ascii_lowercase();
+    let Some(rules) = dict
+        .get(word)
+        .or_else(|| dict.get(&upper))
+        .or_else(|| dict.get(&lower))
+    else {
+        return Ok(letters_to_ipa(word));
+    };
+    if rules.is_empty() {
+        return Ok(word.to_owned());
+    }
+    let i = rand::random_range(0..rules.len());
+    let result = rules[i]
+        .pronunciation()
+        .iter()
+        .map(|i| arpa_to_ipa(&i.to_string()).unwrap_or_default())
+        .collect::<String>();
+    Ok(result)
+}
+
+#[cfg(not(feature = "use-cmudict"))]
+fn word2ipa_en(word: &str) -> Result<String, G2PError> {
+    use super::letters_to_ipa;
+    use std::{
+        ffi::{CStr, CString, c_char},
+        sync::Once,
+    };
+
+    if word.chars().count() < 4 && word.chars().all(|c| c.is_ascii_uppercase()) {
+        return Ok(letters_to_ipa(word));
+    }
+
+    unsafe extern "C" {
+        fn TextToPhonemes(text: *const c_char) -> *const ::std::os::raw::c_char;
+        fn Initialize(data_dictlist: *const c_char);
+    }
+
+    unsafe {
+        static INIT: Once = Once::new();
+        INIT.call_once(|| {
+            static DATA: &[u8] = include_bytes!("../dict/espeak.dict");
+            Initialize(DATA.as_ptr() as _);
+        });
+
+        let word = CString::new(word.to_lowercase())?.into_raw() as *const c_char;
+        let res = TextToPhonemes(word);
+        Ok(CStr::from_ptr(res).to_str()?.to_string())
+    }
+}
+
+fn to_half_shape(text: &str) -> String {
+    let mut result = String::with_capacity(text.len() * 2); // 预分配合理空间
+    let chars = text.chars().peekable();
+
+    for c in chars {
+        match c {
+            // 处理需要后看的情况
+            '«' | '《' => result.push('“'),
+            '»' | '》' => result.push('”'),
+            '（' => result.push('('),
+            '）' => result.push(')'),
+            // 简单替换规则
+            '、' | '，' => result.push(','),
+            '。' => result.push('.'),
+            '！' => result.push('!'),
+            '：' => result.push(':'),
+            '；' => result.push(';'),
+            '？' => result.push('?'),
+            // 默认字符
+            _ => result.push(c),
+        }
+    }
+
+    // 清理多余空格并返回
+    result
+}
+
+fn num_repr(text: &str) -> Result<String, G2PError> {
+    let regex = Regex::new(r#"\d+(\.\d+)?"#)?;
+    Ok(regex
+        .replace(text, |caps: &Captures| {
+            let text = &caps[0];
+            if let Ok(num) = text.parse::<f64>() {
+                num.to_chinese(
+                    ChineseVariant::Traditional,
+                    ChineseCase::Lower,
+                    ChineseCountMethod::Low,
+                )
+                .map_or(text.to_owned(), |i| i)
+            } else if let Ok(num) = text.parse::<i64>() {
+                num.to_chinese(
+                    ChineseVariant::Traditional,
+                    ChineseCase::Lower,
+                    ChineseCountMethod::Low,
+                )
+                .map_or(text.to_owned(), |i| i)
+            } else {
+                text.to_owned()
+            }
+        })
+        .to_string())
+}
+
+pub fn g2p(text: &str, use_v11: bool) -> Result<String, G2PError> {
+    let text = num_repr(text)?;
+    let sentence_pattern = Regex::new(
+        r#"([\u4E00-\u9FFF]+)|([，。：·？、！《》（）【】〖〗〔〕“”‘’〈〉…—　]+)|([\u0000-\u00FF]+)+"#,
+    )?;
+    let en_word_pattern = Regex::new("\\w+|\\W+")?;
+    let jieba = jieba_rs::Jieba::new();
+    let mut result = String::new();
+    for i in sentence_pattern.captures_iter(&text) {
+        match (i.get(1), i.get(2), i.get(3)) {
+            (Some(text), _, _) => {
+                let text = to_half_shape(text.as_str());
+                if use_v11 {
+                    if !result.is_empty() && !result.ends_with(' ') {
+                        result.push(' ');
+                    }
+                    result.push_str(&v11::g2p(&text, true));
+                    result.push(' ');
+                } else {
+                    for i in jieba.cut(&text, true) {
+                        result.push_str(&word2ipa_zh(i)?);
+                        result.push(' ');
+                    }
+                }
+            }
+            (_, Some(text), _) => {
+                let text = to_half_shape(text.as_str());
+                result = result.trim_end().to_string();
+                result.push_str(&text);
+                result.push(' ');
+            }
+            (_, _, Some(text)) => {
+                for i in en_word_pattern.captures_iter(text.as_str()) {
+                    let c = (i[0]).chars().next().unwrap_or_default();
+                    if c == '\''
+                        || c == '_'
+                        || c == '-'
+                        || c.is_ascii_lowercase()
+                        || c.is_ascii_uppercase()
+                    {
+                        let i = &i[0];
+                        if result.trim_end().ends_with(['.', ',', '!', '?'])
+                            && !result.ends_with(' ')
+                        {
+                            result.push(' ');
+                        }
+                        result.push_str(&word2ipa_en(i)?);
+                    } else if c == ' ' && result.ends_with(' ') {
+                        result.push_str((i[0]).trim_start());
+                    } else {
+                        result.push_str(&i[0]);
+                    }
+                }
+            }
+            _ => (),
+        };
+    }
+
+    Ok(result.trim().to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(not(feature = "use-cmudict"))]
+    #[test]
+    fn test_word2ipa_en() -> Result<(), super::G2PError> {
+        use super::word2ipa_en;
+
+        // println!("{:?}", espeak_rs::text_to_phonemes("days", "en", None, true, false));
+        assert_eq!("kjˌuːkjˈuː", word2ipa_en("qq")?);
+        assert_eq!("həlˈəʊ", word2ipa_en("hello")?);
+        assert_eq!("wˈɜːld", word2ipa_en("world")?);
+        assert_eq!("ˈapəl", word2ipa_en("apple")?);
+        assert_eq!("tʃˈɪldɹɛn", word2ipa_en("children")?);
+        assert_eq!("ˈaʊə", word2ipa_en("hour")?);
+        assert_eq!("dˈeɪz", word2ipa_en("days")?);
+
+        Ok(())
+    }
+
+    #[cfg(feature = "use-cmudict")]
+    #[test]
+    fn test_word2ipa_en_is_case_insensitive_for_dictionary_words() -> Result<(), super::G2PError> {
+        use super::word2ipa_en;
+
+        assert_eq!(word2ipa_en("Welcome")?, word2ipa_en("welcome")?);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_g2p() -> Result<(), super::G2PError> {
+        use super::g2p;
+
+        assert_eq!("ni↓xau↓ ʂɻ↘ʨje↘", g2p("你好世界", false)?);
+        assert_eq!("ㄋㄧ2ㄏㄠ3/ㄕ十4ㄐㄝ4", g2p("你好世界", true)?);
+
+        Ok(())
+    }
+}
@@ -0,0 +1,62 @@
+use crate::{G2PError, pinyin_to_ipa};
+
+fn retone(p: &str) -> String {
+    let chars: Vec<char> = p.chars().collect();
+    let mut result = String::with_capacity(p.len());
+    let mut i = 0;
+
+    while i < chars.len() {
+        match () {
+            // 三声调优先处理
+            _ if i + 2 < chars.len()
+                && chars[i] == '˧'
+                && chars[i + 1] == '˩'
+                && chars[i + 2] == '˧' =>
+            {
+                result.push('↓');
+                i += 3;
+            }
+            // 二声调
+            _ if i + 1 < chars.len() && chars[i] == '˧' && chars[i + 1] == '˥' => {
+                result.push('↗');
+                i += 2;
+            }
+            // 四声调
+            _ if i + 1 < chars.len() && chars[i] == '˥' && chars[i + 1] == '˩' => {
+                result.push('↘');
+                i += 2;
+            }
+            // 一声调
+            _ if chars[i] == '˥' => {
+                result.push('→');
+                i += 1;
+            }
+            // 组合字符替换（ɻ̩ 和 ɱ̩）
+            _ if !(i + 1 >= chars.len() || chars[i+1] != '\u{0329}' || chars[i] != '\u{027B}' && chars[i] != '\u{0271}') =>
+            {
+                result.push('ɨ');
+                i += 2;
+            }
+            // 默认情况
+            _ => {
+                result.push(chars[i]);
+                i += 1;
+            }
+        }
+    }
+
+    assert!(
+        !result.contains('\u{0329}'),
+        "Unexpected combining mark in: {}",
+        result
+    );
+    result
+}
+
+pub(super) fn py2ipa(py: &str) -> Result<String, G2PError> {
+    pinyin_to_ipa(py)?
+        .first()
+        .map_or(Err(G2PError::EnptyData), |i| {
+            Ok(i.iter().map(|i| retone(i)).collect::<String>())
+        })
+}
@@ -0,0 +1,83 @@
+mod error;
+mod g2p;
+mod stream;
+mod synthesizer;
+mod tokenizer;
+mod transcription;
+mod voice;
+
+use {
+    bincode::{config::standard, decode_from_slice},
+    ort::{execution_providers::CUDAExecutionProvider, session::Session},
+    std::{collections::HashMap, path::Path, sync::Arc, time::Duration},
+    tokio::{fs::read, sync::Mutex},
+};
+pub use {error::*, g2p::*, stream::*, tokenizer::*, transcription::*, voice::*};
+
+pub struct KokoroTts {
+    model: Arc<Mutex<Session>>,
+    voices: Arc<HashMap<String, Vec<Vec<Vec<f32>>>>>,
+}
+
+impl KokoroTts {
+    pub async fn new<P: AsRef<Path>>(model_path: P, voices_path: P) -> Result<Self, KokoroError> {
+        let voices = read(voices_path).await?;
+        let (voices, _) = decode_from_slice(&voices, standard())?;
+
+        let model = Session::builder()?
+            .with_execution_providers([CUDAExecutionProvider::default().build()])?
+            .commit_from_file(model_path)?;
+        Ok(Self {
+            model: Arc::new(model.into()),
+            voices,
+        })
+    }
+
+    pub async fn new_from_bytes<B>(model: B, voices: B) -> Result<Self, KokoroError>
+    where
+        B: AsRef<[u8]>,
+    {
+        let (voices, _) = decode_from_slice(voices.as_ref(), standard())?;
+
+        let model = Session::builder()?
+            .with_execution_providers([CUDAExecutionProvider::default().build()])?
+            .commit_from_memory(model.as_ref())?;
+        Ok(Self {
+            model: Arc::new(model.into()),
+            voices,
+        })
+    }
+
+    pub async fn synth<S>(&self, text: S, voice: Voice) -> Result<(Vec<f32>, Duration), KokoroError>
+    where
+        S: AsRef<str>,
+    {
+        let name = voice.get_name();
+        let pack = self
+            .voices
+            .get(name)
+            .ok_or(KokoroError::VoiceNotFound(name.to_owned()))?;
+        synthesizer::synth(Arc::downgrade(&self.model), text, pack, voice).await
+    }
+
+    pub fn stream<S>(&self, voice: Voice) -> (SynthSink<S>, SynthStream)
+    where
+        S: AsRef<str> + Send + 'static,
+    {
+        let voices = Arc::downgrade(&self.voices);
+        let model = Arc::downgrade(&self.model);
+
+        start_synth_session(voice, move |text, voice| {
+            let voices = voices.clone();
+            let model = model.clone();
+            async move {
+                let name = voice.get_name();
+                let voices = voices.upgrade().ok_or(KokoroError::ModelReleased)?;
+                let pack = voices
+                    .get(name)
+                    .ok_or(KokoroError::VoiceNotFound(name.to_owned()))?;
+                synthesizer::synth(model, text, pack, voice).await
+            }
+        })
+    }
+}
@@ -0,0 +1,157 @@
+use {
+    crate::{KokoroError, Voice},
+    futures::{Sink, SinkExt, Stream},
+    pin_project::pin_project,
+    std::{
+        pin::Pin,
+        task::{Context, Poll},
+        time::Duration,
+    },
+    tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender, unbounded_channel},
+};
+
+struct Request<S> {
+    voice: Voice,
+    text: S,
+}
+
+struct Response {
+    data: Vec<f32>,
+    took: Duration,
+}
+
+/// 语音合成流
+///
+/// 该结构体用于通过流式合成来处理更长的文本。它实现了`Stream` trait，可以用于异步迭代合成后的音频数据。
+#[pin_project]
+pub struct SynthStream {
+    #[pin]
+    rx: UnboundedReceiver<Response>,
+}
+
+impl Stream for SynthStream {
+    type Item = (Vec<f32>, Duration);
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.project().rx)
+            .poll_recv(cx)
+            .map(|i| i.map(|Response { data, took }| (data, took)))
+    }
+}
+
+/// 语音合成发送端
+///
+/// 该结构体用于发送语音合成请求。它实现了`Sink` trait，可以用于异步发送合成请求。
+#[pin_project]
+pub struct SynthSink<S> {
+    tx: UnboundedSender<Request<S>>,
+    voice: Voice,
+}
+
+impl<S> SynthSink<S> {
+    /// 设置语音名称
+    ///
+    /// 该方法用于设置要合成的语音名称。
+    ///
+    /// # 参数
+    ///
+    /// * `voice_name` - 语音名称，用于选择要合成的语音。
+    ///
+    /// # 示例
+    ///
+    /// ```rust
+    /// use kokoro_tts::{KokoroTts, Voice};
+    ///
+    /// #[tokio::main]
+    /// async fn main() {
+    ///     let Ok(tts) = KokoroTts::new("../kokoro-v1.0.int8.onnx", "../voices.bin").await else {
+    ///         return;
+    ///     };
+    ///     // speed: 1.0
+    ///     let (mut sink, _) = tts.stream::<&str>(Voice::ZfXiaoxiao(1.0));
+    ///     // speed: 1.8
+    ///     sink.set_voice(Voice::ZmYunxi(1.8));
+    /// }
+    /// ```
+    ///
+    pub fn set_voice(&mut self, voice: Voice) {
+        self.voice = voice
+    }
+
+    /// 发送合成请求
+    ///
+    /// 该方法用于发送语音合成请求。
+    ///
+    /// # 参数
+    ///
+    /// * `text` - 要合成的文本内容。
+    ///
+    /// # 返回值
+    ///
+    /// 如果发送成功，将返回`Ok(())`；如果发送失败，将返回一个`KokoroError`类型的错误。
+    ///
+    /// # 示例
+    ///
+    /// ```rust
+    /// use kokoro_tts::{KokoroTts, Voice};
+    ///
+    /// #[tokio::main]
+    /// async fn main() {
+    ///     let Ok(tts) = KokoroTts::new("../kokoro-v1.1-zh.onnx", "../voices-v1.1-zh.bin").await else {
+    ///         return;
+    ///     };
+    ///     let (mut sink, _) =tts.stream(Voice::Zf003(2));
+    ///     let _ = sink.synth("hello world.").await;
+    /// }
+    /// ```
+    ///
+    pub async fn synth(&mut self, text: S) -> Result<(), KokoroError> {
+        self.send((self.voice, text)).await
+    }
+}
+
+impl<S> Sink<(Voice, S)> for SynthSink<S> {
+    type Error = KokoroError;
+
+    fn poll_ready(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn start_send(self: Pin<&mut Self>, (voice, text): (Voice, S)) -> Result<(), Self::Error> {
+        self.tx
+            .send(Request { voice, text })
+            .map_err(|e| KokoroError::Send(e.to_string()))
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn poll_close(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+}
+
+pub(super) fn start_synth_session<F, R, S>(
+    voice: Voice,
+    synth_request_callback: F,
+) -> (SynthSink<S>, SynthStream)
+where
+    F: Fn(S, Voice) -> R + Send + 'static,
+    R: Future<Output = Result<(Vec<f32>, Duration), KokoroError>> + Send,
+    S: AsRef<str> + Send + 'static,
+{
+    let (tx, mut rx) = unbounded_channel::<Request<S>>();
+    let (tx2, rx2) = unbounded_channel();
+    tokio::spawn(async move {
+        while let Some(req) = rx.recv().await {
+            let (data, took) = synth_request_callback(req.text, req.voice).await?;
+            tx2.send(Response { data, took })
+                .map_err(|e| KokoroError::Send(e.to_string()))?;
+        }
+
+        Ok::<_, KokoroError>(())
+    });
+
+    (SynthSink { tx, voice }, SynthStream { rx: rx2 })
+}
@@ -0,0 +1,123 @@
+use {
+    crate::{KokoroError, Voice, g2p, get_token_ids},
+    ndarray::Array,
+    ort::{
+        inputs,
+        session::{RunOptions, Session},
+        value::TensorRef,
+    },
+    std::{
+        cmp::min,
+        sync::Weak,
+        time::{Duration, SystemTime},
+    },
+    tokio::sync::Mutex,
+};
+
+async fn synth_v10<P, S>(
+    model: Weak<Mutex<Session>>,
+    phonemes: S,
+    pack: P,
+    speed: f32,
+) -> Result<(Vec<f32>, Duration), KokoroError>
+where
+    P: AsRef<Vec<Vec<Vec<f32>>>>,
+    S: AsRef<str>,
+{
+    let model = model.upgrade().ok_or(KokoroError::ModelReleased)?;
+    let phonemes = get_token_ids(phonemes.as_ref(), false);
+    let phonemes = Array::from_shape_vec((1, phonemes.len()), phonemes)?;
+    let ref_s = pack.as_ref()[phonemes.len() - 1]
+        .first()
+        .cloned()
+        .unwrap_or_default();
+
+    let style = Array::from_shape_vec((1, ref_s.len()), ref_s)?;
+    let speed = Array::from_vec(vec![speed]);
+    let options = RunOptions::new()?;
+    let mut model = model.lock().await;
+    let t = SystemTime::now();
+    let kokoro_output = model
+        .run_async(
+            inputs![
+                "tokens" => TensorRef::from_array_view(&phonemes)?,
+                "style" => TensorRef::from_array_view(&style)?,
+                "speed" => TensorRef::from_array_view(&speed)?,
+            ],
+            &options,
+        )?
+        .await?;
+    let elapsed = t.elapsed()?;
+    let (_, audio) = kokoro_output["audio"].try_extract_tensor::<f32>()?;
+
+    Ok((audio.to_owned(), elapsed))
+}
+
+async fn synth_v11<P, S>(
+    model: Weak<Mutex<Session>>,
+    phonemes: S,
+    pack: P,
+    speed: i32,
+) -> Result<(Vec<f32>, Duration), KokoroError>
+where
+    P: AsRef<Vec<Vec<Vec<f32>>>>,
+    S: AsRef<str>,
+{
+    let model = model.upgrade().ok_or(KokoroError::ModelReleased)?;
+    let mut phonemes = get_token_ids(phonemes.as_ref(), true);
+
+    let mut ret = Vec::new();
+    let mut elapsed = Duration::ZERO;
+    while let p = phonemes.drain(..min(pack.as_ref().len(), phonemes.len()))
+        && p.len() != 0
+    {
+        let phonemes = Array::from_shape_vec((1, p.len()), p.collect())?;
+        let ref_s = pack.as_ref()[phonemes.len() - 1]
+            .first()
+            .cloned()
+            .unwrap_or(vec![0.; 256]);
+
+        let style = Array::from_shape_vec((1, ref_s.len()), ref_s)?;
+        let speed = Array::from_vec(vec![speed]);
+        let options = RunOptions::new()?;
+        let mut model = model.lock().await;
+        let t = SystemTime::now();
+        let kokoro_output = model
+            .run_async(
+                inputs![
+                    "input_ids" => TensorRef::from_array_view(&phonemes)?,
+                    "style" => TensorRef::from_array_view(&style)?,
+                    "speed" => TensorRef::from_array_view(&speed)?,
+                ],
+                &options,
+            )?
+            .await?;
+        elapsed = t.elapsed()?;
+        let (_, audio) = kokoro_output["waveform"].try_extract_tensor::<f32>()?;
+        let (_, _duration) = kokoro_output["duration"].try_extract_tensor::<i64>()?;
+        // let _ = dbg!(duration.len());
+        ret.extend_from_slice(audio);
+    }
+
+    Ok((ret, elapsed))
+}
+
+pub(super) async fn synth<P, S>(
+    model: Weak<Mutex<Session>>,
+    text: S,
+    pack: P,
+    voice: Voice,
+) -> Result<(Vec<f32>, Duration), KokoroError>
+where
+    P: AsRef<Vec<Vec<Vec<f32>>>>,
+    S: AsRef<str>,
+{
+    let phonemes = g2p(text.as_ref(), voice.is_v11_supported())?;
+    // #[cfg(debug_assertions)]
+    // println!("{}", phonemes);
+    match voice {
+        v if v.is_v11_supported() => synth_v11(model, phonemes, pack, v.get_speed_v11()?).await,
+        v if v.is_v10_supported() => synth_v10(model, phonemes, pack, v.get_speed_v10()?).await,
+        v => Err(KokoroError::VoiceVersionInvalid(v.get_name().to_owned())),
+    }
+}
@@ -0,0 +1,324 @@
+use {
+    log::warn,
+    std::{collections::HashMap, sync::LazyLock},
+};
+static VOCAB_V10: LazyLock<HashMap<char, u8>> = LazyLock::new(|| {
+    let mut map = HashMap::new();
+
+    map.insert(';', 1);
+    map.insert(':', 2);
+    map.insert(',', 3);
+    map.insert('.', 4);
+    map.insert('!', 5);
+    map.insert('?', 6);
+    map.insert('—', 9);
+    map.insert('…', 10);
+    map.insert('"', 11);
+    map.insert('(', 12);
+    map.insert(')', 13);
+    map.insert('“', 14);
+    map.insert('”', 15);
+    map.insert(' ', 16);
+    map.insert('\u{0303}', 17); // Unicode escape for combining tilde
+    map.insert('ʣ', 18);
+    map.insert('ʥ', 19);
+    map.insert('ʦ', 20);
+    map.insert('ʨ', 21);
+    map.insert('ᵝ', 22);
+    map.insert('\u{AB67}', 23); // Unicode escape
+    map.insert('A', 24);
+    map.insert('I', 25);
+    map.insert('O', 31);
+    map.insert('Q', 33);
+    map.insert('S', 35);
+    map.insert('T', 36);
+    map.insert('W', 39);
+    map.insert('Y', 41);
+    map.insert('ᵊ', 42);
+    map.insert('a', 43);
+    map.insert('b', 44);
+    map.insert('c', 45);
+    map.insert('d', 46);
+    map.insert('e', 47);
+    map.insert('f', 48);
+    map.insert('h', 50);
+    map.insert('i', 51);
+    map.insert('j', 52);
+    map.insert('k', 53);
+    map.insert('l', 54);
+    map.insert('m', 55);
+    map.insert('n', 56);
+    map.insert('o', 57);
+    map.insert('p', 58);
+    map.insert('q', 59);
+    map.insert('r', 60);
+    map.insert('s', 61);
+    map.insert('t', 62);
+    map.insert('u', 63);
+    map.insert('v', 64);
+    map.insert('w', 65);
+    map.insert('x', 66);
+    map.insert('y', 67);
+    map.insert('z', 68);
+    map.insert('ɑ', 69);
+    map.insert('ɐ', 70);
+    map.insert('ɒ', 71);
+    map.insert('æ', 72);
+    map.insert('β', 75);
+    map.insert('ɔ', 76);
+    map.insert('ɕ', 77);
+    map.insert('ç', 78);
+    map.insert('ɖ', 80);
+    map.insert('ð', 81);
+    map.insert('ʤ', 82);
+    map.insert('ə', 83);
+    map.insert('ɚ', 85);
+    map.insert('ɛ', 86);
+    map.insert('ɜ', 87);
+    map.insert('ɟ', 90);
+    map.insert('ɡ', 92);
+    map.insert('ɥ', 99);
+    map.insert('ɨ', 101);
+    map.insert('ɪ', 102);
+    map.insert('ʝ', 103);
+    map.insert('ɯ', 110);
+    map.insert('ɰ', 111);
+    map.insert('ŋ', 112);
+    map.insert('ɳ', 113);
+    map.insert('ɲ', 114);
+    map.insert('ɴ', 115);
+    map.insert('ø', 116);
+    map.insert('ɸ', 118);
+    map.insert('θ', 119);
+    map.insert('œ', 120);
+    map.insert('ɹ', 123);
+    map.insert('ɾ', 125);
+    map.insert('ɻ', 126);
+    map.insert('ʁ', 128);
+    map.insert('ɽ', 129);
+    map.insert('ʂ', 130);
+    map.insert('ʃ', 131);
+    map.insert('ʈ', 132);
+    map.insert('ʧ', 133);
+    map.insert('ʊ', 135);
+    map.insert('ʋ', 136);
+    map.insert('ʌ', 138);
+    map.insert('ɣ', 139);
+    map.insert('ɤ', 140);
+    map.insert('χ', 142);
+    map.insert('ʎ', 143);
+    map.insert('ʒ', 147);
+    map.insert('ʔ', 148);
+    map.insert('ˈ', 156);
+    map.insert('ˌ', 157);
+    map.insert('ː', 158);
+    map.insert('ʰ', 162);
+    map.insert('ʲ', 164);
+    map.insert('↓', 169);
+    map.insert('→', 171);
+    map.insert('↗', 172);
+    map.insert('↘', 173);
+    map.insert('ᵻ', 177);
+    map
+});
+
+static VOCAB_V11: LazyLock<HashMap<char, u8>> = LazyLock::new(|| {
+    let mut map = HashMap::new();
+
+    map.insert(';', 1);
+    map.insert(':', 2);
+    map.insert(',', 3);
+    map.insert('.', 4);
+    map.insert('!', 5);
+    map.insert('?', 6);
+    map.insert('/', 7);
+    map.insert('—', 9);
+    map.insert('…', 10);
+    map.insert('"', 11);
+    map.insert('(', 12);
+    map.insert(')', 13);
+    map.insert('“', 14);
+    map.insert('”', 15);
+    map.insert(' ', 16);
+    map.insert('\u{0303}', 17); // Unicode escape for combining tilde
+    map.insert('ʣ', 18);
+    map.insert('ʥ', 19);
+    map.insert('ʦ', 20);
+    map.insert('ʨ', 21);
+    map.insert('ᵝ', 22);
+    map.insert('ㄓ', 23);
+    map.insert('A', 24);
+    map.insert('I', 25);
+    map.insert('ㄅ', 30);
+    map.insert('O', 31);
+    map.insert('ㄆ', 32);
+    map.insert('Q', 33);
+    map.insert('R', 34);
+    map.insert('S', 35);
+    map.insert('T', 36);
+    map.insert('ㄇ', 37);
+    map.insert('ㄈ', 38);
+    map.insert('W', 39);
+    map.insert('ㄉ', 40);
+    map.insert('Y', 41);
+    map.insert('ᵊ', 42);
+    map.insert('a', 43);
+    map.insert('b', 44);
+    map.insert('c', 45);
+    map.insert('d', 46);
+    map.insert('e', 47);
+    map.insert('f', 48);
+    map.insert('ㄊ', 49);
+    map.insert('h', 50);
+    map.insert('i', 51);
+    map.insert('j', 52);
+    map.insert('k', 53);
+    map.insert('l', 54);
+    map.insert('m', 55);
+    map.insert('n', 56);
+    map.insert('o', 57);
+    map.insert('p', 58);
+    map.insert('q', 59);
+    map.insert('r', 60);
+    map.insert('s', 61);
+    map.insert('t', 62);
+    map.insert('u', 63);
+    map.insert('v', 64);
+    map.insert('w', 65);
+    map.insert('x', 66);
+    map.insert('y', 67);
+    map.insert('z', 68);
+    map.insert('ɑ', 69);
+    map.insert('ɐ', 70);
+    map.insert('ɒ', 71);
+    map.insert('æ', 72);
+    map.insert('ㄋ', 73);
+    map.insert('ㄌ', 74);
+    map.insert('β', 75);
+    map.insert('ɔ', 76);
+    map.insert('ɕ', 77);
+    map.insert('ç', 78);
+    map.insert('ㄍ', 79);
+    map.insert('ɖ', 80);
+    map.insert('ð', 81);
+    map.insert('ʤ', 82);
+    map.insert('ə', 83);
+    map.insert('ㄎ', 84);
+    map.insert('ㄦ', 85);
+    map.insert('ɛ', 86);
+    map.insert('ɜ', 87);
+    map.insert('ㄏ', 88);
+    map.insert('ㄐ', 89);
+    map.insert('ɟ', 90);
+    map.insert('ㄑ', 91);
+    map.insert('ɡ', 92);
+    map.insert('ㄒ', 93);
+    map.insert('ㄔ', 94);
+    map.insert('ㄕ', 95);
+    map.insert('ㄗ', 96);
+    map.insert('ㄘ', 97);
+    map.insert('ㄙ', 98);
+    map.insert('月', 99);
+    map.insert('ㄚ', 100);
+    map.insert('ɨ', 101);
+    map.insert('ɪ', 102);
+    map.insert('ʝ', 103);
+    map.insert('ㄛ', 104);
+    map.insert('ㄝ', 105);
+    map.insert('ㄞ', 106);
+    map.insert('ㄟ', 107);
+    map.insert('ㄠ', 108);
+    map.insert('ㄡ', 109);
+    map.insert('ɯ', 110);
+    map.insert('ɰ', 111);
+    map.insert('ŋ', 112);
+    map.insert('ɳ', 113);
+    map.insert('ɲ', 114);
+    map.insert('ɴ', 115);
+    map.insert('ø', 116);
+    map.insert('ㄢ', 117);
+    map.insert('ɸ', 118);
+    map.insert('θ', 119);
+    map.insert('œ', 120);
+    map.insert('ㄣ', 121);
+    map.insert('ㄤ', 122);
+    map.insert('ɹ', 123);
+    map.insert('ㄥ', 124);
+    map.insert('ɾ', 125);
+    map.insert('ㄖ', 126);
+    map.insert('ㄧ', 127);
+    map.insert('ʁ', 128);
+    map.insert('ɽ', 129);
+    map.insert('ʂ', 130);
+    map.insert('ʃ', 131);
+    map.insert('ʈ', 132);
+    map.insert('ʧ', 133);
+    map.insert('ㄨ', 134);
+    map.insert('ʊ', 135);
+    map.insert('ʋ', 136);
+    map.insert('ㄩ', 137);
+    map.insert('ʌ', 138);
+    map.insert('ɣ', 139);
+    map.insert('ㄜ', 140);
+    map.insert('ㄭ', 141);
+    map.insert('χ', 142);
+    map.insert('ʎ', 143);
+    map.insert('十', 144);
+    map.insert('压', 145);
+    map.insert('言', 146);
+    map.insert('ʒ', 147);
+    map.insert('ʔ', 148);
+    map.insert('阳', 149);
+    map.insert('要', 150);
+    map.insert('阴', 151);
+    map.insert('应', 152);
+    map.insert('用', 153);
+    map.insert('又', 154);
+    map.insert('中', 155);
+    map.insert('ˈ', 156);
+    map.insert('ˌ', 157);
+    map.insert('ː', 158);
+    map.insert('穵', 159);
+    map.insert('外', 160);
+    map.insert('万', 161);
+    map.insert('ʰ', 162);
+    map.insert('王', 163);
+    map.insert('ʲ', 164);
+    map.insert('为', 165);
+    map.insert('文', 166);
+    map.insert('瓮', 167);
+    map.insert('我', 168);
+    map.insert('3', 169);
+    map.insert('5', 170);
+    map.insert('1', 171);
+    map.insert('2', 172);
+    map.insert('4', 173);
+    map.insert('元', 175);
+    map.insert('云', 176);
+    map.insert('ᵻ', 177);
+    map
+});
+
+pub fn get_token_ids(phonemes: &str, v11: bool) -> Vec<i64> {
+    let mut tokens = Vec::with_capacity(phonemes.len() + 2);
+    tokens.push(0);
+
+    for i in phonemes.chars() {
+        let v = if v11 {
+            VOCAB_V11.get(&i).copied()
+        } else {
+            VOCAB_V10.get(&i).copied()
+        };
+        match v {
+            Some(t) => {
+                tokens.push(t as _);
+            }
+            _ => {
+                warn!("Unknown phone {}, skipped.", i);
+            }
+        }
+    }
+
+    tokens.push(0);
+    tokens
+}
@@ -0,0 +1,4 @@
+mod en;
+mod zh;
+
+pub use {en::*, zh::*};
@@ -0,0 +1,147 @@
+use regex::Regex;
+use std::{collections::HashMap, sync::LazyLock};
+
+static LETTERS_IPA_MAP: LazyLock<HashMap<char, &'static str>> = LazyLock::new(|| {
+    let mut map = HashMap::new();
+    map.insert('a', "ɐ");
+    map.insert('b', "bˈi");
+    map.insert('c', "sˈi");
+    map.insert('d', "dˈi");
+    map.insert('e', "ˈi");
+    map.insert('f', "ˈɛf");
+    map.insert('g', "ʤˈi");
+    map.insert('h', "ˈAʧ");
+    map.insert('i', "ˈI");
+    map.insert('j', "ʤˈA");
+    map.insert('k', "kˈA");
+    map.insert('l', "ˈɛl");
+    map.insert('m', "ˈɛm");
+    map.insert('n', "ˈɛn");
+    map.insert('o', "ˈO");
+    map.insert('p', "pˈi");
+    map.insert('q', "kjˈu");
+    map.insert('r', "ˈɑɹ");
+    map.insert('s', "ˈɛs");
+    map.insert('t', "tˈi");
+    map.insert('u', "jˈu");
+    map.insert('v', "vˈi");
+    map.insert('w', "dˈʌbᵊlju");
+    map.insert('x', "ˈɛks");
+    map.insert('y', "wˈI");
+    map.insert('z', "zˈi");
+    map.insert('A', "ˈA");
+    map.insert('B', "bˈi");
+    map.insert('C', "sˈi");
+    map.insert('D', "dˈi");
+    map.insert('E', "ˈi");
+    map.insert('F', "ˈɛf");
+    map.insert('G', "ʤˈi");
+    map.insert('H', "ˈAʧ");
+    map.insert('I', "ˈI");
+    map.insert('J', "ʤˈA");
+    map.insert('K', "kˈA");
+    map.insert('L', "ˈɛl");
+    map.insert('M', "ˈɛm");
+    map.insert('N', "ˈɛn");
+    map.insert('O', "ˈO");
+    map.insert('P', "pˈi");
+    map.insert('Q', "kjˈu");
+    map.insert('R', "ˈɑɹ");
+    map.insert('S', "ˈɛs");
+    map.insert('T', "tˈi");
+    map.insert('U', "jˈu");
+    map.insert('V', "vˈi");
+    map.insert('W', "dˈʌbᵊlju");
+    map.insert('X', "ˈɛks");
+    map.insert('Y', "wˈI");
+    map.insert('Z', "zˈi");
+    map
+});
+static ARPA_IPA_MAP: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
+    let mut map = HashMap::new();
+    map.insert("AA", "ɑ");
+    map.insert("AE", "æ");
+    map.insert("AH", "ə");
+    map.insert("AO", "ɔ");
+    map.insert("AW", "aʊ");
+    map.insert("AY", "aɪ");
+    map.insert("B", "b");
+    map.insert("CH", "tʃ");
+    map.insert("D", "d");
+    map.insert("DH", "ð");
+    map.insert("EH", "ɛ");
+    map.insert("ER", "ɝ");
+    map.insert("EY", "eɪ");
+    map.insert("F", "f");
+    map.insert("G", "ɡ");
+    map.insert("HH", "h");
+    map.insert("IH", "ɪ");
+    map.insert("IY", "i");
+    map.insert("JH", "dʒ");
+    map.insert("K", "k");
+    map.insert("L", "l");
+    map.insert("M", "m");
+    map.insert("N", "n");
+    map.insert("NG", "ŋ");
+    map.insert("OW", "oʊ");
+    map.insert("OY", "ɔɪ");
+    map.insert("P", "p");
+    map.insert("R", "ɹ");
+    map.insert("S", "s");
+    map.insert("SH", "ʃ");
+    map.insert("T", "t");
+    map.insert("TH", "θ");
+    map.insert("UH", "ʊ");
+    map.insert("UW", "u");
+    map.insert("V", "v");
+    map.insert("W", "w");
+    map.insert("Y", "j");
+    map.insert("Z", "z");
+    map.insert("ZH", "ʒ");
+    map.insert("SIL", "");
+    map
+});
+
+/// 支持2025新增符号（如：吸气音ʘ）
+const SPECIAL_CASES: [(&str, &str); 3] = [("CLICK!", "ʘ"), ("TSK!", "ǀ"), ("TUT!", "ǁ")];
+
+pub fn arpa_to_ipa(arpa: &str) -> Result<String, regex::Error> {
+    let re = Regex::new(r"([A-Z!]+)(\d*)")?;
+
+    let Some(caps) = re.captures(arpa) else {
+        return Ok(Default::default());
+    };
+
+    // 处理特殊符号（2025新增）
+    if let Some(sc) = SPECIAL_CASES.iter().find(|&&(s, _)| s == &caps[1]) {
+        return Ok(sc.1.to_string());
+    }
+
+    // 获取IPA映射
+    let phoneme = ARPA_IPA_MAP
+        .get(&caps[1])
+        .map_or_else(|| letters_to_ipa(arpa), |i| i.to_string());
+
+    let mut result = String::with_capacity(arpa.len() * 2);
+    // 添加重音标记（支持三级重音）
+    result.push(match &caps[2] {
+        "1" => 'ˈ',
+        "2" => 'ˌ',
+        "3" => '˧', // 2025新增中级重音
+        _ => '\0',
+    });
+
+    result.push_str(&phoneme);
+
+    Ok(result)
+}
+
+pub fn letters_to_ipa(letters: &str) -> String {
+    let mut res = String::with_capacity(letters.len());
+    for i in letters.chars() {
+        if let Some(p) = LETTERS_IPA_MAP.get(&i) {
+            res.push_str(p);
+        }
+    }
+    res
+}
@@ -0,0 +1,364 @@
+/// 汉语拼音到国际音标的转换
+/// 参考了python的misaki库的zh.py。
+use std::{collections::HashMap, error::Error, fmt, sync::LazyLock};
+
+const VALID_FINALS: [&str; 37] = [
+    "i", "u", "ü", "a", "ia", "ua", "o", "uo", "e", "ie", "üe", "ai", "uai", "ei", "uei", "ao",
+    "iao", "ou", "iou", "an", "ian", "uan", "üan", "en", "in", "uen", "ün", "ang", "iang", "uang",
+    "eng", "ing", "ueng", "ong", "iong", "er", "ê",
+];
+const INITIALS: [&str; 21] = [
+    "zh", "ch", "sh", "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s",
+    "t", "x", "z",
+];
+
+// 错误类型定义
+#[derive(Debug)]
+pub enum PinyinError {
+    FinalNotFound(String),
+}
+
+impl fmt::Display for PinyinError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            PinyinError::FinalNotFound(tip) => write!(f, "Final not found: {}", tip),
+        }
+    }
+}
+
+impl Error for PinyinError {}
+
+static INITIAL_MAPPING: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
+    LazyLock::new(|| {
+        let mut map = HashMap::new();
+
+        map.insert("b", vec![vec!["p"]]);
+        map.insert("c", vec![vec!["ʦʰ"]]);
+        map.insert("ch", vec![vec!["ꭧʰ"]]);
+        map.insert("d", vec![vec!["t"]]);
+        map.insert("f", vec![vec!["f"]]);
+        map.insert("g", vec![vec!["k"]]);
+        map.insert("h", vec![vec!["x"], vec!["h"]]);
+        map.insert("j", vec![vec!["ʨ"]]);
+        map.insert("k", vec![vec!["kʰ"]]);
+        map.insert("l", vec![vec!["l"]]);
+        map.insert("m", vec![vec!["m"]]);
+        map.insert("n", vec![vec!["n"]]);
+        map.insert("p", vec![vec!["pʰ"]]);
+        map.insert("q", vec![vec!["ʨʰ"]]);
+        map.insert("r", vec![vec!["ɻ"], vec!["ʐ"]]);
+        map.insert("s", vec![vec!["s"]]);
+        map.insert("sh", vec![vec!["ʂ"]]);
+        map.insert("t", vec![vec!["tʰ"]]);
+        map.insert("x", vec![vec!["ɕ"]]);
+        map.insert("z", vec![vec!["ʦ"]]);
+        map.insert("zh", vec![vec!["ꭧ"]]);
+        map
+    });
+
+static SYLLABIC_CONSONANT_MAPPINGS: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
+    LazyLock::new(|| {
+        let mut map = HashMap::new();
+        map.insert("hm", vec![vec!["h", "m0"]]);
+        map.insert("hng", vec![vec!["h", "ŋ0"]]);
+        map.insert("m", vec![vec!["m0"]]);
+        map.insert("n", vec![vec!["n0"]]);
+        map.insert("ng", vec![vec!["ŋ0"]]);
+        map
+    });
+
+static INTERJECTION_MAPPINGS: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
+    LazyLock::new(|| {
+        let mut map = HashMap::new();
+        map.insert("io", vec![vec!["j", "ɔ0"]]);
+        map.insert("ê", vec![vec!["ɛ0"]]);
+        map.insert("er", vec![vec!["ɚ0"], vec!["aɚ̯0"]]);
+        map.insert("o", vec![vec!["ɔ0"]]);
+        map
+    });
+
+/// Duanmu (2000, p. 37) and Lin (2007, p. 68f)
+/// Diphtongs from Duanmu (2007, p. 40): au, əu, əi, ai
+/// Diphthongs from Lin (2007, p. 68f): au̯, ou̯, ei̯, ai̯
+static FINAL_MAPPING: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
+    LazyLock::new(|| {
+        let mut map = HashMap::new();
+        map.insert("a", vec![vec!["a0"]]);
+        map.insert("ai", vec![vec!["ai0"]]);
+        map.insert("an", vec![vec!["a0", "n"]]);
+        map.insert("ang", vec![vec!["a0", "ŋ"]]);
+        map.insert("ao", vec![vec!["au0"]]);
+        map.insert("e", vec![vec!["ɤ0"]]);
+        map.insert("ei", vec![vec!["ei0"]]);
+        map.insert("en", vec![vec!["ə0", "n"]]);
+        map.insert("eng", vec![vec!["ə0", "ŋ"]]);
+        map.insert("i", vec![vec!["i0"]]);
+        map.insert("ia", vec![vec!["j", "a0"]]);
+        map.insert("ian", vec![vec!["j", "ɛ0", "n"]]);
+        map.insert("iang", vec![vec!["j", "a0", "ŋ"]]);
+        map.insert("iao", vec![vec!["j", "au0"]]);
+        map.insert("ie", vec![vec!["j", "e0"]]);
+        map.insert("in", vec![vec!["i0", "n"]]);
+        map.insert("iou", vec![vec!["j", "ou0"]]);
+        map.insert("ing", vec![vec!["i0", "ŋ"]]);
+        map.insert("iong", vec![vec!["j", "ʊ0", "ŋ"]]);
+        map.insert("ong", vec![vec!["ʊ0", "ŋ"]]);
+        map.insert("ou", vec![vec!["ou0"]]);
+        map.insert("u", vec![vec!["u0"]]);
+        map.insert("uei", vec![vec!["w", "ei0"]]);
+        map.insert("ua", vec![vec!["w", "a0"]]);
+        map.insert("uai", vec![vec!["w", "ai0"]]);
+        map.insert("uan", vec![vec!["w", "a0", "n"]]);
+        map.insert("uen", vec![vec!["w", "ə0", "n"]]);
+        map.insert("uang", vec![vec!["w", "a0", "ŋ"]]);
+        map.insert("ueng", vec![vec!["w", "ə0", "ŋ"]]);
+        map.insert("ui", vec![vec!["w", "ei0"]]);
+        map.insert("un", vec![vec!["w", "ə0", "n"]]);
+        map.insert("uo", vec![vec!["w", "o0"]]);
+        map.insert("o", vec![vec!["w", "o0"]]); // 注意：这里'o'的映射可能与预期不符，根据注释可能需要特殊处理
+        map.insert("ü", vec![vec!["y0"]]);
+        map.insert("üe", vec![vec!["ɥ", "e0"]]);
+        map.insert("üan", vec![vec!["ɥ", "ɛ0", "n"]]);
+        map.insert("ün", vec![vec!["y0", "n"]]);
+        map
+    });
+
+static FINAL_MAPPING_AFTER_ZH_CH_SH_R: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
+    LazyLock::new(|| {
+        let mut map = HashMap::new();
+        map.insert("i", vec![vec!["ɻ0"], vec!["ʐ0"]]);
+        map
+    });
+
+static FINAL_MAPPING_AFTER_Z_C_S: LazyLock<HashMap<&'static str, Vec<Vec<&'static str>>>> =
+    LazyLock::new(|| {
+        let mut map = HashMap::new();
+        map.insert("i", vec![vec!["ɹ0"], vec!["z0"]]);
+        map
+    });
+
+static TONE_MAPPING: LazyLock<HashMap<u8, &'static str>> = LazyLock::new(|| {
+    let mut map = HashMap::new();
+    map.insert(1u8, "˥");
+    map.insert(2u8, "˧˥");
+    map.insert(3u8, "˧˩˧");
+    map.insert(4u8, "˥˩");
+    map.insert(5u8, "");
+    map
+});
+
+pub(crate) fn split_tone(pinyin: &str) -> (&str, u8) {
+    if let Some(t) = pinyin
+        .chars()
+        .last()
+        .and_then(|c| c.to_digit(10).map(|n| n as u8))
+    {
+        return (&pinyin[..pinyin.len() - 1], t);
+    }
+    (pinyin, 5)
+}
+
+/// uen 转换，还原原始的韵母
+/// iou，uei，uen前面加声母的时候，写成iu，ui，un。
+/// 例如niu(牛)，gui(归)，lun(论)。
+fn convert_uen(s: &str) -> String {
+    match s.strip_suffix('n') {
+        Some(stem) if stem.ends_with(['u', 'ū', 'ú', 'ǔ', 'ù']) => {
+            format!("{}en", stem)
+        }
+        _ => s.to_string(),
+    }
+}
+
+/// ü 转换，还原原始的韵母
+/// ü行的韵母跟声母j，q，x拼的时候，写成ju(居)，qu(区)，xu(虚)， ü上两点也省略；
+/// 但是跟声母n，l拼的时候，仍然写成nü(女)，lü(吕)
+fn convert_uv(pinyin: &str) -> String {
+    let chars = pinyin.chars().collect::<Vec<_>>();
+
+    match chars.as_slice() {
+        [
+            c @ ('j' | 'q' | 'x'),
+            tone @ ('u' | 'ū' | 'ú' | 'ǔ' | 'ù'),
+            rest @ ..,
+        ] => {
+            let new_tone = match tone {
+                'u' => 'ü',
+                'ū' => 'ǖ',
+                'ú' => 'ǘ',
+                'ǔ' => 'ǚ',
+                'ù' => 'ǜ',
+                _ => unreachable!(),
+            };
+            format!("{}{}{}", c, new_tone, rest.iter().collect::<String>())
+        }
+        _ => pinyin.to_string(),
+    }
+}
+
+/// iou 转换，还原原始的韵母
+/// iou，uei，uen前面加声母的时候，写成iu，ui，un。
+/// 例如niu(牛)，gui(归)，lun(论)。
+fn convert_iou(pinyin: &str) -> String {
+    let chars = pinyin.chars().collect::<Vec<_>>();
+
+    match chars.as_slice() {
+        // 处理 iu 系列
+        [.., 'i', u @ ('u' | 'ū' | 'ú' | 'ǔ' | 'ù')] => {
+            format!("{}o{}", &pinyin[..pinyin.len() - 1], u)
+        }
+
+        // 其他情况保持原样
+        _ => pinyin.to_string(),
+    }
+}
+
+/// uei 转换，还原原始的韵母
+/// iou，uei，uen前面加声母的时候，写成iu，ui，un。
+/// 例如niu(牛)，gui(归)，lun(论)。
+fn convert_uei(pinyin: &str) -> String {
+    let chars = pinyin.chars().collect::<Vec<_>>();
+
+    match chars.as_slice() {
+        // 处理 ui 系列
+        [.., 'u', i @ ('i' | 'ī' | 'í' | 'ǐ' | 'ì')] => {
+            format!("{}e{}", &pinyin[..pinyin.len() - 1], i)
+        }
+
+        // 其他情况保持原样
+        _ => pinyin.to_string(),
+    }
+}
+
+/// 零声母转换，还原原始的韵母
+/// i行的韵母，前面没有声母的时候，写成yi(衣)，ya(呀)，ye(耶)，yao(腰)，you(忧)，yan(烟)，yin(因)，yang(央)，ying(英)，yong(雍)。
+/// u行的韵母，前面没有声母的时候，写成wu(乌)，wa(蛙)，wo(窝)，wai(歪)，wei(威)，wan(弯)，wen(温)，wang(汪)，weng(翁)。
+/// ü行的韵母，前面没有声母的时候，写成yu(迂)，yue(约)，yuan(冤)，yun(晕)；ü上两点省略。"""
+pub(crate) fn convert_zero_consonant(pinyin: &str) -> String {
+    let mut buffer = String::with_capacity(pinyin.len() + 2);
+    let chars: Vec<char> = pinyin.chars().collect();
+
+    match chars.as_slice() {
+        // 处理Y系转换
+        ['y', 'u', rest @ ..] => {
+            buffer.push('ü');
+            buffer.extend(rest.iter());
+        }
+        ['y', u @ ('ū' | 'ú' | 'ǔ' | 'ù'), rest @ ..] => {
+            buffer.push(match u {
+                'ū' => 'ǖ', // ü 第一声
+                'ú' => 'ǘ', // ü 第二声
+                'ǔ' => 'ǚ', // ü 第三声
+                'ù' => 'ǜ', // ü 第四声
+                _ => unreachable!(),
+            });
+            buffer.extend(rest.iter());
+        }
+        ['y', i @ ('i' | 'ī' | 'í' | 'ǐ' | 'ì'), rest @ ..] => {
+            buffer.push(*i);
+            buffer.extend(rest.iter());
+        }
+        ['y', rest @ ..] => {
+            buffer.push('i');
+            buffer.extend(rest);
+        }
+
+        // 处理W系转换
+        ['w', u @ ('u' | 'ū' | 'ú' | 'ǔ' | 'ù'), rest @ ..] => {
+            buffer.push(*u);
+            buffer.extend(rest.iter());
+        }
+        ['w', rest @ ..] => {
+            buffer.push('u');
+            buffer.extend(rest);
+        }
+
+        // 无需转换的情况
+        _ => return pinyin.to_string(),
+    }
+
+    // 有效性验证
+    if VALID_FINALS.contains(&buffer.as_str()) {
+        buffer
+    } else {
+        pinyin.to_string()
+    }
+}
+
+pub(crate) fn split_initial(pinyin: &str) -> (&'static str, &str) {
+    for &initial in &INITIALS {
+        if let Some(stripped) = pinyin.strip_prefix(initial) {
+            return (initial, stripped);
+        }
+    }
+    ("", pinyin)
+}
+
+fn apply_tone(variants: &[Vec<&str>], tone: u8) -> Vec<Vec<String>> {
+    let tone_str = TONE_MAPPING.get(&tone).unwrap_or(&"");
+    variants
+        .iter()
+        .map(|v| v.iter().map(|s| s.replace("0", tone_str)).collect())
+        .collect()
+}
+
+pub fn pinyin_to_ipa(pinyin: &str) -> Result<Vec<Vec<String>>, PinyinError> {
+    let (pinyin, tone) = split_tone(pinyin);
+    let pinyin = convert_zero_consonant(pinyin);
+    let pinyin = convert_uv(&pinyin);
+    let pinyin = convert_iou(&pinyin);
+    let pinyin = convert_uei(&pinyin);
+    let pinyin = convert_uen(&pinyin);
+
+    // 处理特殊成音节辅音和感叹词
+    if let Some(ipa) = SYLLABIC_CONSONANT_MAPPINGS.get(pinyin.as_str()) {
+        return Ok(apply_tone(ipa, tone)
+            .into_iter()
+            .map(|i| i.into_iter().collect())
+            .collect());
+    }
+    if let Some(ipa) = INTERJECTION_MAPPINGS.get(pinyin.as_str()) {
+        return Ok(apply_tone(ipa, tone)
+            .into_iter()
+            .map(|i| i.into_iter().collect())
+            .collect());
+    }
+
+    // 分解声母韵母
+    let (initial_part, final_part) = split_initial(pinyin.as_str());
+
+    // 获取韵母IPA
+    let final_ipa = match initial_part {
+        "zh" | "ch" | "sh" | "r" if FINAL_MAPPING_AFTER_ZH_CH_SH_R.contains_key(final_part) => {
+            FINAL_MAPPING_AFTER_ZH_CH_SH_R.get(final_part)
+        }
+        "z" | "c" | "s" if FINAL_MAPPING_AFTER_Z_C_S.contains_key(final_part) => {
+            FINAL_MAPPING_AFTER_Z_C_S.get(final_part)
+        }
+        _ => FINAL_MAPPING.get(final_part),
+    }
+    .ok_or(PinyinError::FinalNotFound(final_part.to_owned()))?;
+
+    // 组合所有可能
+    let mut result = Vec::<Vec<String>>::new();
+    let initials = INITIAL_MAPPING
+        .get(initial_part)
+        .map_or(vec![vec![Default::default()]], |i| {
+            i.iter()
+                .map(|i| i.iter().map(|i| i.to_string()).collect())
+                .collect()
+        });
+
+    for i in initials.into_iter() {
+        for j in apply_tone(final_ipa, tone).into_iter() {
+            result.push(
+                i.iter()
+                    .chain(j.iter())
+                    .map(|i| i.to_owned())
+                    .collect::<Vec<_>>(),
+            )
+        }
+    }
+
+    Ok(result)
+}
@@ -0,0 +1,673 @@
+use crate::KokoroError;
+
+//noinspection SpellCheckingInspection
+#[derive(Copy, Clone, Debug)]
+pub enum Voice {
+    // v1.0
+    ZmYunyang(f32),
+    ZfXiaoni(f32),
+    AfJessica(f32),
+    BfLily(f32),
+    ZfXiaobei(f32),
+    ZmYunxia(f32),
+    AfHeart(f32),
+    BfEmma(f32),
+    AmPuck(f32),
+    BfAlice(f32),
+    HfAlpha(f32),
+    BfIsabella(f32),
+    AfNova(f32),
+    AmFenrir(f32),
+    EmAlex(f32),
+    ImNicola(f32),
+    PmAlex(f32),
+    AfAlloy(f32),
+    ZmYunxi(f32),
+    AfSarah(f32),
+    JfNezumi(f32),
+    BmDaniel(f32),
+    JfTebukuro(f32),
+    JfAlpha(f32),
+    JmKumo(f32),
+    EmSanta(f32),
+    AmLiam(f32),
+    AmSanta(f32),
+    AmEric(f32),
+    BmFable(f32),
+    AfBella(f32),
+    BmLewis(f32),
+    PfDora(f32),
+    AfNicole(f32),
+    BmGeorge(f32),
+    AmOnyx(f32),
+    HmPsi(f32),
+    HfBeta(f32),
+    HmOmega(f32),
+    ZfXiaoxiao(f32),
+    FfSiwis(f32),
+    EfDora(f32),
+    AfAoede(f32),
+    AmEcho(f32),
+    AmMichael(f32),
+    AfKore(f32),
+    ZfXiaoyi(f32),
+    JfGongitsune(f32),
+    AmAdam(f32),
+    IfSara(f32),
+    AfSky(f32),
+    PmSanta(f32),
+    AfRiver(f32),
+    ZmYunjian(f32),
+
+    // v1.1
+    Zm029(i32),
+    Zf048(i32),
+    Zf008(i32),
+    Zm014(i32),
+    Zf003(i32),
+    Zf047(i32),
+    Zm080(i32),
+    Zf094(i32),
+    Zf046(i32),
+    Zm054(i32),
+    Zf001(i32),
+    Zm062(i32),
+    BfVale(i32),
+    Zf044(i32),
+    Zf005(i32),
+    Zf028(i32),
+    Zf059(i32),
+    Zm030(i32),
+    Zf074(i32),
+    Zm009(i32),
+    Zf004(i32),
+    Zf021(i32),
+    Zm095(i32),
+    Zm041(i32),
+    Zf087(i32),
+    Zf039(i32),
+    Zm031(i32),
+    Zf007(i32),
+    Zf038(i32),
+    Zf092(i32),
+    Zm056(i32),
+    Zf099(i32),
+    Zm010(i32),
+    Zm069(i32),
+    Zm016(i32),
+    Zm068(i32),
+    Zf083(i32),
+    Zf093(i32),
+    Zf006(i32),
+    Zf026(i32),
+    Zm053(i32),
+    Zm064(i32),
+    AfSol(i32),
+    Zf042(i32),
+    Zf084(i32),
+    Zf073(i32),
+    Zf067(i32),
+    Zm025(i32),
+    Zm020(i32),
+    Zm050(i32),
+    Zf070(i32),
+    Zf002(i32),
+    Zf032(i32),
+    Zm091(i32),
+    Zm066(i32),
+    Zm089(i32),
+    Zm034(i32),
+    Zm100(i32),
+    Zf086(i32),
+    Zf040(i32),
+    Zm011(i32),
+    Zm098(i32),
+    Zm015(i32),
+    Zf051(i32),
+    Zm065(i32),
+    Zf076(i32),
+    Zf036(i32),
+    Zm033(i32),
+    Zf018(i32),
+    Zf017(i32),
+    Zf049(i32),
+    AfMaple(i32),
+    Zm082(i32),
+    Zm057(i32),
+    Zf079(i32),
+    Zf022(i32),
+    Zm063(i32),
+    Zf060(i32),
+    Zf019(i32),
+    Zm097(i32),
+    Zm096(i32),
+    Zf023(i32),
+    Zf027(i32),
+    Zf085(i32),
+    Zf077(i32),
+    Zm035(i32),
+    Zf088(i32),
+    Zf024(i32),
+    Zf072(i32),
+    Zm055(i32),
+    Zm052(i32),
+    Zf071(i32),
+    Zm061(i32),
+    Zf078(i32),
+    Zm013(i32),
+    Zm081(i32),
+    Zm037(i32),
+    Zf090(i32),
+    Zf043(i32),
+    Zm058(i32),
+    Zm012(i32),
+    Zm045(i32),
+    Zf075(i32),
+}
+
+impl Voice {
+    //noinspection SpellCheckingInspection
+    pub(super) fn get_name(&self) -> &str {
+        match self {
+            Self::ZmYunyang(_) => "zm_yunyang",
+            Self::ZfXiaoni(_) => "zf_xiaoni",
+            Self::AfJessica(_) => "af_jessica",
+            Self::BfLily(_) => "bf_lily",
+            Self::ZfXiaobei(_) => "zf_xiaobei",
+            Self::ZmYunxia(_) => "zm_yunxia",
+            Self::AfHeart(_) => "af_heart",
+            Self::BfEmma(_) => "bf_emma",
+            Self::AmPuck(_) => "am_puck",
+            Self::BfAlice(_) => "bf_alice",
+            Self::HfAlpha(_) => "hf_alpha",
+            Self::BfIsabella(_) => "bf_isabella",
+            Self::AfNova(_) => "af_nova",
+            Self::AmFenrir(_) => "am_fenrir",
+            Self::EmAlex(_) => "em_alex",
+            Self::ImNicola(_) => "im_nicola",
+            Self::PmAlex(_) => "pm_alex",
+            Self::AfAlloy(_) => "af_alloy",
+            Self::ZmYunxi(_) => "zm_yunxi",
+            Self::AfSarah(_) => "af_sarah",
+            Self::JfNezumi(_) => "jf_nezumi",
+            Self::BmDaniel(_) => "bm_daniel",
+            Self::JfTebukuro(_) => "jf_tebukuro",
+            Self::JfAlpha(_) => "jf_alpha",
+            Self::JmKumo(_) => "jm_kumo",
+            Self::EmSanta(_) => "em_santa",
+            Self::AmLiam(_) => "am_liam",
+            Self::AmSanta(_) => "am_santa",
+            Self::AmEric(_) => "am_eric",
+            Self::BmFable(_) => "bm_fable",
+            Self::AfBella(_) => "af_bella",
+            Self::BmLewis(_) => "bm_lewis",
+            Self::PfDora(_) => "pf_dora",
+            Self::AfNicole(_) => "af_nicole",
+            Self::BmGeorge(_) => "bm_george",
+            Self::AmOnyx(_) => "am_onyx",
+            Self::HmPsi(_) => "hm_psi",
+            Self::HfBeta(_) => "hf_beta",
+            Self::HmOmega(_) => "hm_omega",
+            Self::ZfXiaoxiao(_) => "zf_xiaoxiao",
+            Self::FfSiwis(_) => "ff_siwis",
+            Self::EfDora(_) => "ef_dora",
+            Self::AfAoede(_) => "af_aoede",
+            Self::AmEcho(_) => "am_echo",
+            Self::AmMichael(_) => "am_michael",
+            Self::AfKore(_) => "af_kore",
+            Self::ZfXiaoyi(_) => "zf_xiaoyi",
+            Self::JfGongitsune(_) => "jf_gongitsune",
+            Self::AmAdam(_) => "am_adam",
+            Self::IfSara(_) => "if_sara",
+            Self::AfSky(_) => "af_sky",
+            Self::PmSanta(_) => "pm_santa",
+            Self::AfRiver(_) => "af_river",
+            Self::ZmYunjian(_) => "zm_yunjian",
+            Self::Zm029(_) => "zm_029",
+            Self::Zf048(_) => "zf_048",
+            Self::Zf008(_) => "zf_008",
+            Self::Zm014(_) => "zm_014",
+            Self::Zf003(_) => "zf_003",
+            Self::Zf047(_) => "zf_047",
+            Self::Zm080(_) => "zm_080",
+            Self::Zf094(_) => "zf_094",
+            Self::Zf046(_) => "zf_046",
+            Self::Zm054(_) => "zm_054",
+            Self::Zf001(_) => "zf_001",
+            Self::Zm062(_) => "zm_062",
+            Self::BfVale(_) => "bf_vale",
+            Self::Zf044(_) => "zf_044",
+            Self::Zf005(_) => "zf_005",
+            Self::Zf028(_) => "zf_028",
+            Self::Zf059(_) => "zf_059",
+            Self::Zm030(_) => "zm_030",
+            Self::Zf074(_) => "zf_074",
+            Self::Zm009(_) => "zm_009",
+            Self::Zf004(_) => "zf_004",
+            Self::Zf021(_) => "zf_021",
+            Self::Zm095(_) => "zm_095",
+            Self::Zm041(_) => "zm_041",
+            Self::Zf087(_) => "zf_087",
+            Self::Zf039(_) => "zf_039",
+            Self::Zm031(_) => "zm_031",
+            Self::Zf007(_) => "zf_007",
+            Self::Zf038(_) => "zf_038",
+            Self::Zf092(_) => "zf_092",
+            Self::Zm056(_) => "zm_056",
+            Self::Zf099(_) => "zf_099",
+            Self::Zm010(_) => "zm_010",
+            Self::Zm069(_) => "zm_069",
+            Self::Zm016(_) => "zm_016",
+            Self::Zm068(_) => "zm_068",
+            Self::Zf083(_) => "zf_083",
+            Self::Zf093(_) => "zf_093",
+            Self::Zf006(_) => "zf_006",
+            Self::Zf026(_) => "zf_026",
+            Self::Zm053(_) => "zm_053",
+            Self::Zm064(_) => "zm_064",
+            Self::AfSol(_) => "af_sol",
+            Self::Zf042(_) => "zf_042",
+            Self::Zf084(_) => "zf_084",
+            Self::Zf073(_) => "zf_073",
+            Self::Zf067(_) => "zf_067",
+            Self::Zm025(_) => "zm_025",
+            Self::Zm020(_) => "zm_020",
+            Self::Zm050(_) => "zm_050",
+            Self::Zf070(_) => "zf_070",
+            Self::Zf002(_) => "zf_002",
+            Self::Zf032(_) => "zf_032",
+            Self::Zm091(_) => "zm_091",
+            Self::Zm066(_) => "zm_066",
+            Self::Zm089(_) => "zm_089",
+            Self::Zm034(_) => "zm_034",
+            Self::Zm100(_) => "zm_100",
+            Self::Zf086(_) => "zf_086",
+            Self::Zf040(_) => "zf_040",
+            Self::Zm011(_) => "zm_011",
+            Self::Zm098(_) => "zm_098",
+            Self::Zm015(_) => "zm_015",
+            Self::Zf051(_) => "zf_051",
+            Self::Zm065(_) => "zm_065",
+            Self::Zf076(_) => "zf_076",
+            Self::Zf036(_) => "zf_036",
+            Self::Zm033(_) => "zm_033",
+            Self::Zf018(_) => "zf_018",
+            Self::Zf017(_) => "zf_017",
+            Self::Zf049(_) => "zf_049",
+            Self::AfMaple(_) => "af_maple",
+            Self::Zm082(_) => "zm_082",
+            Self::Zm057(_) => "zm_057",
+            Self::Zf079(_) => "zf_079",
+            Self::Zf022(_) => "zf_022",
+            Self::Zm063(_) => "zm_063",
+            Self::Zf060(_) => "zf_060",
+            Self::Zf019(_) => "zf_019",
+            Self::Zm097(_) => "zm_097",
+            Self::Zm096(_) => "zm_096",
+            Self::Zf023(_) => "zf_023",
+            Self::Zf027(_) => "zf_027",
+            Self::Zf085(_) => "zf_085",
+            Self::Zf077(_) => "zf_077",
+            Self::Zm035(_) => "zm_035",
+            Self::Zf088(_) => "zf_088",
+            Self::Zf024(_) => "zf_024",
+            Self::Zf072(_) => "zf_072",
+            Self::Zm055(_) => "zm_055",
+            Self::Zm052(_) => "zm_052",
+            Self::Zf071(_) => "zf_071",
+            Self::Zm061(_) => "zm_061",
+            Self::Zf078(_) => "zf_078",
+            Self::Zm013(_) => "zm_013",
+            Self::Zm081(_) => "zm_081",
+            Self::Zm037(_) => "zm_037",
+            Self::Zf090(_) => "zf_090",
+            Self::Zf043(_) => "zf_043",
+            Self::Zm058(_) => "zm_058",
+            Self::Zm012(_) => "zm_012",
+            Self::Zm045(_) => "zm_045",
+            Self::Zf075(_) => "zf_075",
+        }
+    }
+
+    pub(super) fn is_v10_supported(&self) -> bool {
+        matches!(
+            self,
+            Self::ZmYunyang(_)
+                | Self::ZfXiaoni(_)
+                | Self::AfJessica(_)
+                | Self::BfLily(_)
+                | Self::ZfXiaobei(_)
+                | Self::ZmYunxia(_)
+                | Self::AfHeart(_)
+                | Self::BfEmma(_)
+                | Self::AmPuck(_)
+                | Self::BfAlice(_)
+                | Self::HfAlpha(_)
+                | Self::BfIsabella(_)
+                | Self::AfNova(_)
+                | Self::AmFenrir(_)
+                | Self::EmAlex(_)
+                | Self::ImNicola(_)
+                | Self::PmAlex(_)
+                | Self::AfAlloy(_)
+                | Self::ZmYunxi(_)
+                | Self::AfSarah(_)
+                | Self::JfNezumi(_)
+                | Self::BmDaniel(_)
+                | Self::JfTebukuro(_)
+                | Self::JfAlpha(_)
+                | Self::JmKumo(_)
+                | Self::EmSanta(_)
+                | Self::AmLiam(_)
+                | Self::AmSanta(_)
+                | Self::AmEric(_)
+                | Self::BmFable(_)
+                | Self::AfBella(_)
+                | Self::BmLewis(_)
+                | Self::PfDora(_)
+                | Self::AfNicole(_)
+                | Self::BmGeorge(_)
+                | Self::AmOnyx(_)
+                | Self::HmPsi(_)
+                | Self::HfBeta(_)
+                | Self::HmOmega(_)
+                | Self::ZfXiaoxiao(_)
+                | Self::FfSiwis(_)
+                | Self::EfDora(_)
+                | Self::AfAoede(_)
+                | Self::AmEcho(_)
+                | Self::AmMichael(_)
+                | Self::AfKore(_)
+                | Self::ZfXiaoyi(_)
+                | Self::JfGongitsune(_)
+                | Self::AmAdam(_)
+                | Self::IfSara(_)
+                | Self::AfSky(_)
+                | Self::PmSanta(_)
+                | Self::AfRiver(_)
+                | Self::ZmYunjian(_)
+        )
+    }
+
+    pub(super) fn is_v11_supported(&self) -> bool {
+        matches!(
+            self,
+            Self::Zm029(_)
+                | Self::Zf048(_)
+                | Self::Zf008(_)
+                | Self::Zm014(_)
+                | Self::Zf003(_)
+                | Self::Zf047(_)
+                | Self::Zm080(_)
+                | Self::Zf094(_)
+                | Self::Zf046(_)
+                | Self::Zm054(_)
+                | Self::Zf001(_)
+                | Self::Zm062(_)
+                | Self::BfVale(_)
+                | Self::Zf044(_)
+                | Self::Zf005(_)
+                | Self::Zf028(_)
+                | Self::Zf059(_)
+                | Self::Zm030(_)
+                | Self::Zf074(_)
+                | Self::Zm009(_)
+                | Self::Zf004(_)
+                | Self::Zf021(_)
+                | Self::Zm095(_)
+                | Self::Zm041(_)
+                | Self::Zf087(_)
+                | Self::Zf039(_)
+                | Self::Zm031(_)
+                | Self::Zf007(_)
+                | Self::Zf038(_)
+                | Self::Zf092(_)
+                | Self::Zm056(_)
+                | Self::Zf099(_)
+                | Self::Zm010(_)
+                | Self::Zm069(_)
+                | Self::Zm016(_)
+                | Self::Zm068(_)
+                | Self::Zf083(_)
+                | Self::Zf093(_)
+                | Self::Zf006(_)
+                | Self::Zf026(_)
+                | Self::Zm053(_)
+                | Self::Zm064(_)
+                | Self::AfSol(_)
+                | Self::Zf042(_)
+                | Self::Zf084(_)
+                | Self::Zf073(_)
+                | Self::Zf067(_)
+                | Self::Zm025(_)
+                | Self::Zm020(_)
+                | Self::Zm050(_)
+                | Self::Zf070(_)
+                | Self::Zf002(_)
+                | Self::Zf032(_)
+                | Self::Zm091(_)
+                | Self::Zm066(_)
+                | Self::Zm089(_)
+                | Self::Zm034(_)
+                | Self::Zm100(_)
+                | Self::Zf086(_)
+                | Self::Zf040(_)
+                | Self::Zm011(_)
+                | Self::Zm098(_)
+                | Self::Zm015(_)
+                | Self::Zf051(_)
+                | Self::Zm065(_)
+                | Self::Zf076(_)
+                | Self::Zf036(_)
+                | Self::Zm033(_)
+                | Self::Zf018(_)
+                | Self::Zf017(_)
+                | Self::Zf049(_)
+                | Self::AfMaple(_)
+                | Self::Zm082(_)
+                | Self::Zm057(_)
+                | Self::Zf079(_)
+                | Self::Zf022(_)
+                | Self::Zm063(_)
+                | Self::Zf060(_)
+                | Self::Zf019(_)
+                | Self::Zm097(_)
+                | Self::Zm096(_)
+                | Self::Zf023(_)
+                | Self::Zf027(_)
+                | Self::Zf085(_)
+                | Self::Zf077(_)
+                | Self::Zm035(_)
+                | Self::Zf088(_)
+                | Self::Zf024(_)
+                | Self::Zf072(_)
+                | Self::Zm055(_)
+                | Self::Zm052(_)
+                | Self::Zf071(_)
+                | Self::Zm061(_)
+                | Self::Zf078(_)
+                | Self::Zm013(_)
+                | Self::Zm081(_)
+                | Self::Zm037(_)
+                | Self::Zf090(_)
+                | Self::Zf043(_)
+                | Self::Zm058(_)
+                | Self::Zm012(_)
+                | Self::Zm045(_)
+                | Self::Zf075(_)
+        )
+    }
+
+    pub(super) fn get_speed_v10(&self) -> Result<f32, KokoroError> {
+        match self {
+            Self::ZmYunyang(v)
+            | Self::ZfXiaoni(v)
+            | Self::AfJessica(v)
+            | Self::BfLily(v)
+            | Self::ZfXiaobei(v)
+            | Self::ZmYunxia(v)
+            | Self::AfHeart(v)
+            | Self::BfEmma(v)
+            | Self::AmPuck(v)
+            | Self::BfAlice(v)
+            | Self::HfAlpha(v)
+            | Self::BfIsabella(v)
+            | Self::AfNova(v)
+            | Self::AmFenrir(v)
+            | Self::EmAlex(v)
+            | Self::ImNicola(v)
+            | Self::PmAlex(v)
+            | Self::AfAlloy(v)
+            | Self::ZmYunxi(v)
+            | Self::AfSarah(v)
+            | Self::JfNezumi(v)
+            | Self::BmDaniel(v)
+            | Self::JfTebukuro(v)
+            | Self::JfAlpha(v)
+            | Self::JmKumo(v)
+            | Self::EmSanta(v)
+            | Self::AmLiam(v)
+            | Self::AmSanta(v)
+            | Self::AmEric(v)
+            | Self::BmFable(v)
+            | Self::AfBella(v)
+            | Self::BmLewis(v)
+            | Self::PfDora(v)
+            | Self::AfNicole(v)
+            | Self::BmGeorge(v)
+            | Self::AmOnyx(v)
+            | Self::HmPsi(v)
+            | Self::HfBeta(v)
+            | Self::HmOmega(v)
+            | Self::ZfXiaoxiao(v)
+            | Self::FfSiwis(v)
+            | Self::EfDora(v)
+            | Self::AfAoede(v)
+            | Self::AmEcho(v)
+            | Self::AmMichael(v)
+            | Self::AfKore(v)
+            | Self::ZfXiaoyi(v)
+            | Self::JfGongitsune(v)
+            | Self::AmAdam(v)
+            | Self::IfSara(v)
+            | Self::AfSky(v)
+            | Self::PmSanta(v)
+            | Self::AfRiver(v)
+            | Self::ZmYunjian(v) => Ok(*v),
+            _ => Err(KokoroError::VoiceVersionInvalid(
+                "Expect version 1.0".to_owned(),
+            )),
+        }
+    }
+
+    pub(super) fn get_speed_v11(&self) -> Result<i32, KokoroError> {
+        match self {
+            Self::Zm029(v)
+            | Self::Zf048(v)
+            | Self::Zf008(v)
+            | Self::Zm014(v)
+            | Self::Zf003(v)
+            | Self::Zf047(v)
+            | Self::Zm080(v)
+            | Self::Zf094(v)
+            | Self::Zf046(v)
+            | Self::Zm054(v)
+            | Self::Zf001(v)
+            | Self::Zm062(v)
+            | Self::BfVale(v)
+            | Self::Zf044(v)
+            | Self::Zf005(v)
+            | Self::Zf028(v)
+            | Self::Zf059(v)
+            | Self::Zm030(v)
+            | Self::Zf074(v)
+            | Self::Zm009(v)
+            | Self::Zf004(v)
+            | Self::Zf021(v)
+            | Self::Zm095(v)
+            | Self::Zm041(v)
+            | Self::Zf087(v)
+            | Self::Zf039(v)
+            | Self::Zm031(v)
+            | Self::Zf007(v)
+            | Self::Zf038(v)
+            | Self::Zf092(v)
+            | Self::Zm056(v)
+            | Self::Zf099(v)
+            | Self::Zm010(v)
+            | Self::Zm069(v)
+            | Self::Zm016(v)
+            | Self::Zm068(v)
+            | Self::Zf083(v)
+            | Self::Zf093(v)
+            | Self::Zf006(v)
+            | Self::Zf026(v)
+            | Self::Zm053(v)
+            | Self::Zm064(v)
+            | Self::AfSol(v)
+            | Self::Zf042(v)
+            | Self::Zf084(v)
+            | Self::Zf073(v)
+            | Self::Zf067(v)
+            | Self::Zm025(v)
+            | Self::Zm020(v)
+            | Self::Zm050(v)
+            | Self::Zf070(v)
+            | Self::Zf002(v)
+            | Self::Zf032(v)
+            | Self::Zm091(v)
+            | Self::Zm066(v)
+            | Self::Zm089(v)
+            | Self::Zm034(v)
+            | Self::Zm100(v)
+            | Self::Zf086(v)
+            | Self::Zf040(v)
+            | Self::Zm011(v)
+            | Self::Zm098(v)
+            | Self::Zm015(v)
+            | Self::Zf051(v)
+            | Self::Zm065(v)
+            | Self::Zf076(v)
+            | Self::Zf036(v)
+            | Self::Zm033(v)
+            | Self::Zf018(v)
+            | Self::Zf017(v)
+            | Self::Zf049(v)
+            | Self::AfMaple(v)
+            | Self::Zm082(v)
+            | Self::Zm057(v)
+            | Self::Zf079(v)
+            | Self::Zf022(v)
+            | Self::Zm063(v)
+            | Self::Zf060(v)
+            | Self::Zf019(v)
+            | Self::Zm097(v)
+            | Self::Zm096(v)
+            | Self::Zf023(v)
+            | Self::Zf027(v)
+            | Self::Zf085(v)
+            | Self::Zf077(v)
+            | Self::Zm035(v)
+            | Self::Zf088(v)
+            | Self::Zf024(v)
+            | Self::Zf072(v)
+            | Self::Zm055(v)
+            | Self::Zm052(v)
+            | Self::Zf071(v)
+            | Self::Zm061(v)
+            | Self::Zf078(v)
+            | Self::Zm013(v)
+            | Self::Zm081(v)
+            | Self::Zm037(v)
+            | Self::Zf090(v)
+            | Self::Zf043(v)
+            | Self::Zm058(v)
+            | Self::Zm012(v)
+            | Self::Zm045(v)
+            | Self::Zf075(v) => Ok(*v),
+            _ => Err(KokoroError::VoiceVersionInvalid(
+                "Expect version 1.1".to_owned(),
+            )),
+        }
+    }
+}
@@ -3,6 +3,6 @@
 */
 export const commitinfo = {
  name: 'siprouter',
-  version: '1.25.1',
+  version: '1.25.2',
  description: 'undefined'
 }
@@ -28,6 +28,24 @@ export function registerProxyEventHandlers(options: IRegisterProxyEventHandlersO
    onCloseWebRtcSession,
  } = options;

+  const legMediaDetails = (data: {
+    codec?: string | null;
+    remoteMedia?: string | null;
+    rtpPort?: number | null;
+  }): string => {
+    const parts: string[] = [];
+    if (data.codec) {
+      parts.push(`codec=${data.codec}`);
+    }
+    if (data.remoteMedia) {
+      parts.push(`remote=${data.remoteMedia}`);
+    }
+    if (data.rtpPort !== undefined && data.rtpPort !== null) {
+      parts.push(`rtp=${data.rtpPort}`);
+    }
+    return parts.length ? ` ${parts.join(' ')}` : '';
+  };
+
  onProxyEvent('provider_registered', (data) => {
    const previous = statusStore.noteProviderRegistered(data);
    if (previous) {
@@ -128,7 +146,9 @@ export function registerProxyEventHandlers(options: IRegisterProxyEventHandlersO
  });

  onProxyEvent('leg_added', (data) => {
-    log(`[leg] added: call=${data.call_id} leg=${data.leg_id} kind=${data.kind} state=${data.state}`);
+    log(
+      `[leg] added: call=${data.call_id} leg=${data.leg_id} kind=${data.kind} state=${data.state}${legMediaDetails(data)}`,
+    );
    statusStore.noteLegAdded(data);
  });

@@ -138,7 +158,9 @@ export function registerProxyEventHandlers(options: IRegisterProxyEventHandlersO
  });

  onProxyEvent('leg_state_changed', (data) => {
-    log(`[leg] state: call=${data.call_id} leg=${data.leg_id} -> ${data.state}`);
+    log(
+      `[leg] state: call=${data.call_id} leg=${data.leg_id} -> ${data.state}${legMediaDetails(data)}`,
+    );
    statusStore.noteLegStateChanged(data);
  });

@@ -213,6 +213,10 @@ export class StatusStore {
      legs: [...call.legs.values()].map((leg) => ({
        id: leg.id,
        type: leg.type,
+        state: leg.state,
+        codec: leg.codec,
+        rtpPort: leg.rtpPort,
+        remoteMedia: leg.remoteMedia,
        metadata: leg.metadata || {},
      })),
    });
@@ -255,6 +259,15 @@ export class StatusStore {
    const existingLeg = call.legs.get(data.leg_id);
    if (existingLeg) {
      existingLeg.state = data.state;
+      if (data.codec !== undefined) {
+        existingLeg.codec = data.codec;
+      }
+      if (data.rtpPort !== undefined) {
+        existingLeg.rtpPort = data.rtpPort;
+      }
+      if (data.remoteMedia !== undefined) {
+        existingLeg.remoteMedia = data.remoteMedia;
+      }
      if (data.metadata) {
        existingLeg.metadata = data.metadata;
      }
@@ -265,9 +278,9 @@ export class StatusStore {
      id: data.leg_id,
      type: this.inferLegType(data.leg_id),
      state: data.state,
-      codec: null,
-      rtpPort: null,
-      remoteMedia: null,
+      codec: data.codec ?? null,
+      rtpPort: data.rtpPort ?? null,
+      remoteMedia: data.remoteMedia ?? null,
      metadata: data.metadata || {},
    });
  }
@@ -80,6 +80,9 @@ export interface ILegStateChangedEvent {
  call_id: string;
  leg_id: string;
  state: string;
+  codec?: string | null;
+  rtpPort?: number | null;
+  remoteMedia?: string | null;
  metadata?: Record<string, unknown>;
 }

@@ -43,7 +43,11 @@ export interface IActiveCall {

 export interface IHistoryLeg {
  id: string;
-  type: string;
+  type: TLegType;
+  state: string;
+  codec: string | null;
+  rtpPort: number | null;
+  remoteMedia: string | null;
  metadata: Record<string, unknown>;
 }

@@ -3,6 +3,6 @@
 */
 export const commitinfo = {
  name: 'siprouter',
-  version: '1.25.1',
+  version: '1.25.2',
  description: 'undefined'
 }
@@ -32,8 +32,32 @@ const LEG_TYPE_LABELS: Record<string, string> = {
  'sip-device': 'SIP Device',
  'sip-provider': 'SIP Provider',
  'webrtc': 'WebRTC',
+  'tool': 'Tool',
 };

+function renderHistoryLegs(legs: ICallHistoryEntry['legs']): TemplateResult {
+  if (!legs.length) {
+    return html`<span style="color:#64748b">-</span>`;
+  }
+
+  return html`
+    <div style="display:flex;flex-direction:column;gap:6px;font-size:.72rem;line-height:1.35;">
+      ${legs.map(
+        (leg) => html`
+          <div>
+            <span class="badge" style="${legTypeBadgeStyle(leg.type)}">${LEG_TYPE_LABELS[leg.type] || leg.type}</span>
+            <span style="margin-left:6px;font-family:'JetBrains Mono',monospace;">${leg.codec || '--'}</span>
+            <span style="margin-left:6px;color:#94a3b8;">${STATE_LABELS[leg.state] || leg.state}</span>
+            ${leg.remoteMedia
+              ? html`<span style="display:block;color:#64748b;font-family:'JetBrains Mono',monospace;">${leg.remoteMedia}</span>`
+              : ''}
+          </div>
+        `,
+      )}
+    </div>
+  `;
+}
+
 function directionIcon(dir: string): string {
  if (dir === 'inbound') return '\u2199';
  if (dir === 'outbound') return '\u2197';
@@ -226,8 +250,8 @@ export class SipproxyViewCalls extends DeesElement {
    `,
  ];

-  connectedCallback() {
-    super.connectedCallback();
+  async connectedCallback(): Promise<void> {
+    await super.connectedCallback();
    this.rxSubscriptions.push({
      unsubscribe: appState.subscribe((s) => {
        this.appData = s;
@@ -490,6 +514,11 @@ export class SipproxyViewCalls extends DeesElement {
        renderer: (val: number) =>
          html`<span style="font-family:'JetBrains Mono',monospace;font-size:.75rem">${fmtDuration(val)}</span>`,
      },
+      {
+        key: 'legs',
+        header: 'Legs',
+        renderer: (val: ICallHistoryEntry['legs']) => renderHistoryLegs(val),
+      },
    ];
  }

@@ -551,9 +580,7 @@ export class SipproxyViewCalls extends DeesElement {
                            </span>
                          </td>
                          <td>
-                            ${leg.remoteMedia
-                              ? `${leg.remoteMedia.address}:${leg.remoteMedia.port}`
-                              : '--'}
+                            ${leg.remoteMedia || '--'}
                          </td>
                          <td>${leg.rtpPort ?? '--'}</td>
                          <td>
@@ -18,6 +18,12 @@ interface IVoicemailMessage {
  heard: boolean;
 }

+interface IVoiceboxRow {
+  id: string;
+  unheardCount: number;
+  selected: boolean;
+}
+
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
@@ -61,19 +67,6 @@ export class SipproxyViewVoicemail extends DeesElement {
      .view-section {
        margin-bottom: 24px;
      }
-      .box-selector {
-        display: flex;
-        align-items: center;
-        gap: 12px;
-        margin-bottom: 24px;
-      }
-      .box-selector label {
-        font-size: 0.85rem;
-        font-weight: 600;
-        color: #94a3b8;
-        text-transform: uppercase;
-        letter-spacing: 0.04em;
-      }
      .audio-player {
        display: flex;
        align-items: center;
@@ -135,10 +128,11 @@ export class SipproxyViewVoicemail extends DeesElement {
      const cfg = await appState.apiGetConfig();
      const boxes: { id: string }[] = cfg.voiceboxes || [];
      this.voiceboxIds = boxes.map((b) => b.id);
-      if (this.voiceboxIds.length > 0 && !this.selectedBoxId) {
-        this.selectedBoxId = this.voiceboxIds[0];
-        await this.loadMessages();
-      }
+      const nextSelectedBoxId = this.voiceboxIds.includes(this.selectedBoxId)
+        ? this.selectedBoxId
+        : (this.voiceboxIds[0] || '');
+      this.selectedBoxId = nextSelectedBoxId;
+      await this.loadMessages();
    } catch {
      // Config unavailable.
    }
@@ -161,11 +155,22 @@ export class SipproxyViewVoicemail extends DeesElement {
  }

  private async selectBox(boxId: string) {
+    if (boxId === this.selectedBoxId) {
+      return;
+    }
    this.selectedBoxId = boxId;
    this.stopAudio();
    await this.loadMessages();
  }

+  private getVoiceboxRows(): IVoiceboxRow[] {
+    return this.voiceboxIds.map((id) => ({
+      id,
+      unheardCount: this.appData.voicemailCounts[id] || 0,
+      selected: id === this.selectedBoxId,
+    }));
+  }
+
  // ---- audio playback ------------------------------------------------------

  private playMessage(msg: IVoicemailMessage) {
@@ -341,6 +346,43 @@ export class SipproxyViewVoicemail extends DeesElement {
    ];
  }

+  private getVoiceboxColumns() {
+    return [
+      {
+        key: 'id',
+        header: 'Voicebox',
+        sortable: true,
+        renderer: (val: string, row: IVoiceboxRow) => html`
+          <div style="display:flex;align-items:center;gap:10px;">
+            <span style="font-family:'JetBrains Mono',monospace;font-size:.85rem;">${val}</span>
+            ${row.selected ? html`
+              <span style="display:inline-block;padding:2px 8px;border-radius:4px;font-size:.7rem;font-weight:600;text-transform:uppercase;background:#1e3a5f;color:#60a5fa">Viewing</span>
+            ` : ''}
+          </div>
+        `,
+      },
+      {
+        key: 'unheardCount',
+        header: 'Unheard',
+        sortable: true,
+        renderer: (val: number) => {
+          const hasUnheard = val > 0;
+          return html`
+            <span style="display:inline-block;padding:2px 8px;border-radius:4px;font-size:.75rem;font-weight:600;background:${hasUnheard ? '#422006' : '#1f2937'};color:${hasUnheard ? '#f59e0b' : '#94a3b8'}">${val}</span>
+          `;
+        },
+      },
+      {
+        key: 'selected',
+        header: 'Status',
+        value: (row: IVoiceboxRow) => (row.selected ? 'Open' : 'Available'),
+        renderer: (val: string, row: IVoiceboxRow) => html`
+          <span style="color:${row.selected ? '#60a5fa' : '#94a3b8'};font-size:.8rem;">${val}</span>
+        `,
+      },
+    ];
+  }
+
  // ---- table actions -------------------------------------------------------

  private getDataActions() {
@@ -390,21 +432,43 @@ export class SipproxyViewVoicemail extends DeesElement {
    ];
  }

+  private getVoiceboxActions() {
+    return [
+      {
+        name: 'View Messages',
+        iconName: 'lucide:folder-open',
+        type: ['inRow'] as any,
+        actionFunc: async ({ item }: { item: IVoiceboxRow }) => {
+          await this.selectBox(item.id);
+        },
+      },
+      {
+        name: 'Refresh Boxes',
+        iconName: 'lucide:refreshCw',
+        type: ['header'] as any,
+        actionFunc: async () => {
+          await this.loadVoiceboxes();
+          deesCatalog.DeesToast.success('Voiceboxes refreshed');
+        },
+      },
+    ];
+  }
+
  // ---- render --------------------------------------------------------------

  public render(): TemplateResult {
    return html`
-      ${this.voiceboxIds.length > 1 ? html`
-        <div class="box-selector">
-          <label>Voicebox</label>
-          <dees-input-dropdown
-            .key=${'voicebox'}
-            .selectedOption=${{ option: this.selectedBoxId, key: this.selectedBoxId }}
-            .options=${this.voiceboxIds.map((id) => ({ option: id, key: id }))}
-            @selectedOption=${(e: CustomEvent) => { this.selectBox(e.detail.key); }}
-          ></dees-input-dropdown>
-        </div>
-      ` : ''}
+      <div class="view-section">
+        <dees-table
+          heading1="Voiceboxes"
+          heading2="${this.voiceboxIds.length} configured"
+          dataName="voiceboxes"
+          .data=${this.getVoiceboxRows()}
+          .rowKey=${'id'}
+          .columns=${this.getVoiceboxColumns()}
+          .dataActions=${this.getVoiceboxActions()}
+        ></dees-table>
+      </div>

      <div class="view-section">
        <dees-statsgrid