Files
siprouter/rust/vendor/kokoro-tts/g2p.py

515 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
from typing import List, Optional, Tuple
from jieba import posseg, cut_for_search
from pypinyin import lazy_pinyin, load_phrases_dict, Style
from dataclasses import dataclass
@dataclass
class MToken:
tag: str
whitespace: str
phonemes: Optional[str] = None
ZH_MAP = {"b":"","p":"","m":"","f":"","d":"","t":"","n":"","l":"","g":"","k":"","h":"","j":"","q":"","x":"","zh":"","ch":"","sh":"","r":"","z":"","c":"","s":"","a":"","o":"","e":"","ie":"","ai":"","ei":"","ao":"","ou":"","an":"","en":"","ang":"","eng":"","er":"","i":"","u":"","v":"","ii":"","iii":"","ve":"","ia":"","ian":"","iang":"","iao":"","in":"","ing":"","iong":"","iou":"","ong":"","ua":"","uai":"","uan":"","uang":"","uei":"","uen":"","ueng":"","uo":"","van":"","vn":""}
for p in ';:,.!?/—…"()“” 12345R':
assert p not in ZH_MAP, p
ZH_MAP[p] = p
unk = ''
punc = frozenset(';:,.!?—…"()“”')
phrases_dict = {
'开户行': [['ka1i'], ['hu4'], ['hang2']],
'发卡行': [['fa4'], ['ka3'], ['hang2']],
'放款行': [['fa4ng'], ['kua3n'], ['hang2']],
'茧行': [['jia3n'], ['hang2']],
'行号': [['hang2'], ['ha4o']],
'各地': [['ge4'], ['di4']],
'借还款': [['jie4'], ['hua2n'], ['kua3n']],
'时间为': [['shi2'], ['jia1n'], ['we2i']],
'为准': [['we2i'], ['zhu3n']],
'色差': [['se4'], ['cha1']],
'': [['dia3']],
'': [['bei5']],
'': [['bu4']],
'': [['zuo5']],
'': [['lei5']],
'掺和': [['chan1'], ['huo5']]
}
must_erhua = {
"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
}
must_not_neural_tone_words = {
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
'幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
'耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
'考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
}
must_neural_tone_words = {
'麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
'难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊',
'里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去',
'软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号',
'认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当',
'蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
'舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
'胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
'戏弄', '将军', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
'精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
'窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
'码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
'白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨',
'琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快',
'爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜',
'溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔',
'棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事',
'木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
'收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
'抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担',
'戒指', '懒得', '意识', '意思', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '别人',
'快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事',
'帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股',
'尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在',
'官司', '学问', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', '姑娘', '姐夫',
'妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', '大意', '大夫',
'多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', '嘱咐', '嘟囔',
'嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', '咳嗽', '和尚',
'告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', '叫唤', '口袋',
'厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', '功夫', '力气',
'前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', '凑合', '凉快',
'冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', '佩服', '作坊',
'体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', '交情', '云彩',
'事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', '不由', '下水',
'下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜',
'邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
'扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
'糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记'
}
not_erhua = {
"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
"拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
"流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
"孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
"狗儿", "少儿"
}
BU = ''
YI = ''
X_ENG = frozenset(['x', 'eng'])
# g2p
load_phrases_dict(phrases_dict)
def get_initials_finals(word: str) -> Tuple[List[str], List[str]]:
"""
Get word initial and final by pypinyin or g2pM
"""
initials = []
finals = []
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
print(orig_initials, orig_finals)
# after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
en_index = [index for index, c in enumerate(word) if c == ""]
for i in en_index:
orig_finals[i] = "n2"
for c, v in zip(orig_initials, orig_finals):
if re.match(r'i\d', v):
if c in ['z', 'c', 's']:
# zi, ci, si
v = re.sub('i', 'ii', v)
elif c in ['zh', 'ch', 'sh', 'r']:
# zhi, chi, shi
v = re.sub('i', 'iii', v)
initials.append(c)
finals.append(v)
return initials, finals
def merge_erhua(initials: List[str], finals: List[str], word: str, pos: str) -> Tuple[List[str], List[str]]:
"""
Do erhub.
"""
# fix er1
for i, phn in enumerate(finals):
if i == len(finals) - 1 and word[i] == "" and phn == 'er1':
finals[i] = 'er2'
# 发音
if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}):
return initials, finals
# "……" 等情况直接返回
if len(finals) != len(word):
return initials, finals
assert len(finals) == len(word)
# 不发音
new_initials = []
new_finals = []
for i, phn in enumerate(finals):
if i == len(finals) - 1 and word[i] == "" and phn in {"er2", "er5"} and word[-2:] not in not_erhua and new_finals:
new_finals[-1] = new_finals[-1][:-1] + "R" + new_finals[-1][-1]
else:
new_initials.append(initials[i])
new_finals.append(phn)
return new_initials, new_finals
# merge "不" and the word behind it
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
def merge_bu(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
for i, (word, pos) in enumerate(seg):
if pos not in X_ENG:
last_word = None
if i > 0:
last_word, _ = seg[i - 1]
if last_word == BU:
word = last_word + word
next_pos = None
if i + 1 < len(seg):
_, next_pos = seg[i + 1]
if word != BU or next_pos is None or next_pos in X_ENG:
new_seg.append((word, pos))
return new_seg
# function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
# function 2: merge single "一" and the word behind it
# if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
# e.g.
# input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
# output seg: [['听一听', 'v']]
def merge_yi(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
skip_next = False
# function 1
for i, (word, pos) in enumerate(seg):
if skip_next:
skip_next = False
continue
if i - 1 >= 0 and word == YI and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v" and seg[i + 1][1] not in X_ENG:
new_seg[-1] = (new_seg[-1][0] + YI + seg[i + 1][0], new_seg[-1][1])
skip_next = True
else:
new_seg.append((word, pos))
seg = new_seg
new_seg = []
# function 2
for i, (word, pos) in enumerate(seg):
if new_seg and new_seg[-1][0] == YI and pos not in X_ENG:
new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
else:
new_seg.append((word, pos))
return new_seg
def merge_reduplication(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
for i, (word, pos) in enumerate(seg):
if new_seg and word == new_seg[-1][0] and pos not in X_ENG:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
else:
new_seg.append([word, pos])
return new_seg
def is_reduplication(word: str) -> bool:
return len(word) == 2 and word[0] == word[1]
# the first and the second words are all_tone_three
def merge_continuous_three_tones(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
sub_finals_list = []
for (word, pos) in seg:
if pos in X_ENG:
sub_finals_list.append(['0'])
continue
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
# after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
en_index = [index for index, c in enumerate(word) if c == ""]
for i in en_index:
orig_finals[i] = "n2"
sub_finals_list.append(orig_finals)
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
for i, (word, pos) in enumerate(seg):
if pos not in X_ENG and i - 1 >= 0 and all_tone_three(sub_finals_list[i - 1]) and all_tone_three(sub_finals_list[i]) and not merge_last[i - 1]:
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if not is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:
new_seg.append([word, pos])
else:
new_seg.append([word, pos])
return new_seg
# the last char of first word and the first char of second word is tone_three
def merge_continuous_three_tones_2(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
sub_finals_list = []
for (word, pos) in seg:
if pos in X_ENG:
sub_finals_list.append(['0'])
continue
orig_finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
# after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
en_index = [index for index, c in enumerate(word) if c == ""]
for i in en_index:
orig_finals[i] = "n2"
sub_finals_list.append(orig_finals)
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
for i, (word, pos) in enumerate(seg):
if pos not in X_ENG and i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not merge_last[i - 1]:
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if not is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:
new_seg.append([word, pos])
else:
new_seg.append([word, pos])
return new_seg
def merge_er(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
for i, (word, pos) in enumerate(seg):
if i - 1 >= 0 and word == "" and new_seg[-1][1] not in X_ENG:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
else:
new_seg.append([word, pos])
return new_seg
def pre_merge_for_modify(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
"""
seg: [(word, pos), ...]
"""
seg = merge_bu(seg)
seg = merge_yi(seg)
seg = merge_reduplication(seg)
seg = merge_continuous_three_tones(seg)
seg = merge_continuous_three_tones_2(seg)
return merge_er(seg)
def bu_sandhi(word: str, finals: List[str]) -> List[str]:
# e.g. 看不懂
if len(word) == 3 and word[1] == BU:
finals[1] = finals[1][:-1] + "5"
else:
for i, char in enumerate(word):
# "不" before tone4 should be bu2, e.g. 不怕
if char == BU and i + 1 < len(word) and finals[i + 1][-1] == "4":
finals[i] = finals[i][:-1] + "2"
return finals
def yi_sandhi(word: str, finals: List[str]) -> List[str]:
# "一" in number sequences, e.g. 一零零, 二一零
if word.find(YI) != -1 and all(
[item.isnumeric() for item in word if item != YI]):
return finals
# "一" between reduplication words shold be yi5, e.g. 看一看
elif len(word) == 3 and word[1] == YI and word[0] == word[-1]:
finals[1] = finals[1][:-1] + "5"
# when "一" is ordinal word, it should be yi1
elif word.startswith("第一"):
finals[1] = finals[1][:-1] + "1"
else:
for i, char in enumerate(word):
if char == YI and i + 1 < len(word):
# "一" before tone4 should be yi2, e.g. 一段
if finals[i + 1][-1] in {'4', '5'}:
finals[i] = finals[i][:-1] + "2"
# "一" before non-tone4 should be yi4, e.g. 一天
else:
# "一" 后面如果是标点,还读一声
if word[i + 1] not in punc:
finals[i] = finals[i][:-1] + "4"
return finals
def split_word(word: str) -> List[str]:
word_list = cut_for_search(word)
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
first_subword = word_list[0]
first_begin_idx = word.find(first_subword)
if first_begin_idx == 0:
second_subword = word[len(first_subword):]
new_word_list = [first_subword, second_subword]
else:
second_subword = word[:-len(first_subword)]
new_word_list = [second_subword, first_subword]
return new_word_list
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
# e.g.
# word: "家里"
# pos: "s"
# finals: ['ia1', 'i3']
def neural_sandhi(word: str, pos: str, finals: List[str]) -> List[str]:
if word in must_not_neural_tone_words:
return finals
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
for j, item in enumerate(word):
if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
finals[j] = finals[j][:-1] + "5"
ge_idx = word.find("")
if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒滴哩哟喽啰耶喔诶":
finals[-1] = finals[-1][:-1] + "5"
elif len(word) >= 1 and word[-1] in "的地得":
finals[-1] = finals[-1][:-1] + "5"
# e.g. 走了, 看着, 去过
elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
finals[-1] = finals[-1][:-1] + "5"
elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"}:
finals[-1] = finals[-1][:-1] + "5"
# e.g. 桌上, 地下
elif len(word) > 1 and word[-1] in "上下" and pos in {"s", "l", "f"}:
finals[-1] = finals[-1][:-1] + "5"
# e.g. 上来, 下去
elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
finals[-1] = finals[-1][:-1] + "5"
# 个做量词
elif (ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")) or word == '':
finals[ge_idx] = finals[ge_idx][:-1] + "5"
else:
if word in must_neural_tone_words or word[-2:] in must_neural_tone_words:
finals[-1] = finals[-1][:-1] + "5"
word_list = split_word(word)
finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
for i, word in enumerate(word_list):
# conventional neural in Chinese
if word in must_neural_tone_words or word[-2:] in must_neural_tone_words:
finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
finals = sum(finals_list, [])
return finals
def all_tone_three(finals: List[str]) -> bool:
return all(x[-1] == "3" for x in finals)
def three_sandhi(word: str, finals: List[str]) -> List[str]:
if len(word) == 2 and all_tone_three(finals):
finals[0] = finals[0][:-1] + "2"
elif len(word) == 3:
word_list = split_word(word)
if all_tone_three(finals):
# disyllabic + monosyllabic, e.g. 蒙古/包
if len(word_list[0]) == 2:
finals[0] = finals[0][:-1] + "2"
finals[1] = finals[1][:-1] + "2"
# monosyllabic + disyllabic, e.g. 纸/老虎
elif len(word_list[0]) == 1:
finals[1] = finals[1][:-1] + "2"
else:
finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
if len(finals_list) == 2:
for i, sub in enumerate(finals_list):
# e.g. 所有/人
if all_tone_three(sub) and len(sub) == 2:
finals_list[i][0] = finals_list[i][0][:-1] + "2"
# e.g. 好/喜欢
elif i == 1 and not all_tone_three(sub) and finals_list[i][0][-1] == "3" and finals_list[0][-1][-1] == "3":
finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
finals = sum(finals_list, [])
# split idiom into two words who's length is 2
elif len(word) == 4:
finals_list = [finals[:2], finals[2:]]
finals = []
for sub in finals_list:
if all_tone_three(sub):
sub[0] = sub[0][:-1] + "2"
finals += sub
return finals
def modified_tone(word: str, pos: str, finals: List[str]) -> List[str]:
"""
word: 分词
pos: 词性
finals: 带调韵母, [final1, ..., finaln]
"""
finals = bu_sandhi(word, finals)
finals = yi_sandhi(word, finals)
finals = neural_sandhi(word, pos, finals)
return three_sandhi(word, finals)
def g2p(text: str, with_erhua: bool = True) -> str:
"""
Return: string of phonemes.
'ㄋㄧ2ㄏㄠ3/ㄕ十4ㄐㄝ4'
"""
tokens = []
seg_cut = posseg.lcut(text)
# fix wordseg bad case for sandhi
seg_cut = pre_merge_for_modify(seg_cut)
# 为了多音词获得更好的效果,这里采用整句预测
initials = []
finals = []
# pypinyin, g2pM
for word, pos in seg_cut:
if pos == 'x' and '\u4E00' <= min(word) and max(word) <= '\u9FFF':
pos = 'X'
elif pos != 'x' and word in punc:
pos = 'x'
tk = MToken(tag=pos, whitespace='')
if pos in X_ENG:
if not word.isspace():
if pos == 'x' and word in punc:
tk.phonemes = word
tokens.append(tk)
elif tokens:
tokens[-1].whitespace += word
continue
elif tokens and tokens[-1].tag not in X_ENG and not tokens[-1].whitespace:
tokens[-1].whitespace = '/'
# g2p
sub_initials, sub_finals = get_initials_finals(word)
# tone sandhi
sub_finals = modified_tone(word, pos, sub_finals)
# er hua
if with_erhua:
sub_initials, sub_finals = merge_erhua(sub_initials, sub_finals, word, pos)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
# sum(iterable[, start])
# initials = sum(initials, [])
# finals = sum(finals, [])
phones = []
for c, v in zip(sub_initials, sub_finals):
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c:
phones.append(c)
# replace punctuation by ` `
# if c and c in punc:
# phones.append(c)
if v and (v not in punc or v != c):# and v not in rhy_phns:
phones.append(v)
phones = '_'.join(phones).replace('_eR', '_er').replace('R', '_R')
phones = re.sub(r'(?=\d)', '_', phones).split('_')
print(phones)
tk.phonemes = ''.join(ZH_MAP.get(p, unk) for p in phones)
tokens.append(tk)
return ''.join((unk if tk.phonemes is None else tk.phonemes) + tk.whitespace for tk in tokens)
print(g2p('时间为。Hello, world!你好我们是一群追逐梦想的人。我正在使用qq。忽略卢驴'))
seg = posseg.lcut('不好看', True)
print(seg, merge_bu(seg))
seg = merge_bu(posseg.lcut('听一听一个', True))
print(seg, merge_yi(seg))
seg = merge_bu(posseg.lcut('谢谢谢谢', True))
print(seg, merge_reduplication(seg))
seg = merge_bu(posseg.lcut('小美好', True))
print(seg, merge_continuous_three_tones(seg))
seg = merge_bu(posseg.lcut('风景好', True))
print(seg, merge_continuous_three_tones_2(seg))