515 lines
24 KiB
Python
515 lines
24 KiB
Python
import re
|
||
from typing import List, Optional, Tuple
|
||
from jieba import posseg, cut_for_search
|
||
from pypinyin import lazy_pinyin, load_phrases_dict, Style
|
||
from dataclasses import dataclass
|
||
|
||
@dataclass
|
||
class MToken:
|
||
tag: str
|
||
whitespace: str
|
||
phonemes: Optional[str] = None
|
||
|
||
ZH_MAP = {"b":"ㄅ","p":"ㄆ","m":"ㄇ","f":"ㄈ","d":"ㄉ","t":"ㄊ","n":"ㄋ","l":"ㄌ","g":"ㄍ","k":"ㄎ","h":"ㄏ","j":"ㄐ","q":"ㄑ","x":"ㄒ","zh":"ㄓ","ch":"ㄔ","sh":"ㄕ","r":"ㄖ","z":"ㄗ","c":"ㄘ","s":"ㄙ","a":"ㄚ","o":"ㄛ","e":"ㄜ","ie":"ㄝ","ai":"ㄞ","ei":"ㄟ","ao":"ㄠ","ou":"ㄡ","an":"ㄢ","en":"ㄣ","ang":"ㄤ","eng":"ㄥ","er":"ㄦ","i":"ㄧ","u":"ㄨ","v":"ㄩ","ii":"ㄭ","iii":"十","ve":"月","ia":"压","ian":"言","iang":"阳","iao":"要","in":"阴","ing":"应","iong":"用","iou":"又","ong":"中","ua":"穵","uai":"外","uan":"万","uang":"王","uei":"为","uen":"文","ueng":"瓮","uo":"我","van":"元","vn":"云"}
|
||
for p in ';:,.!?/—…"()“” 12345R':
|
||
assert p not in ZH_MAP, p
|
||
ZH_MAP[p] = p
|
||
|
||
unk = '❓'
|
||
punc = frozenset(';:,.!?—…"()“”')
|
||
phrases_dict = {
|
||
'开户行': [['ka1i'], ['hu4'], ['hang2']],
|
||
'发卡行': [['fa4'], ['ka3'], ['hang2']],
|
||
'放款行': [['fa4ng'], ['kua3n'], ['hang2']],
|
||
'茧行': [['jia3n'], ['hang2']],
|
||
'行号': [['hang2'], ['ha4o']],
|
||
'各地': [['ge4'], ['di4']],
|
||
'借还款': [['jie4'], ['hua2n'], ['kua3n']],
|
||
'时间为': [['shi2'], ['jia1n'], ['we2i']],
|
||
'为准': [['we2i'], ['zhu3n']],
|
||
'色差': [['se4'], ['cha1']],
|
||
'嗲': [['dia3']],
|
||
'呗': [['bei5']],
|
||
'不': [['bu4']],
|
||
'咗': [['zuo5']],
|
||
'嘞': [['lei5']],
|
||
'掺和': [['chan1'], ['huo5']]
|
||
}
|
||
must_erhua = {
|
||
"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
|
||
}
|
||
must_not_neural_tone_words = {
|
||
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
|
||
'幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
|
||
'耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
|
||
'考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
|
||
}
|
||
must_neural_tone_words = {
|
||
'麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
|
||
'难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊',
|
||
'里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去',
|
||
'软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号',
|
||
'认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当',
|
||
'蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻',
|
||
'舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂',
|
||
'胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆',
|
||
'戏弄', '将军', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂',
|
||
'精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿',
|
||
'窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台',
|
||
'码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算',
|
||
'白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨',
|
||
'琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快',
|
||
'爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜',
|
||
'溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔',
|
||
'棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事',
|
||
'木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
|
||
'收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
|
||
'抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担',
|
||
'戒指', '懒得', '意识', '意思', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '别人',
|
||
'快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事',
|
||
'帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股',
|
||
'尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在',
|
||
'官司', '学问', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', '姑娘', '姐夫',
|
||
'妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', '大意', '大夫',
|
||
'多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', '嘱咐', '嘟囔',
|
||
'嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', '咳嗽', '和尚',
|
||
'告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', '叫唤', '口袋',
|
||
'厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', '功夫', '力气',
|
||
'前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', '凑合', '凉快',
|
||
'冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', '佩服', '作坊',
|
||
'体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', '交情', '云彩',
|
||
'事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', '不由', '下水',
|
||
'下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜',
|
||
'邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
|
||
'扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
|
||
'糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记'
|
||
}
|
||
not_erhua = {
|
||
"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
|
||
"拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
|
||
"流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
|
||
"孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
|
||
"狗儿", "少儿"
|
||
}
|
||
BU = '不'
|
||
YI = '一'
|
||
X_ENG = frozenset(['x', 'eng'])
|
||
|
||
# g2p
|
||
load_phrases_dict(phrases_dict)
|
||
|
||
def get_initials_finals(word: str) -> Tuple[List[str], List[str]]:
|
||
"""
|
||
Get word initial and final by pypinyin or g2pM
|
||
"""
|
||
initials = []
|
||
finals = []
|
||
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
|
||
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||
print(orig_initials, orig_finals)
|
||
# after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
|
||
en_index = [index for index, c in enumerate(word) if c == "嗯"]
|
||
for i in en_index:
|
||
orig_finals[i] = "n2"
|
||
|
||
for c, v in zip(orig_initials, orig_finals):
|
||
if re.match(r'i\d', v):
|
||
if c in ['z', 'c', 's']:
|
||
# zi, ci, si
|
||
v = re.sub('i', 'ii', v)
|
||
elif c in ['zh', 'ch', 'sh', 'r']:
|
||
# zhi, chi, shi
|
||
v = re.sub('i', 'iii', v)
|
||
initials.append(c)
|
||
finals.append(v)
|
||
|
||
return initials, finals
|
||
|
||
def merge_erhua(initials: List[str], finals: List[str], word: str, pos: str) -> Tuple[List[str], List[str]]:
|
||
"""
|
||
Do erhub.
|
||
"""
|
||
# fix er1
|
||
for i, phn in enumerate(finals):
|
||
if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
|
||
finals[i] = 'er2'
|
||
|
||
# 发音
|
||
if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}):
|
||
return initials, finals
|
||
|
||
# "……" 等情况直接返回
|
||
if len(finals) != len(word):
|
||
return initials, finals
|
||
|
||
assert len(finals) == len(word)
|
||
|
||
# 不发音
|
||
new_initials = []
|
||
new_finals = []
|
||
for i, phn in enumerate(finals):
|
||
if i == len(finals) - 1 and word[i] == "儿" and phn in {"er2", "er5"} and word[-2:] not in not_erhua and new_finals:
|
||
new_finals[-1] = new_finals[-1][:-1] + "R" + new_finals[-1][-1]
|
||
else:
|
||
new_initials.append(initials[i])
|
||
new_finals.append(phn)
|
||
|
||
return new_initials, new_finals
|
||
|
||
# merge "不" and the word behind it
|
||
# if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
|
||
def merge_bu(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||
new_seg = []
|
||
for i, (word, pos) in enumerate(seg):
|
||
if pos not in X_ENG:
|
||
last_word = None
|
||
if i > 0:
|
||
last_word, _ = seg[i - 1]
|
||
if last_word == BU:
|
||
word = last_word + word
|
||
next_pos = None
|
||
if i + 1 < len(seg):
|
||
_, next_pos = seg[i + 1]
|
||
if word != BU or next_pos is None or next_pos in X_ENG:
|
||
new_seg.append((word, pos))
|
||
return new_seg
|
||
|
||
# function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
|
||
# function 2: merge single "一" and the word behind it
|
||
# if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
|
||
# e.g.
|
||
# input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
|
||
# output seg: [['听一听', 'v']]
|
||
def merge_yi(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||
new_seg = []
|
||
skip_next = False
|
||
# function 1
|
||
for i, (word, pos) in enumerate(seg):
|
||
if skip_next:
|
||
skip_next = False
|
||
continue
|
||
if i - 1 >= 0 and word == YI and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v" and seg[i + 1][1] not in X_ENG:
|
||
new_seg[-1] = (new_seg[-1][0] + YI + seg[i + 1][0], new_seg[-1][1])
|
||
skip_next = True
|
||
else:
|
||
new_seg.append((word, pos))
|
||
seg = new_seg
|
||
new_seg = []
|
||
# function 2
|
||
for i, (word, pos) in enumerate(seg):
|
||
if new_seg and new_seg[-1][0] == YI and pos not in X_ENG:
|
||
new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
|
||
else:
|
||
new_seg.append((word, pos))
|
||
return new_seg
|
||
|
||
def merge_reduplication(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||
new_seg = []
|
||
for i, (word, pos) in enumerate(seg):
|
||
if new_seg and word == new_seg[-1][0] and pos not in X_ENG:
|
||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||
else:
|
||
new_seg.append([word, pos])
|
||
return new_seg
|
||
|
||
def is_reduplication(word: str) -> bool:
|
||
return len(word) == 2 and word[0] == word[1]
|
||
|
||
# the first and the second words are all_tone_three
|
||
def merge_continuous_three_tones(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||
new_seg = []
|
||
sub_finals_list = []
|
||
for (word, pos) in seg:
|
||
if pos in X_ENG:
|
||
sub_finals_list.append(['0'])
|
||
continue
|
||
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||
# after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
|
||
en_index = [index for index, c in enumerate(word) if c == "嗯"]
|
||
for i in en_index:
|
||
orig_finals[i] = "n2"
|
||
sub_finals_list.append(orig_finals)
|
||
|
||
assert len(sub_finals_list) == len(seg)
|
||
merge_last = [False] * len(seg)
|
||
for i, (word, pos) in enumerate(seg):
|
||
if pos not in X_ENG and i - 1 >= 0 and all_tone_three(sub_finals_list[i - 1]) and all_tone_three(sub_finals_list[i]) and not merge_last[i - 1]:
|
||
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
||
if not is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||
merge_last[i] = True
|
||
else:
|
||
new_seg.append([word, pos])
|
||
else:
|
||
new_seg.append([word, pos])
|
||
|
||
return new_seg
|
||
|
||
# the last char of first word and the first char of second word is tone_three
|
||
def merge_continuous_three_tones_2(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||
new_seg = []
|
||
sub_finals_list = []
|
||
for (word, pos) in seg:
|
||
if pos in X_ENG:
|
||
sub_finals_list.append(['0'])
|
||
continue
|
||
orig_finals = lazy_pinyin(
|
||
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||
# after pypinyin==0.44.0, '嗯' need to be n2, cause the initial and final consonants cannot be empty at the same time
|
||
en_index = [index for index, c in enumerate(word) if c == "嗯"]
|
||
for i in en_index:
|
||
orig_finals[i] = "n2"
|
||
sub_finals_list.append(orig_finals)
|
||
assert len(sub_finals_list) == len(seg)
|
||
merge_last = [False] * len(seg)
|
||
for i, (word, pos) in enumerate(seg):
|
||
if pos not in X_ENG and i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not merge_last[i - 1]:
|
||
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
||
if not is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||
merge_last[i] = True
|
||
else:
|
||
new_seg.append([word, pos])
|
||
else:
|
||
new_seg.append([word, pos])
|
||
return new_seg
|
||
|
||
def merge_er(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||
new_seg = []
|
||
for i, (word, pos) in enumerate(seg):
|
||
if i - 1 >= 0 and word == "儿" and new_seg[-1][1] not in X_ENG:
|
||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||
else:
|
||
new_seg.append([word, pos])
|
||
return new_seg
|
||
|
||
def pre_merge_for_modify(seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||
"""
|
||
seg: [(word, pos), ...]
|
||
"""
|
||
seg = merge_bu(seg)
|
||
seg = merge_yi(seg)
|
||
seg = merge_reduplication(seg)
|
||
seg = merge_continuous_three_tones(seg)
|
||
seg = merge_continuous_three_tones_2(seg)
|
||
return merge_er(seg)
|
||
|
||
def bu_sandhi(word: str, finals: List[str]) -> List[str]:
|
||
# e.g. 看不懂
|
||
if len(word) == 3 and word[1] == BU:
|
||
finals[1] = finals[1][:-1] + "5"
|
||
else:
|
||
for i, char in enumerate(word):
|
||
# "不" before tone4 should be bu2, e.g. 不怕
|
||
if char == BU and i + 1 < len(word) and finals[i + 1][-1] == "4":
|
||
finals[i] = finals[i][:-1] + "2"
|
||
return finals
|
||
|
||
def yi_sandhi(word: str, finals: List[str]) -> List[str]:
|
||
# "一" in number sequences, e.g. 一零零, 二一零
|
||
if word.find(YI) != -1 and all(
|
||
[item.isnumeric() for item in word if item != YI]):
|
||
return finals
|
||
# "一" between reduplication words shold be yi5, e.g. 看一看
|
||
elif len(word) == 3 and word[1] == YI and word[0] == word[-1]:
|
||
finals[1] = finals[1][:-1] + "5"
|
||
# when "一" is ordinal word, it should be yi1
|
||
elif word.startswith("第一"):
|
||
finals[1] = finals[1][:-1] + "1"
|
||
else:
|
||
for i, char in enumerate(word):
|
||
if char == YI and i + 1 < len(word):
|
||
# "一" before tone4 should be yi2, e.g. 一段
|
||
if finals[i + 1][-1] in {'4', '5'}:
|
||
finals[i] = finals[i][:-1] + "2"
|
||
# "一" before non-tone4 should be yi4, e.g. 一天
|
||
else:
|
||
# "一" 后面如果是标点,还读一声
|
||
if word[i + 1] not in punc:
|
||
finals[i] = finals[i][:-1] + "4"
|
||
return finals
|
||
|
||
def split_word(word: str) -> List[str]:
|
||
word_list = cut_for_search(word)
|
||
word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
|
||
first_subword = word_list[0]
|
||
first_begin_idx = word.find(first_subword)
|
||
if first_begin_idx == 0:
|
||
second_subword = word[len(first_subword):]
|
||
new_word_list = [first_subword, second_subword]
|
||
else:
|
||
second_subword = word[:-len(first_subword)]
|
||
new_word_list = [second_subword, first_subword]
|
||
return new_word_list
|
||
|
||
# the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
|
||
# e.g.
|
||
# word: "家里"
|
||
# pos: "s"
|
||
# finals: ['ia1', 'i3']
|
||
def neural_sandhi(word: str, pos: str, finals: List[str]) -> List[str]:
|
||
if word in must_not_neural_tone_words:
|
||
return finals
|
||
# reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
|
||
for j, item in enumerate(word):
|
||
if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
|
||
finals[j] = finals[j][:-1] + "5"
|
||
ge_idx = word.find("个")
|
||
if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒滴哩哟喽啰耶喔诶":
|
||
finals[-1] = finals[-1][:-1] + "5"
|
||
elif len(word) >= 1 and word[-1] in "的地得":
|
||
finals[-1] = finals[-1][:-1] + "5"
|
||
# e.g. 走了, 看着, 去过
|
||
elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
|
||
finals[-1] = finals[-1][:-1] + "5"
|
||
elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"}:
|
||
finals[-1] = finals[-1][:-1] + "5"
|
||
# e.g. 桌上, 地下
|
||
elif len(word) > 1 and word[-1] in "上下" and pos in {"s", "l", "f"}:
|
||
finals[-1] = finals[-1][:-1] + "5"
|
||
# e.g. 上来, 下去
|
||
elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
|
||
finals[-1] = finals[-1][:-1] + "5"
|
||
# 个做量词
|
||
elif (ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个':
|
||
finals[ge_idx] = finals[ge_idx][:-1] + "5"
|
||
else:
|
||
if word in must_neural_tone_words or word[-2:] in must_neural_tone_words:
|
||
finals[-1] = finals[-1][:-1] + "5"
|
||
|
||
word_list = split_word(word)
|
||
finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
|
||
for i, word in enumerate(word_list):
|
||
# conventional neural in Chinese
|
||
if word in must_neural_tone_words or word[-2:] in must_neural_tone_words:
|
||
finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
|
||
finals = sum(finals_list, [])
|
||
return finals
|
||
|
||
def all_tone_three(finals: List[str]) -> bool:
|
||
return all(x[-1] == "3" for x in finals)
|
||
|
||
def three_sandhi(word: str, finals: List[str]) -> List[str]:
|
||
if len(word) == 2 and all_tone_three(finals):
|
||
finals[0] = finals[0][:-1] + "2"
|
||
elif len(word) == 3:
|
||
word_list = split_word(word)
|
||
if all_tone_three(finals):
|
||
# disyllabic + monosyllabic, e.g. 蒙古/包
|
||
if len(word_list[0]) == 2:
|
||
finals[0] = finals[0][:-1] + "2"
|
||
finals[1] = finals[1][:-1] + "2"
|
||
# monosyllabic + disyllabic, e.g. 纸/老虎
|
||
elif len(word_list[0]) == 1:
|
||
finals[1] = finals[1][:-1] + "2"
|
||
else:
|
||
finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]]
|
||
if len(finals_list) == 2:
|
||
for i, sub in enumerate(finals_list):
|
||
# e.g. 所有/人
|
||
if all_tone_three(sub) and len(sub) == 2:
|
||
finals_list[i][0] = finals_list[i][0][:-1] + "2"
|
||
# e.g. 好/喜欢
|
||
elif i == 1 and not all_tone_three(sub) and finals_list[i][0][-1] == "3" and finals_list[0][-1][-1] == "3":
|
||
finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
|
||
finals = sum(finals_list, [])
|
||
# split idiom into two words who's length is 2
|
||
elif len(word) == 4:
|
||
finals_list = [finals[:2], finals[2:]]
|
||
finals = []
|
||
for sub in finals_list:
|
||
if all_tone_three(sub):
|
||
sub[0] = sub[0][:-1] + "2"
|
||
finals += sub
|
||
|
||
return finals
|
||
|
||
def modified_tone(word: str, pos: str, finals: List[str]) -> List[str]:
|
||
"""
|
||
word: 分词
|
||
pos: 词性
|
||
finals: 带调韵母, [final1, ..., finaln]
|
||
"""
|
||
finals = bu_sandhi(word, finals)
|
||
finals = yi_sandhi(word, finals)
|
||
finals = neural_sandhi(word, pos, finals)
|
||
return three_sandhi(word, finals)
|
||
|
||
def g2p(text: str, with_erhua: bool = True) -> str:
|
||
"""
|
||
Return: string of phonemes.
|
||
'ㄋㄧ2ㄏㄠ3/ㄕ十4ㄐㄝ4'
|
||
"""
|
||
tokens = []
|
||
seg_cut = posseg.lcut(text)
|
||
# fix wordseg bad case for sandhi
|
||
seg_cut = pre_merge_for_modify(seg_cut)
|
||
|
||
# 为了多音词获得更好的效果,这里采用整句预测
|
||
initials = []
|
||
finals = []
|
||
# pypinyin, g2pM
|
||
for word, pos in seg_cut:
|
||
if pos == 'x' and '\u4E00' <= min(word) and max(word) <= '\u9FFF':
|
||
pos = 'X'
|
||
elif pos != 'x' and word in punc:
|
||
pos = 'x'
|
||
tk = MToken(tag=pos, whitespace='')
|
||
if pos in X_ENG:
|
||
if not word.isspace():
|
||
if pos == 'x' and word in punc:
|
||
tk.phonemes = word
|
||
tokens.append(tk)
|
||
elif tokens:
|
||
tokens[-1].whitespace += word
|
||
continue
|
||
elif tokens and tokens[-1].tag not in X_ENG and not tokens[-1].whitespace:
|
||
tokens[-1].whitespace = '/'
|
||
|
||
# g2p
|
||
sub_initials, sub_finals = get_initials_finals(word)
|
||
# tone sandhi
|
||
sub_finals = modified_tone(word, pos, sub_finals)
|
||
# er hua
|
||
if with_erhua:
|
||
sub_initials, sub_finals = merge_erhua(sub_initials, sub_finals, word, pos)
|
||
|
||
initials.append(sub_initials)
|
||
finals.append(sub_finals)
|
||
# assert len(sub_initials) == len(sub_finals) == len(word)
|
||
|
||
# sum(iterable[, start])
|
||
# initials = sum(initials, [])
|
||
# finals = sum(finals, [])
|
||
|
||
phones = []
|
||
for c, v in zip(sub_initials, sub_finals):
|
||
# NOTE: post process for pypinyin outputs
|
||
# we discriminate i, ii and iii
|
||
if c:
|
||
phones.append(c)
|
||
# replace punctuation by ` `
|
||
# if c and c in punc:
|
||
# phones.append(c)
|
||
if v and (v not in punc or v != c):# and v not in rhy_phns:
|
||
phones.append(v)
|
||
phones = '_'.join(phones).replace('_eR', '_er').replace('R', '_R')
|
||
phones = re.sub(r'(?=\d)', '_', phones).split('_')
|
||
print(phones)
|
||
tk.phonemes = ''.join(ZH_MAP.get(p, unk) for p in phones)
|
||
tokens.append(tk)
|
||
|
||
return ''.join((unk if tk.phonemes is None else tk.phonemes) + tk.whitespace for tk in tokens)
|
||
|
||
print(g2p('时间为。Hello, world!你好,我们是一群追逐梦想的人。我正在使用qq。忽略卢驴'))
|
||
seg = posseg.lcut('不好看', True)
|
||
print(seg, merge_bu(seg))
|
||
seg = merge_bu(posseg.lcut('听一听一个', True))
|
||
print(seg, merge_yi(seg))
|
||
seg = merge_bu(posseg.lcut('谢谢谢谢', True))
|
||
print(seg, merge_reduplication(seg))
|
||
seg = merge_bu(posseg.lcut('小美好', True))
|
||
print(seg, merge_continuous_three_tones(seg))
|
||
seg = merge_bu(posseg.lcut('风景好', True))
|
||
print(seg, merge_continuous_three_tones_2(seg))
|