Skip to content

Commit 0f7e5c4

Browse files
authored
修复英文多音字,调整字典热加载,新增姓名匹配 (RVC-Boss#869)
* Fix homograph dict * Add JSON in dict * Adjust hot dict to hot reload * Add English name dict * Adjust get name dict logic
1 parent 41f0987 commit 0f7e5c4

File tree

3 files changed

+37
-6
lines changed

3 files changed

+37
-6
lines changed

GPT_SoVITS/text/engdict-hot.rep

+2-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
CHATGPT CH AE1 T JH IY1 P IY1 T IY1
1+
CHATGPT CH AE1 T JH IY1 P IY1 T IY1
2+
JSON JH EY1 S AH0 N

GPT_SoVITS/text/english.py

+35-5
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep")
2121
CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
2222
CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
23+
NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")
2324

2425
arpa = {
2526
"AH0",
@@ -162,6 +163,9 @@ def read_dict_new():
162163
line_index = line_index + 1
163164
line = f.readline()
164165

166+
return g2p_dict
167+
168+
def hot_reload_hot(g2p_dict):
165169
with open(CMU_DICT_HOT_PATH) as f:
166170
line = f.readline()
167171
line_index = 1
@@ -175,7 +179,7 @@ def read_dict_new():
175179

176180
line_index = line_index + 1
177181
line = f.readline()
178-
182+
179183
return g2p_dict
180184

181185

@@ -192,9 +196,21 @@ def get_dict():
192196
g2p_dict = read_dict_new()
193197
cache_dict(g2p_dict, CACHE_PATH)
194198

199+
g2p_dict = hot_reload_hot(g2p_dict)
200+
195201
return g2p_dict
196202

197203

204+
def get_namedict():
205+
if os.path.exists(NAMECACHE_PATH):
206+
with open(NAMECACHE_PATH, "rb") as pickle_file:
207+
name_dict = pickle.load(pickle_file)
208+
else:
209+
name_dict = {}
210+
211+
return name_dict
212+
213+
198214
def text_normalize(text):
199215
# todo: eng text normalize
200216
# 适配中文及 g2p_en 标点
@@ -227,13 +243,18 @@ def __init__(self):
227243
# 分词初始化
228244
wordsegment.load()
229245

230-
# 扩展过时字典
246+
# 扩展过时字典, 添加姓名字典
231247
self.cmu = get_dict()
248+
self.namedict = get_namedict()
232249

233250
# 剔除读音错误的几个缩写
234251
for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
235252
del self.cmu[word.lower()]
236253

254+
# 修正多音字
255+
self.homograph2features["read"] = (['R', 'IY1', 'D'], ['R', 'EH1', 'D'], 'VBP')
256+
self.homograph2features["complex"] = (['K', 'AH0', 'M', 'P', 'L', 'EH1', 'K', 'S'], ['K', 'AA1', 'M', 'P', 'L', 'EH0', 'K', 'S'], 'JJ')
257+
237258

238259
def __call__(self, text):
239260
# tokenization
@@ -260,25 +281,34 @@ def __call__(self, text):
260281
pron1, pron2, pos1 = self.homograph2features[word]
261282
if pos.startswith(pos1):
262283
pron = pron1
284+
# pos1比pos长仅出现在read
285+
elif len(pos) < len(pos1) and pos == pos1[:len(pos)]:
286+
pron = pron1
263287
else:
264288
pron = pron2
265289
else:
266290
# 递归查找预测
267-
pron = self.qryword(word)
291+
pron = self.qryword(o_word)
268292

269293
prons.extend(pron)
270294
prons.extend([" "])
271295

272296
return prons[:-1]
273297

274298

275-
def qryword(self, word):
299+
def qryword(self, o_word):
300+
word = o_word.lower()
301+
276302
# 查字典, 单字母除外
277303
if len(word) > 1 and word in self.cmu: # lookup CMU dict
278304
return self.cmu[word][0]
279305

306+
# 单词仅首字母大写时查找姓名字典
307+
if o_word.istitle() and word in self.namedict:
308+
return self.namedict[word][0]
309+
280310
# oov 长度小于等于 3 直接读字母
281-
if (len(word) <= 3):
311+
if len(word) <= 3:
282312
phones = []
283313
for w in word:
284314
# 单读 A 发音修正, 此处不存在大写的情况

GPT_SoVITS/text/namedict_cache.pickle

743 KB
Binary file not shown.

0 commit comments

Comments
 (0)