20
20
CMU_DICT_FAST_PATH = os .path .join (current_file_path , "cmudict-fast.rep" )
21
21
CMU_DICT_HOT_PATH = os .path .join (current_file_path , "engdict-hot.rep" )
22
22
CACHE_PATH = os .path .join (current_file_path , "engdict_cache.pickle" )
23
+ NAMECACHE_PATH = os .path .join (current_file_path , "namedict_cache.pickle" )
23
24
24
25
arpa = {
25
26
"AH0" ,
@@ -162,6 +163,9 @@ def read_dict_new():
162
163
line_index = line_index + 1
163
164
line = f .readline ()
164
165
166
+ return g2p_dict
167
+
168
+ def hot_reload_hot (g2p_dict ):
165
169
with open (CMU_DICT_HOT_PATH ) as f :
166
170
line = f .readline ()
167
171
line_index = 1
@@ -175,7 +179,7 @@ def read_dict_new():
175
179
176
180
line_index = line_index + 1
177
181
line = f .readline ()
178
-
182
+
179
183
return g2p_dict
180
184
181
185
@@ -192,9 +196,21 @@ def get_dict():
192
196
g2p_dict = read_dict_new ()
193
197
cache_dict (g2p_dict , CACHE_PATH )
194
198
199
+ g2p_dict = hot_reload_hot (g2p_dict )
200
+
195
201
return g2p_dict
196
202
197
203
204
+ def get_namedict ():
205
+ if os .path .exists (NAMECACHE_PATH ):
206
+ with open (NAMECACHE_PATH , "rb" ) as pickle_file :
207
+ name_dict = pickle .load (pickle_file )
208
+ else :
209
+ name_dict = {}
210
+
211
+ return name_dict
212
+
213
+
198
214
def text_normalize (text ):
199
215
# todo: eng text normalize
200
216
# 适配中文及 g2p_en 标点
@@ -227,13 +243,18 @@ def __init__(self):
227
243
# 分词初始化
228
244
wordsegment .load ()
229
245
230
- # 扩展过时字典
246
+ # 扩展过时字典, 添加姓名字典
231
247
self .cmu = get_dict ()
248
+ self .namedict = get_namedict ()
232
249
233
250
# 剔除读音错误的几个缩写
234
251
for word in ["AE" , "AI" , "AR" , "IOS" , "HUD" , "OS" ]:
235
252
del self .cmu [word .lower ()]
236
253
254
+ # 修正多音字
255
+ self .homograph2features ["read" ] = (['R' , 'IY1' , 'D' ], ['R' , 'EH1' , 'D' ], 'VBP' )
256
+ self .homograph2features ["complex" ] = (['K' , 'AH0' , 'M' , 'P' , 'L' , 'EH1' , 'K' , 'S' ], ['K' , 'AA1' , 'M' , 'P' , 'L' , 'EH0' , 'K' , 'S' ], 'JJ' )
257
+
237
258
238
259
def __call__ (self , text ):
239
260
# tokenization
@@ -260,25 +281,34 @@ def __call__(self, text):
260
281
pron1 , pron2 , pos1 = self .homograph2features [word ]
261
282
if pos .startswith (pos1 ):
262
283
pron = pron1
284
+ # pos1比pos长仅出现在read
285
+ elif len (pos ) < len (pos1 ) and pos == pos1 [:len (pos )]:
286
+ pron = pron1
263
287
else :
264
288
pron = pron2
265
289
else :
266
290
# 递归查找预测
267
- pron = self .qryword (word )
291
+ pron = self .qryword (o_word )
268
292
269
293
prons .extend (pron )
270
294
prons .extend ([" " ])
271
295
272
296
return prons [:- 1 ]
273
297
274
298
275
- def qryword (self , word ):
299
+ def qryword (self , o_word ):
300
+ word = o_word .lower ()
301
+
276
302
# 查字典, 单字母除外
277
303
if len (word ) > 1 and word in self .cmu : # lookup CMU dict
278
304
return self .cmu [word ][0 ]
279
305
306
+ # 单词仅首字母大写时查找姓名字典
307
+ if o_word .istitle () and word in self .namedict :
308
+ return self .namedict [word ][0 ]
309
+
280
310
# oov 长度小于等于 3 直接读字母
281
- if ( len (word ) <= 3 ) :
311
+ if len (word ) <= 3 :
282
312
phones = []
283
313
for w in word :
284
314
# 单读 A 发音修正, 此处不存在大写的情况
0 commit comments