@@ -247,105 +247,28 @@ THAI_FONTS=( \
247
247
KOREAN_FONTS=( \
248
248
" Arial Unicode MS" \
249
249
" Arial Unicode MS Bold" \
250
- " Ascender Uni" \
251
250
" Baekmuk Batang Patched" \
252
251
" Baekmuk Batang" \
253
252
" Baekmuk Dotum" \
254
253
" Baekmuk Gulim" \
255
254
" Baekmuk Headline" \
256
- " Bandal Medium" \
257
- " Bangwool Medium" \
258
- " Dotum" \
259
- " Eunjin Medium" \
260
- " EunjinNakseo Medium" \
261
- " FBHanGothicDB" \
262
- " Guseul Medium" \
263
- " JejuGothic" \
264
- " JejuHallasan" \
265
- " JejuMyeongjo" \
266
- " KoPub Batang Bold" \
267
- " KoPub Batang Light" \
268
- " KoPub Batang" \
269
- " Nanum Brush Script" \
270
- " NanumGothic Bold" \
271
- " NanumGothic Ultra-Bold" \
272
- " NanumGothic" \
273
- " NanumMyeongjo Bold" \
274
- " NanumMyeongjo Semi-Bold" \
275
- " NanumMyeongjo" \
276
- " Nanum Pen" \
277
- " WenQuanYi Zen Hei Medium" \
278
255
)
279
256
280
257
CHI_SIM_FONTS=( \
281
258
" AR PL UKai CN" \
282
259
" AR PL UMing Patched Light" \
283
260
" Arial Unicode MS" \
284
261
" Arial Unicode MS Bold" \
285
- " CFangSongPRC" \
286
- " CGuLi PRC" \
287
- " CGuYin PRC" \
288
- " CHei2 PRC" \
289
- " CHei3 PRC" \
290
- " CNganKai PRC" \
291
- " CPo3 PRC" \
292
- " CPo PRC" \
293
- " CSong3 PRC" \
294
- " CWeiBei PRC" \
295
- " CXLi PRC" \
296
- " CXYao PRC" \
297
- " CXing PRC" \
298
- " CYuen2 PRC" \
299
- " MComic PRC" \
300
- " MCute PRC" \
301
- " MElle PRC" \
302
- " MGentle PRC" \
303
- " MJNgai PRC" \
304
- " MKai PRC" \
305
- " MMarker PRC" \
306
- " MRocky PRC" \
307
- " MSung PRC" \
308
- " MWindy PRC" \
309
- " MYoung PRC" \
310
- " MYuen PRC" \
311
- " MYuppy PRC" \
312
262
" WenQuanYi Zen Hei Medium" \
313
263
)
314
264
315
- # The PRC fonts don't cover all the character set for chi_tra, but they
316
- # provide a broader view of the fonts for the characters they do cover.
317
265
CHI_TRA_FONTS=( \
266
+ " AR PL UKai TW" \
267
+ " AR PL UMing TW MBE Light" \
318
268
" AR PL UKai Patched" \
319
269
" AR PL UMing Patched Light" \
320
270
" Arial Unicode MS" \
321
271
" Arial Unicode MS Bold" \
322
- " CFangSongPRC" \
323
- " CGuLi PRC" \
324
- " CGuYin PRC" \
325
- " CHei2 PRC" \
326
- " CHei3 PRC" \
327
- " CNganKai PRC" \
328
- " CPo3 PRC" \
329
- " CPo PRC" \
330
- " CSong3 PRC" \
331
- " CWeiBei PRC" \
332
- " CXLi PRC" \
333
- " CXYao PRC" \
334
- " CXing PRC" \
335
- " CYuen2 PRC" \
336
- " MComic PRC" \
337
- " MCute PRC" \
338
- " MElle PRC" \
339
- " MGentle PRC" \
340
- " MJNgai PRC" \
341
- " MKai PRC" \
342
- " MMarker PRC" \
343
- " MRocky PRC" \
344
- " MSung PRC" \
345
- " MWindy PRC" \
346
- " MYoung PRC" \
347
- " MYuen PRC" \
348
- " MYuppy PRC" \
349
272
" WenQuanYi Zen Hei Medium" \
350
273
)
351
274
@@ -358,23 +281,8 @@ JPN_FONTS=( \
358
281
" TakaoPMincho" \
359
282
" VL Gothic" \
360
283
" VL PGothic" \
361
- " Noto Sans Japanese Black" \
362
284
" Noto Sans Japanese Bold" \
363
285
" Noto Sans Japanese Light" \
364
- " Noto Sans Japanese Medium" \
365
- " Noto Sans Japanese" \
366
- " Noto Sans Japanese Thin" \
367
- " IPAGothic" \
368
- " IPAPGothic" \
369
- " IPAUIGothic" \
370
- " IPAMincho" \
371
- " IPAPMincho" \
372
- " Kochi Gothic" \
373
- " Kochi Mincho" \
374
- " Monapo" \
375
- " UmePlus Gothic" \
376
- " UmePlus P Gothic" \
377
- " WenQuanYi Zen Hei Medium" \
378
286
)
379
287
380
288
RUSSIAN_FONTS=( \
@@ -889,7 +797,15 @@ set_lang_specific_parameters() {
889
797
FONTS=( " ${LATIN_FONTS[@]} " )
890
798
FILTER_ARGUMENTS=" "
891
799
WORDLIST2DAWG_ARGUMENTS=" "
892
- WORD_DAWG_SIZE=100000
800
+ # These dawg factors represent the fraction of the corpus not covered by the
801
+ # dawg, and seem like reasonable defaults, but the optimal value is likely
802
+ # to be highly corpus-dependent, as well as somewhat language-dependent.
803
+ # Number dawg factor is the fraction of all numeric strings that are not
804
+ # covered, which is why it is higher relative to the others.
805
+ PUNC_DAWG_FACTOR=
806
+ NUMBER_DAWG_FACTOR=0.125
807
+ WORD_DAWG_FACTOR=0.05
808
+ BIGRAM_DAWG_FACTOR=0.015
893
809
TRAINING_DATA_ARGUMENTS=" "
894
810
FRAGMENTS_DISABLED=" y"
895
811
RUN_SHAPE_CLUSTERING=0
@@ -935,17 +851,17 @@ set_lang_specific_parameters() {
935
851
bos ) ;;
936
852
cat ) ;;
937
853
ceb ) ;;
938
- ces ) ;;
854
+ ces ) PUNC_DAWG_FACTOR=0.004 ;;
939
855
cym ) ;;
940
856
dan ) ;;
941
- deu ) ;;
942
- eng ) ;;
857
+ deu ) WORD_DAWG_FACTOR=0.125 ;;
858
+ eng ) WORD_DAWG_FACTOR=0.03 ;;
943
859
epo ) ;;
944
860
est ) ;;
945
861
eus ) ;;
946
862
fil ) ;;
947
863
fin ) ;;
948
- fra ) ;;
864
+ fra ) WORD_DAWG_FACTOR=0.08 ;;
949
865
gle ) ;;
950
866
glg ) ;;
951
867
hat ) ;;
@@ -959,7 +875,7 @@ set_lang_specific_parameters() {
959
875
lit ) ;;
960
876
mlt ) ;;
961
877
msa ) ;;
962
- nld ) ;;
878
+ nld ) WORD_DAWG_FACTOR=0.02 ;;
963
879
nor ) ;;
964
880
por ) ;;
965
881
ron ) ;;
@@ -987,6 +903,7 @@ set_lang_specific_parameters() {
987
903
988
904
# Cyrillic script-based languages.
989
905
rus ) FONTS=( " ${RUSSIAN_FONTS[@]} " )
906
+ NUMBER_DAWG_FACTOR=0.05
990
907
WORD_DAWG_SIZE=1000000 ;;
991
908
aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl )
992
909
FONTS=( " ${RUSSIAN_FONTS[@]} " ) ;;
@@ -998,56 +915,73 @@ set_lang_specific_parameters() {
998
915
TEXT_CORPUS=${FLAGS_webtext_prefix} /cyr_lid.corpus.txt
999
916
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1000
917
GENERATE_WORD_BIGRAMS=0
1001
- FRAGMENTS_DISABLED=" y"
1002
918
WORD_DAWG_SIZE=1000000
1003
919
FONTS=( " ${RUSSIAN_FONTS[@]} " );;
1004
920
1005
921
# South Asian scripts mostly have a lot of different graphemes, so trim
1006
922
# down the MEAN_COUNT so as not to get a huge amount of text.
1007
923
asm | ben )
1008
924
MEAN_COUNT=" 15"
925
+ WORD_DAWG_FACTOR=0.15
1009
926
FONTS=( " ${BENGALI_FONTS[@]} " ) ;;
1010
927
bih | hin | mar | nep | san )
1011
928
MEAN_COUNT=" 15"
929
+ WORD_DAWG_FACTOR=0.15
1012
930
FONTS=( " ${DEVANAGARI_FONTS[@]} " ) ;;
1013
931
bod ) MEAN_COUNT=" 15"
932
+ WORD_DAWG_FACTOR=0.15
933
+ FONTS=( " ${TIBETAN_FONTS[@]} " ) ;;
934
+ dzo )
935
+ WORD_DAWG_FACTOR=0.01
1014
936
FONTS=( " ${TIBETAN_FONTS[@]} " ) ;;
1015
- dzo ) FONTS=( " ${TIBETAN_FONTS[@]} " ) ;;
1016
937
guj ) MEAN_COUNT=" 15"
938
+ WORD_DAWG_FACTOR=0.15
1017
939
FONTS=( " ${GUJARATI_FONTS[@]} " ) ;;
1018
940
kan ) MEAN_COUNT=" 15"
941
+ WORD_DAWG_FACTOR=0.15
1019
942
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
1020
943
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
1021
944
FONTS=( " ${KANNADA_FONTS[@]} " ) ;;
1022
945
mal ) MEAN_COUNT=" 15"
946
+ WORD_DAWG_FACTOR=0.15
1023
947
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
1024
948
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
1025
949
FONTS=( " ${MALAYALAM_FONTS[@]} " ) ;;
1026
- ori ) FONTS=( " ${ORIYA_FONTS[@]} " ) ;;
950
+ ori )
951
+ WORD_DAWG_FACTOR=0.01
952
+ FONTS=( " ${ORIYA_FONTS[@]} " ) ;;
1027
953
pan ) MEAN_COUNT=" 15"
954
+ WORD_DAWG_FACTOR=0.01
1028
955
FONTS=( " ${PUNJABI_FONTS[@]} " ) ;;
1029
956
sin ) MEAN_COUNT=" 15"
957
+ WORD_DAWG_FACTOR=0.01
1030
958
FONTS=( " ${SINHALA_FONTS[@]} " ) ;;
1031
959
tam ) MEAN_COUNT=" 30"
960
+ WORD_DAWG_FACTOR=0.15
1032
961
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
1033
962
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
1034
963
FONTS=( " ${TAMIL_FONTS[@]} " ) ;;
1035
964
tel ) MEAN_COUNT=" 15"
965
+ WORD_DAWG_FACTOR=0.15
1036
966
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
1037
967
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
1038
968
FONTS=( " ${TELUGU_FONTS[@]} " ) ;;
1039
969
1040
970
# SouthEast Asian scripts.
1041
971
khm ) MEAN_COUNT=" 15"
972
+ WORD_DAWG_FACTOR=0.15
1042
973
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1043
974
FONTS=( " ${KHMER_FONTS[@]} " ) ;;
1044
975
lao ) MEAN_COUNT=" 15"
976
+ WORD_DAWG_FACTOR=0.15
1045
977
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1046
978
FONTS=( " ${LAOTHIAN_FONTS[@]} " ) ;;
1047
979
mya ) MEAN_COUNT=" 12"
980
+ WORD_DAWG_FACTOR=0.15
1048
981
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1049
982
FONTS=( " ${BURMESE_FONTS[@]} " ) ;;
1050
983
tha ) MEAN_COUNT=" 30"
984
+ WORD_DAWG_FACTOR=0.01
1051
985
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1052
986
FILTER_ARGUMENTS=" --segmenter_lang=tha"
1053
987
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
@@ -1058,36 +992,35 @@ set_lang_specific_parameters() {
1058
992
# CJK
1059
993
chi_sim )
1060
994
MEAN_COUNT=" 15"
995
+ PUNC_DAWG_FACTOR=0.015
996
+ WORD_DAWG_FACTOR=0.015
1061
997
GENERATE_WORD_BIGRAMS=0
1062
998
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1063
999
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
1064
1000
FILTER_ARGUMENTS=" --charset_filter=chi_sim --segmenter_lang=chi_sim"
1065
- FRAGMENTS_DISABLED=" y"
1066
- GENERATE_DAWGS=0
1067
1001
FONTS=( " ${CHI_SIM_FONTS[@]} " ) ;;
1068
1002
chi_tra )
1069
1003
MEAN_COUNT=" 15"
1004
+ WORD_DAWG_FACTOR=0.015
1070
1005
GENERATE_WORD_BIGRAMS=0
1071
1006
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1072
1007
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
1073
1008
FILTER_ARGUMENTS=" --charset_filter=chi_tra --segmenter_lang=chi_tra"
1074
- FRAGMENTS_DISABLED=" y"
1075
- GENERATE_DAWGS=0
1076
1009
FONTS=( " ${CHI_TRA_FONTS[@]} " ) ;;
1077
1010
jpn ) MEAN_COUNT=" 15"
1011
+ WORD_DAWG_FACTOR=0.015
1078
1012
GENERATE_WORD_BIGRAMS=0
1079
1013
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1080
1014
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
1081
1015
FILTER_ARGUMENTS=" --charset_filter=jpn --segmenter_lang=jpn"
1082
- FRAGMENTS_DISABLED=" y"
1083
- GENERATE_DAWGS=0
1084
1016
FONTS=( " ${JPN_FONTS[@]} " ) ;;
1085
1017
kor ) MEAN_COUNT=" 20"
1018
+ WORD_DAWG_FACTOR=0.015
1019
+ NUMBER_DAWG_FACTOR=0.05
1086
1020
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1087
1021
TRAINING_DATA_ARGUMENTS+=" --desired_bigrams="
1088
1022
GENERATE_WORD_BIGRAMS=0
1089
1023
FILTER_ARGUMENTS=" --charset_filter=kor --segmenter_lang=kor"
1090
- FRAGMENTS_DISABLED=" y"
1091
1024
FONTS=( " ${KOREAN_FONTS[@]} " ) ;;
1092
1025
1093
1026
# Middle-Eastern scripts.
@@ -1096,6 +1029,8 @@ set_lang_specific_parameters() {
1096
1029
fas | pus | snd | uig | urd )
1097
1030
FONTS=( " ${PERSIAN_FONTS[@]} " ) ;;
1098
1031
heb | yid )
1032
+ NUMBER_DAWG_FACTOR=0.05
1033
+ WORD_DAWG_FACTOR=0.08
1099
1034
FONTS=( " ${HEBREW_FONTS[@]} " ) ;;
1100
1035
syr ) FONTS=( " ${SYRIAC_FONTS[@]} " ) ;;
1101
1036
@@ -1106,6 +1041,8 @@ set_lang_specific_parameters() {
1106
1041
" Noto Sans Cherokee" \
1107
1042
) ;;
1108
1043
ell | grc )
1044
+ NUMBER_DAWG_FACTOR=0.05
1045
+ WORD_DAWG_FACTOR=0.08
1109
1046
FONTS=( " ${GREEK_FONTS[@]} " ) ;;
1110
1047
hye ) FONTS=( " ${ARMENIAN_FONTS[@]} " ) ;;
1111
1048
iku ) FONTS=( " ${NORTH_AMERICAN_ABORIGINAL_FONTS[@]} " ) ;;
0 commit comments