Skip to content

Commit 4c7ab0c

Browse files
committed
Fixed font lists, improved wordlist management
1 parent ab0f4e2 commit 4c7ab0c

File tree

3 files changed

+69
-164
lines changed

3 files changed

+69
-164
lines changed

training/language-specific.sh

100644100755
+47-110
Original file line numberDiff line numberDiff line change
@@ -247,105 +247,28 @@ THAI_FONTS=( \
247247
KOREAN_FONTS=( \
248248
"Arial Unicode MS" \
249249
"Arial Unicode MS Bold" \
250-
"Ascender Uni" \
251250
"Baekmuk Batang Patched" \
252251
"Baekmuk Batang" \
253252
"Baekmuk Dotum" \
254253
"Baekmuk Gulim" \
255254
"Baekmuk Headline" \
256-
"Bandal Medium" \
257-
"Bangwool Medium" \
258-
"Dotum" \
259-
"Eunjin Medium" \
260-
"EunjinNakseo Medium" \
261-
"FBHanGothicDB" \
262-
"Guseul Medium" \
263-
"JejuGothic" \
264-
"JejuHallasan" \
265-
"JejuMyeongjo" \
266-
"KoPub Batang Bold" \
267-
"KoPub Batang Light" \
268-
"KoPub Batang" \
269-
"Nanum Brush Script" \
270-
"NanumGothic Bold" \
271-
"NanumGothic Ultra-Bold" \
272-
"NanumGothic" \
273-
"NanumMyeongjo Bold" \
274-
"NanumMyeongjo Semi-Bold" \
275-
"NanumMyeongjo" \
276-
"Nanum Pen" \
277-
"WenQuanYi Zen Hei Medium" \
278255
)
279256

280257
CHI_SIM_FONTS=( \
281258
"AR PL UKai CN" \
282259
"AR PL UMing Patched Light" \
283260
"Arial Unicode MS" \
284261
"Arial Unicode MS Bold" \
285-
"CFangSongPRC" \
286-
"CGuLi PRC" \
287-
"CGuYin PRC" \
288-
"CHei2 PRC" \
289-
"CHei3 PRC" \
290-
"CNganKai PRC" \
291-
"CPo3 PRC" \
292-
"CPo PRC" \
293-
"CSong3 PRC" \
294-
"CWeiBei PRC" \
295-
"CXLi PRC" \
296-
"CXYao PRC" \
297-
"CXing PRC" \
298-
"CYuen2 PRC" \
299-
"MComic PRC" \
300-
"MCute PRC" \
301-
"MElle PRC" \
302-
"MGentle PRC" \
303-
"MJNgai PRC" \
304-
"MKai PRC" \
305-
"MMarker PRC" \
306-
"MRocky PRC" \
307-
"MSung PRC" \
308-
"MWindy PRC" \
309-
"MYoung PRC" \
310-
"MYuen PRC" \
311-
"MYuppy PRC" \
312262
"WenQuanYi Zen Hei Medium" \
313263
)
314264

315-
# The PRC fonts don't cover all the character set for chi_tra, but they
316-
# provide a broader view of the fonts for the characters they do cover.
317265
CHI_TRA_FONTS=( \
266+
"AR PL UKai TW" \
267+
"AR PL UMing TW MBE Light" \
318268
"AR PL UKai Patched" \
319269
"AR PL UMing Patched Light" \
320270
"Arial Unicode MS" \
321271
"Arial Unicode MS Bold" \
322-
"CFangSongPRC" \
323-
"CGuLi PRC" \
324-
"CGuYin PRC" \
325-
"CHei2 PRC" \
326-
"CHei3 PRC" \
327-
"CNganKai PRC" \
328-
"CPo3 PRC" \
329-
"CPo PRC" \
330-
"CSong3 PRC" \
331-
"CWeiBei PRC" \
332-
"CXLi PRC" \
333-
"CXYao PRC" \
334-
"CXing PRC" \
335-
"CYuen2 PRC" \
336-
"MComic PRC" \
337-
"MCute PRC" \
338-
"MElle PRC" \
339-
"MGentle PRC" \
340-
"MJNgai PRC" \
341-
"MKai PRC" \
342-
"MMarker PRC" \
343-
"MRocky PRC" \
344-
"MSung PRC" \
345-
"MWindy PRC" \
346-
"MYoung PRC" \
347-
"MYuen PRC" \
348-
"MYuppy PRC" \
349272
"WenQuanYi Zen Hei Medium" \
350273
)
351274

@@ -358,23 +281,8 @@ JPN_FONTS=( \
358281
"TakaoPMincho" \
359282
"VL Gothic" \
360283
"VL PGothic" \
361-
"Noto Sans Japanese Black" \
362284
"Noto Sans Japanese Bold" \
363285
"Noto Sans Japanese Light" \
364-
"Noto Sans Japanese Medium" \
365-
"Noto Sans Japanese" \
366-
"Noto Sans Japanese Thin" \
367-
"IPAGothic" \
368-
"IPAPGothic" \
369-
"IPAUIGothic" \
370-
"IPAMincho" \
371-
"IPAPMincho" \
372-
"Kochi Gothic" \
373-
"Kochi Mincho" \
374-
"Monapo" \
375-
"UmePlus Gothic" \
376-
"UmePlus P Gothic" \
377-
"WenQuanYi Zen Hei Medium" \
378286
)
379287

380288
RUSSIAN_FONTS=( \
@@ -889,7 +797,15 @@ set_lang_specific_parameters() {
889797
FONTS=( "${LATIN_FONTS[@]}" )
890798
FILTER_ARGUMENTS=""
891799
WORDLIST2DAWG_ARGUMENTS=""
892-
WORD_DAWG_SIZE=100000
800+
# These dawg factors represent the fraction of the corpus not covered by the
801+
# dawg, and seem like reasonable defaults, but the optimal value is likely
802+
# to be highly corpus-dependent, as well as somewhat language-dependent.
803+
# Number dawg factor is the fraction of all numeric strings that are not
804+
# covered, which is why it is higher relative to the others.
805+
PUNC_DAWG_FACTOR=
806+
NUMBER_DAWG_FACTOR=0.125
807+
WORD_DAWG_FACTOR=0.05
808+
BIGRAM_DAWG_FACTOR=0.015
893809
TRAINING_DATA_ARGUMENTS=""
894810
FRAGMENTS_DISABLED="y"
895811
RUN_SHAPE_CLUSTERING=0
@@ -935,17 +851,17 @@ set_lang_specific_parameters() {
935851
bos ) ;;
936852
cat ) ;;
937853
ceb ) ;;
938-
ces ) ;;
854+
ces ) PUNC_DAWG_FACTOR=0.004 ;;
939855
cym ) ;;
940856
dan ) ;;
941-
deu ) ;;
942-
eng ) ;;
857+
deu ) WORD_DAWG_FACTOR=0.125 ;;
858+
eng ) WORD_DAWG_FACTOR=0.03 ;;
943859
epo ) ;;
944860
est ) ;;
945861
eus ) ;;
946862
fil ) ;;
947863
fin ) ;;
948-
fra ) ;;
864+
fra ) WORD_DAWG_FACTOR=0.08 ;;
949865
gle ) ;;
950866
glg ) ;;
951867
hat ) ;;
@@ -959,7 +875,7 @@ set_lang_specific_parameters() {
959875
lit ) ;;
960876
mlt ) ;;
961877
msa ) ;;
962-
nld ) ;;
878+
nld ) WORD_DAWG_FACTOR=0.02 ;;
963879
nor ) ;;
964880
por ) ;;
965881
ron ) ;;
@@ -987,6 +903,7 @@ set_lang_specific_parameters() {
987903

988904
# Cyrillic script-based languages.
989905
rus ) FONTS=( "${RUSSIAN_FONTS[@]}" )
906+
NUMBER_DAWG_FACTOR=0.05
990907
WORD_DAWG_SIZE=1000000 ;;
991908
aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl )
992909
FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
@@ -998,56 +915,73 @@ set_lang_specific_parameters() {
998915
TEXT_CORPUS=${FLAGS_webtext_prefix}/cyr_lid.corpus.txt
999916
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1000917
GENERATE_WORD_BIGRAMS=0
1001-
FRAGMENTS_DISABLED="y"
1002918
WORD_DAWG_SIZE=1000000
1003919
FONTS=( "${RUSSIAN_FONTS[@]}" );;
1004920

1005921
# South Asian scripts mostly have a lot of different graphemes, so trim
1006922
# down the MEAN_COUNT so as not to get a huge amount of text.
1007923
asm | ben )
1008924
MEAN_COUNT="15"
925+
WORD_DAWG_FACTOR=0.15
1009926
FONTS=( "${BENGALI_FONTS[@]}" ) ;;
1010927
bih | hin | mar | nep | san )
1011928
MEAN_COUNT="15"
929+
WORD_DAWG_FACTOR=0.15
1012930
FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
1013931
bod ) MEAN_COUNT="15"
932+
WORD_DAWG_FACTOR=0.15
933+
FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
934+
dzo )
935+
WORD_DAWG_FACTOR=0.01
1014936
FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
1015-
dzo ) FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
1016937
guj ) MEAN_COUNT="15"
938+
WORD_DAWG_FACTOR=0.15
1017939
FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
1018940
kan ) MEAN_COUNT="15"
941+
WORD_DAWG_FACTOR=0.15
1019942
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
1020943
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
1021944
FONTS=( "${KANNADA_FONTS[@]}" ) ;;
1022945
mal ) MEAN_COUNT="15"
946+
WORD_DAWG_FACTOR=0.15
1023947
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
1024948
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
1025949
FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
1026-
ori ) FONTS=( "${ORIYA_FONTS[@]}" ) ;;
950+
ori )
951+
WORD_DAWG_FACTOR=0.01
952+
FONTS=( "${ORIYA_FONTS[@]}" ) ;;
1027953
pan ) MEAN_COUNT="15"
954+
WORD_DAWG_FACTOR=0.01
1028955
FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
1029956
sin ) MEAN_COUNT="15"
957+
WORD_DAWG_FACTOR=0.01
1030958
FONTS=( "${SINHALA_FONTS[@]}" ) ;;
1031959
tam ) MEAN_COUNT="30"
960+
WORD_DAWG_FACTOR=0.15
1032961
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
1033962
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
1034963
FONTS=( "${TAMIL_FONTS[@]}" ) ;;
1035964
tel ) MEAN_COUNT="15"
965+
WORD_DAWG_FACTOR=0.15
1036966
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
1037967
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
1038968
FONTS=( "${TELUGU_FONTS[@]}" ) ;;
1039969

1040970
# SouthEast Asian scripts.
1041971
khm ) MEAN_COUNT="15"
972+
WORD_DAWG_FACTOR=0.15
1042973
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1043974
FONTS=( "${KHMER_FONTS[@]}" ) ;;
1044975
lao ) MEAN_COUNT="15"
976+
WORD_DAWG_FACTOR=0.15
1045977
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1046978
FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
1047979
mya ) MEAN_COUNT="12"
980+
WORD_DAWG_FACTOR=0.15
1048981
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1049982
FONTS=( "${BURMESE_FONTS[@]}" ) ;;
1050983
tha ) MEAN_COUNT="30"
984+
WORD_DAWG_FACTOR=0.01
1051985
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1052986
FILTER_ARGUMENTS="--segmenter_lang=tha"
1053987
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
@@ -1058,36 +992,35 @@ set_lang_specific_parameters() {
1058992
# CJK
1059993
chi_sim )
1060994
MEAN_COUNT="15"
995+
PUNC_DAWG_FACTOR=0.015
996+
WORD_DAWG_FACTOR=0.015
1061997
GENERATE_WORD_BIGRAMS=0
1062998
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1063999
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
10641000
FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim"
1065-
FRAGMENTS_DISABLED="y"
1066-
GENERATE_DAWGS=0
10671001
FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
10681002
chi_tra )
10691003
MEAN_COUNT="15"
1004+
WORD_DAWG_FACTOR=0.015
10701005
GENERATE_WORD_BIGRAMS=0
10711006
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
10721007
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
10731008
FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra"
1074-
FRAGMENTS_DISABLED="y"
1075-
GENERATE_DAWGS=0
10761009
FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
10771010
jpn ) MEAN_COUNT="15"
1011+
WORD_DAWG_FACTOR=0.015
10781012
GENERATE_WORD_BIGRAMS=0
10791013
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
10801014
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
10811015
FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn"
1082-
FRAGMENTS_DISABLED="y"
1083-
GENERATE_DAWGS=0
10841016
FONTS=( "${JPN_FONTS[@]}" ) ;;
10851017
kor ) MEAN_COUNT="20"
1018+
WORD_DAWG_FACTOR=0.015
1019+
NUMBER_DAWG_FACTOR=0.05
10861020
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
10871021
TRAINING_DATA_ARGUMENTS+=" --desired_bigrams="
10881022
GENERATE_WORD_BIGRAMS=0
10891023
FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor"
1090-
FRAGMENTS_DISABLED="y"
10911024
FONTS=( "${KOREAN_FONTS[@]}" ) ;;
10921025

10931026
# Middle-Eastern scripts.
@@ -1096,6 +1029,8 @@ set_lang_specific_parameters() {
10961029
fas | pus | snd | uig | urd )
10971030
FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
10981031
heb | yid )
1032+
NUMBER_DAWG_FACTOR=0.05
1033+
WORD_DAWG_FACTOR=0.08
10991034
FONTS=( "${HEBREW_FONTS[@]}" ) ;;
11001035
syr ) FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
11011036

@@ -1106,6 +1041,8 @@ set_lang_specific_parameters() {
11061041
"Noto Sans Cherokee" \
11071042
) ;;
11081043
ell | grc )
1044+
NUMBER_DAWG_FACTOR=0.05
1045+
WORD_DAWG_FACTOR=0.08
11091046
FONTS=( "${GREEK_FONTS[@]}" ) ;;
11101047
hye ) FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
11111048
iku ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;

0 commit comments

Comments
 (0)