@@ -780,7 +780,7 @@ VERTICAL_FONTS=( \
780
780
# holds the text corpus file for the language, used in phase F
781
781
# ${FONTS[@]}
782
782
# holds a sequence of applicable fonts for the language, used in
783
- # phase F & I
783
+ # phase F & I. only set if not already set, i.e. from command line
784
784
# ${TRAINING_DATA_ARGUMENTS}
785
785
# non-default arguments to the training_data program used in phase T
786
786
# ${FILTER_ARGUMENTS} -
@@ -794,7 +794,6 @@ set_lang_specific_parameters() {
794
794
local lang=$1
795
795
# The default text location is now given directly from the language code.
796
796
TEXT_CORPUS=" ${FLAGS_webtext_prefix} /${lang} .corpus.txt"
797
- test -z " $FONTS " && FONTS=( " ${LATIN_FONTS[@]} " )
798
797
FILTER_ARGUMENTS=" "
799
798
WORDLIST2DAWG_ARGUMENTS=" "
800
799
# These dawg factors represent the fraction of the corpus not covered by the
@@ -816,30 +815,30 @@ set_lang_specific_parameters() {
816
815
case ${lang} in
817
816
# Latin languages.
818
817
enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported
819
- FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
818
+ test -z " $FONTS " && FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
820
819
frm ) TEXT_CORPUS=" ${FLAGS_webtext_prefix} /fra.corpus.txt"
821
820
# Make long-s substitutions for Middle French text
822
821
FILTER_ARGUMENTS=" --make_early_language_variant=fra"
823
822
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
824
- FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
823
+ test -z " $FONTS " && FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
825
824
frk ) TEXT_CORPUS=" ${FLAGS_webtext_prefix} /deu.corpus.txt"
826
- FONTS=( " ${FRAKTUR_FONTS[@]} " );;
825
+ test -z " $FONTS " && FONTS=( " ${FRAKTUR_FONTS[@]} " );;
827
826
ita_old )
828
827
TEXT_CORPUS=" ${FLAGS_webtext_prefix} /ita.corpus.txt"
829
828
# Make long-s substitutions for Early Italian text
830
829
FILTER_ARGUMENTS=" --make_early_language_variant=ita"
831
830
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
832
- FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
831
+ test -z " $FONTS " && FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
833
832
spa_old )
834
833
TEXT_CORPUS=" ${FLAGS_webtext_prefix} /spa.corpus.txt"
835
834
# Make long-s substitutions for Early Spanish text
836
835
FILTER_ARGUMENTS=" --make_early_language_variant=spa"
837
836
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
838
- FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
837
+ test -z " $FONTS " && FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
839
838
srp_latn )
840
839
TEXT_CORPUS=${FLAGS_webtext_prefix} /srp.corpus.txt ;;
841
840
vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
842
- FONTS=( " ${VIETNAMESE_FONTS[@]} " ) ;;
841
+ test -z " $FONTS " && FONTS=( " ${VIETNAMESE_FONTS[@]} " ) ;;
843
842
# Highly inflective languages get a bigger dawg size.
844
843
# TODO(rays) Add more here!
845
844
hun ) WORD_DAWG_SIZE=1000000 ;;
@@ -899,14 +898,14 @@ set_lang_specific_parameters() {
899
898
# Strip unrenderable words as not all fonts will render the extended
900
899
# latin symbols found in Vietnamese text.
901
900
WORD_DAWG_SIZE=1000000
902
- FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
901
+ test -z " $FONTS " && FONTS=( " ${EARLY_LATIN_FONTS[@]} " );;
903
902
904
903
# Cyrillic script-based languages.
905
- rus ) FONTS=( " ${RUSSIAN_FONTS[@]} " )
904
+ rus ) test -z " $FONTS " && FONTS=( " ${RUSSIAN_FONTS[@]} " )
906
905
NUMBER_DAWG_FACTOR=0.05
907
906
WORD_DAWG_SIZE=1000000 ;;
908
907
aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl )
909
- FONTS=( " ${RUSSIAN_FONTS[@]} " ) ;;
908
+ test -z " $FONTS " && FONTS=( " ${RUSSIAN_FONTS[@]} " ) ;;
910
909
911
910
# Special code for performing Cyrillic language-id that is trained on
912
911
# Russian, Serbian, Ukranian, Belarusian, Macedonian, Tajik and Mongolian
@@ -916,78 +915,78 @@ set_lang_specific_parameters() {
916
915
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
917
916
GENERATE_WORD_BIGRAMS=0
918
917
WORD_DAWG_SIZE=1000000
919
- FONTS=( " ${RUSSIAN_FONTS[@]} " );;
918
+ test -z " $FONTS " && FONTS=( " ${RUSSIAN_FONTS[@]} " );;
920
919
921
920
# South Asian scripts mostly have a lot of different graphemes, so trim
922
921
# down the MEAN_COUNT so as not to get a huge amount of text.
923
922
asm | ben )
924
923
MEAN_COUNT=" 15"
925
924
WORD_DAWG_FACTOR=0.15
926
- FONTS=( " ${BENGALI_FONTS[@]} " ) ;;
925
+ test -z " $FONTS " && FONTS=( " ${BENGALI_FONTS[@]} " ) ;;
927
926
bih | hin | mar | nep | san )
928
927
MEAN_COUNT=" 15"
929
928
WORD_DAWG_FACTOR=0.15
930
- FONTS=( " ${DEVANAGARI_FONTS[@]} " ) ;;
929
+ test -z " $FONTS " && FONTS=( " ${DEVANAGARI_FONTS[@]} " ) ;;
931
930
bod ) MEAN_COUNT=" 15"
932
931
WORD_DAWG_FACTOR=0.15
933
- FONTS=( " ${TIBETAN_FONTS[@]} " ) ;;
932
+ test -z " $FONTS " && FONTS=( " ${TIBETAN_FONTS[@]} " ) ;;
934
933
dzo )
935
934
WORD_DAWG_FACTOR=0.01
936
- FONTS=( " ${TIBETAN_FONTS[@]} " ) ;;
935
+ test -z " $FONTS " && FONTS=( " ${TIBETAN_FONTS[@]} " ) ;;
937
936
guj ) MEAN_COUNT=" 15"
938
937
WORD_DAWG_FACTOR=0.15
939
- FONTS=( " ${GUJARATI_FONTS[@]} " ) ;;
938
+ test -z " $FONTS " && FONTS=( " ${GUJARATI_FONTS[@]} " ) ;;
940
939
kan ) MEAN_COUNT=" 15"
941
940
WORD_DAWG_FACTOR=0.15
942
941
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
943
942
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
944
- FONTS=( " ${KANNADA_FONTS[@]} " ) ;;
943
+ test -z " $FONTS " && FONTS=( " ${KANNADA_FONTS[@]} " ) ;;
945
944
mal ) MEAN_COUNT=" 15"
946
945
WORD_DAWG_FACTOR=0.15
947
946
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
948
947
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
949
- FONTS=( " ${MALAYALAM_FONTS[@]} " ) ;;
948
+ test -z " $FONTS " && FONTS=( " ${MALAYALAM_FONTS[@]} " ) ;;
950
949
ori )
951
950
WORD_DAWG_FACTOR=0.01
952
- FONTS=( " ${ORIYA_FONTS[@]} " ) ;;
951
+ test -z " $FONTS " && FONTS=( " ${ORIYA_FONTS[@]} " ) ;;
953
952
pan ) MEAN_COUNT=" 15"
954
953
WORD_DAWG_FACTOR=0.01
955
- FONTS=( " ${PUNJABI_FONTS[@]} " ) ;;
954
+ test -z " $FONTS " && FONTS=( " ${PUNJABI_FONTS[@]} " ) ;;
956
955
sin ) MEAN_COUNT=" 15"
957
956
WORD_DAWG_FACTOR=0.01
958
- FONTS=( " ${SINHALA_FONTS[@]} " ) ;;
957
+ test -z " $FONTS " && FONTS=( " ${SINHALA_FONTS[@]} " ) ;;
959
958
tam ) MEAN_COUNT=" 30"
960
959
WORD_DAWG_FACTOR=0.15
961
960
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
962
961
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
963
- FONTS=( " ${TAMIL_FONTS[@]} " ) ;;
962
+ test -z " $FONTS " && FONTS=( " ${TAMIL_FONTS[@]} " ) ;;
964
963
tel ) MEAN_COUNT=" 15"
965
964
WORD_DAWG_FACTOR=0.15
966
965
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
967
966
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
968
- FONTS=( " ${TELUGU_FONTS[@]} " ) ;;
967
+ test -z " $FONTS " && FONTS=( " ${TELUGU_FONTS[@]} " ) ;;
969
968
970
969
# SouthEast Asian scripts.
971
970
khm ) MEAN_COUNT=" 15"
972
971
WORD_DAWG_FACTOR=0.15
973
972
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
974
- FONTS=( " ${KHMER_FONTS[@]} " ) ;;
973
+ test -z " $FONTS " && FONTS=( " ${KHMER_FONTS[@]} " ) ;;
975
974
lao ) MEAN_COUNT=" 15"
976
975
WORD_DAWG_FACTOR=0.15
977
976
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
978
- FONTS=( " ${LAOTHIAN_FONTS[@]} " ) ;;
977
+ test -z " $FONTS " && FONTS=( " ${LAOTHIAN_FONTS[@]} " ) ;;
979
978
mya ) MEAN_COUNT=" 12"
980
979
WORD_DAWG_FACTOR=0.15
981
980
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
982
- FONTS=( " ${BURMESE_FONTS[@]} " ) ;;
981
+ test -z " $FONTS " && FONTS=( " ${BURMESE_FONTS[@]} " ) ;;
983
982
tha ) MEAN_COUNT=" 30"
984
983
WORD_DAWG_FACTOR=0.01
985
984
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
986
985
FILTER_ARGUMENTS=" --segmenter_lang=tha"
987
986
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
988
987
AMBIGS_FILTER_DENOMINATOR=" 1000"
989
988
LEADING=48
990
- FONTS=( " ${THAI_FONTS[@]} " ) ;;
989
+ test -z " $FONTS " && FONTS=( " ${THAI_FONTS[@]} " ) ;;
991
990
992
991
# CJK
993
992
chi_sim )
@@ -998,61 +997,61 @@ set_lang_specific_parameters() {
998
997
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
999
998
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
1000
999
FILTER_ARGUMENTS=" --charset_filter=chi_sim --segmenter_lang=chi_sim"
1001
- FONTS=( " ${CHI_SIM_FONTS[@]} " ) ;;
1000
+ test -z " $FONTS " && FONTS=( " ${CHI_SIM_FONTS[@]} " ) ;;
1002
1001
chi_tra )
1003
1002
MEAN_COUNT=" 15"
1004
1003
WORD_DAWG_FACTOR=0.015
1005
1004
GENERATE_WORD_BIGRAMS=0
1006
1005
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1007
1006
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
1008
1007
FILTER_ARGUMENTS=" --charset_filter=chi_tra --segmenter_lang=chi_tra"
1009
- FONTS=( " ${CHI_TRA_FONTS[@]} " ) ;;
1008
+ test -z " $FONTS " && FONTS=( " ${CHI_TRA_FONTS[@]} " ) ;;
1010
1009
jpn ) MEAN_COUNT=" 15"
1011
1010
WORD_DAWG_FACTOR=0.015
1012
1011
GENERATE_WORD_BIGRAMS=0
1013
1012
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1014
1013
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
1015
1014
FILTER_ARGUMENTS=" --charset_filter=jpn --segmenter_lang=jpn"
1016
- FONTS=( " ${JPN_FONTS[@]} " ) ;;
1015
+ test -z " $FONTS " && FONTS=( " ${JPN_FONTS[@]} " ) ;;
1017
1016
kor ) MEAN_COUNT=" 20"
1018
1017
WORD_DAWG_FACTOR=0.015
1019
1018
NUMBER_DAWG_FACTOR=0.05
1020
1019
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1021
1020
TRAINING_DATA_ARGUMENTS+=" --desired_bigrams="
1022
1021
GENERATE_WORD_BIGRAMS=0
1023
1022
FILTER_ARGUMENTS=" --charset_filter=kor --segmenter_lang=kor"
1024
- FONTS=( " ${KOREAN_FONTS[@]} " ) ;;
1023
+ test -z " $FONTS " && FONTS=( " ${KOREAN_FONTS[@]} " ) ;;
1025
1024
1026
1025
# Middle-Eastern scripts.
1027
- ara ) FONTS=( " ${ARABIC_FONTS[@]} " ) ;;
1028
- div ) FONTS=( " ${THAANA_FONTS[@]} " ) ;;
1026
+ ara ) test -z " $FONTS " && FONTS=( " ${ARABIC_FONTS[@]} " ) ;;
1027
+ div ) test -z " $FONTS " && FONTS=( " ${THAANA_FONTS[@]} " ) ;;
1029
1028
fas | pus | snd | uig | urd )
1030
- FONTS=( " ${PERSIAN_FONTS[@]} " ) ;;
1029
+ test -z " $FONTS " && FONTS=( " ${PERSIAN_FONTS[@]} " ) ;;
1031
1030
heb | yid )
1032
1031
NUMBER_DAWG_FACTOR=0.05
1033
1032
WORD_DAWG_FACTOR=0.08
1034
- FONTS=( " ${HEBREW_FONTS[@]} " ) ;;
1035
- syr ) FONTS=( " ${SYRIAC_FONTS[@]} " ) ;;
1033
+ test -z " $FONTS " && FONTS=( " ${HEBREW_FONTS[@]} " ) ;;
1034
+ syr ) test -z " $FONTS " && FONTS=( " ${SYRIAC_FONTS[@]} " ) ;;
1036
1035
1037
1036
# Other scripts.
1038
1037
amh | tir)
1039
- FONTS=( " ${AMHARIC_FONTS[@]} " ) ;;
1040
- chr ) FONTS=( " ${NORTH_AMERICAN_ABORIGINAL_FONTS[@]} " \
1038
+ test -z " $FONTS " && FONTS=( " ${AMHARIC_FONTS[@]} " ) ;;
1039
+ chr ) test -z " $FONTS " && FONTS=( " ${NORTH_AMERICAN_ABORIGINAL_FONTS[@]} " \
1041
1040
" Noto Sans Cherokee" \
1042
1041
) ;;
1043
1042
ell | grc )
1044
1043
NUMBER_DAWG_FACTOR=0.05
1045
1044
WORD_DAWG_FACTOR=0.08
1046
- FONTS=( " ${GREEK_FONTS[@]} " ) ;;
1047
- hye ) FONTS=( " ${ARMENIAN_FONTS[@]} " ) ;;
1048
- iku ) FONTS=( " ${NORTH_AMERICAN_ABORIGINAL_FONTS[@]} " ) ;;
1049
- kat) FONTS=( " ${GEORGIAN_FONTS[@]} " ) ;;
1045
+ test -z " $FONTS " && FONTS=( " ${GREEK_FONTS[@]} " ) ;;
1046
+ hye ) test -z " $FONTS " && FONTS=( " ${ARMENIAN_FONTS[@]} " ) ;;
1047
+ iku ) test -z " $FONTS " && FONTS=( " ${NORTH_AMERICAN_ABORIGINAL_FONTS[@]} " ) ;;
1048
+ kat) test -z " $FONTS " && FONTS=( " ${GEORGIAN_FONTS[@]} " ) ;;
1050
1049
kat_old)
1051
1050
TEXT_CORPUS=" ${FLAGS_webtext_prefix} /kat.corpus.txt"
1052
- FONTS=( " ${OLD_GEORGIAN_FONTS[@]} " ) ;;
1053
- kir ) FONTS=( " ${KYRGYZ_FONTS[@]} " )
1051
+ test -z " $FONTS " && FONTS=( " ${OLD_GEORGIAN_FONTS[@]} " ) ;;
1052
+ kir ) test -z " $FONTS " && FONTS=( " ${KYRGYZ_FONTS[@]} " )
1054
1053
TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;;
1055
- kur ) FONTS=( " ${KURDISH_FONTS[@]} " ) ;;
1054
+ kur ) test -z " $FONTS " && FONTS=( " ${KURDISH_FONTS[@]} " ) ;;
1056
1055
1057
1056
* ) err " Error: ${lang} is not a valid language code"
1058
1057
esac
@@ -1061,6 +1060,8 @@ set_lang_specific_parameters() {
1061
1060
elif [[ ! -z ${MEAN_COUNT} ]]; then
1062
1061
TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT} "
1063
1062
fi
1063
+ # Default to Latin fonts if none have been set
1064
+ test -z " $FONTS " && test -z " $FONTS " && FONTS=( " ${LATIN_FONTS[@]} " )
1064
1065
}
1065
1066
1066
1067
# =============================================================================
0 commit comments