Skip to content

Commit 422c424

Browse files
committed
tesstrain.sh: Only set FONTS if they weren't set on the command line
Previously the fonts specified in language-selection.sh would override any specified on the command line. This changes language-specific.sh from overriding a user request to just setting the default fonts if none are specified with --fontlist.
1 parent 8d0f59d commit 422c424

File tree

1 file changed

+47
-46
lines changed

1 file changed

+47
-46
lines changed

training/language-specific.sh

+47-46
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,7 @@ VERTICAL_FONTS=( \
780780
# holds the text corpus file for the language, used in phase F
781781
# ${FONTS[@]}
782782
# holds a sequence of applicable fonts for the language, used in
783-
# phase F & I
783+
# phase F & I. only set if not already set, i.e. from command line
784784
# ${TRAINING_DATA_ARGUMENTS}
785785
# non-default arguments to the training_data program used in phase T
786786
# ${FILTER_ARGUMENTS} -
@@ -794,7 +794,6 @@ set_lang_specific_parameters() {
794794
local lang=$1
795795
# The default text location is now given directly from the language code.
796796
TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt"
797-
test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" )
798797
FILTER_ARGUMENTS=""
799798
WORDLIST2DAWG_ARGUMENTS=""
800799
# These dawg factors represent the fraction of the corpus not covered by the
@@ -816,30 +815,30 @@ set_lang_specific_parameters() {
816815
case ${lang} in
817816
# Latin languages.
818817
enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported
819-
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
818+
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
820819
frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt"
821820
# Make long-s substitutions for Middle French text
822821
FILTER_ARGUMENTS="--make_early_language_variant=fra"
823822
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
824-
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
823+
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
825824
frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt"
826-
FONTS=( "${FRAKTUR_FONTS[@]}" );;
825+
test -z "$FONTS" && FONTS=( "${FRAKTUR_FONTS[@]}" );;
827826
ita_old )
828827
TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt"
829828
# Make long-s substitutions for Early Italian text
830829
FILTER_ARGUMENTS="--make_early_language_variant=ita"
831830
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
832-
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
831+
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
833832
spa_old )
834833
TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt"
835834
# Make long-s substitutions for Early Spanish text
836835
FILTER_ARGUMENTS="--make_early_language_variant=spa"
837836
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
838-
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
837+
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
839838
srp_latn )
840839
TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;;
841840
vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
842-
FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
841+
test -z "$FONTS" && FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
843842
# Highly inflective languages get a bigger dawg size.
844843
# TODO(rays) Add more here!
845844
hun ) WORD_DAWG_SIZE=1000000 ;;
@@ -899,14 +898,14 @@ set_lang_specific_parameters() {
899898
# Strip unrenderable words as not all fonts will render the extended
900899
# latin symbols found in Vietnamese text.
901900
WORD_DAWG_SIZE=1000000
902-
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
901+
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
903902

904903
# Cyrillic script-based languages.
905-
rus ) FONTS=( "${RUSSIAN_FONTS[@]}" )
904+
rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" )
906905
NUMBER_DAWG_FACTOR=0.05
907906
WORD_DAWG_SIZE=1000000 ;;
908907
aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl )
909-
FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
908+
test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
910909

911910
# Special code for performing Cyrillic language-id that is trained on
912911
# Russian, Serbian, Ukranian, Belarusian, Macedonian, Tajik and Mongolian
@@ -916,78 +915,78 @@ set_lang_specific_parameters() {
916915
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
917916
GENERATE_WORD_BIGRAMS=0
918917
WORD_DAWG_SIZE=1000000
919-
FONTS=( "${RUSSIAN_FONTS[@]}" );;
918+
test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" );;
920919

921920
# South Asian scripts mostly have a lot of different graphemes, so trim
922921
# down the MEAN_COUNT so as not to get a huge amount of text.
923922
asm | ben )
924923
MEAN_COUNT="15"
925924
WORD_DAWG_FACTOR=0.15
926-
FONTS=( "${BENGALI_FONTS[@]}" ) ;;
925+
test -z "$FONTS" && FONTS=( "${BENGALI_FONTS[@]}" ) ;;
927926
bih | hin | mar | nep | san )
928927
MEAN_COUNT="15"
929928
WORD_DAWG_FACTOR=0.15
930-
FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
929+
test -z "$FONTS" && FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
931930
bod ) MEAN_COUNT="15"
932931
WORD_DAWG_FACTOR=0.15
933-
FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
932+
test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
934933
dzo )
935934
WORD_DAWG_FACTOR=0.01
936-
FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
935+
test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
937936
guj ) MEAN_COUNT="15"
938937
WORD_DAWG_FACTOR=0.15
939-
FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
938+
test -z "$FONTS" && FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
940939
kan ) MEAN_COUNT="15"
941940
WORD_DAWG_FACTOR=0.15
942941
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
943942
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
944-
FONTS=( "${KANNADA_FONTS[@]}" ) ;;
943+
test -z "$FONTS" && FONTS=( "${KANNADA_FONTS[@]}" ) ;;
945944
mal ) MEAN_COUNT="15"
946945
WORD_DAWG_FACTOR=0.15
947946
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
948947
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
949-
FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
948+
test -z "$FONTS" && FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
950949
ori )
951950
WORD_DAWG_FACTOR=0.01
952-
FONTS=( "${ORIYA_FONTS[@]}" ) ;;
951+
test -z "$FONTS" && FONTS=( "${ORIYA_FONTS[@]}" ) ;;
953952
pan ) MEAN_COUNT="15"
954953
WORD_DAWG_FACTOR=0.01
955-
FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
954+
test -z "$FONTS" && FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
956955
sin ) MEAN_COUNT="15"
957956
WORD_DAWG_FACTOR=0.01
958-
FONTS=( "${SINHALA_FONTS[@]}" ) ;;
957+
test -z "$FONTS" && FONTS=( "${SINHALA_FONTS[@]}" ) ;;
959958
tam ) MEAN_COUNT="30"
960959
WORD_DAWG_FACTOR=0.15
961960
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
962961
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
963-
FONTS=( "${TAMIL_FONTS[@]}" ) ;;
962+
test -z "$FONTS" && FONTS=( "${TAMIL_FONTS[@]}" ) ;;
964963
tel ) MEAN_COUNT="15"
965964
WORD_DAWG_FACTOR=0.15
966965
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
967966
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
968-
FONTS=( "${TELUGU_FONTS[@]}" ) ;;
967+
test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
969968

970969
# SouthEast Asian scripts.
971970
khm ) MEAN_COUNT="15"
972971
WORD_DAWG_FACTOR=0.15
973972
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
974-
FONTS=( "${KHMER_FONTS[@]}" ) ;;
973+
test -z "$FONTS" && FONTS=( "${KHMER_FONTS[@]}" ) ;;
975974
lao ) MEAN_COUNT="15"
976975
WORD_DAWG_FACTOR=0.15
977976
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
978-
FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
977+
test -z "$FONTS" && FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
979978
mya ) MEAN_COUNT="12"
980979
WORD_DAWG_FACTOR=0.15
981980
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
982-
FONTS=( "${BURMESE_FONTS[@]}" ) ;;
981+
test -z "$FONTS" && FONTS=( "${BURMESE_FONTS[@]}" ) ;;
983982
tha ) MEAN_COUNT="30"
984983
WORD_DAWG_FACTOR=0.01
985984
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
986985
FILTER_ARGUMENTS="--segmenter_lang=tha"
987986
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
988987
AMBIGS_FILTER_DENOMINATOR="1000"
989988
LEADING=48
990-
FONTS=( "${THAI_FONTS[@]}" ) ;;
989+
test -z "$FONTS" && FONTS=( "${THAI_FONTS[@]}" ) ;;
991990

992991
# CJK
993992
chi_sim )
@@ -998,61 +997,61 @@ set_lang_specific_parameters() {
998997
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
999998
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
1000999
FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim"
1001-
FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
1000+
test -z "$FONTS" && FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
10021001
chi_tra )
10031002
MEAN_COUNT="15"
10041003
WORD_DAWG_FACTOR=0.015
10051004
GENERATE_WORD_BIGRAMS=0
10061005
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
10071006
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
10081007
FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra"
1009-
FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
1008+
test -z "$FONTS" && FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
10101009
jpn ) MEAN_COUNT="15"
10111010
WORD_DAWG_FACTOR=0.015
10121011
GENERATE_WORD_BIGRAMS=0
10131012
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
10141013
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
10151014
FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn"
1016-
FONTS=( "${JPN_FONTS[@]}" ) ;;
1015+
test -z "$FONTS" && FONTS=( "${JPN_FONTS[@]}" ) ;;
10171016
kor ) MEAN_COUNT="20"
10181017
WORD_DAWG_FACTOR=0.015
10191018
NUMBER_DAWG_FACTOR=0.05
10201019
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
10211020
TRAINING_DATA_ARGUMENTS+=" --desired_bigrams="
10221021
GENERATE_WORD_BIGRAMS=0
10231022
FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor"
1024-
FONTS=( "${KOREAN_FONTS[@]}" ) ;;
1023+
test -z "$FONTS" && FONTS=( "${KOREAN_FONTS[@]}" ) ;;
10251024

10261025
# Middle-Eastern scripts.
1027-
ara ) FONTS=( "${ARABIC_FONTS[@]}" ) ;;
1028-
div ) FONTS=( "${THAANA_FONTS[@]}" ) ;;
1026+
ara ) test -z "$FONTS" && FONTS=( "${ARABIC_FONTS[@]}" ) ;;
1027+
div ) test -z "$FONTS" && FONTS=( "${THAANA_FONTS[@]}" ) ;;
10291028
fas | pus | snd | uig | urd )
1030-
FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
1029+
test -z "$FONTS" && FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
10311030
heb | yid )
10321031
NUMBER_DAWG_FACTOR=0.05
10331032
WORD_DAWG_FACTOR=0.08
1034-
FONTS=( "${HEBREW_FONTS[@]}" ) ;;
1035-
syr ) FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
1033+
test -z "$FONTS" && FONTS=( "${HEBREW_FONTS[@]}" ) ;;
1034+
syr ) test -z "$FONTS" && FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
10361035

10371036
# Other scripts.
10381037
amh | tir)
1039-
FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
1040-
chr ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
1038+
test -z "$FONTS" && FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
1039+
chr ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
10411040
"Noto Sans Cherokee" \
10421041
) ;;
10431042
ell | grc )
10441043
NUMBER_DAWG_FACTOR=0.05
10451044
WORD_DAWG_FACTOR=0.08
1046-
FONTS=( "${GREEK_FONTS[@]}" ) ;;
1047-
hye ) FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
1048-
iku ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
1049-
kat) FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
1045+
test -z "$FONTS" && FONTS=( "${GREEK_FONTS[@]}" ) ;;
1046+
hye ) test -z "$FONTS" && FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
1047+
iku ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
1048+
kat) test -z "$FONTS" && FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
10501049
kat_old)
10511050
TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt"
1052-
FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
1053-
kir ) FONTS=( "${KYRGYZ_FONTS[@]}" )
1051+
test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
1052+
kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" )
10541053
TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;;
1055-
kur ) FONTS=( "${KURDISH_FONTS[@]}" ) ;;
1054+
kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;;
10561055

10571056
*) err "Error: ${lang} is not a valid language code"
10581057
esac
@@ -1061,6 +1060,8 @@ set_lang_specific_parameters() {
10611060
elif [[ ! -z ${MEAN_COUNT} ]]; then
10621061
TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}"
10631062
fi
1063+
# Default to Latin fonts if none have been set
1064+
test -z "$FONTS" && test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" )
10641065
}
10651066

10661067
#=============================================================================

0 commit comments

Comments
 (0)