Skip to content

Commit 8e71c79

Browse files
committed
Remove --bin_dir option from tesstrain.sh (should use $PATH instead)
The --bin_dir option to tesstrain.sh is not useful, as $PATH does the same job much better, so switch to relying on that instead. This also makes the code a bit more readable, as it removes the need to refer to binaries as COMMAND_NAME_EXE rather than just command_name.
1 parent e110b14 commit 8e71c79

File tree

2 files changed

+20
-45
lines changed

2 files changed

+20
-45
lines changed

training/tesstrain.sh

-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
# USAGE:
1818
#
1919
# tesstrain.sh
20-
# --bin_dir PATH # Location of training program.
2120
# --fontlist FONTS_STR # A plus-separated list of fontnames to train on.
2221
# --fonts_dir FONTS_PATH # Path to font files.
2322
# --lang LANG_CODE # ISO 639 code.

training/tesstrain_utils.sh

+20-44
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@ err_exit() {
4141
# if the program file is not found.
4242
# Usage: run_command CMD ARG1 ARG2...
4343
run_command() {
44-
local cmd=$1
45-
shift
46-
if [[ ! -x ${cmd} ]]; then
47-
err_exit "File ${cmd} not found"
44+
local cmd=`which $1`
45+
if [[ -z ${cmd} ]]; then
46+
err_exit "$1 not found"
4847
fi
48+
shift
4949
tlog "[$(date)] ${cmd} $@"
5050
${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
5151
# check completion status
@@ -65,22 +65,6 @@ check_file_readable() {
6565
done
6666
}
6767

68-
# Set global path variables that are based on parsed flags.
69-
set_prog_paths() {
70-
if [[ -z ${BINDIR} ]]; then
71-
err_exit "Need to specify location of program files"
72-
fi
73-
CN_TRAINING_EXE=${BINDIR}/cntraining
74-
COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
75-
MF_TRAINING_EXE=${BINDIR}/mftraining
76-
SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
77-
SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
78-
TESSERACT_EXE=${BINDIR}/tesseract
79-
TEXT2IMAGE_EXE=${BINDIR}/text2image
80-
UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
81-
WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
82-
}
83-
8468
# Sets the named variable to given value. Aborts if the value is missing or
8569
# if it looks like a flag.
8670
# Usage: parse_value VAR_NAME VALUE
@@ -105,9 +89,6 @@ parse_flags() {
10589
case ${ARGV[$i]} in
10690
--)
10791
break;;
108-
--bin_dir)
109-
parse_value "BINDIR" ${ARGV[$j]}
110-
i=$j ;;
11192
--fontlist) # Expect a plus-separated list of names
11293
if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
11394
err_exit "Invalid value passed to --fontlist"
@@ -152,9 +133,6 @@ parse_flags() {
152133
if [[ -z ${LANG_CODE} ]]; then
153134
err_exit "Need to specify a language --lang"
154135
fi
155-
if [[ -z ${BINDIR} ]]; then
156-
err_exit "Need to specify path to built binaries --bin_dir"
157-
fi
158136
if [[ -z ${LANGDATA_ROOT} ]]; then
159137
err_exit "Need to specify path to language files --langdata_dir"
160138
fi
@@ -167,8 +145,6 @@ parse_flags() {
167145
fi
168146
fi
169147

170-
set_prog_paths
171-
172148
# Location where intermediate files will be created.
173149
TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
174150
# Location of log file for the whole run.
@@ -196,7 +172,7 @@ initialize_fontconfig() {
196172
export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
197173
local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
198174
echo "Text" >${sample_path}
199-
run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
175+
run_command text2image --fonts_dir=${FONTS_DIR} \
200176
--font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \
201177
--fontconfig_tmpdir=${FONT_CONFIG_CACHE}
202178
}
@@ -224,14 +200,14 @@ generate_font_image() {
224200
fi
225201
done
226202

227-
run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
203+
run_command text2image ${common_args} --font="${font}" \
228204
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
229205
check_file_readable ${outbase}.box ${outbase}.tif
230206

231207
if (( ${EXTRACT_FONT_PROPERTIES} )) &&
232208
[[ -r ${TRAIN_NGRAMS_FILE} ]]; then
233209
tlog "Extracting font properties of ${font}"
234-
run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
210+
run_command text2image ${common_args} --font="${font}" \
235211
--ligatures=false --text=${TRAIN_NGRAMS_FILE} \
236212
--only_extract_font_properties --ptsize=32
237213
check_file_readable ${outbase}.fontinfo
@@ -287,15 +263,15 @@ phase_UP_generate_unicharset() {
287263
tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
288264

289265
local box_files=$(ls ${TRAINING_DIR}/*.box)
290-
run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
266+
run_command unicharset_extractor -D "${TRAINING_DIR}/" ${box_files}
291267
local outfile=${TRAINING_DIR}/unicharset
292268
UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
293269
check_file_readable ${outfile}
294270
mv ${outfile} ${UNICHARSET_FILE}
295271

296272
XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
297273
check_file_readable ${UNICHARSET_FILE}
298-
run_command ${SET_UNICHARSET_PROPERTIES_EXE} \
274+
run_command set_unicharset_properties \
299275
-U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
300276
--script_dir=${LANGDATA_ROOT}
301277
check_file_readable ${XHEIGHTS_FILE}
@@ -323,7 +299,7 @@ phase_D_generate_dawg() {
323299
if [[ -s ${WORDLIST_FILE} ]]; then
324300
tlog "Generating word Dawg"
325301
check_file_readable ${UNICHARSET_FILE}
326-
run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
302+
run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
327303
${UNICHARSET_FILE}
328304
check_file_readable ${WORD_DAWG}
329305

@@ -335,13 +311,13 @@ phase_D_generate_dawg() {
335311
if [[ -s ${freq_wordlist_file} ]]; then
336312
check_file_readable ${UNICHARSET_FILE}
337313
tlog "Generating frequent-word Dawg"
338-
run_command ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} \
314+
run_command wordlist2dawg -r 1 ${freq_wordlist_file} \
339315
${FREQ_DAWG} ${UNICHARSET_FILE}
340316
check_file_readable ${FREQ_DAWG}
341317
fi
342318

343319
# Punctuation DAWG
344-
# -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
320+
# -r arguments to wordlist2dawg denote RTL reverse policy
345321
# (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
346322
# We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
347323
# 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
@@ -356,20 +332,20 @@ phase_D_generate_dawg() {
356332
PUNC_FILE="${LANGDATA_ROOT}/common.punc"
357333
fi
358334
check_file_readable ${PUNC_FILE}
359-
run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
335+
run_command wordlist2dawg -r ${punc_reverse_policy} \
360336
${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
361337
check_file_readable ${PUNC_DAWG}
362338

363339
# Numbers DAWG
364340
if [[ -s ${NUMBERS_FILE} ]]; then
365-
run_command ${WORDLIST2DAWG_EXE} -r 0 \
341+
run_command wordlist2dawg -r 0 \
366342
${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
367343
check_file_readable ${NUMBER_DAWG}
368344
fi
369345

370346
# Bigram dawg
371347
if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
372-
run_command ${WORDLIST2DAWG_EXE} -r 1 \
348+
run_command wordlist2dawg -r 1 \
373349
${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
374350
check_file_readable ${BIGRAM_DAWG}
375351
fi
@@ -401,7 +377,7 @@ phase_E_extract_features() {
401377
tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
402378
local counter=0
403379
for img_file in ${img_files}; do
404-
run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
380+
run_command tesseract ${img_file} ${img_file%.*} \
405381
${box_config} ${config} &
406382
let counter=counter+1
407383
let rem=counter%par_factor
@@ -423,7 +399,7 @@ phase_C_cluster_prototypes() {
423399
tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
424400
local out_normproto=$1
425401

426-
run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
402+
run_command cntraining -D "${TRAINING_DIR}/" \
427403
$(ls ${TRAINING_DIR}/*.tr)
428404

429405
check_file_readable ${TRAINING_DIR}/normproto
@@ -443,7 +419,7 @@ phase_S_cluster_shapes() {
443419
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
444420
fi
445421

446-
run_command ${SHAPE_TRAINING_EXE} \
422+
run_command shapeclustering \
447423
-D "${TRAINING_DIR}/" \
448424
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
449425
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
@@ -464,7 +440,7 @@ phase_M_cluster_microfeatures() {
464440
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
465441
fi
466442

467-
run_command ${MF_TRAINING_EXE} \
443+
run_command mftraining \
468444
-D "${TRAINING_DIR}/" \
469445
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
470446
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
@@ -524,7 +500,7 @@ make__traineddata() {
524500
fi
525501

526502
# Compose the traineddata file.
527-
run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
503+
run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
528504

529505
# Copy it to the output dir, overwriting only if allowed by the cmdline flag.
530506
if [[ ! -d ${OUTPUT_DIR} ]]; then

0 commit comments

Comments
 (0)