@@ -41,11 +41,11 @@ err_exit() {
41
41
# if the program file is not found.
42
42
# Usage: run_command CMD ARG1 ARG2...
43
43
run_command () {
44
- local cmd=$1
45
- shift
46
- if [[ ! -x ${cmd} ]]; then
47
- err_exit " File ${cmd} not found"
44
+ local cmd=` which $1 `
45
+ if [[ -z ${cmd} ]]; then
46
+ err_exit " $1 not found"
48
47
fi
48
+ shift
49
49
tlog " [$( date) ] ${cmd} $@ "
50
50
${cmd} " $@ " 2>&1 1>&2 | tee -a ${LOG_FILE}
51
51
# check completion status
@@ -65,22 +65,6 @@ check_file_readable() {
65
65
done
66
66
}
67
67
68
- # Set global path variables that are based on parsed flags.
69
- set_prog_paths () {
70
- if [[ -z ${BINDIR} ]]; then
71
- err_exit " Need to specify location of program files"
72
- fi
73
- CN_TRAINING_EXE=${BINDIR} /cntraining
74
- COMBINE_TESSDATA_EXE=${BINDIR} /combine_tessdata
75
- MF_TRAINING_EXE=${BINDIR} /mftraining
76
- SET_UNICHARSET_PROPERTIES_EXE=${BINDIR} /set_unicharset_properties
77
- SHAPE_TRAINING_EXE=${BINDIR} /shapeclustering
78
- TESSERACT_EXE=${BINDIR} /tesseract
79
- TEXT2IMAGE_EXE=${BINDIR} /text2image
80
- UNICHARSET_EXTRACTOR_EXE=${BINDIR} /unicharset_extractor
81
- WORDLIST2DAWG_EXE=${BINDIR} /wordlist2dawg
82
- }
83
-
84
68
# Sets the named variable to given value. Aborts if the value is missing or
85
69
# if it looks like a flag.
86
70
# Usage: parse_value VAR_NAME VALUE
@@ -105,9 +89,6 @@ parse_flags() {
105
89
case ${ARGV[$i]} in
106
90
--)
107
91
break ;;
108
- --bin_dir)
109
- parse_value " BINDIR" ${ARGV[$j]}
110
- i=$j ;;
111
92
--fontlist) # Expect a plus-separated list of names
112
93
if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]: 0: 2} == " --" ]]; then
113
94
err_exit " Invalid value passed to --fontlist"
@@ -152,9 +133,6 @@ parse_flags() {
152
133
if [[ -z ${LANG_CODE} ]]; then
153
134
err_exit " Need to specify a language --lang"
154
135
fi
155
- if [[ -z ${BINDIR} ]]; then
156
- err_exit " Need to specify path to built binaries --bin_dir"
157
- fi
158
136
if [[ -z ${LANGDATA_ROOT} ]]; then
159
137
err_exit " Need to specify path to language files --langdata_dir"
160
138
fi
@@ -167,8 +145,6 @@ parse_flags() {
167
145
fi
168
146
fi
169
147
170
- set_prog_paths
171
-
172
148
# Location where intermediate files will be created.
173
149
TRAINING_DIR=${WORKSPACE_DIR} /${LANG_CODE}
174
150
# Location of log file for the whole run.
@@ -196,7 +172,7 @@ initialize_fontconfig() {
196
172
export FONT_CONFIG_CACHE=$( mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
197
173
local sample_path=${FONT_CONFIG_CACHE} /sample_text.txt
198
174
echo " Text" > ${sample_path}
199
- run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
175
+ run_command text2image --fonts_dir=${FONTS_DIR} \
200
176
--font=" ${FONTS[0]} " --outputbase=${sample_path} --text=${sample_path} \
201
177
--fontconfig_tmpdir=${FONT_CONFIG_CACHE}
202
178
}
@@ -224,14 +200,14 @@ generate_font_image() {
224
200
fi
225
201
done
226
202
227
- run_command ${TEXT2IMAGE_EXE} ${common_args} --font=" ${font} " \
203
+ run_command text2image ${common_args} --font=" ${font} " \
228
204
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
229
205
check_file_readable ${outbase} .box ${outbase} .tif
230
206
231
207
if (( ${EXTRACT_FONT_PROPERTIES} )) &&
232
208
[[ -r ${TRAIN_NGRAMS_FILE} ]]; then
233
209
tlog " Extracting font properties of ${font} "
234
- run_command ${TEXT2IMAGE_EXE} ${common_args} --font=" ${font} " \
210
+ run_command text2image ${common_args} --font=" ${font} " \
235
211
--ligatures=false --text=${TRAIN_NGRAMS_FILE} \
236
212
--only_extract_font_properties --ptsize=32
237
213
check_file_readable ${outbase} .fontinfo
@@ -287,15 +263,15 @@ phase_UP_generate_unicharset() {
287
263
tlog " \n=== Phase UP: Generating unicharset and unichar properties files ==="
288
264
289
265
local box_files=$( ls ${TRAINING_DIR} /* .box)
290
- run_command ${UNICHARSET_EXTRACTOR_EXE} -D " ${TRAINING_DIR} /" ${box_files}
266
+ run_command unicharset_extractor -D " ${TRAINING_DIR} /" ${box_files}
291
267
local outfile=${TRAINING_DIR} /unicharset
292
268
UNICHARSET_FILE=" ${TRAINING_DIR} /${LANG_CODE} .unicharset"
293
269
check_file_readable ${outfile}
294
270
mv ${outfile} ${UNICHARSET_FILE}
295
271
296
272
XHEIGHTS_FILE=" ${TRAINING_DIR} /${LANG_CODE} .xheights"
297
273
check_file_readable ${UNICHARSET_FILE}
298
- run_command ${SET_UNICHARSET_PROPERTIES_EXE} \
274
+ run_command set_unicharset_properties \
299
275
-U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
300
276
--script_dir=${LANGDATA_ROOT}
301
277
check_file_readable ${XHEIGHTS_FILE}
@@ -323,7 +299,7 @@ phase_D_generate_dawg() {
323
299
if [[ -s ${WORDLIST_FILE} ]]; then
324
300
tlog " Generating word Dawg"
325
301
check_file_readable ${UNICHARSET_FILE}
326
- run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
302
+ run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
327
303
${UNICHARSET_FILE}
328
304
check_file_readable ${WORD_DAWG}
329
305
@@ -335,13 +311,13 @@ phase_D_generate_dawg() {
335
311
if [[ -s ${freq_wordlist_file} ]]; then
336
312
check_file_readable ${UNICHARSET_FILE}
337
313
tlog " Generating frequent-word Dawg"
338
- run_command ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} \
314
+ run_command wordlist2dawg -r 1 ${freq_wordlist_file} \
339
315
${FREQ_DAWG} ${UNICHARSET_FILE}
340
316
check_file_readable ${FREQ_DAWG}
341
317
fi
342
318
343
319
# Punctuation DAWG
344
- # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
320
+ # -r arguments to wordlist2dawg denote RTL reverse policy
345
321
# (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
346
322
# We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
347
323
# 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
@@ -356,20 +332,20 @@ phase_D_generate_dawg() {
356
332
PUNC_FILE=" ${LANGDATA_ROOT} /common.punc"
357
333
fi
358
334
check_file_readable ${PUNC_FILE}
359
- run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
335
+ run_command wordlist2dawg -r ${punc_reverse_policy} \
360
336
${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
361
337
check_file_readable ${PUNC_DAWG}
362
338
363
339
# Numbers DAWG
364
340
if [[ -s ${NUMBERS_FILE} ]]; then
365
- run_command ${WORDLIST2DAWG_EXE} -r 0 \
341
+ run_command wordlist2dawg -r 0 \
366
342
${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
367
343
check_file_readable ${NUMBER_DAWG}
368
344
fi
369
345
370
346
# Bigram dawg
371
347
if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
372
- run_command ${WORDLIST2DAWG_EXE} -r 1 \
348
+ run_command wordlist2dawg -r 1 \
373
349
${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
374
350
check_file_readable ${BIGRAM_DAWG}
375
351
fi
@@ -401,7 +377,7 @@ phase_E_extract_features() {
401
377
tlog " Using TESSDATA_PREFIX=${TESSDATA_PREFIX} "
402
378
local counter=0
403
379
for img_file in ${img_files} ; do
404
- run_command ${TESSERACT_EXE} ${img_file} ${img_file% .* } \
380
+ run_command tesseract ${img_file} ${img_file% .* } \
405
381
${box_config} ${config} &
406
382
let counter=counter+1
407
383
let rem=counter%par_factor
@@ -423,7 +399,7 @@ phase_C_cluster_prototypes() {
423
399
tlog " \n=== Phase C: Clustering feature prototypes (cnTraining) ==="
424
400
local out_normproto=$1
425
401
426
- run_command ${CN_TRAINING_EXE} -D " ${TRAINING_DIR} /" \
402
+ run_command cntraining -D " ${TRAINING_DIR} /" \
427
403
$( ls ${TRAINING_DIR} /* .tr)
428
404
429
405
check_file_readable ${TRAINING_DIR} /normproto
@@ -443,7 +419,7 @@ phase_S_cluster_shapes() {
443
419
font_props=${font_props} " -X ${TRAINING_DIR} /${LANG_CODE} .xheights"
444
420
fi
445
421
446
- run_command ${SHAPE_TRAINING_EXE} \
422
+ run_command shapeclustering \
447
423
-D " ${TRAINING_DIR} /" \
448
424
-U ${TRAINING_DIR} /${LANG_CODE} .unicharset \
449
425
-O ${TRAINING_DIR} /${LANG_CODE} .mfunicharset \
@@ -464,7 +440,7 @@ phase_M_cluster_microfeatures() {
464
440
font_props=${font_props} " -X ${TRAINING_DIR} /${LANG_CODE} .xheights"
465
441
fi
466
442
467
- run_command ${MF_TRAINING_EXE} \
443
+ run_command mftraining \
468
444
-D " ${TRAINING_DIR} /" \
469
445
-U ${TRAINING_DIR} /${LANG_CODE} .unicharset \
470
446
-O ${TRAINING_DIR} /${LANG_CODE} .mfunicharset \
@@ -524,7 +500,7 @@ make__traineddata() {
524
500
fi
525
501
526
502
# Compose the traineddata file.
527
- run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR} /${LANG_CODE} .
503
+ run_command combine_tessdata ${TRAINING_DIR} /${LANG_CODE} .
528
504
529
505
# Copy it to the output dir, overwriting only if allowed by the cmdline flag.
530
506
if [[ ! -d ${OUTPUT_DIR} ]]; then
0 commit comments