Skip to content

Commit 8aa2523

Browse files
author
James R. Barlow
committed
Fix some of Codacy's complaints
1 parent 9122e62 commit 8aa2523

File tree

3 files changed

+51
-25
lines changed

3 files changed

+51
-25
lines changed

src/training/language_specific.py

+36-11
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,9 @@ def set_lang_specific_parameters(ctx, lang):
916916
TEXT2IMAGE_EXTRA_ARGS = []
917917
EXPOSURES = []
918918

919+
GENERATE_WORD_BIGRAMS = None
920+
WORD_DAWG_SIZE = None
921+
919922
# Latin languages.
920923
if lang == "enm":
921924
TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported
@@ -1364,18 +1367,40 @@ def set_lang_specific_parameters(ctx, lang):
13641367
LANG_IS_RTL = False
13651368
NORM_MODE = 1
13661369

1367-
for var in [v for v in locals()]:
1368-
if var.isupper():
1369-
value = locals()[var]
1370-
lowervar = var.lower()
1371-
if hasattr(ctx, lowervar) and getattr(ctx, lowervar) != value:
1372-
log.debug(f"{lowervar} = {value} (was {getattr(ctx, lowervar)})")
1373-
setattr(ctx, lowervar, value)
1374-
elif hasattr(ctx, lowervar):
1375-
log.debug(f"{lowervar} = {value} (set on cmdline)")
1370+
vars_to_transfer = {
1371+
'ambigs_filter_denominator': AMBIGS_FILTER_DENOMINATOR,
1372+
'bigram_dawg_factor': BIGRAM_DAWG_FACTOR,
1373+
'exposures': EXPOSURES,
1374+
'filter_arguments': FILTER_ARGUMENTS,
1375+
'fonts': FONTS,
1376+
'fragments_disabled': FRAGMENTS_DISABLED,
1377+
'generate_word_bigrams': GENERATE_WORD_BIGRAMS,
1378+
'lang_is_rtl': LANG_IS_RTL,
1379+
'leading': LEADING,
1380+
'mean_count': MEAN_COUNT,
1381+
'mix_lang': MIX_LANG,
1382+
'norm_mode': NORM_MODE,
1383+
'number_dawg_factor': NUMBER_DAWG_FACTOR,
1384+
'punc_dawg_factor': PUNC_DAWG_FACTOR,
1385+
'run_shape_clustering': RUN_SHAPE_CLUSTERING,
1386+
'text2image_extra_args': TEXT2IMAGE_EXTRA_ARGS,
1387+
'text_corpus': TEXT_CORPUS,
1388+
'training_data_arguments': TRAINING_DATA_ARGUMENTS,
1389+
'word_dawg_factor': WORD_DAWG_FACTOR,
1390+
'word_dawg_size': WORD_DAWG_SIZE,
1391+
'wordlist2dawg_arguments': WORDLIST2DAWG_ARGUMENTS,
1392+
}
1393+
1394+
for attr, value in vars_to_transfer.items():
1395+
if hasattr(ctx, attr):
1396+
if getattr(ctx, attr) != value:
1397+
log.debug(f"{attr} = {value} (was {getattr(ctx, attr)})")
1398+
setattr(ctx, attr, value)
13761399
else:
1377-
log.debug(f"{lowervar} = {value}")
1378-
setattr(ctx, lowervar, value)
1400+
log.debug(f"{attr} = {value} (set on cmdline)")
1401+
else:
1402+
log.debug(f"{attr} = {value}")
1403+
setattr(ctx, attr, value)
13791404

13801405
return ctx
13811406

src/training/tesstrain.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# Tesseract. For a detailed description of the phases, see
1515
# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
1616
#
17-
import sys, os, subprocess, logging
17+
import sys, os, logging
1818

1919

2020
sys.path.insert(0, os.path.dirname(__file__))
@@ -32,7 +32,7 @@
3232
log = logging.getLogger()
3333

3434

35-
def setup_logging(logfile):
35+
def setup_logging_console():
3636
log.setLevel(logging.DEBUG)
3737
console = logging.StreamHandler()
3838
console.setLevel(logging.INFO)
@@ -42,6 +42,8 @@ def setup_logging(logfile):
4242
console.setFormatter(console_formatter)
4343
log.addHandler(console)
4444

45+
46+
def setup_logging_logfile(logfile):
4547
logfile = logging.FileHandler(logfile)
4648
logfile.setLevel(logging.DEBUG)
4749
logfile_formatter = logging.Formatter(
@@ -52,8 +54,9 @@ def setup_logging(logfile):
5254

5355

5456
def main():
57+
setup_logging_console()
5558
ctx = parse_flags()
56-
setup_logging(ctx.log_file)
59+
setup_logging_logfile(ctx.log_file)
5760
if not ctx.linedata:
5861
log.error("--linedata_only is required since only LSTM is supported")
5962
sys.exit(1)

src/training/tesstrain_utils.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,10 @@ def __init__(self):
4949

5050
self.max_pages = 0
5151
self.save_box_tiff = False
52-
self.output_dir = "/tmp/tesstrain/tessdata"
5352
self.overwrite = False
5453
self.linedata = False
5554
self.run_shape_clustering = False
5655
self.extract_font_properties = True
57-
self._workspace_dir = TemporaryDirectory(prefix="tesstrain")
58-
self.workspace_dir = self._workspace_dir.name
5956

6057

6158
def err_exit(msg):
@@ -88,8 +85,8 @@ def run_command(cmd, *args, env=None):
8885
else:
8986
try:
9087
proclog.error(proc.stdout.decode("utf-8", errors="replace"))
91-
except Exception:
92-
pass
88+
except Exception as e:
89+
proclog.error(e)
9390
err_exit(f"Program {cmd} failed with return code {proc.returncode}. Abort.")
9491

9592

@@ -101,10 +98,10 @@ def check_file_readable(*filenames):
10198
filenames = [filenames]
10299
for filename in filenames:
103100
try:
104-
with Path(filename).open() as f:
101+
with Path(filename).open():
105102
pass
106103
except FileNotFoundError:
107-
err_exit(f"Expected file {filename} does not exist")
104+
err_exit(f"Required/expected file '{filename}' does not exist")
108105
except PermissionError:
109106
err_exit(f"{filename} is not readable")
110107
except IOError as e:
@@ -191,7 +188,6 @@ def check_file_readable(*filenames):
191188
nargs="+",
192189
help="A list of exposure levels to use (e.g. -1,0,1).",
193190
)
194-
parser.add_argument("--workspace_dir")
195191

196192

197193
# Does simple command-line parsing and initialization.
@@ -200,7 +196,6 @@ def parse_flags(argv=None):
200196
log.debug(ctx)
201197
parser.parse_args(args=argv, namespace=ctx)
202198
log.debug(ctx)
203-
log.info("Parsing")
204199

205200
if not ctx.lang_code:
206201
err_exit("Need to specify a language --lang")
@@ -215,12 +210,15 @@ def parse_flags(argv=None):
215210
)
216211
else:
217212
ctx.tessdata_dir = tessdata_prefix
213+
if not ctx.output_dir:
214+
ctx.output_dir = mkdtemp(prefix=f"trained-{ctx.lang_code}-{ctx.timestamp}")
215+
log.info(f"Output directory set to: {ctx.output_dir}")
218216

219217
# Location where intermediate files will be created.
220218
ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}")
221219
# Location of log file for the whole run.
222220
ctx.log_file = Path(ctx.training_dir) / "tesstrain.log"
223-
log.info(f"Log file {ctx.log_file}")
221+
log.info(f"Log file location: {ctx.log_file}")
224222

225223
def show_tmpdir_location(training_dir):
226224
# On successful exit we will delete this first; on failure we want to let the user
@@ -356,7 +354,7 @@ def phase_I_generate_image(ctx, par_factor):
356354
# for tesseract to recognize during training. Take only the ngrams whose
357355
# combined weight accounts for 95% of all the bigrams in the language.
358356
lines = Path(ctx.bigram_freqs_file).read_text(encoding="utf-8").split("\n")
359-
records = (line.split(" ") for line in splittable_lines)
357+
records = (line.split(" ") for line in lines)
360358
p = 0.99
361359
ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2)
362360

0 commit comments

Comments
 (0)