@@ -49,13 +49,10 @@ def __init__(self):
49
49
50
50
self .max_pages = 0
51
51
self .save_box_tiff = False
52
- self .output_dir = "/tmp/tesstrain/tessdata"
53
52
self .overwrite = False
54
53
self .linedata = False
55
54
self .run_shape_clustering = False
56
55
self .extract_font_properties = True
57
- self ._workspace_dir = TemporaryDirectory (prefix = "tesstrain" )
58
- self .workspace_dir = self ._workspace_dir .name
59
56
60
57
61
58
def err_exit (msg ):
@@ -88,8 +85,8 @@ def run_command(cmd, *args, env=None):
88
85
else :
89
86
try :
90
87
proclog .error (proc .stdout .decode ("utf-8" , errors = "replace" ))
91
- except Exception :
92
- pass
88
+ except Exception as e :
89
+ proclog . error ( e )
93
90
err_exit (f"Program { cmd } failed with return code { proc .returncode } . Abort." )
94
91
95
92
@@ -101,10 +98,10 @@ def check_file_readable(*filenames):
101
98
filenames = [filenames ]
102
99
for filename in filenames :
103
100
try :
104
- with Path (filename ).open () as f :
101
+ with Path (filename ).open ():
105
102
pass
106
103
except FileNotFoundError :
107
- err_exit (f"Expected file { filename } does not exist" )
104
+ err_exit (f"Required/expected file ' { filename } ' does not exist" )
108
105
except PermissionError :
109
106
err_exit (f"{ filename } is not readable" )
110
107
except IOError as e :
@@ -191,7 +188,6 @@ def check_file_readable(*filenames):
191
188
nargs = "+" ,
192
189
help = "A list of exposure levels to use (e.g. -1,0,1)." ,
193
190
)
194
- parser .add_argument ("--workspace_dir" )
195
191
196
192
197
193
# Does simple command-line parsing and initialization.
@@ -200,7 +196,6 @@ def parse_flags(argv=None):
200
196
log .debug (ctx )
201
197
parser .parse_args (args = argv , namespace = ctx )
202
198
log .debug (ctx )
203
- log .info ("Parsing" )
204
199
205
200
if not ctx .lang_code :
206
201
err_exit ("Need to specify a language --lang" )
@@ -215,12 +210,15 @@ def parse_flags(argv=None):
215
210
)
216
211
else :
217
212
ctx .tessdata_dir = tessdata_prefix
213
+ if not ctx .output_dir :
214
+ ctx .output_dir = mkdtemp (prefix = f"trained-{ ctx .lang_code } -{ ctx .timestamp } " )
215
+ log .info (f"Output directory set to: { ctx .output_dir } " )
218
216
219
217
# Location where intermediate files will be created.
220
218
ctx .training_dir = mkdtemp (prefix = f"{ ctx .lang_code } -{ ctx .timestamp } " )
221
219
# Location of log file for the whole run.
222
220
ctx .log_file = Path (ctx .training_dir ) / "tesstrain.log"
223
- log .info (f"Log file { ctx .log_file } " )
221
+ log .info (f"Log file location: { ctx .log_file } " )
224
222
225
223
def show_tmpdir_location (training_dir ):
226
224
# On successful exit we will delete this first; on failure we want to let the user
@@ -356,7 +354,7 @@ def phase_I_generate_image(ctx, par_factor):
356
354
# for tesseract to recognize during training. Take only the ngrams whose
357
355
# combined weight accounts for 95% of all the bigrams in the language.
358
356
lines = Path (ctx .bigram_freqs_file ).read_text (encoding = "utf-8" ).split ("\n " )
359
- records = (line .split (" " ) for line in splittable_lines )
357
+ records = (line .split (" " ) for line in lines )
360
358
p = 0.99
361
359
ngram_frac = p * sum (int (rec [1 ]) for rec in records if len (rec ) >= 2 )
362
360
0 commit comments