Skip to content

Commit 5f06402

Browse files
committed
python: optimize imports, reformat code
1 parent 2e9fd69 commit 5f06402

File tree

3 files changed

+63
-67
lines changed

3 files changed

+63
-67
lines changed

src/training/language_specific.py

+45-45
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
# Language specific info
2020
# =============================================================================
2121

22-
import os
2322
import logging
23+
import os
2424

2525
log = logging.getLogger(__name__)
2626

@@ -875,6 +875,7 @@
875875

876876
FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "")
877877

878+
878879
# Set language-specific values for several global variables, including
879880
# ${TEXT_CORPUS}
880881
# holds the text corpus file for the language, used in phase F
@@ -1079,15 +1080,15 @@ def set_lang_specific_parameters(ctx, lang):
10791080
NUMBER_DAWG_FACTOR = 0.05
10801081
WORD_DAWG_SIZE = 1_000_000
10811082
elif lang in (
1082-
"aze_cyrl",
1083-
"bel",
1084-
"bul",
1085-
"kaz",
1086-
"mkd",
1087-
"srp",
1088-
"tgk",
1089-
"ukr",
1090-
"uzb_cyrl",
1083+
"aze_cyrl",
1084+
"bel",
1085+
"bul",
1086+
"kaz",
1087+
"mkd",
1088+
"srp",
1089+
"tgk",
1090+
"ukr",
1091+
"uzb_cyrl",
10911092
):
10921093
MIX_LANG = f"{lang}"
10931094
if not FONTS:
@@ -1326,44 +1327,44 @@ def set_lang_specific_parameters(ctx, lang):
13261327
EXPOSURES = [0]
13271328
# Set right-to-left and normalization mode.
13281329
if lang in (
1329-
"ara",
1330-
"div",
1331-
"fas",
1332-
"pus",
1333-
"snd",
1334-
"syr",
1335-
"uig",
1336-
"urd",
1337-
"kur_ara",
1338-
"heb",
1339-
"yid",
1330+
"ara",
1331+
"div",
1332+
"fas",
1333+
"pus",
1334+
"snd",
1335+
"syr",
1336+
"uig",
1337+
"urd",
1338+
"kur_ara",
1339+
"heb",
1340+
"yid",
13401341
):
13411342
LANG_IS_RTL = True
13421343
NORM_MODE = 2
13431344
elif lang in (
1344-
"asm",
1345-
"ben",
1346-
"bih",
1347-
"hin",
1348-
"mar",
1349-
"nep",
1350-
"guj",
1351-
"kan",
1352-
"mal",
1353-
"tam",
1354-
"tel",
1355-
"pan",
1356-
"dzo",
1357-
"sin",
1358-
"san",
1359-
"bod",
1360-
"ori",
1361-
"khm",
1362-
"mya",
1363-
"tha",
1364-
"lao",
1365-
"jav ",
1366-
"jav_java",
1345+
"asm",
1346+
"ben",
1347+
"bih",
1348+
"hin",
1349+
"mar",
1350+
"nep",
1351+
"guj",
1352+
"kan",
1353+
"mal",
1354+
"tam",
1355+
"tel",
1356+
"pan",
1357+
"dzo",
1358+
"sin",
1359+
"san",
1360+
"bod",
1361+
"ori",
1362+
"khm",
1363+
"mya",
1364+
"tha",
1365+
"lao",
1366+
"jav ",
1367+
"jav_java",
13671368
):
13681369
LANG_IS_RTL = False
13691370
NORM_MODE = 2
@@ -1408,7 +1409,6 @@ def set_lang_specific_parameters(ctx, lang):
14081409

14091410
return ctx
14101411

1411-
14121412
# =============================================================================
14131413
# END of Language specific info
14141414
# =============================================================================

src/training/tesstrain.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
# This script provides an easy way to execute various phases of training
1616
# Tesseract. For a detailed description of the phases, see
1717
# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
18-
#
19-
import sys
20-
import os
18+
2119
import logging
20+
import os
21+
import sys
2222

2323
if (sys.version_info.major < 3) or (sys.version_info.major == 3 and sys.version_info.minor < 6):
2424
raise Exception("Must be using Python minimum version 3.6!")
@@ -86,7 +86,6 @@ def main():
8686
if __name__ == "__main__":
8787
main()
8888

89-
9089
# _rc0 = subprocess.call(["tlog","\n=== Starting training for language '"+str(LANG_CODE.val)+"'"],shell=True)
9190
# _rc0 = subprocess.call(["source",os.popen("dirname "+__file__).read().rstrip("\n")+"/language-specific.sh"],shell=True)
9291
# _rc0 = subprocess.call(["set_lang_specific_parameters",str(LANG_CODE.val)],shell=True)

src/training/tesstrain_utils.py

+15-18
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@
1414
# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
1515
#
1616

17+
import argparse
18+
import atexit
19+
import concurrent.futures
20+
import logging
1721
import os
22+
import pathlib
1823
import platform
24+
import shutil
25+
import subprocess
1926
import sys
2027
from datetime import date
21-
from tempfile import TemporaryDirectory, mkdtemp
22-
import pathlib
23-
import logging
24-
import subprocess
25-
import argparse
2628
from operator import itemgetter
27-
import concurrent.futures
28-
import shutil
29-
import atexit
29+
from tempfile import TemporaryDirectory, mkdtemp
3030

3131
from tqdm import tqdm
3232

@@ -247,18 +247,18 @@ def show_tmpdir_location(training_dir):
247247
# specified in the command-line.
248248
if not ctx.training_text:
249249
ctx.training_text = (
250-
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
250+
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
251251
)
252252
if not ctx.wordlist_file:
253253
ctx.wordlist_file = (
254-
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
254+
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
255255
)
256256

257257
ctx.word_bigrams_file = (
258-
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
258+
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
259259
)
260260
ctx.numbers_file = (
261-
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
261+
pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
262262
)
263263
ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
264264
ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
@@ -307,7 +307,6 @@ def make_outbase(ctx, fontname, exposure):
307307
# Helper function for phaseI_generate_image. Generates the image for a single
308308
# language/font combination in a way that can be run in parallel.
309309
def generate_font_image(ctx, font, exposure, char_spacing):
310-
311310
log.info(f"Rendering using {font}")
312311
fontname = make_fontname(font)
313312
outbase = make_outbase(ctx, fontname, exposure)
@@ -358,7 +357,6 @@ def generate_font_image(ctx, font, exposure, char_spacing):
358357

359358
# Phase I : Generate (I)mages from training text for each font.
360359
def phase_I_generate_image(ctx, par_factor):
361-
362360
if not par_factor or par_factor <= 0:
363361
par_factor = 1
364362

@@ -387,8 +385,8 @@ def phase_I_generate_image(ctx, par_factor):
387385
check_file_readable(ctx.train_ngrams_file)
388386

389387
with tqdm(
390-
total=len(ctx.fonts)
391-
) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
388+
total=len(ctx.fonts)
389+
) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
392390
futures = [
393391
executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
394392
for font in ctx.fonts
@@ -533,7 +531,7 @@ def phase_E_extract_features(ctx, box_config, ext):
533531
log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")
534532

535533
with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor(
536-
max_workers=2
534+
max_workers=2
537535
) as executor:
538536
futures = []
539537
for img_file in img_files:
@@ -693,7 +691,6 @@ def get_file_list():
693691
dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf"))
694692
pathlib.Path(lstm_list).write_text("\n".join(dir_listing))
695693

696-
697694
# make__traineddata() {
698695
# tlog "\n=== Making final traineddata file ==="
699696
# local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}

0 commit comments

Comments
 (0)