Skip to content

Commit d385269

Browse files
committed
Add support for Tesseract version 3.05.00
This is a bit more involved, because Tesseract 3.05.00 comes not only with improvements but also with a few quirks we need to deal with. The first quirk is that the order arguments of the `tesseract' command now matters and the list of configurations has to be at the end of the command line. So we add a new attribute tesseract_flags to the BaseBuilder class that contains a list of all the flags to pass to `tesseract', the tesseract_configs attribute however remains pretty much the same but now only really contains a list of configs instead of being mixed with flag arguments. Another quirk has to do with Leptonica >= 1.74 which Tesseract 3.05.00 now requires. Leptonica has special handling of files that reside in /tmp and assumes that it's an internal temporary file of Leptonica. In order to deal with it, we now run Tesseract in a temporary directory, which contains the input/output files and use the relative name of these files because Leptonica only searches for path names beginning with /tmp. Fortunately the last item we need to address is not really a quirk, but an API change. In Tesseract 3.05.00 there is now a new function called TessBaseAPIDetectOrientationScript(), which doesn't fill the OSResults object anymore but now allows to pass the values we're interested in directly by reference. We need to use this new function because the old function TessBaseAPIDetectOS() now *always* returns false. Ran the test suite successfully with Python 3.5 and both Tesseract 3.04.01 and 3.05.00 except the following tests, which also didn't succeed prior to this commit: * cuneiform:TestTxt.test_basic * cuneiform:TestTxt.test_european * cuneiform:TestTxt.test_french * cuneiform:TestWordBox.test_basic * cuneiform:TestWordBox.test_european * cuneiform:TestWordBox.test_french * libtesseract:TestBasicDoc.test_basic * libtesseract:TestDigitLineBox.test_digits * libtesseract:TestLineBox.test_japanese * libtesseract:TestTxt.test_japanese * libtesseract:TestWordBox.test_japanese * tesseract:TestDigitLineBox.test_digits * tesseract:TestTxt.test_japanese The failure of these test cases is probably related to issue openpaperwork#52, but from looking at the failures it doesn't seem to be related to this change anyway. Signed-off-by: aszlig <[email protected]>
1 parent f232402 commit d385269

File tree

5 files changed

+101
-46
lines changed

5 files changed

+101
-46
lines changed

src/pyocr/builders.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,10 @@ class BaseBuilder(object):
240240
cuneiform_args : Arguments passed to the Cuneiform command line.
241241
"""
242242

243-
def __init__(self, file_extensions, tesseract_configs, cuneiform_args):
243+
def __init__(self, file_extensions, tesseract_flags, tesseract_configs,
244+
cuneiform_args):
244245
self.file_extensions = file_extensions
246+
self.tesseract_flags = tesseract_flags
245247
self.tesseract_configs = tesseract_configs
246248
self.cuneiform_args = cuneiform_args
247249

@@ -298,15 +300,15 @@ class TextBuilder(BaseBuilder):
298300
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
299301
cuneiform_fax=False, cuneiform_singlecolumn=False):
300302
file_ext = ["txt"]
301-
tess_conf = ["-psm", str(tesseract_layout)]
303+
tess_flags = ["-psm", str(tesseract_layout)]
302304
cun_args = ["-f", "text"]
303305
# Add custom cuneiform parameters if needed
304306
for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
305307
(cuneiform_fax, "--fax"),
306308
(cuneiform_singlecolumn, "--singlecolumn")]:
307309
if par:
308310
cun_args.append(arg)
309-
super(TextBuilder, self).__init__(file_ext, tess_conf, cun_args)
311+
super(TextBuilder, self).__init__(file_ext, tess_flags, [], cun_args)
310312
self.tesseract_layout = tesseract_layout
311313
self.built_text = []
312314

@@ -540,9 +542,11 @@ class WordBoxBuilder(BaseBuilder):
540542

541543
def __init__(self, tesseract_layout=1):
542544
file_ext = ["html", "hocr"]
543-
tess_conf = ["hocr", "-psm", str(tesseract_layout)]
545+
tess_flags = ["-psm", str(tesseract_layout)]
546+
tess_conf = ["hocr"]
544547
cun_args = ["-f", "hocr"]
545-
super(WordBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
548+
super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
549+
cun_args)
546550
self.word_boxes = []
547551
self.tesseract_layout = tesseract_layout
548552

@@ -614,9 +618,11 @@ class LineBoxBuilder(BaseBuilder):
614618

615619
def __init__(self, tesseract_layout=1):
616620
file_ext = ["html", "hocr"]
617-
tess_conf = ["hocr", "-psm", str(tesseract_layout)]
621+
tess_flags = ["-psm", str(tesseract_layout)]
622+
tess_conf = ["hocr"]
618623
cun_args = ["-f", "hocr"]
619-
super(LineBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
624+
super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
625+
cun_args)
620626
self.lines = []
621627
self.tesseract_layout = tesseract_layout
622628

src/pyocr/libtesseract/tesseract_raw.py

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -263,11 +263,22 @@ class OSResults(ctypes.Structure):
263263
]
264264
g_libtesseract.TessDeleteText.restype = None
265265

266-
g_libtesseract.TessBaseAPIDetectOS.argtypes = [
267-
ctypes.c_void_p, # TessBaseAPI*
268-
ctypes.POINTER(OSResults),
269-
]
270-
g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
266+
if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
267+
g_libtesseract.TessBaseAPIDetectOrientationScript.argtypes = [
268+
ctypes.c_void_p, # TessBaseAPI*
269+
ctypes.POINTER(ctypes.c_int), # orient_deg
270+
ctypes.POINTER(ctypes.c_float), # orient_conf
271+
ctypes.POINTER(ctypes.c_char_p), # script_name
272+
ctypes.POINTER(ctypes.c_float), # script_conf
273+
]
274+
g_libtesseract.TessBaseAPIDetectOrientationScript.restype = \
275+
ctypes.c_bool
276+
else:
277+
g_libtesseract.TessBaseAPIDetectOS.argtypes = [
278+
ctypes.c_void_p, # TessBaseAPI*
279+
ctypes.POINTER(OSResults),
280+
]
281+
g_libtesseract.TessBaseAPIDetectOS.restype = ctypes.c_bool
271282

272283

273284
def init(lang=None):
@@ -526,15 +537,37 @@ def detect_os(handle):
526537
global g_libtesseract
527538
assert(g_libtesseract)
528539

529-
results = OSResults()
530-
r = g_libtesseract.TessBaseAPIDetectOS(
531-
ctypes.c_void_p(handle),
532-
ctypes.pointer(results)
533-
)
534-
if not r:
535-
raise TesseractError("detect_orientation failed",
536-
"TessBaseAPIDetectOS() failed")
537-
return {
538-
"orientation": results.best_orientation_id,
539-
"confidence": results.best_oconfidence,
540-
}
540+
# Use the new API function if it is available, because since Tesseract
541+
# 3.05.00 the old API function _always_ returns False.
542+
if hasattr(g_libtesseract, 'TessBaseAPIDetectOrientationScript'):
543+
orientation_deg = ctypes.c_int(0)
544+
orientation_confidence = ctypes.c_float(0.0)
545+
546+
r = g_libtesseract.TessBaseAPIDetectOrientationScript(
547+
ctypes.c_void_p(handle),
548+
ctypes.byref(orientation_deg),
549+
ctypes.byref(orientation_confidence),
550+
None, # script_name
551+
None # script_confidence
552+
)
553+
554+
if not r:
555+
raise TesseractError("detect_orientation failed",
556+
"TessBaseAPIDetectOrientationScript() failed")
557+
return {
558+
"orientation": round(orientation_deg.value / 90),
559+
"confidence": orientation_confidence.value,
560+
}
561+
else: # old API (before Tesseract 3.05.00)
562+
results = OSResults()
563+
r = g_libtesseract.TessBaseAPIDetectOS(
564+
ctypes.c_void_p(handle),
565+
ctypes.pointer(results)
566+
)
567+
if not r:
568+
raise TesseractError("detect_orientation failed",
569+
"TessBaseAPIDetectOS() failed")
570+
return {
571+
"orientation": results.best_orientation_id,
572+
"confidence": results.best_oconfidence,
573+
}

src/pyocr/tesseract.py

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import subprocess
2323
import sys
2424
import tempfile
25+
import contextlib
26+
import shutil
2527

2628
from . import builders
2729
from . import error
@@ -62,9 +64,11 @@ class CharBoxBuilder(builders.BaseBuilder):
6264

6365
def __init__(self):
6466
file_ext = ["box"]
67+
tess_flags = []
6568
tess_conf = ["batch.nochop", "makebox"]
6669
cun_args = []
67-
super(CharBoxBuilder, self).__init__(file_ext, tess_conf, cun_args)
70+
super(CharBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
71+
cun_args)
6872
self.tesseract_layout = 1
6973

7074
@staticmethod
@@ -173,18 +177,19 @@ def detect_orientation(image, lang=None):
173177
TesseractError --- if no script detected on the image
174178
"""
175179
_set_environment()
176-
with temp_file(".bmp") as input_file:
177-
command = [TESSERACT_CMD, input_file.name, 'stdout', "-psm", "0"]
180+
with temp_dir() as tmpdir:
181+
command = [TESSERACT_CMD, "input.bmp", 'stdout', "-psm", "0"]
178182
if lang is not None:
179183
command += ['-l', lang]
180184

181185
if image.mode != "RGB":
182186
image = image.convert("RGB")
183-
image.save(input_file.name)
187+
image.save(os.path.join(tmpdir, "input.bmp"))
184188

185189
proc = subprocess.Popen(command, stdin=subprocess.PIPE, shell=False,
186190
startupinfo=g_subprocess_startup_info,
187191
creationflags=g_creation_flags,
192+
cwd=tmpdir,
188193
stdout=subprocess.PIPE,
189194
stderr=subprocess.STDOUT)
190195
proc.stdin.close()
@@ -224,8 +229,8 @@ def get_available_builders():
224229
]
225230

226231

227-
def run_tesseract(input_filename, output_filename_base, lang=None,
228-
configs=None):
232+
def run_tesseract(input_filename, output_filename_base, cwd=None, lang=None,
233+
flags=None, configs=None):
229234
'''
230235
Runs Tesseract:
231236
`TESSERACT_CMD` \
@@ -238,6 +243,8 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
238243
input_filename --- image to read
239244
output_filename_base --- file name in which must be stored the result
240245
(without the extension)
246+
cwd --- Run Tesseract in the specified working directory or use current
247+
one if None
241248
lang --- Tesseract language to use (if None, none will be specified)
242249
config --- List of Tesseract configs to use (if None, none will be
243250
specified)
@@ -252,10 +259,13 @@ def run_tesseract(input_filename, output_filename_base, lang=None,
252259
if lang is not None:
253260
command += ['-l', lang]
254261

262+
if flags is not None:
263+
command += flags
264+
255265
if configs is not None:
256266
command += configs
257267

258-
proc = subprocess.Popen(command,
268+
proc = subprocess.Popen(command, cwd=cwd,
259269
startupinfo=g_subprocess_startup_info,
260270
creationflags=g_creation_flags,
261271
stdout=subprocess.PIPE,
@@ -301,11 +311,18 @@ def close(self):
301311
self.name = None
302312

303313

304-
def temp_file(suffix):
305-
''' Returns a temporary file '''
306-
if os.name == 'nt': # Windows
307-
return ReOpenableTempfile(suffix)
308-
return tempfile.NamedTemporaryFile(prefix='tess_', suffix=suffix)
314+
@contextlib.contextmanager
315+
def temp_dir():
316+
"""
317+
A context manager for maintaining a temporary directory
318+
"""
319+
# NOTE: Drop this as soon as we don't support Python 2.7 anymore, because
320+
# since Python 3.2 there is a context manager called TemporaryDirectory().
321+
path = tempfile.mkdtemp(prefix='tess_')
322+
try:
323+
yield path
324+
finally:
325+
shutil.rmtree(path)
309326

310327

311328
def image_to_string(image, lang=None, builder=None):
@@ -329,23 +346,20 @@ def image_to_string(image, lang=None, builder=None):
329346

330347
if builder is None:
331348
builder = builders.TextBuilder()
332-
with temp_file(".bmp") as input_file:
333-
with temp_file('') as output_file:
334-
output_file_name_base = output_file.name
335-
349+
with temp_dir() as tmpdir:
336350
if image.mode != "RGB":
337351
image = image.convert("RGB")
338-
image.save(input_file.name)
339-
(status, errors) = run_tesseract(input_file.name,
340-
output_file_name_base,
352+
image.save(os.path.join(tmpdir, "input.bmp"))
353+
(status, errors) = run_tesseract("input.bmp", "output", cwd=tmpdir,
341354
lang=lang,
355+
flags=builder.tesseract_flags,
342356
configs=builder.tesseract_configs)
343357
if status:
344358
raise TesseractError(status, errors)
345359

346360
output_file_name = "ERROR"
347361
for file_extension in builder.file_extensions:
348-
output_file_name = ('%s.%s' % (output_file_name_base,
362+
output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"),
349363
file_extension))
350364
if not os.access(output_file_name, os.F_OK):
351365
continue

tests/tests_libtesseract.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ def test_version(self):
3333
(3, 3, 0),
3434
(3, 4, 0),
3535
(3, 4, 1),
36+
(3, 5, 0),
3637
), ("Tesseract does not have the expected version"
37-
" (3.4.0) ! Some tests will be skipped !"))
38+
" (3.5.0) ! Some tests will be skipped !"))
3839

3940
def test_langs(self):
4041
langs = libtesseract.get_available_languages()

tests/tests_tesseract.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ def test_version(self):
2727
(3, 3, 0),
2828
(3, 4, 0),
2929
(3, 4, 1),
30+
(3, 5, 0),
3031
), ("Tesseract does not have the expected version"
31-
" (3.4.0) ! Some tests will be skipped !"))
32+
" (3.5.0) ! Some tests will be skipped !"))
3233

3334
def test_langs(self):
3435
langs = tesseract.get_available_languages()

0 commit comments

Comments
 (0)