Skip to content

Commit c2f5e9b

Browse files
committed
If there is no explicit renderer(s), default to TessTextRenderer
Revert fd429c3, 43834da, 05de195. See #49, #59. The code in this commit solves the issue in a more elegant way, IMHO. Now you can use: * `tesseract eurotext.tif eurotext txt pdf` * `tesseract eurotext.tif eurotext txt hocr` * `tesseract eurotext.tif eurotext txt hocr pdf` NOTE: With `tesseract eurotext.tif eurotext` or `tesseract eurotext.tif eurotext txt` the psm will be set to '3', but... With `tesseract eurotext.tif eurotext txt pdf` or `tesseract eurotext.tif eurotext txt hocr` the psm will be set to '1'.
1 parent d4e0c64 commit c2f5e9b

File tree

8 files changed

+38
-21
lines changed

8 files changed

+38
-21
lines changed

api/tesseractmain.cpp

+33-15
Original file line numberDiff line numberDiff line change
@@ -176,16 +176,16 @@ void PrintLangsList(tesseract::TessBaseAPI* api) {
176176
/**
177177
* We have 2 possible sources of pagesegmode: a config file and
178178
* the command line. For backwards compatibility reasons, the
179-
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
180-
* default for this program is tesseract::PSM_AUTO. We will let
181-
* the config file take priority, so the command-line default
182-
* can take priority over the tesseract default, so we use the
183-
* value from the command line only if the retrieved mode
184-
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
185-
* in any config file. Therefore the only way to force
186-
* tesseract::PSM_SINGLE_BLOCK is from the command line.
187-
* It would be simpler if we could set the value before Init,
188-
* but that doesn't work.
179+
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
180+
* default for this program is tesseract::PSM_AUTO. We will let
181+
* the config file take priority, so the command-line default
182+
* can take priority over the tesseract default, so we use the
183+
* value from the command line only if the retrieved mode
184+
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
185+
* in any config file. Therefore the only way to force
186+
* tesseract::PSM_SINGLE_BLOCK is from the command line.
187+
* It would be simpler if we could set the value before Init,
188+
* but that doesn't work.
189189
*/
190190
void FixPageSegMode(tesseract::TessBaseAPI* api,
191191
tesseract::PageSegMode pagesegmode) {
@@ -295,19 +295,37 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
295295
if (b) {
296296
bool font_info;
297297
api->GetBoolVariable("hocr_font_info", &font_info);
298-
renderers->push_back(new tesseract::TessHOcrRenderer(outputbase, font_info));
298+
renderers->push_back(
299+
new tesseract::TessHOcrRenderer(outputbase, font_info));
299300
}
301+
300302
api->GetBoolVariable("tessedit_create_pdf", &b);
301303
if (b) {
302304
renderers->push_back(new tesseract::TessPDFRenderer(outputbase,
303-
api->GetDatapath()));
305+
api->GetDatapath()));
304306
}
307+
305308
api->GetBoolVariable("tessedit_write_unlv", &b);
306-
if (b) renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
309+
if (b) {
310+
renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
311+
}
312+
307313
api->GetBoolVariable("tessedit_create_boxfile", &b);
308-
if (b) renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
314+
if (b) {
315+
renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
316+
}
317+
318+
// disable text renderer when using one of these configs:
319+
// ambigs.train, box.train, box.train.stderr, linebox, rebox
320+
bool disable_text_renderer =
321+
(api->GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
322+
(api->GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
323+
(api->GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
324+
309325
api->GetBoolVariable("tessedit_create_txt", &b);
310-
if (b) renderers->push_back(new tesseract::TessTextRenderer(outputbase));
326+
if (b || (renderers->empty() && !disable_text_renderer) {
327+
renderers->push_back(new tesseract::TessTextRenderer(outputbase));
328+
}
311329
}
312330

313331
if (!renderers->empty()) {

ccmain/tesseractclass.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ Tesseract::Tesseract()
381381
this->params()),
382382
BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
383383
this->params()),
384-
BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
384+
BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
385385
this->params()),
386386
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
387387
this->params()),

ccmain/tesseractclass.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1001,7 +1001,7 @@ class Tesseract : public Wordrec {
10011001
BOOL_VAR_H(tessedit_write_rep_codes, false,
10021002
"Write repetition char code");
10031003
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
1004-
BOOL_VAR_H(tessedit_create_txt, true, "Write .txt output file");
1004+
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
10051005
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
10061006
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
10071007
STRING_VAR_H(unrecognised_char, "|",

tessdata/configs/hocr

-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
tessedit_create_txt 0
21
tessedit_create_hocr 1
32
tessedit_pageseg_mode 1

tessdata/configs/makebox

-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
tessedit_create_txt 0
21
tessedit_create_boxfile 1

tessdata/configs/pdf

-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
tessedit_create_txt 0
21
tessedit_create_pdf 1
32
tessedit_pageseg_mode 1

tessdata/configs/txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# This config file should be used with other cofig files which creates renderers.
2+
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
3+
tessedit_create_txt 1

tessdata/configs/unlv

-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
tessedit_create_txt 0
21
tessedit_write_unlv 1
32
tessedit_pageseg_mode 6

0 commit comments

Comments
 (0)