Skip to content

Commit 660dbaa

Browse files
committed
implement parameter min_characters_to_try for minimum characters to try to skip page entirely.
fixes #1729
1 parent 2cb609d commit 660dbaa

File tree

3 files changed

+10
-6
lines changed

3 files changed

+10
-6
lines changed

src/ccmain/osdetect.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@
3636
#include <algorithm>
3737
#include <memory>
3838

39-
const int kMinCharactersToTry = 50;
40-
const int kMaxCharactersToTry = 5 * kMinCharactersToTry;
41-
4239
const float kSizeRatioToReject = 2.0;
4340
const int kMinAcceptableBlobHeight = 10;
4441

@@ -278,6 +275,8 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
278275
BLOBNBOX_CLIST* blob_list, OSResults* osr,
279276
tesseract::Tesseract* tess) {
280277
OSResults osr_;
278+
int minCharactersToTry = tess->min_characters_to_try;
279+
int maxCharactersToTry = 5 * minCharactersToTry;
281280
if (osr == nullptr)
282281
osr = &osr_;
283282

@@ -286,13 +285,13 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
286285
ScriptDetector s(allowed_scripts, osr, tess);
287286

288287
BLOBNBOX_C_IT filtered_it(blob_list);
289-
int real_max = std::min(filtered_it.length(), kMaxCharactersToTry);
288+
int real_max = std::min(filtered_it.length(), maxCharactersToTry);
290289
// tprintf("Total blobs found = %d\n", blobs_total);
291290
// tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
292291
// tprintf("Number of blobs to try = %d\n", real_max);
293292

294293
// If there are too few characters, skip this page entirely.
295-
if (real_max < kMinCharactersToTry / 2) {
294+
if (real_max < minCharactersToTry / 2) {
296295
tprintf("Too few characters. Skipping this page\n");
297296
return 0;
298297
}
@@ -307,7 +306,7 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
307306
int num_blobs_evaluated = 0;
308307
for (int i = 0; i < real_max; ++i) {
309308
if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
310-
&& i > kMinCharactersToTry) {
309+
&& i > minCharactersToTry) {
311310
break;
312311
}
313312
++num_blobs_evaluated;

src/ccmain/tesseractclass.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,9 @@ Tesseract::Tesseract()
397397
INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
398398
INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
399399
this->params()),
400+
INT_MEMBER(min_characters_to_try, 50,
401+
"Specify minimum characters to try to skip page entirely",
402+
this->params()),
400403
STRING_MEMBER(unrecognised_char, "|",
401404
"Output char for unidentified blobs", this->params()),
402405
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),

src/ccmain/tesseractclass.h

+2
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,8 @@ class Tesseract : public Wordrec {
10431043
"Create PDF with only one invisible text layer");
10441044
INT_VAR_H(jpg_quality, 85, "Set JPEG quality level");
10451045
INT_VAR_H(user_defined_dpi, 0, "Specify DPI for input image");
1046+
INT_VAR_H(min_characters_to_try, 50,
1047+
"Specify minimum characters to try to skip page entirely");
10461048
STRING_VAR_H(unrecognised_char, "|",
10471049
"Output char for unidentified blobs");
10481050
INT_VAR_H(suspect_level, 99, "Suspect marker level");

0 commit comments

Comments
 (0)