Skip to content

Commit b453f74

Browse files
committed
Fixed issue #633 (multi-language mode
1 parent ca16a08 commit b453f74

File tree

5 files changed

+70
-43
lines changed

5 files changed

+70
-43
lines changed

ccmain/control.cpp

+46-35
Original file line numberDiff line numberDiff line change
@@ -754,29 +754,39 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) {
754754
}
755755
}
756756

757-
// Factored helper considers the indexed word and updates all the pointed
758-
// values.
759-
static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
760-
float* rating, float* certainty, bool* bad,
761-
bool* valid_permuter, int* right, int* next_left) {
757+
// Helper finds the gap between the index word and the next.
758+
static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
759+
int* next_left) {
762760
*right = -MAX_INT32;
763761
*next_left = MAX_INT32;
764762
if (index < words.size()) {
763+
*right = words[index]->word->bounding_box().right();
764+
if (index + 1 < words.size())
765+
*next_left = words[index + 1]->word->bounding_box().left();
766+
}
767+
}
768+
769+
// Factored helper computes the rating, certainty, badness and validity of
770+
// the permuter of the words in [first_index, end_index).
771+
static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
772+
int first_index, int end_index, float* rating,
773+
float* certainty, bool* bad,
774+
bool* valid_permuter) {
775+
if (end_index <= first_index) {
776+
*bad = true;
777+
*valid_permuter = false;
778+
}
779+
for (int index = first_index; index < end_index && index < words.size();
780+
++index) {
765781
WERD_CHOICE* choice = words[index]->best_choice;
766-
if (choice == NULL) {
782+
if (choice == nullptr) {
767783
*bad = true;
768784
} else {
769785
*rating += choice->rating();
770786
*certainty = MIN(*certainty, choice->certainty());
771787
if (!Dict::valid_word_permuter(choice->permuter(), false))
772788
*valid_permuter = false;
773789
}
774-
*right = words[index]->word->bounding_box().right();
775-
if (index + 1 < words.size())
776-
*next_left = words[index + 1]->word->bounding_box().left();
777-
} else {
778-
*valid_permuter = false;
779-
*bad = true;
780790
}
781791
}
782792

@@ -801,24 +811,13 @@ static int SelectBestWords(double rating_ratio,
801811
while (b < best_words->size() || n < new_words->size()) {
802812
// Start of the current run in each.
803813
int start_b = b, start_n = n;
804-
// Rating of the current run in each.
805-
float b_rating = 0.0f, n_rating = 0.0f;
806-
// Certainty of the current run in each.
807-
float b_certainty = 0.0f, n_certainty = 0.0f;
808-
// True if any word is missing its best choice.
809-
bool b_bad = false, n_bad = false;
810-
// True if all words have a valid permuter.
811-
bool b_valid_permuter = true, n_valid_permuter = true;
812-
813814
while (b < best_words->size() || n < new_words->size()) {
814815
int b_right = -MAX_INT32;
815816
int next_b_left = MAX_INT32;
816-
EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
817-
&b_valid_permuter, &b_right, &next_b_left);
817+
WordGap(*best_words, b, &b_right, &next_b_left);
818818
int n_right = -MAX_INT32;
819819
int next_n_left = MAX_INT32;
820-
EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
821-
&n_valid_permuter, &n_right, &next_n_left);
820+
WordGap(*new_words, n, &n_right, &next_n_left);
822821
if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
823822
// The word breaks overlap. [start_b,b] and [start_n, n] match.
824823
break;
@@ -830,29 +829,41 @@ static int SelectBestWords(double rating_ratio,
830829
else
831830
++n;
832831
}
832+
// Rating of the current run in each.
833+
float b_rating = 0.0f, n_rating = 0.0f;
834+
// Certainty of the current run in each.
835+
float b_certainty = 0.0f, n_certainty = 0.0f;
836+
// True if any word is missing its best choice.
837+
bool b_bad = false, n_bad = false;
838+
// True if all words have a valid permuter.
839+
bool b_valid_permuter = true, n_valid_permuter = true;
840+
int end_b = b < best_words->size() ? b + 1 : b;
841+
int end_n = n < new_words->size() ? n + 1 : n;
842+
EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
843+
&b_bad, &b_valid_permuter);
844+
EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
845+
&n_bad, &n_valid_permuter);
833846
bool new_better = false;
834847
if (!n_bad && (b_bad || (n_certainty > b_certainty &&
835848
n_rating < b_rating) ||
836849
(!b_valid_permuter && n_valid_permuter &&
837850
n_rating < b_rating * rating_ratio &&
838851
n_certainty > b_certainty - certainty_margin))) {
839852
// New is better.
840-
for (int i = start_n; i <= n; ++i) {
853+
for (int i = start_n; i < end_n; ++i) {
841854
out_words.push_back((*new_words)[i]);
842855
(*new_words)[i] = NULL;
843856
++num_new;
844857
}
845858
new_better = true;
846859
} else if (!b_bad) {
847860
// Current best is better.
848-
for (int i = start_b; i <= b; ++i) {
861+
for (int i = start_b; i < end_b; ++i) {
849862
out_words.push_back((*best_words)[i]);
850863
(*best_words)[i] = NULL;
851864
++num_best;
852865
}
853866
}
854-
int end_b = b < best_words->size() ? b + 1 : b;
855-
int end_n = n < new_words->size() ? n + 1 : n;
856867
if (debug) {
857868
tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
858869
" valid dict: %d v %d\n",
@@ -875,10 +886,9 @@ static int SelectBestWords(double rating_ratio,
875886
// Returns positive if this recognizer found more new best words than the
876887
// number kept from best_words.
877888
int Tesseract::RetryWithLanguage(const WordData& word_data,
878-
WordRecognizer recognizer,
889+
WordRecognizer recognizer, bool debug,
879890
WERD_RES** in_word,
880891
PointerVector<WERD_RES>* best_words) {
881-
bool debug = classify_debug_level;
882892
if (debug) {
883893
tprintf("Trying word using lang %s, oem %d\n",
884894
lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
@@ -1281,7 +1291,8 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
12811291
// Points to the best result. May be word or in lang_words.
12821292
WERD_RES* word = word_data->word;
12831293
clock_t start_t = clock();
1284-
if (classify_debug_level) {
1294+
bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1295+
if (debug) {
12851296
tprintf("%s word with lang %s at:",
12861297
word->done ? "Already done" : "Processing",
12871298
most_recently_used_->lang.string());
@@ -1300,20 +1311,20 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
13001311
most_recently_used_ != sub_langs_[sub]; ++sub) {}
13011312
}
13021313
most_recently_used_->RetryWithLanguage(
1303-
*word_data, recognizer, &word_data->lang_words[sub], &best_words);
1314+
*word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
13041315
Tesseract* best_lang_tess = most_recently_used_;
13051316
if (!WordsAcceptable(best_words)) {
13061317
// Try all the other languages to see if they are any better.
13071318
if (most_recently_used_ != this &&
1308-
this->RetryWithLanguage(*word_data, recognizer,
1319+
this->RetryWithLanguage(*word_data, recognizer, debug,
13091320
&word_data->lang_words[sub_langs_.size()],
13101321
&best_words) > 0) {
13111322
best_lang_tess = this;
13121323
}
13131324
for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
13141325
++i) {
13151326
if (most_recently_used_ != sub_langs_[i] &&
1316-
sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
1327+
sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
13171328
&word_data->lang_words[i],
13181329
&best_words) > 0) {
13191330
best_lang_tess = sub_langs_[i];

ccmain/linerec.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
309309
word_certainty);
310310
word->best_choice->print();
311311
}
312+
word->best_choice->set_certainty(word_certainty);
312313
// Discard words that are impossibly bad, but allow a bit more for
313314
// dictionary words, and keep bad words in non-space-delimited langs.
314315
if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
@@ -324,7 +325,6 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
324325
// It is a dud.
325326
word->SetupFake(lstm_recognizer_->GetUnicharset());
326327
}
327-
word->best_choice->set_certainty(word_certainty);
328328
}
329329
}
330330
}

ccmain/tesseractclass.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,8 @@ Tesseract::Tesseract()
214214
BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
215215
double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
216216
double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
217+
INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
218+
this->params()),
217219
INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
218220
this->params()),
219221
BOOL_MEMBER(paragraph_text_based, true,
@@ -636,6 +638,8 @@ Tesseract::~Tesseract() {
636638
}
637639

638640
void Tesseract::Clear() {
641+
STRING debug_name = imagebasename + "_debug.pdf";
642+
pixa_debug_.WritePDF(debug_name.string());
639643
pixDestroy(&pix_binary_);
640644
pixDestroy(&pix_grey_);
641645
pixDestroy(&pix_thresholds_);
@@ -703,7 +707,7 @@ void Tesseract::PrepareForPageseg() {
703707
// the newly splitted image.
704708
splitter_.set_orig_pix(pix_binary());
705709
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
706-
if (splitter_.Split(true)) {
710+
if (splitter_.Split(true, &pixa_debug_)) {
707711
ASSERT_HOST(splitter_.splitted_image());
708712
pixDestroy(&pix_binary_);
709713
pix_binary_ = pixClone(splitter_.splitted_image());
@@ -732,7 +736,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
732736
splitter_.set_segmentation_block_list(block_list);
733737
splitter_.set_ocr_split_strategy(max_ocr_strategy);
734738
// Run the splitter for OCR
735-
bool split_for_ocr = splitter_.Split(false);
739+
bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
736740
// Restore pix_binary to the binarized original pix for future reference.
737741
ASSERT_HOST(splitter_.orig_pix());
738742
pixDestroy(&pix_binary_);

ccmain/tesseractclass.h

+8-5
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@
2828

2929
#include "allheaders.h"
3030
#include "control.h"
31-
#include "docqual.h"
31+
#include "debugpixa.h"
3232
#include "devanagari_processing.h"
33+
#include "docqual.h"
3334
#include "genericvector.h"
34-
#include "params.h"
3535
#include "ocrclass.h"
36+
#include "params.h"
3637
#include "textord.h"
3738
#include "wordrec.h"
3839

@@ -372,9 +373,8 @@ class Tesseract : public Wordrec {
372373
// Helper to recognize the word using the given (language-specific) tesseract.
373374
// Returns positive if this recognizer found more new best words than the
374375
// number kept from best_words.
375-
int RetryWithLanguage(const WordData& word_data,
376-
WordRecognizer recognizer,
377-
WERD_RES** in_word,
376+
int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
377+
bool debug, WERD_RES** in_word,
378378
PointerVector<WERD_RES>* best_words);
379379
// Moves good-looking "noise"/diacritics from the reject list to the main
380380
// blob list on the current word. Returns true if anything was done, and
@@ -907,6 +907,7 @@ class Tesseract : public Wordrec {
907907
BOOL_VAR_H(test_pt, false, "Test for point");
908908
double_VAR_H(test_pt_x, 99999.99, "xcoord");
909909
double_VAR_H(test_pt_y, 99999.99, "ycoord");
910+
INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
910911
INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
911912
BOOL_VAR_H(paragraph_text_based, true,
912913
"Run paragraph detection on the post-text-recognition "
@@ -1194,6 +1195,8 @@ class Tesseract : public Wordrec {
11941195
Pix* pix_original_;
11951196
// Thresholds that were used to generate the thresholded image from grey.
11961197
Pix* pix_thresholds_;
1198+
// Debug images. If non-empty, will be written on destruction.
1199+
DebugPixa pixa_debug_;
11971200
// Input image resolution after any scaling. The resolution is not well
11981201
// transmitted by operations on Pix, so we keep an independent record here.
11991202
int source_resolution_;

lstm/recodebeam.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,15 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
276276
}
277277
if (t < width) {
278278
int unichar_id = best_nodes[t]->unichar_id;
279+
if (unichar_id == UNICHAR_SPACE && !certs->empty() &&
280+
best_nodes[t]->permuter != NO_PERM) {
281+
// All the rating and certainty go on the previous character except
282+
// for the space itself.
283+
if (certainty < certs->back()) certs->back() = certainty;
284+
ratings->back() += rating;
285+
certainty = 0.0;
286+
rating = 0.0;
287+
}
279288
unichar_ids->push_back(unichar_id);
280289
xcoords->push_back(t);
281290
do {

0 commit comments

Comments
 (0)