@@ -754,29 +754,39 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) {
754
754
}
755
755
}
756
756
757
- // Factored helper considers the indexed word and updates all the pointed
758
- // values.
759
- static void EvaluateWord (const PointerVector<WERD_RES>& words, int index,
760
- float * rating, float * certainty, bool * bad,
761
- bool * valid_permuter, int * right, int * next_left) {
757
+ // Helper finds the gap between the index word and the next.
758
+ static void WordGap (const PointerVector<WERD_RES>& words, int index, int * right,
759
+ int * next_left) {
762
760
*right = -MAX_INT32;
763
761
*next_left = MAX_INT32;
764
762
if (index < words.size ()) {
763
+ *right = words[index ]->word ->bounding_box ().right ();
764
+ if (index + 1 < words.size ())
765
+ *next_left = words[index + 1 ]->word ->bounding_box ().left ();
766
+ }
767
+ }
768
+
769
+ // Factored helper computes the rating, certainty, badness and validity of
770
+ // the permuter of the words in [first_index, end_index).
771
+ static void EvaluateWordSpan (const PointerVector<WERD_RES>& words,
772
+ int first_index, int end_index, float * rating,
773
+ float * certainty, bool * bad,
774
+ bool * valid_permuter) {
775
+ if (end_index <= first_index) {
776
+ *bad = true ;
777
+ *valid_permuter = false ;
778
+ }
779
+ for (int index = first_index; index < end_index && index < words.size ();
780
+ ++index ) {
765
781
WERD_CHOICE* choice = words[index ]->best_choice ;
766
- if (choice == NULL ) {
782
+ if (choice == nullptr ) {
767
783
*bad = true ;
768
784
} else {
769
785
*rating += choice->rating ();
770
786
*certainty = MIN (*certainty, choice->certainty ());
771
787
if (!Dict::valid_word_permuter (choice->permuter (), false ))
772
788
*valid_permuter = false ;
773
789
}
774
- *right = words[index ]->word ->bounding_box ().right ();
775
- if (index + 1 < words.size ())
776
- *next_left = words[index + 1 ]->word ->bounding_box ().left ();
777
- } else {
778
- *valid_permuter = false ;
779
- *bad = true ;
780
790
}
781
791
}
782
792
@@ -801,24 +811,13 @@ static int SelectBestWords(double rating_ratio,
801
811
while (b < best_words->size () || n < new_words->size ()) {
802
812
// Start of the current run in each.
803
813
int start_b = b, start_n = n;
804
- // Rating of the current run in each.
805
- float b_rating = 0 .0f , n_rating = 0 .0f ;
806
- // Certainty of the current run in each.
807
- float b_certainty = 0 .0f , n_certainty = 0 .0f ;
808
- // True if any word is missing its best choice.
809
- bool b_bad = false , n_bad = false ;
810
- // True if all words have a valid permuter.
811
- bool b_valid_permuter = true , n_valid_permuter = true ;
812
-
813
814
while (b < best_words->size () || n < new_words->size ()) {
814
815
int b_right = -MAX_INT32;
815
816
int next_b_left = MAX_INT32;
816
- EvaluateWord (*best_words, b, &b_rating, &b_certainty, &b_bad,
817
- &b_valid_permuter, &b_right, &next_b_left);
817
+ WordGap (*best_words, b, &b_right, &next_b_left);
818
818
int n_right = -MAX_INT32;
819
819
int next_n_left = MAX_INT32;
820
- EvaluateWord (*new_words, n, &n_rating, &n_certainty, &n_bad,
821
- &n_valid_permuter, &n_right, &next_n_left);
820
+ WordGap (*new_words, n, &n_right, &next_n_left);
822
821
if (MAX (b_right, n_right) < MIN (next_b_left, next_n_left)) {
823
822
// The word breaks overlap. [start_b,b] and [start_n, n] match.
824
823
break ;
@@ -830,29 +829,41 @@ static int SelectBestWords(double rating_ratio,
830
829
else
831
830
++n;
832
831
}
832
+ // Rating of the current run in each.
833
+ float b_rating = 0 .0f , n_rating = 0 .0f ;
834
+ // Certainty of the current run in each.
835
+ float b_certainty = 0 .0f , n_certainty = 0 .0f ;
836
+ // True if any word is missing its best choice.
837
+ bool b_bad = false , n_bad = false ;
838
+ // True if all words have a valid permuter.
839
+ bool b_valid_permuter = true , n_valid_permuter = true ;
840
+ int end_b = b < best_words->size () ? b + 1 : b;
841
+ int end_n = n < new_words->size () ? n + 1 : n;
842
+ EvaluateWordSpan (*best_words, start_b, end_b, &b_rating, &b_certainty,
843
+ &b_bad, &b_valid_permuter);
844
+ EvaluateWordSpan (*new_words, start_n, end_n, &n_rating, &n_certainty,
845
+ &n_bad, &n_valid_permuter);
833
846
bool new_better = false ;
834
847
if (!n_bad && (b_bad || (n_certainty > b_certainty &&
835
848
n_rating < b_rating) ||
836
849
(!b_valid_permuter && n_valid_permuter &&
837
850
n_rating < b_rating * rating_ratio &&
838
851
n_certainty > b_certainty - certainty_margin))) {
839
852
// New is better.
840
- for (int i = start_n; i <= n ; ++i) {
853
+ for (int i = start_n; i < end_n ; ++i) {
841
854
out_words.push_back ((*new_words)[i]);
842
855
(*new_words)[i] = NULL ;
843
856
++num_new;
844
857
}
845
858
new_better = true ;
846
859
} else if (!b_bad) {
847
860
// Current best is better.
848
- for (int i = start_b; i <= b ; ++i) {
861
+ for (int i = start_b; i < end_b ; ++i) {
849
862
out_words.push_back ((*best_words)[i]);
850
863
(*best_words)[i] = NULL ;
851
864
++num_best;
852
865
}
853
866
}
854
- int end_b = b < best_words->size () ? b + 1 : b;
855
- int end_n = n < new_words->size () ? n + 1 : n;
856
867
if (debug) {
857
868
tprintf (" %d new words %s than %d old words: r: %g v %g c: %g v %g"
858
869
" valid dict: %d v %d\n " ,
@@ -875,10 +886,9 @@ static int SelectBestWords(double rating_ratio,
875
886
// Returns positive if this recognizer found more new best words than the
876
887
// number kept from best_words.
877
888
int Tesseract::RetryWithLanguage (const WordData& word_data,
878
- WordRecognizer recognizer,
889
+ WordRecognizer recognizer, bool debug,
879
890
WERD_RES** in_word,
880
891
PointerVector<WERD_RES>* best_words) {
881
- bool debug = classify_debug_level;
882
892
if (debug) {
883
893
tprintf (" Trying word using lang %s, oem %d\n " ,
884
894
lang.string (), static_cast <int >(tessedit_ocr_engine_mode));
@@ -1281,7 +1291,8 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
1281
1291
// Points to the best result. May be word or in lang_words.
1282
1292
WERD_RES* word = word_data->word ;
1283
1293
clock_t start_t = clock ();
1284
- if (classify_debug_level) {
1294
+ bool debug = classify_debug_level > 0 || multilang_debug_level > 0 ;
1295
+ if (debug) {
1285
1296
tprintf (" %s word with lang %s at:" ,
1286
1297
word->done ? " Already done" : " Processing" ,
1287
1298
most_recently_used_->lang .string ());
@@ -1300,20 +1311,20 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
1300
1311
most_recently_used_ != sub_langs_[sub]; ++sub) {}
1301
1312
}
1302
1313
most_recently_used_->RetryWithLanguage (
1303
- *word_data, recognizer, &word_data->lang_words [sub], &best_words);
1314
+ *word_data, recognizer, debug, &word_data->lang_words [sub], &best_words);
1304
1315
Tesseract* best_lang_tess = most_recently_used_;
1305
1316
if (!WordsAcceptable (best_words)) {
1306
1317
// Try all the other languages to see if they are any better.
1307
1318
if (most_recently_used_ != this &&
1308
- this ->RetryWithLanguage (*word_data, recognizer,
1319
+ this ->RetryWithLanguage (*word_data, recognizer, debug,
1309
1320
&word_data->lang_words [sub_langs_.size ()],
1310
1321
&best_words) > 0 ) {
1311
1322
best_lang_tess = this ;
1312
1323
}
1313
1324
for (int i = 0 ; !WordsAcceptable (best_words) && i < sub_langs_.size ();
1314
1325
++i) {
1315
1326
if (most_recently_used_ != sub_langs_[i] &&
1316
- sub_langs_[i]->RetryWithLanguage (*word_data, recognizer,
1327
+ sub_langs_[i]->RetryWithLanguage (*word_data, recognizer, debug,
1317
1328
&word_data->lang_words [i],
1318
1329
&best_words) > 0 ) {
1319
1330
best_lang_tess = sub_langs_[i];
0 commit comments