@@ -27,61 +27,59 @@ namespace tesseract {
27
27
* page_number is a 0-base page index that will appear in the box file.
28
28
* Returned string must be freed with the delete [] operator.
29
29
*/
30
+ static void AddBoxToLSTM (int right, int bottom, int top,
31
+ int image_height_, int page_num,
32
+ STRING* text) {
33
+ text->add_str_int (" " , image_height_ - bottom);
34
+ text->add_str_int (" " , right + 5 );
35
+ text->add_str_int (" " , image_height_ - top);
36
+ text->add_str_int (" " , page_num);
37
+ }
30
38
31
39
char * TessBaseAPI::GetLSTMBOXText (int page_number) {
32
40
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize (nullptr ) < 0 ))
33
41
return nullptr ;
34
42
35
43
STRING lstm_box_str (" " );
36
-
37
44
int page_num = page_number;
38
45
bool first_word = true ;
39
-
46
+ int left, top, right, bottom;
47
+
40
48
LTRResultIterator* res_it = GetLTRIterator ();
41
49
while (!res_it->Empty (RIL_BLOCK)) {
42
50
if (res_it->Empty (RIL_SYMBOL)) {
43
51
res_it->Next (RIL_SYMBOL);
44
52
continue ;
45
53
}
46
-
47
- int left, top, right, bottom;
48
-
49
54
if (!first_word) {
55
+ if (!(res_it->IsAtBeginningOf (RIL_TEXTLINE))) {
50
56
if (res_it->IsAtBeginningOf (RIL_WORD)) {
51
57
lstm_box_str.add_str_int (" " , left);
52
- lstm_box_str.add_str_int (" " , image_height_ - bottom);
53
- lstm_box_str.add_str_int (" " , right + 5 );
54
- lstm_box_str.add_str_int (" " , image_height_ - top);
55
- lstm_box_str.add_str_int (" " , page_num); // - word
58
+ AddBoxToLSTM (right, bottom, top, image_height_, page_num, &lstm_box_str);
56
59
lstm_box_str += " \n " ; // end of row for word
57
- }
60
+ } // word
61
+ } else {
58
62
if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
59
63
lstm_box_str.add_str_int (" \t " , left);
60
- lstm_box_str.add_str_int (" " , image_height_ - bottom);
61
- lstm_box_str.add_str_int (" " , right + 5 );
62
- lstm_box_str.add_str_int (" " , image_height_ - top);
63
- lstm_box_str.add_str_int (" " , page_num); // - line
64
+ AddBoxToLSTM (right, bottom, top, image_height_, page_num, &lstm_box_str);
64
65
lstm_box_str += " \n " ; // end of row for line
65
- }
66
- }
66
+ } // line
67
+ }
68
+ } // not first word
67
69
first_word=false ;
68
- // Use bounding box for whole line for every character
70
+ // Use bounding box for whole line for everything
69
71
res_it->BoundingBox (RIL_TEXTLINE, &left, &top, &right, &bottom);
70
-
71
- do {
72
- lstm_box_str +=
72
+ do { lstm_box_str +=
73
73
std::unique_ptr<const char []>(res_it->GetUTF8Text (RIL_SYMBOL)).get ();
74
74
res_it->Next (RIL_SYMBOL);
75
75
} while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_SYMBOL));
76
-
77
76
lstm_box_str.add_str_int (" " , left);
78
- lstm_box_str.add_str_int (" " , image_height_ - bottom);
79
- lstm_box_str.add_str_int (" " , right + 5 );
80
- lstm_box_str.add_str_int (" " , image_height_ - top);
81
- lstm_box_str.add_str_int (" " , page_num); // symbol
82
- lstm_box_str += " \n " ; // end of row
83
-
77
+ AddBoxToLSTM (right, bottom, top, image_height_, page_num, &lstm_box_str);
78
+ lstm_box_str += " \n " ; // end of row for symbol
84
79
}
80
+ lstm_box_str.add_str_int (" \t " , left);
81
+ AddBoxToLSTM (right, bottom, top, image_height_, page_num, &lstm_box_str);
82
+ lstm_box_str += " \n " ; // end of PAGE
85
83
char * ret = new char [lstm_box_str.length () + 1 ];
86
84
strcpy (ret, lstm_box_str.string ());
87
85
delete res_it;
0 commit comments