14
14
// limitations under the License.
15
15
16
16
#include < memory>
17
+ #include < sstream> // for std::stringstream
17
18
#include " baseapi.h"
18
19
#include " renderer.h"
19
20
20
21
namespace tesseract {
21
22
22
- // /
23
- // / Add coordinates to specified TextBlock, TextLine, or String bounding box
24
- // / Add word confidence if adding to a String bounding box
23
+ // / Add coordinates to specified TextBlock, TextLine or String bounding box.
24
+ // / Add word confidence if adding to a String bounding box.
25
25
// /
26
26
static void AddBoxToAlto (const ResultIterator* it, PageIteratorLevel level,
27
- STRING* alto_str) {
27
+ std::stringstream& alto_str) {
28
28
int left, top, right, bottom;
29
29
it->BoundingBox (level, &left, &top, &right, &bottom);
30
30
@@ -33,43 +33,19 @@ static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level,
33
33
int height = bottom - top;
34
34
int width = right - left;
35
35
36
- *alto_str += " HPOS=\" " ;
37
- alto_str->add_str_int (" " , hpos);
38
- *alto_str += " \" " ;
39
- *alto_str += " VPOS=\" " ;
40
- alto_str->add_str_int (" " , vpos);
41
- *alto_str += " \" " ;
42
- *alto_str += " WIDTH=\" " ;
43
- alto_str->add_str_int (" " , width);
44
- *alto_str += " \" " ;
45
- *alto_str += " HEIGHT=\" " ;
46
- alto_str->add_str_int (" " , height);
47
- *alto_str += " \" " ;
36
+ alto_str << " HPOS=\" " << hpos << " \" " ;
37
+ alto_str << " VPOS=\" " << vpos << " \" " ;
38
+ alto_str << " WIDTH=\" " << width << " \" " ;
39
+ alto_str << " HEIGHT=\" " << height << " \" " ;
48
40
49
41
if (level == RIL_WORD) {
50
42
int wc = it->Confidence (RIL_WORD);
51
- *alto_str += " WC=\" 0." ;
52
- alto_str->add_str_int (" " , wc);
53
- *alto_str += " \" " ;
54
- }
55
- if (level != RIL_WORD) {
56
- *alto_str += " >" ;
43
+ alto_str << " WC=\" 0." << wc << " \" " ;
44
+ } else {
45
+ alto_str << " >" ;
57
46
}
58
47
}
59
48
60
- // /
61
- // / Add a unique ID to an ALTO element
62
- // /
63
- static void AddIdToAlto (STRING* alto_str, const std::string base, int num1) {
64
- const size_t BUFSIZE = 64 ;
65
- char id_buffer[BUFSIZE];
66
- snprintf (id_buffer, BUFSIZE - 1 , " %s_%d" , base.c_str (), num1);
67
- id_buffer[BUFSIZE - 1 ] = ' \0 ' ;
68
- *alto_str += " ID=\" " ;
69
- *alto_str += id_buffer;
70
- *alto_str += " \" " ;
71
- }
72
-
73
49
// /
74
50
// / Append the ALTO XML for the beginning of the document
75
51
// /
@@ -111,10 +87,10 @@ bool TessAltoRenderer::BeginDocumentHandler() {
111
87
// / Append the ALTO XML for the layout of the image
112
88
// /
113
89
bool TessAltoRenderer::AddImageHandler (TessBaseAPI* api) {
114
- const std::unique_ptr<const char []> hocr (api->GetAltoText (imagenum ()));
115
- if (hocr == nullptr ) return false ;
90
+ const std::unique_ptr<const char []> text (api->GetAltoText (imagenum ()));
91
+ if (text == nullptr ) return false ;
116
92
117
- AppendString (hocr .get ());
93
+ AppendString (text .get ());
118
94
119
95
return true ;
120
96
}
@@ -150,8 +126,6 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
150
126
int lcnt = 0 , bcnt = 0 , wcnt = 0 ;
151
127
int page_id = page_number;
152
128
153
- STRING alto_str (" " );
154
-
155
129
if (input_file_ == nullptr ) SetInputName (nullptr );
156
130
157
131
#ifdef _WIN32
@@ -171,23 +145,16 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
171
145
delete[] utf8_str;
172
146
#endif
173
147
174
- alto_str += " \t\t <Page WIDTH=\" " ;
175
- alto_str.add_str_int (" " , rect_width_);
176
- alto_str += " \" HEIGHT=\" " ;
177
- alto_str.add_str_int (" " , rect_height_);
178
- alto_str += " \" PHYSICAL_IMG_NR=\" " ;
179
- alto_str.add_str_int (" " , rect_height_);
180
- alto_str += " \" " ;
181
- AddIdToAlto (&alto_str, " page" , page_id);
182
- alto_str += " >\n " ;
183
- alto_str +=
184
- (" \t\t\t <PrintSpace HPOS=\" 0\" "
185
- " VPOS=\" 0\" "
186
- " WIDTH=\" " );
187
- alto_str.add_str_int (" " , rect_width_);
188
- alto_str += " \" HEIGHT=\" " ;
189
- alto_str.add_str_int (" " , rect_height_);
190
- alto_str += " \" >\n " ;
148
+ std::stringstream alto_str;
149
+ alto_str
150
+ << " \t\t <Page WIDTH=\" " << rect_width_ << " \" HEIGHT=\" "
151
+ << rect_height_
152
+ // TODO: next line is buggy because rect_height is not an image number.
153
+ << " \" PHYSICAL_IMG_NR=\" " << rect_height_ << " \" "
154
+ << " ID=\" page_" << page_id << " \" >\n "
155
+ << " \t\t\t <PrintSpace HPOS=\" 0\" VPOS=\" 0\" "
156
+ << " WIDTH=\" " << rect_width_ << " \" "
157
+ << " HEIGHT=\" " << rect_height_ << " \" >\n " ;
191
158
192
159
ResultIterator* res_it = GetIterator ();
193
160
while (!res_it->Empty (RIL_BLOCK)) {
@@ -197,58 +164,66 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
197
164
}
198
165
199
166
if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
200
- alto_str += " \t\t\t\t <TextBlock " ;
201
- AddIdToAlto (&alto_str, " block" , bcnt);
202
- AddBoxToAlto (res_it, RIL_BLOCK, &alto_str);
203
- alto_str += " \n " ;
167
+ alto_str << " \t\t\t\t <TextBlock ID=\" block_" << bcnt << " \" " ;
168
+ AddBoxToAlto (res_it, RIL_BLOCK, alto_str);
169
+ alto_str << " \n " ;
204
170
}
205
171
206
172
if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
207
- alto_str += " \t\t\t\t\t <TextLine " ;
208
- AddIdToAlto (&alto_str, " line" , lcnt);
209
- AddBoxToAlto (res_it, RIL_TEXTLINE, &alto_str);
210
- alto_str += " \n " ;
173
+ alto_str << " \t\t\t\t\t <TextLine ID=\" line_" << lcnt << " \" " ;
174
+ AddBoxToAlto (res_it, RIL_TEXTLINE, alto_str);
175
+ alto_str << " \n " ;
211
176
}
212
177
213
- alto_str += " \t\t\t\t\t\t <String " ;
214
- AddIdToAlto (&alto_str, " string" , wcnt);
215
- AddBoxToAlto (res_it, RIL_WORD, &alto_str);
216
- alto_str += " CONTENT=\" " ;
178
+ alto_str << " \t\t\t\t\t\t <String ID=\" string_" << wcnt << " \" " ;
179
+ AddBoxToAlto (res_it, RIL_WORD, alto_str);
180
+ alto_str << " CONTENT=\" " ;
217
181
218
182
bool last_word_in_line = res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD);
219
183
bool last_word_in_block = res_it->IsAtFinalElement (RIL_BLOCK, RIL_WORD);
220
184
185
+ int left, top, right, bottom;
186
+ res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
187
+
221
188
do {
222
189
const std::unique_ptr<const char []> grapheme (
223
190
res_it->GetUTF8Text (RIL_SYMBOL));
224
191
if (grapheme && grapheme[0 ] != 0 ) {
225
- alto_str += HOcrEscape (grapheme.get ());
192
+ alto_str << HOcrEscape (grapheme.get ()). c_str ( );
226
193
}
227
194
res_it->Next (RIL_SYMBOL);
228
195
} while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_WORD));
229
196
230
- alto_str += " \" />\n " ;
197
+ alto_str << " \" />" ;
231
198
232
199
wcnt++;
233
200
234
201
if (last_word_in_line) {
235
- alto_str += " \t\t\t\t\t </TextLine>\n " ;
202
+ alto_str << " \n \t\t\t\t\t </TextLine>\n " ;
236
203
lcnt++;
204
+ } else {
205
+ int hpos = right;
206
+ int vpos = top;
207
+ res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
208
+ int width = left - hpos;
209
+ alto_str << " <SP WIDTH=\" " << width << " \" VPOS=\" " << vpos
210
+ << " \" HPOS=\" " << hpos << " \" />\n " ;
237
211
}
238
212
239
213
if (last_word_in_block) {
240
- alto_str += " \t\t\t\t </TextBlock>\n " ;
214
+ alto_str << " \t\t\t\t </TextBlock>\n " ;
241
215
bcnt++;
242
216
}
243
217
}
244
218
245
- alto_str += " \t\t\t </PrintSpace>\n " ;
246
- alto_str += " \t\t </Page>\n " ;
219
+ alto_str << " \t\t\t </PrintSpace>\n "
220
+ << " \t\t </Page>\n " ;
221
+ const std::string& text = alto_str.str ();
247
222
248
- char * ret = new char [alto_str .length () + 1 ];
249
- strcpy (ret, alto_str. string ());
223
+ char * result = new char [text .length () + 1 ];
224
+ strcpy (result, text. c_str ());
250
225
delete res_it;
251
- return ret ;
226
+ return result ;
252
227
}
253
228
254
229
} // namespace tesseract
0 commit comments