17
17
*
18
18
**********************************************************************/
19
19
20
+ #include < locale> // for std::locale::classic
20
21
#include < memory> // for std::unique_ptr
22
+ #include < sstream> // for std::stringstream
21
23
#include " baseapi.h" // for TessBaseAPI
22
24
#include " renderer.h"
23
25
#include " tesseractclass.h" // for Tesseract
@@ -46,10 +48,11 @@ static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) {
46
48
* direction and does not add any baseline information to the hocr string.
47
49
*/
48
50
static void AddBaselineCoordsTohOCR (const PageIterator* it,
49
- PageIteratorLevel level, STRING* hocr_str) {
51
+ PageIteratorLevel level,
52
+ std::stringstream& hocr_str) {
50
53
tesseract::Orientation orientation = GetBlockTextOrientation (it);
51
54
if (orientation != ORIENTATION_PAGE_UP) {
52
- hocr_str-> add_str_int ( " ; textangle " , 360 - orientation * 90 ) ;
55
+ hocr_str << " ; textangle " << 360 - orientation * 90 ;
53
56
return ;
54
57
}
55
58
@@ -69,68 +72,36 @@ static void AddBaselineCoordsTohOCR(const PageIterator* it,
69
72
70
73
// Now fit a line through the points so we can extract coefficients for the
71
74
// equation: y = p1 x + p0
72
- double p1 = 0 ;
73
- double p0 = 0 ;
74
75
if (x1 == x2) {
75
76
// Problem computing the polynomial coefficients.
76
77
return ;
77
78
}
78
- p1 = (y2 - y1 ) / static_cast <double >(x2 - x1);
79
- p0 = y1 - static_cast < double >( p1 * x1) ;
79
+ double p1 = (y2 - y1 ) / static_cast <double >(x2 - x1);
80
+ double p0 = y1 - p1 * x1;
80
81
81
- hocr_str->add_str_double (" ; baseline " , round (p1 * 1000.0 ) / 1000.0 );
82
- hocr_str->add_str_double (" " , round (p0 * 1000.0 ) / 1000.0 );
83
- }
84
-
85
- static void AddIdTohOCR (STRING* hocr_str, const std::string base, int num1,
86
- int num2) {
87
- const size_t BUFSIZE = 64 ;
88
- char id_buffer[BUFSIZE];
89
- if (num2 >= 0 ) {
90
- snprintf (id_buffer, BUFSIZE - 1 , " %s_%d_%d" , base.c_str (), num1, num2);
91
- } else {
92
- snprintf (id_buffer, BUFSIZE - 1 , " %s_%d" , base.c_str (), num1);
93
- }
94
- id_buffer[BUFSIZE - 1 ] = ' \0 ' ;
95
- *hocr_str += " id='" ;
96
- *hocr_str += id_buffer;
97
- *hocr_str += " '" ;
98
- }
99
-
100
- static void AddIdTohOCR (STRING* hocr_str, const std::string base, int num1,
101
- int num2, int num3) {
102
- const size_t BUFSIZE = 64 ;
103
- char id_buffer[BUFSIZE];
104
- snprintf (id_buffer, BUFSIZE - 1 , " %s_%d_%d_%d" , base.c_str (), num1, num2,
105
- num3);
106
- id_buffer[BUFSIZE - 1 ] = ' \0 ' ;
107
- *hocr_str += " id='" ;
108
- *hocr_str += id_buffer;
109
- *hocr_str += " '" ;
82
+ hocr_str << " ; baseline " << round (p1 * 1000.0 ) / 1000.0 << " "
83
+ << round (p0 * 1000.0 ) / 1000.0 ;
110
84
}
111
85
112
86
static void AddBoxTohOCR (const ResultIterator* it, PageIteratorLevel level,
113
- STRING* hocr_str) {
87
+ std::stringstream& hocr_str) {
114
88
int left, top, right, bottom;
115
89
it->BoundingBox (level, &left, &top, &right, &bottom);
116
90
// This is the only place we use double quotes instead of single quotes,
117
91
// but it may too late to change for consistency
118
- hocr_str->add_str_int (" title=\" bbox " , left);
119
- hocr_str->add_str_int (" " , top);
120
- hocr_str->add_str_int (" " , right);
121
- hocr_str->add_str_int (" " , bottom);
92
+ hocr_str << " title=\" bbox " << left << " " << top << " " << right << " "
93
+ << bottom;
122
94
// Add baseline coordinates & heights for textlines only.
123
95
if (level == RIL_TEXTLINE) {
124
96
AddBaselineCoordsTohOCR (it, level, hocr_str);
125
97
// add custom height measures
126
98
float row_height, descenders, ascenders; // row attributes
127
99
it->RowAttributes (&row_height, &descenders, &ascenders);
128
100
// TODO(rays): Do we want to limit these to a single decimal place?
129
- hocr_str->add_str_double (" ; x_size " , row_height);
130
- hocr_str->add_str_double (" ; x_descenders " , descenders * -1 );
131
- hocr_str->add_str_double (" ; x_ascenders " , ascenders);
101
+ hocr_str << " ; x_size " << row_height << " ; x_descenders " << -descenders
102
+ << " ; x_ascenders " << ascenders;
132
103
}
133
- * hocr_str += " \" >" ;
104
+ hocr_str << " \" >" ;
134
105
}
135
106
136
107
/* *
@@ -166,8 +137,6 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
166
137
bool font_info = false ;
167
138
GetBoolVariable (" hocr_font_info" , &font_info);
168
139
169
- STRING hocr_str (" " );
170
-
171
140
if (input_file_ == nullptr ) SetInputName (nullptr );
172
141
173
142
#ifdef _WIN32
@@ -187,22 +156,25 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
187
156
delete[] utf8_str;
188
157
#endif
189
158
190
- hocr_str += " <div class='ocr_page'" ;
191
- AddIdTohOCR (&hocr_str, " page" , page_id, -1 );
192
- hocr_str += " title='image \" " ;
159
+ std::stringstream hocr_str;
160
+ // Use "C" locale (needed for double values x_size and x_descenders).
161
+ hocr_str.imbue (std::locale::classic ());
162
+ // Use 8 digits for double values.
163
+ hocr_str.precision (8 );
164
+ hocr_str << " <div class='ocr_page'" ;
165
+ hocr_str << " id='"
166
+ << " page_" << page_id << " '" ;
167
+ hocr_str << " title='image \" " ;
193
168
if (input_file_) {
194
- hocr_str += HOcrEscape (input_file_->string ());
169
+ hocr_str << HOcrEscape (input_file_->string ()). c_str ( );
195
170
} else {
196
- hocr_str += " unknown" ;
171
+ hocr_str << " unknown" ;
197
172
}
198
- hocr_str.add_str_int (" \" ; bbox " , rect_left_);
199
- hocr_str.add_str_int (" " , rect_top_);
200
- hocr_str.add_str_int (" " , rect_width_);
201
- hocr_str.add_str_int (" " , rect_height_);
202
- hocr_str.add_str_int (" ; ppageno " , page_number);
203
- hocr_str += " '>\n " ;
173
+ hocr_str << " \" ; bbox " << rect_left_ << " " << rect_top_ << " "
174
+ << rect_width_ << " " << rect_height_ << " ; ppageno " << page_number
175
+ << " '>\n " ;
204
176
205
- ResultIterator* res_it = GetIterator ();
177
+ std::unique_ptr< ResultIterator> res_it ( GetIterator () );
206
178
while (!res_it->Empty (RIL_BLOCK)) {
207
179
if (res_it->Empty (RIL_WORD)) {
208
180
res_it->Next (RIL_WORD);
@@ -212,29 +184,30 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
212
184
// Open any new block/paragraph/textline.
213
185
if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
214
186
para_is_ltr = true ; // reset to default direction
215
- hocr_str += " <div class='ocr_carea'" ;
216
- AddIdTohOCR (&hocr_str, " block" , page_id, bcnt);
217
- AddBoxTohOCR (res_it, RIL_BLOCK, &hocr_str);
187
+ hocr_str << " <div class='ocr_carea'"
188
+ << " id='"
189
+ << " block_" << page_id << " _" << bcnt << " '" ;
190
+ AddBoxTohOCR (res_it.get (), RIL_BLOCK, hocr_str);
218
191
}
219
192
if (res_it->IsAtBeginningOf (RIL_PARA)) {
220
- hocr_str += " \n <p class='ocr_par'" ;
193
+ hocr_str << " \n <p class='ocr_par'" ;
221
194
para_is_ltr = res_it->ParagraphIsLtr ();
222
195
if (!para_is_ltr) {
223
- hocr_str += " dir='rtl'" ;
196
+ hocr_str << " dir='rtl'" ;
224
197
}
225
- AddIdTohOCR (&hocr_str, " par" , page_id, pcnt);
198
+ hocr_str << " id='"
199
+ << " par_" << page_id << " _" << pcnt << " '" ;
226
200
paragraph_lang = res_it->WordRecognitionLanguage ();
227
201
if (paragraph_lang) {
228
- hocr_str += " lang='" ;
229
- hocr_str += paragraph_lang;
230
- hocr_str += " '" ;
202
+ hocr_str << " lang='" << paragraph_lang << " '" ;
231
203
}
232
- AddBoxTohOCR (res_it, RIL_PARA, & hocr_str);
204
+ AddBoxTohOCR (res_it. get () , RIL_PARA, hocr_str);
233
205
}
234
206
if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
235
- hocr_str += " \n <span class='ocr_line'" ;
236
- AddIdTohOCR (&hocr_str, " line" , page_id, lcnt);
237
- AddBoxTohOCR (res_it, RIL_TEXTLINE, &hocr_str);
207
+ hocr_str << " \n <span class='ocr_line'"
208
+ << " id='"
209
+ << " line_" << page_id << " _" << lcnt << " '" ;
210
+ AddBoxTohOCR (res_it.get (), RIL_TEXTLINE, hocr_str);
238
211
}
239
212
240
213
// Now, process the word...
@@ -243,8 +216,9 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
243
216
if (tesseract_->lstm_choice_mode ) {
244
217
confidencemap = res_it->GetBestLSTMSymbolChoices ();
245
218
}
246
- hocr_str += " \n <span class='ocrx_word'" ;
247
- AddIdTohOCR (&hocr_str, " word" , page_id, wcnt);
219
+ hocr_str << " \n <span class='ocrx_word'"
220
+ << " id='"
221
+ << " word_" << page_id << " _" << wcnt << " '" ;
248
222
int left, top, right, bottom;
249
223
bool bold , italic , underlined, monospace, serif, smallcaps;
250
224
int pointsize, font_id;
@@ -253,126 +227,118 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
253
227
font_name =
254
228
res_it->WordFontAttributes (&bold , &italic , &underlined, &monospace,
255
229
&serif, &smallcaps, &pointsize, &font_id);
256
- hocr_str.add_str_int (" title='bbox " , left);
257
- hocr_str.add_str_int (" " , top);
258
- hocr_str.add_str_int (" " , right);
259
- hocr_str.add_str_int (" " , bottom);
260
- hocr_str.add_str_int (" ; x_wconf " , res_it->Confidence (RIL_WORD));
230
+ hocr_str << " title='bbox " << left << " " << top << " " << right << " "
231
+ << bottom << " ; x_wconf "
232
+ << static_cast <int >(res_it->Confidence (RIL_WORD));
261
233
if (font_info) {
262
234
if (font_name) {
263
- hocr_str += " ; x_font " ;
264
- hocr_str += HOcrEscape (font_name);
235
+ hocr_str << " ; x_font " << HOcrEscape (font_name).c_str ();
265
236
}
266
- hocr_str. add_str_int ( " ; x_fsize " , pointsize) ;
237
+ hocr_str << " ; x_fsize " << pointsize;
267
238
}
268
- hocr_str += " '" ;
239
+ hocr_str << " '" ;
269
240
const char * lang = res_it->WordRecognitionLanguage ();
270
241
if (lang && (!paragraph_lang || strcmp (lang, paragraph_lang))) {
271
- hocr_str += " lang='" ;
272
- hocr_str += lang;
273
- hocr_str += " '" ;
242
+ hocr_str << " lang='" << lang << " '" ;
274
243
}
275
244
switch (res_it->WordDirection ()) {
276
245
// Only emit direction if different from current paragraph direction
277
246
case DIR_LEFT_TO_RIGHT:
278
- if (!para_is_ltr) hocr_str += " dir='ltr'" ;
247
+ if (!para_is_ltr) hocr_str << " dir='ltr'" ;
279
248
break ;
280
249
case DIR_RIGHT_TO_LEFT:
281
- if (para_is_ltr) hocr_str += " dir='rtl'" ;
250
+ if (para_is_ltr) hocr_str << " dir='rtl'" ;
282
251
break ;
283
252
case DIR_MIX:
284
253
case DIR_NEUTRAL:
285
254
default : // Do nothing.
286
255
break ;
287
256
}
288
- hocr_str += " >" ;
257
+ hocr_str << " >" ;
289
258
bool last_word_in_line = res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD);
290
259
bool last_word_in_para = res_it->IsAtFinalElement (RIL_PARA, RIL_WORD);
291
260
bool last_word_in_block = res_it->IsAtFinalElement (RIL_BLOCK, RIL_WORD);
292
- if (bold ) hocr_str += " <strong>" ;
293
- if (italic ) hocr_str += " <em>" ;
261
+ if (bold ) hocr_str << " <strong>" ;
262
+ if (italic ) hocr_str << " <em>" ;
294
263
do {
295
264
const std::unique_ptr<const char []> grapheme (
296
265
res_it->GetUTF8Text (RIL_SYMBOL));
297
266
if (grapheme && grapheme[0 ] != 0 ) {
298
- hocr_str += HOcrEscape (grapheme.get ());
267
+ hocr_str << HOcrEscape (grapheme.get ()). c_str ( );
299
268
}
300
269
res_it->Next (RIL_SYMBOL);
301
270
} while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_WORD));
302
- if (italic ) hocr_str += " </em>" ;
303
- if (bold ) hocr_str += " </strong>" ;
271
+ if (italic ) hocr_str << " </em>" ;
272
+ if (bold ) hocr_str << " </strong>" ;
304
273
// If the lstm choice mode is required it is added here
305
274
if (tesseract_->lstm_choice_mode == 1 && confidencemap != nullptr ) {
306
275
for (size_t i = 0 ; i < confidencemap->size (); i++) {
307
- hocr_str += " \n <span class='ocrx_cinfo'" ;
308
- AddIdTohOCR (&hocr_str, " timestep" , page_id, wcnt, tcnt);
309
- hocr_str += " >" ;
276
+ hocr_str << " \n <span class='ocrx_cinfo'"
277
+ << " id='"
278
+ << " timestep_" << page_id << " _" << wcnt << " _" << tcnt << " '"
279
+ << " >" ;
310
280
std::vector<std::pair<const char *, float >> timestep =
311
281
(*confidencemap)[i];
312
282
for (std::pair<const char *, float > conf : timestep) {
313
- hocr_str += " <span class='ocr_glyph'" ;
314
- AddIdTohOCR (&hocr_str, " choice" , page_id, wcnt, gcnt);
315
- hocr_str.add_str_int (" title='x_confs " , int (conf.second * 100 ));
316
- hocr_str += " '" ;
317
- hocr_str += " >" ;
318
- hocr_str += conf.first ;
319
- hocr_str += " </span>" ;
283
+ hocr_str << " <span class='ocr_glyph'"
284
+ << " id='"
285
+ << " choice_" << page_id << " _" << wcnt << " _" << gcnt << " '"
286
+ << " title='x_confs " << int (conf.second * 100 ) << " '>"
287
+ << conf.first << " </span>" ;
320
288
gcnt++;
321
289
}
322
- hocr_str += " </span>" ;
290
+ hocr_str << " </span>" ;
323
291
tcnt++;
324
292
}
325
293
} else if (tesseract_->lstm_choice_mode == 2 && confidencemap != nullptr ) {
326
294
for (size_t i = 0 ; i < confidencemap->size (); i++) {
327
295
std::vector<std::pair<const char *, float >> timestep =
328
296
(*confidencemap)[i];
329
297
if (timestep.size () > 0 ) {
330
- hocr_str += " \n <span class='ocrx_cinfo'" ;
331
- AddIdTohOCR (&hocr_str, " lstm_choices " , page_id, wcnt, tcnt);
332
- hocr_str += " chosen=' " ;
333
- hocr_str += timestep[ 0 ]. first ;
334
- hocr_str += " '>" ;
298
+ hocr_str << " \n <span class='ocrx_cinfo'"
299
+ << " id=' "
300
+ << " lstm_choices_ " << page_id << " _ " << wcnt << " _ " << tcnt
301
+ << " ' "
302
+ << " chosen=' " << timestep[ 0 ]. first << " '>" ;
335
303
for (size_t j = 1 ; j < timestep.size (); j++) {
336
- hocr_str += " <span class='ocr_glyph'" ;
337
- AddIdTohOCR (&hocr_str, " choice" , page_id, wcnt, gcnt);
338
- hocr_str.add_str_int (" title='x_confs " ,
339
- int (timestep[j].second * 100 ));
340
- hocr_str += " '" ;
341
- hocr_str += " >" ;
342
- hocr_str += timestep[j].first ;
343
- hocr_str += " </span>" ;
304
+ hocr_str << " <span class='ocr_glyph'"
305
+ << " id='"
306
+ << " choice_" << page_id << " _" << wcnt << " _" << gcnt
307
+ << " '"
308
+ << " title='x_confs " << int (timestep[j].second * 100 )
309
+ << " '>" << timestep[j].first << " </span>" ;
344
310
gcnt++;
345
311
}
346
- hocr_str += " </span>" ;
312
+ hocr_str << " </span>" ;
347
313
tcnt++;
348
314
}
349
315
}
350
316
}
351
- hocr_str += " </span>" ;
317
+ hocr_str << " </span>" ;
352
318
tcnt = 1 ;
353
319
gcnt = 1 ;
354
320
wcnt++;
355
321
// Close any ending block/paragraph/textline.
356
322
if (last_word_in_line) {
357
- hocr_str += " \n </span>" ;
323
+ hocr_str << " \n </span>" ;
358
324
lcnt++;
359
325
}
360
326
if (last_word_in_para) {
361
- hocr_str += " \n </p>\n " ;
327
+ hocr_str << " \n </p>\n " ;
362
328
pcnt++;
363
329
para_is_ltr = true ; // back to default direction
364
330
}
365
331
if (last_word_in_block) {
366
- hocr_str += " </div>\n " ;
332
+ hocr_str << " </div>\n " ;
367
333
bcnt++;
368
334
}
369
335
}
370
- hocr_str += " </div>\n " ;
336
+ hocr_str << " </div>\n " ;
371
337
372
- char * ret = new char [ hocr_str.length () + 1 ] ;
373
- strcpy (ret, hocr_str. string ()) ;
374
- delete res_it ;
375
- return ret ;
338
+ const std::string& text = hocr_str.str () ;
339
+ char * result = new char [text. length () + 1 ] ;
340
+ strcpy (result, text. c_str ()) ;
341
+ return result ;
376
342
}
377
343
378
344
/* *********************************************************************
0 commit comments