Skip to content

Commit db9c7e0

Browse files
committed
Use std::stringstream to generate hOCR output
Using std::stringstream simplifies the code and allows conversion of double to string independant of the current locale setting. Signed-off-by: Stefan Weil <[email protected]>
1 parent 72d8df5 commit db9c7e0

File tree

1 file changed

+93
-127
lines changed

1 file changed

+93
-127
lines changed

src/api/hocrrenderer.cpp

+93-127
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
*
1818
**********************************************************************/
1919

20+
#include <locale> // for std::locale::classic
2021
#include <memory> // for std::unique_ptr
22+
#include <sstream> // for std::stringstream
2123
#include "baseapi.h" // for TessBaseAPI
2224
#include "renderer.h"
2325
#include "tesseractclass.h" // for Tesseract
@@ -46,10 +48,11 @@ static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) {
4648
* direction and does not add any baseline information to the hocr string.
4749
*/
4850
static void AddBaselineCoordsTohOCR(const PageIterator* it,
49-
PageIteratorLevel level, STRING* hocr_str) {
51+
PageIteratorLevel level,
52+
std::stringstream& hocr_str) {
5053
tesseract::Orientation orientation = GetBlockTextOrientation(it);
5154
if (orientation != ORIENTATION_PAGE_UP) {
52-
hocr_str->add_str_int("; textangle ", 360 - orientation * 90);
55+
hocr_str << "; textangle " << 360 - orientation * 90;
5356
return;
5457
}
5558

@@ -69,68 +72,36 @@ static void AddBaselineCoordsTohOCR(const PageIterator* it,
6972

7073
// Now fit a line through the points so we can extract coefficients for the
7174
// equation: y = p1 x + p0
72-
double p1 = 0;
73-
double p0 = 0;
7475
if (x1 == x2) {
7576
// Problem computing the polynomial coefficients.
7677
return;
7778
}
78-
p1 = (y2 - y1) / static_cast<double>(x2 - x1);
79-
p0 = y1 - static_cast<double>(p1 * x1);
79+
double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
80+
double p0 = y1 - p1 * x1;
8081

81-
hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0);
82-
hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0);
83-
}
84-
85-
static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
86-
int num2) {
87-
const size_t BUFSIZE = 64;
88-
char id_buffer[BUFSIZE];
89-
if (num2 >= 0) {
90-
snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2);
91-
} else {
92-
snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
93-
}
94-
id_buffer[BUFSIZE - 1] = '\0';
95-
*hocr_str += " id='";
96-
*hocr_str += id_buffer;
97-
*hocr_str += "'";
98-
}
99-
100-
static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
101-
int num2, int num3) {
102-
const size_t BUFSIZE = 64;
103-
char id_buffer[BUFSIZE];
104-
snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,
105-
num3);
106-
id_buffer[BUFSIZE - 1] = '\0';
107-
*hocr_str += " id='";
108-
*hocr_str += id_buffer;
109-
*hocr_str += "'";
82+
hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
83+
<< round(p0 * 1000.0) / 1000.0;
11084
}
11185

11286
static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
113-
STRING* hocr_str) {
87+
std::stringstream& hocr_str) {
11488
int left, top, right, bottom;
11589
it->BoundingBox(level, &left, &top, &right, &bottom);
11690
// This is the only place we use double quotes instead of single quotes,
11791
// but it may too late to change for consistency
118-
hocr_str->add_str_int(" title=\"bbox ", left);
119-
hocr_str->add_str_int(" ", top);
120-
hocr_str->add_str_int(" ", right);
121-
hocr_str->add_str_int(" ", bottom);
92+
hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
93+
<< bottom;
12294
// Add baseline coordinates & heights for textlines only.
12395
if (level == RIL_TEXTLINE) {
12496
AddBaselineCoordsTohOCR(it, level, hocr_str);
12597
// add custom height measures
12698
float row_height, descenders, ascenders; // row attributes
12799
it->RowAttributes(&row_height, &descenders, &ascenders);
128100
// TODO(rays): Do we want to limit these to a single decimal place?
129-
hocr_str->add_str_double("; x_size ", row_height);
130-
hocr_str->add_str_double("; x_descenders ", descenders * -1);
131-
hocr_str->add_str_double("; x_ascenders ", ascenders);
101+
hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
102+
<< "; x_ascenders " << ascenders;
132103
}
133-
*hocr_str += "\">";
104+
hocr_str << "\">";
134105
}
135106

136107
/**
@@ -166,8 +137,6 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
166137
bool font_info = false;
167138
GetBoolVariable("hocr_font_info", &font_info);
168139

169-
STRING hocr_str("");
170-
171140
if (input_file_ == nullptr) SetInputName(nullptr);
172141

173142
#ifdef _WIN32
@@ -187,22 +156,25 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
187156
delete[] utf8_str;
188157
#endif
189158

190-
hocr_str += " <div class='ocr_page'";
191-
AddIdTohOCR(&hocr_str, "page", page_id, -1);
192-
hocr_str += " title='image \"";
159+
std::stringstream hocr_str;
160+
// Use "C" locale (needed for double values x_size and x_descenders).
161+
hocr_str.imbue(std::locale::classic());
162+
// Use 8 digits for double values.
163+
hocr_str.precision(8);
164+
hocr_str << " <div class='ocr_page'";
165+
hocr_str << " id='"
166+
<< "page_" << page_id << "'";
167+
hocr_str << " title='image \"";
193168
if (input_file_) {
194-
hocr_str += HOcrEscape(input_file_->string());
169+
hocr_str << HOcrEscape(input_file_->string()).c_str();
195170
} else {
196-
hocr_str += "unknown";
171+
hocr_str << "unknown";
197172
}
198-
hocr_str.add_str_int("\"; bbox ", rect_left_);
199-
hocr_str.add_str_int(" ", rect_top_);
200-
hocr_str.add_str_int(" ", rect_width_);
201-
hocr_str.add_str_int(" ", rect_height_);
202-
hocr_str.add_str_int("; ppageno ", page_number);
203-
hocr_str += "'>\n";
173+
hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
174+
<< rect_width_ << " " << rect_height_ << "; ppageno " << page_number
175+
<< "'>\n";
204176

205-
ResultIterator* res_it = GetIterator();
177+
std::unique_ptr<ResultIterator> res_it(GetIterator());
206178
while (!res_it->Empty(RIL_BLOCK)) {
207179
if (res_it->Empty(RIL_WORD)) {
208180
res_it->Next(RIL_WORD);
@@ -212,29 +184,30 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
212184
// Open any new block/paragraph/textline.
213185
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
214186
para_is_ltr = true; // reset to default direction
215-
hocr_str += " <div class='ocr_carea'";
216-
AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
217-
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
187+
hocr_str << " <div class='ocr_carea'"
188+
<< " id='"
189+
<< "block_" << page_id << "_" << bcnt << "'";
190+
AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
218191
}
219192
if (res_it->IsAtBeginningOf(RIL_PARA)) {
220-
hocr_str += "\n <p class='ocr_par'";
193+
hocr_str << "\n <p class='ocr_par'";
221194
para_is_ltr = res_it->ParagraphIsLtr();
222195
if (!para_is_ltr) {
223-
hocr_str += " dir='rtl'";
196+
hocr_str << " dir='rtl'";
224197
}
225-
AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
198+
hocr_str << " id='"
199+
<< "par_" << page_id << "_" << pcnt << "'";
226200
paragraph_lang = res_it->WordRecognitionLanguage();
227201
if (paragraph_lang) {
228-
hocr_str += " lang='";
229-
hocr_str += paragraph_lang;
230-
hocr_str += "'";
202+
hocr_str << " lang='" << paragraph_lang << "'";
231203
}
232-
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
204+
AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
233205
}
234206
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
235-
hocr_str += "\n <span class='ocr_line'";
236-
AddIdTohOCR(&hocr_str, "line", page_id, lcnt);
237-
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
207+
hocr_str << "\n <span class='ocr_line'"
208+
<< " id='"
209+
<< "line_" << page_id << "_" << lcnt << "'";
210+
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
238211
}
239212

240213
// Now, process the word...
@@ -243,8 +216,9 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
243216
if (tesseract_->lstm_choice_mode) {
244217
confidencemap = res_it->GetBestLSTMSymbolChoices();
245218
}
246-
hocr_str += "\n <span class='ocrx_word'";
247-
AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
219+
hocr_str << "\n <span class='ocrx_word'"
220+
<< " id='"
221+
<< "word_" << page_id << "_" << wcnt << "'";
248222
int left, top, right, bottom;
249223
bool bold, italic, underlined, monospace, serif, smallcaps;
250224
int pointsize, font_id;
@@ -253,126 +227,118 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
253227
font_name =
254228
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
255229
&serif, &smallcaps, &pointsize, &font_id);
256-
hocr_str.add_str_int(" title='bbox ", left);
257-
hocr_str.add_str_int(" ", top);
258-
hocr_str.add_str_int(" ", right);
259-
hocr_str.add_str_int(" ", bottom);
260-
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
230+
hocr_str << " title='bbox " << left << " " << top << " " << right << " "
231+
<< bottom << "; x_wconf "
232+
<< static_cast<int>(res_it->Confidence(RIL_WORD));
261233
if (font_info) {
262234
if (font_name) {
263-
hocr_str += "; x_font ";
264-
hocr_str += HOcrEscape(font_name);
235+
hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
265236
}
266-
hocr_str.add_str_int("; x_fsize ", pointsize);
237+
hocr_str << "; x_fsize " << pointsize;
267238
}
268-
hocr_str += "'";
239+
hocr_str << "'";
269240
const char* lang = res_it->WordRecognitionLanguage();
270241
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
271-
hocr_str += " lang='";
272-
hocr_str += lang;
273-
hocr_str += "'";
242+
hocr_str << " lang='" << lang << "'";
274243
}
275244
switch (res_it->WordDirection()) {
276245
// Only emit direction if different from current paragraph direction
277246
case DIR_LEFT_TO_RIGHT:
278-
if (!para_is_ltr) hocr_str += " dir='ltr'";
247+
if (!para_is_ltr) hocr_str << " dir='ltr'";
279248
break;
280249
case DIR_RIGHT_TO_LEFT:
281-
if (para_is_ltr) hocr_str += " dir='rtl'";
250+
if (para_is_ltr) hocr_str << " dir='rtl'";
282251
break;
283252
case DIR_MIX:
284253
case DIR_NEUTRAL:
285254
default: // Do nothing.
286255
break;
287256
}
288-
hocr_str += ">";
257+
hocr_str << ">";
289258
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
290259
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
291260
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
292-
if (bold) hocr_str += "<strong>";
293-
if (italic) hocr_str += "<em>";
261+
if (bold) hocr_str << "<strong>";
262+
if (italic) hocr_str << "<em>";
294263
do {
295264
const std::unique_ptr<const char[]> grapheme(
296265
res_it->GetUTF8Text(RIL_SYMBOL));
297266
if (grapheme && grapheme[0] != 0) {
298-
hocr_str += HOcrEscape(grapheme.get());
267+
hocr_str << HOcrEscape(grapheme.get()).c_str();
299268
}
300269
res_it->Next(RIL_SYMBOL);
301270
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
302-
if (italic) hocr_str += "</em>";
303-
if (bold) hocr_str += "</strong>";
271+
if (italic) hocr_str << "</em>";
272+
if (bold) hocr_str << "</strong>";
304273
// If the lstm choice mode is required it is added here
305274
if (tesseract_->lstm_choice_mode == 1 && confidencemap != nullptr) {
306275
for (size_t i = 0; i < confidencemap->size(); i++) {
307-
hocr_str += "\n <span class='ocrx_cinfo'";
308-
AddIdTohOCR(&hocr_str, "timestep", page_id, wcnt, tcnt);
309-
hocr_str += ">";
276+
hocr_str << "\n <span class='ocrx_cinfo'"
277+
<< " id='"
278+
<< "timestep_" << page_id << "_" << wcnt << "_" << tcnt << "'"
279+
<< ">";
310280
std::vector<std::pair<const char*, float>> timestep =
311281
(*confidencemap)[i];
312282
for (std::pair<const char*, float> conf : timestep) {
313-
hocr_str += "<span class='ocr_glyph'";
314-
AddIdTohOCR(&hocr_str, "choice", page_id, wcnt, gcnt);
315-
hocr_str.add_str_int(" title='x_confs ", int(conf.second * 100));
316-
hocr_str += "'";
317-
hocr_str += ">";
318-
hocr_str += conf.first;
319-
hocr_str += "</span>";
283+
hocr_str << "<span class='ocr_glyph'"
284+
<< " id='"
285+
<< "choice_" << page_id << "_" << wcnt << "_" << gcnt << "'"
286+
<< " title='x_confs " << int(conf.second * 100) << "'>"
287+
<< conf.first << "</span>";
320288
gcnt++;
321289
}
322-
hocr_str += "</span>";
290+
hocr_str << "</span>";
323291
tcnt++;
324292
}
325293
} else if (tesseract_->lstm_choice_mode == 2 && confidencemap != nullptr) {
326294
for (size_t i = 0; i < confidencemap->size(); i++) {
327295
std::vector<std::pair<const char*, float>> timestep =
328296
(*confidencemap)[i];
329297
if (timestep.size() > 0) {
330-
hocr_str += "\n <span class='ocrx_cinfo'";
331-
AddIdTohOCR(&hocr_str, "lstm_choices", page_id, wcnt, tcnt);
332-
hocr_str += " chosen='";
333-
hocr_str += timestep[0].first;
334-
hocr_str += "'>";
298+
hocr_str << "\n <span class='ocrx_cinfo'"
299+
<< " id='"
300+
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
301+
<< "'"
302+
<< " chosen='" << timestep[0].first << "'>";
335303
for (size_t j = 1; j < timestep.size(); j++) {
336-
hocr_str += "<span class='ocr_glyph'";
337-
AddIdTohOCR(&hocr_str, "choice", page_id, wcnt, gcnt);
338-
hocr_str.add_str_int(" title='x_confs ",
339-
int(timestep[j].second * 100));
340-
hocr_str += "'";
341-
hocr_str += ">";
342-
hocr_str += timestep[j].first;
343-
hocr_str += "</span>";
304+
hocr_str << "<span class='ocr_glyph'"
305+
<< " id='"
306+
<< "choice_" << page_id << "_" << wcnt << "_" << gcnt
307+
<< "'"
308+
<< " title='x_confs " << int(timestep[j].second * 100)
309+
<< "'>" << timestep[j].first << "</span>";
344310
gcnt++;
345311
}
346-
hocr_str += "</span>";
312+
hocr_str << "</span>";
347313
tcnt++;
348314
}
349315
}
350316
}
351-
hocr_str += "</span>";
317+
hocr_str << "</span>";
352318
tcnt = 1;
353319
gcnt = 1;
354320
wcnt++;
355321
// Close any ending block/paragraph/textline.
356322
if (last_word_in_line) {
357-
hocr_str += "\n </span>";
323+
hocr_str << "\n </span>";
358324
lcnt++;
359325
}
360326
if (last_word_in_para) {
361-
hocr_str += "\n </p>\n";
327+
hocr_str << "\n </p>\n";
362328
pcnt++;
363329
para_is_ltr = true; // back to default direction
364330
}
365331
if (last_word_in_block) {
366-
hocr_str += " </div>\n";
332+
hocr_str << " </div>\n";
367333
bcnt++;
368334
}
369335
}
370-
hocr_str += " </div>\n";
336+
hocr_str << " </div>\n";
371337

372-
char* ret = new char[hocr_str.length() + 1];
373-
strcpy(ret, hocr_str.string());
374-
delete res_it;
375-
return ret;
338+
const std::string& text = hocr_str.str();
339+
char* result = new char[text.length() + 1];
340+
strcpy(result, text.c_str());
341+
return result;
376342
}
377343

378344
/**********************************************************************

0 commit comments

Comments
 (0)