Skip to content

Commit fbbbdb4

Browse files
committed
Use std::stringstream to generate ALTO output and add <SP> element
Using std::stringstream simplifies the code. The <SP> element is needed between two >String> elements. Remove also some unneeded spaces in the ALTO output. Signed-off-by: Stefan Weil <[email protected]>
1 parent 5307b2c commit fbbbdb4

File tree

1 file changed

+53
-78
lines changed

1 file changed

+53
-78
lines changed

src/api/altorenderer.cpp

+53-78
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@
1414
// limitations under the License.
1515

1616
#include <memory>
17+
#include <sstream> // for std::stringstream
1718
#include "baseapi.h"
1819
#include "renderer.h"
1920

2021
namespace tesseract {
2122

22-
///
23-
/// Add coordinates to specified TextBlock, TextLine, or String bounding box
24-
/// Add word confidence if adding to a String bounding box
23+
/// Add coordinates to specified TextBlock, TextLine or String bounding box.
24+
/// Add word confidence if adding to a String bounding box.
2525
///
2626
static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level,
27-
STRING* alto_str) {
27+
std::stringstream& alto_str) {
2828
int left, top, right, bottom;
2929
it->BoundingBox(level, &left, &top, &right, &bottom);
3030

@@ -33,43 +33,19 @@ static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level,
3333
int height = bottom - top;
3434
int width = right - left;
3535

36-
*alto_str += " HPOS=\"";
37-
alto_str->add_str_int("", hpos);
38-
*alto_str += "\"";
39-
*alto_str += " VPOS=\"";
40-
alto_str->add_str_int("", vpos);
41-
*alto_str += "\"";
42-
*alto_str += " WIDTH=\"";
43-
alto_str->add_str_int("", width);
44-
*alto_str += "\"";
45-
*alto_str += " HEIGHT=\"";
46-
alto_str->add_str_int("", height);
47-
*alto_str += "\"";
36+
alto_str << " HPOS=\"" << hpos << "\"";
37+
alto_str << " VPOS=\"" << vpos << "\"";
38+
alto_str << " WIDTH=\"" << width << "\"";
39+
alto_str << " HEIGHT=\"" << height << "\"";
4840

4941
if (level == RIL_WORD) {
5042
int wc = it->Confidence(RIL_WORD);
51-
*alto_str += " WC=\"0.";
52-
alto_str->add_str_int("", wc);
53-
*alto_str += "\"";
54-
}
55-
if (level != RIL_WORD) {
56-
*alto_str += ">";
43+
alto_str << " WC=\"0." << wc << "\"";
44+
} else {
45+
alto_str << ">";
5746
}
5847
}
5948

60-
///
61-
/// Add a unique ID to an ALTO element
62-
///
63-
static void AddIdToAlto(STRING* alto_str, const std::string base, int num1) {
64-
const size_t BUFSIZE = 64;
65-
char id_buffer[BUFSIZE];
66-
snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
67-
id_buffer[BUFSIZE - 1] = '\0';
68-
*alto_str += " ID=\"";
69-
*alto_str += id_buffer;
70-
*alto_str += "\"";
71-
}
72-
7349
///
7450
/// Append the ALTO XML for the beginning of the document
7551
///
@@ -111,10 +87,10 @@ bool TessAltoRenderer::BeginDocumentHandler() {
11187
/// Append the ALTO XML for the layout of the image
11288
///
11389
bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) {
114-
const std::unique_ptr<const char[]> hocr(api->GetAltoText(imagenum()));
115-
if (hocr == nullptr) return false;
90+
const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
91+
if (text == nullptr) return false;
11692

117-
AppendString(hocr.get());
93+
AppendString(text.get());
11894

11995
return true;
12096
}
@@ -150,8 +126,6 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
150126
int lcnt = 0, bcnt = 0, wcnt = 0;
151127
int page_id = page_number;
152128

153-
STRING alto_str("");
154-
155129
if (input_file_ == nullptr) SetInputName(nullptr);
156130

157131
#ifdef _WIN32
@@ -171,23 +145,16 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
171145
delete[] utf8_str;
172146
#endif
173147

174-
alto_str += "\t\t<Page WIDTH=\"";
175-
alto_str.add_str_int("", rect_width_);
176-
alto_str += "\" HEIGHT=\"";
177-
alto_str.add_str_int("", rect_height_);
178-
alto_str += "\" PHYSICAL_IMG_NR=\"";
179-
alto_str.add_str_int("", rect_height_);
180-
alto_str += "\"";
181-
AddIdToAlto(&alto_str, "page", page_id);
182-
alto_str += ">\n";
183-
alto_str +=
184-
("\t\t\t<PrintSpace HPOS=\"0\" "
185-
"VPOS=\"0\""
186-
" WIDTH=\"");
187-
alto_str.add_str_int("", rect_width_);
188-
alto_str += "\" HEIGHT=\"";
189-
alto_str.add_str_int("", rect_height_);
190-
alto_str += "\">\n";
148+
std::stringstream alto_str;
149+
alto_str
150+
<< "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\""
151+
<< rect_height_
152+
// TODO: next line is buggy because rect_height is not an image number.
153+
<< "\" PHYSICAL_IMG_NR=\"" << rect_height_ << "\""
154+
<< " ID=\"page_" << page_id << "\">\n"
155+
<< "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
156+
<< " WIDTH=\"" << rect_width_ << "\""
157+
<< " HEIGHT=\"" << rect_height_ << "\">\n";
191158

192159
ResultIterator* res_it = GetIterator();
193160
while (!res_it->Empty(RIL_BLOCK)) {
@@ -197,58 +164,66 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
197164
}
198165

199166
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
200-
alto_str += "\t\t\t\t<TextBlock ";
201-
AddIdToAlto(&alto_str, "block", bcnt);
202-
AddBoxToAlto(res_it, RIL_BLOCK, &alto_str);
203-
alto_str += "\n";
167+
alto_str << "\t\t\t\t<TextBlock ID=\"block_" << bcnt << "\"";
168+
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
169+
alto_str << "\n";
204170
}
205171

206172
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
207-
alto_str += "\t\t\t\t\t<TextLine ";
208-
AddIdToAlto(&alto_str, "line", lcnt);
209-
AddBoxToAlto(res_it, RIL_TEXTLINE, &alto_str);
210-
alto_str += "\n";
173+
alto_str << "\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
174+
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
175+
alto_str << "\n";
211176
}
212177

213-
alto_str += "\t\t\t\t\t\t<String ";
214-
AddIdToAlto(&alto_str, "string", wcnt);
215-
AddBoxToAlto(res_it, RIL_WORD, &alto_str);
216-
alto_str += " CONTENT=\"";
178+
alto_str << "\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
179+
AddBoxToAlto(res_it, RIL_WORD, alto_str);
180+
alto_str << " CONTENT=\"";
217181

218182
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
219183
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
220184

185+
int left, top, right, bottom;
186+
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
187+
221188
do {
222189
const std::unique_ptr<const char[]> grapheme(
223190
res_it->GetUTF8Text(RIL_SYMBOL));
224191
if (grapheme && grapheme[0] != 0) {
225-
alto_str += HOcrEscape(grapheme.get());
192+
alto_str << HOcrEscape(grapheme.get()).c_str();
226193
}
227194
res_it->Next(RIL_SYMBOL);
228195
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
229196

230-
alto_str += "\"/>\n";
197+
alto_str << "\"/>";
231198

232199
wcnt++;
233200

234201
if (last_word_in_line) {
235-
alto_str += "\t\t\t\t\t</TextLine>\n";
202+
alto_str << "\n\t\t\t\t\t</TextLine>\n";
236203
lcnt++;
204+
} else {
205+
int hpos = right;
206+
int vpos = top;
207+
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
208+
int width = left - hpos;
209+
alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos
210+
<< "\" HPOS=\"" << hpos << "\"/>\n";
237211
}
238212

239213
if (last_word_in_block) {
240-
alto_str += "\t\t\t\t</TextBlock>\n";
214+
alto_str << "\t\t\t\t</TextBlock>\n";
241215
bcnt++;
242216
}
243217
}
244218

245-
alto_str += "\t\t\t</PrintSpace>\n";
246-
alto_str += "\t\t</Page>\n";
219+
alto_str << "\t\t\t</PrintSpace>\n"
220+
<< "\t\t</Page>\n";
221+
const std::string& text = alto_str.str();
247222

248-
char* ret = new char[alto_str.length() + 1];
249-
strcpy(ret, alto_str.string());
223+
char* result = new char[text.length() + 1];
224+
strcpy(result, text.c_str());
250225
delete res_it;
251-
return ret;
226+
return result;
252227
}
253228

254229
} // namespace tesseract

0 commit comments

Comments
 (0)