Skip to content

Commit 438edd6

Browse files
committed
added row attributes to hocr output
1 parent 917e994 commit 438edd6

File tree

3 files changed

+20
-0
lines changed

3 files changed

+20
-0
lines changed

api/baseapi.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -1415,6 +1415,7 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
14151415

14161416
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
14171417
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
1418+
float row_height, descenders, ascenders; // row attributes
14181419
bool font_info = false;
14191420
GetBoolVariable("hocr_font_info", &font_info);
14201421

@@ -1480,7 +1481,12 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
14801481
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
14811482
}
14821483
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1484+
int fontsize;
14831485
hocr_str.add_str_int("\n <span class='ocr_line' id='line_", page_id);
1486+
res_it->RowAttributes(&row_height, &descenders, &ascenders);
1487+
hocr_str.add_str_int("' size='", row_height);
1488+
hocr_str.add_str_int("' descenders='", descenders * -1);
1489+
hocr_str.add_str_int("' ascenders='", ascenders);
14841490
hocr_str.add_str_int("_", lcnt);
14851491
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
14861492
}

ccmain/ltrresultiterator.cpp

+9
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,15 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const {
145145
return 0.0f;
146146
}
147147

148+
void LTRResultIterator::RowAttributes(float* row_height,
149+
float* descenders,
150+
float* ascenders) const {
151+
*row_height = it_->row()->row->x_height() + it_->row()-> row->ascenders()
152+
- it_->row()->row->descenders();
153+
*descenders = it_->row()->row->descenders();
154+
*ascenders = it_->row()->row->ascenders();
155+
}
156+
148157
// Returns the font attributes of the current word. If iterating at a higher
149158
// level object than words, eg textlines, then this will return the
150159
// attributes of the first word in that textline.

ccmain/ltrresultiterator.h

+5
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,11 @@ class TESS_API LTRResultIterator : public PageIterator {
9191
// The number should be interpreted as a percent probability. (0.0f-100.0f)
9292
float Confidence(PageIteratorLevel level) const;
9393

94+
// Returns the attributes of the current row.
95+
void RowAttributes(float* row_height,
96+
float* descenders,
97+
float* ascenders) const;
98+
9499
// ============= Functions that refer to words only ============.
95100

96101
// Returns the font attributes of the current word. If iterating at a higher

0 commit comments

Comments
 (0)