Skip to content

Commit d04e325

Browse files
sundarcftfmorris
authored andcommitted
Adds char* GetHOCRTSVText(int) as placeholder. Copy of char* GetHOCRText(int).
1 parent 2597296 commit d04e325

File tree

2 files changed

+170
-0
lines changed

2 files changed

+170
-0
lines changed

api/baseapi.cpp

+162
Original file line numberDiff line numberDiff line change
@@ -1601,6 +1601,168 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
16011601
return ret;
16021602
}
16031603

1604+
/**
1605+
* Make a TSV-formatted string with hOCR markup from the internal
1606+
* data structures.
1607+
* page_number is 0-based but will appear in the output as 1-based.
1608+
* Image name/input_file_ can be set by SetInputName before calling
1609+
* GetHOCRText
1610+
* STL removed from original patch submission and refactored by rays.
1611+
*/
1612+
char* TessBaseAPI::GetHOCRTSVText(int page_number) {
1613+
if (tesseract_ == NULL ||
1614+
(page_res_ == NULL && Recognize(NULL) < 0))
1615+
return NULL;
1616+
1617+
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1618+
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
1619+
bool font_info = false;
1620+
GetBoolVariable("hocr_font_info", &font_info);
1621+
1622+
STRING hocr_str("");
1623+
1624+
if (input_file_ == NULL)
1625+
SetInputName(NULL);
1626+
1627+
#ifdef _WIN32
1628+
// convert input name from ANSI encoding to utf-8
1629+
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
1630+
NULL, NULL);
1631+
wchar_t *uni16_str = new WCHAR[str16_len];
1632+
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
1633+
uni16_str, str16_len);
1634+
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
1635+
NULL, NULL, NULL);
1636+
char *utf8_str = new char[utf8_len];
1637+
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
1638+
utf8_len, NULL, NULL);
1639+
*input_file_ = utf8_str;
1640+
delete[] uni16_str;
1641+
delete[] utf8_str;
1642+
#endif
1643+
1644+
hocr_str.add_str_int(" <div class='ocr_page' id='page_", page_id);
1645+
hocr_str += "' title='image \"";
1646+
if (input_file_) {
1647+
hocr_str += HOcrEscape(input_file_->string());
1648+
} else {
1649+
hocr_str += "unknown";
1650+
}
1651+
hocr_str.add_str_int("\"; bbox ", rect_left_);
1652+
hocr_str.add_str_int(" ", rect_top_);
1653+
hocr_str.add_str_int(" ", rect_width_);
1654+
hocr_str.add_str_int(" ", rect_height_);
1655+
hocr_str.add_str_int("; ppageno ", page_number);
1656+
hocr_str += "'>\n";
1657+
1658+
ResultIterator *res_it = GetIterator();
1659+
while (!res_it->Empty(RIL_BLOCK)) {
1660+
if (res_it->Empty(RIL_WORD)) {
1661+
res_it->Next(RIL_WORD);
1662+
continue;
1663+
}
1664+
1665+
// Open any new block/paragraph/textline.
1666+
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1667+
hocr_str.add_str_int(" <div class='ocr_carea' id='block_", page_id);
1668+
hocr_str.add_str_int("_", bcnt);
1669+
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
1670+
}
1671+
if (res_it->IsAtBeginningOf(RIL_PARA)) {
1672+
if (res_it->ParagraphIsLtr()) {
1673+
hocr_str.add_str_int("\n <p class='ocr_par' dir='ltr' id='par_",
1674+
page_id);
1675+
hocr_str.add_str_int("_", pcnt);
1676+
} else {
1677+
hocr_str.add_str_int("\n <p class='ocr_par' dir='rtl' id='par_",
1678+
page_id);
1679+
hocr_str.add_str_int("_", pcnt);
1680+
}
1681+
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
1682+
}
1683+
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1684+
hocr_str.add_str_int("\n <span class='ocr_line' id='line_", page_id);
1685+
hocr_str.add_str_int("_", lcnt);
1686+
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
1687+
}
1688+
1689+
// Now, process the word...
1690+
hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
1691+
hocr_str.add_str_int("_", wcnt);
1692+
int left, top, right, bottom;
1693+
bool bold, italic, underlined, monospace, serif, smallcaps;
1694+
int pointsize, font_id;
1695+
const char *font_name;
1696+
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1697+
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
1698+
&monospace, &serif, &smallcaps,
1699+
&pointsize, &font_id);
1700+
hocr_str.add_str_int("' title='bbox ", left);
1701+
hocr_str.add_str_int(" ", top);
1702+
hocr_str.add_str_int(" ", right);
1703+
hocr_str.add_str_int(" ", bottom);
1704+
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
1705+
if (font_info) {
1706+
hocr_str += "; x_font ";
1707+
hocr_str += HOcrEscape(font_name);
1708+
hocr_str.add_str_int("; x_fsize ", pointsize);
1709+
}
1710+
hocr_str += "'";
1711+
if (res_it->WordRecognitionLanguage()) {
1712+
hocr_str += " lang='";
1713+
hocr_str += res_it->WordRecognitionLanguage();
1714+
hocr_str += "'";
1715+
}
1716+
switch (res_it->WordDirection()) {
1717+
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
1718+
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
1719+
default: // Do nothing.
1720+
break;
1721+
}
1722+
hocr_str += ">";
1723+
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
1724+
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
1725+
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
1726+
if (bold) hocr_str += "<strong>";
1727+
if (italic) hocr_str += "<em>";
1728+
do {
1729+
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
1730+
if (grapheme && grapheme[0] != 0) {
1731+
if (grapheme[1] == 0) {
1732+
hocr_str += HOcrEscape(grapheme);
1733+
} else {
1734+
hocr_str += grapheme;
1735+
}
1736+
}
1737+
delete []grapheme;
1738+
res_it->Next(RIL_SYMBOL);
1739+
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1740+
if (italic) hocr_str += "</em>";
1741+
if (bold) hocr_str += "</strong>";
1742+
hocr_str += "</span> ";
1743+
wcnt++;
1744+
// Close any ending block/paragraph/textline.
1745+
if (last_word_in_line) {
1746+
hocr_str += "\n </span>";
1747+
lcnt++;
1748+
}
1749+
if (last_word_in_para) {
1750+
hocr_str += "\n </p>\n";
1751+
pcnt++;
1752+
}
1753+
if (last_word_in_block) {
1754+
hocr_str += " </div>\n";
1755+
bcnt++;
1756+
}
1757+
}
1758+
hocr_str += " </div>\n";
1759+
1760+
char *ret = new char[hocr_str.length() + 1];
1761+
strcpy(ret, hocr_str.string());
1762+
delete res_it;
1763+
return ret;
1764+
}
1765+
16041766
/** The 5 numbers output for each box (the usual 4 and a page number.) */
16051767
const int kNumbersPerBlob = 5;
16061768
/**

api/baseapi.h

+8
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,14 @@ class TESS_API TessBaseAPI {
602602
*/
603603
char* GetHOCRText(int page_number);
604604

605+
/**
606+
* Make a TSV-formatted string with hOCR markup from the internal
607+
* data structures.
608+
* page_number is 0-based but will appear in the output as 1-based.
609+
*/
610+
char* GetHOCRTSVText(int page_number);
611+
612+
605613
/**
606614
* The recognized text is returned as a char* which is coded in the same
607615
* format as a box file used in training. Returned string must be freed with

0 commit comments

Comments
 (0)