Skip to content

Commit 6700edd

Browse files
committed
Cleanup TSV renderer
Remove all references to hocr, hocr.tsv, etc. Remove dead code for font info, input filename, HTML escapes. Improved comments. Fixed indentation.
1 parent 858f4b7 commit 6700edd

File tree

9 files changed

+83
-125
lines changed

9 files changed

+83
-125
lines changed

api/baseapi.cpp

+59-100
Original file line numberDiff line numberDiff line change
@@ -1417,7 +1417,7 @@ static void AddBoxTohOCR(const ResultIterator *it,
14171417
*hocr_str += "\">";
14181418
}
14191419

1420-
static void AddBoxTohOCRTSV(const PageIterator *it,
1420+
static void AddBoxToTSV(const PageIterator *it,
14211421
PageIteratorLevel level,
14221422
STRING* hocr_str) {
14231423
int left, top, right, bottom;
@@ -1615,57 +1615,31 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
16151615
}
16161616

16171617
/**
1618-
* Make a TSV-formatted string with hOCR markup from the internal
1619-
* data structures.
1618+
* Make a TSV-formatted string from the internal data structures.
16201619
* page_number is 0-based but will appear in the output as 1-based.
1621-
* Image name/input_file_ can be set by SetInputName before calling
1622-
* GetHOCRText
1623-
* STL removed from original patch submission and refactored by rays.
16241620
*/
1625-
char* TessBaseAPI::GetHOCRTSVText(int page_number) {
1621+
char* TessBaseAPI::GetTSVText(int page_number) {
16261622
if (tesseract_ == NULL ||
16271623
(page_res_ == NULL && Recognize(NULL) < 0))
16281624
return NULL;
16291625

16301626
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1631-
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
1632-
bool font_info = false;
1633-
GetBoolVariable("hocr_font_info", &font_info);
1634-
1635-
STRING hocr_str("");
1627+
int page_id = page_number + 1; // we use 1-based page numbers.
16361628

1637-
if (input_file_ == NULL)
1638-
SetInputName(NULL);
1639-
1640-
#ifdef _WIN32
1641-
// convert input name from ANSI encoding to utf-8
1642-
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
1643-
NULL, NULL);
1644-
wchar_t *uni16_str = new WCHAR[str16_len];
1645-
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
1646-
uni16_str, str16_len);
1647-
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
1648-
NULL, NULL, NULL);
1649-
char *utf8_str = new char[utf8_len];
1650-
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
1651-
utf8_len, NULL, NULL);
1652-
*input_file_ = utf8_str;
1653-
delete[] uni16_str;
1654-
delete[] utf8_str;
1655-
#endif
1629+
STRING tsv_str("");
16561630

16571631
int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;
16581632

1659-
hocr_str.add_str_int("1\t", page_num);
1660-
hocr_str.add_str_int("\t", block_num);
1661-
hocr_str.add_str_int("\t", par_num);
1662-
hocr_str.add_str_int("\t", line_num);
1663-
hocr_str.add_str_int("\t", word_num);
1664-
hocr_str.add_str_int("\t", rect_left_);
1665-
hocr_str.add_str_int("\t", rect_top_);
1666-
hocr_str.add_str_int("\t", rect_width_);
1667-
hocr_str.add_str_int("\t", rect_height_);
1668-
hocr_str += "\t-1\t\n";
1633+
tsv_str.add_str_int("1\t", page_num); // level 1 - page
1634+
tsv_str.add_str_int("\t", block_num);
1635+
tsv_str.add_str_int("\t", par_num);
1636+
tsv_str.add_str_int("\t", line_num);
1637+
tsv_str.add_str_int("\t", word_num);
1638+
tsv_str.add_str_int("\t", rect_left_);
1639+
tsv_str.add_str_int("\t", rect_top_);
1640+
tsv_str.add_str_int("\t", rect_width_);
1641+
tsv_str.add_str_int("\t", rect_height_);
1642+
tsv_str += "\t-1\t\n";
16691643

16701644
ResultIterator *res_it = GetIterator();
16711645
while (!res_it->Empty(RIL_BLOCK)) {
@@ -1674,36 +1648,36 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
16741648
continue;
16751649
}
16761650

1677-
// Open any new block/paragraph/textline.
1651+
// Add rows for any new block/paragraph/textline.
16781652
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
16791653
block_num++, par_num = 0, line_num = 0, word_num = 0;
1680-
hocr_str.add_str_int("2\t", page_num);
1681-
hocr_str.add_str_int("\t", block_num);
1682-
hocr_str.add_str_int("\t", par_num);
1683-
hocr_str.add_str_int("\t", line_num);
1684-
hocr_str.add_str_int("\t", word_num);
1685-
AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str);
1686-
hocr_str += "\t-1\t\n";
1654+
tsv_str.add_str_int("2\t", page_num); // level 2 - block
1655+
tsv_str.add_str_int("\t", block_num);
1656+
tsv_str.add_str_int("\t", par_num);
1657+
tsv_str.add_str_int("\t", line_num);
1658+
tsv_str.add_str_int("\t", word_num);
1659+
AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
1660+
tsv_str += "\t-1\t\n"; // end of row for block
16871661
}
16881662
if (res_it->IsAtBeginningOf(RIL_PARA)) {
16891663
par_num++, line_num = 0, word_num = 0;
1690-
hocr_str.add_str_int("3\t", page_num);
1691-
hocr_str.add_str_int("\t", block_num);
1692-
hocr_str.add_str_int("\t", par_num);
1693-
hocr_str.add_str_int("\t", line_num);
1694-
hocr_str.add_str_int("\t", word_num);
1695-
AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str);
1696-
hocr_str += "\t-1\t\n";
1664+
tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
1665+
tsv_str.add_str_int("\t", block_num);
1666+
tsv_str.add_str_int("\t", par_num);
1667+
tsv_str.add_str_int("\t", line_num);
1668+
tsv_str.add_str_int("\t", word_num);
1669+
AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
1670+
tsv_str += "\t-1\t\n"; // end of row for para
16971671
}
16981672
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
16991673
line_num++, word_num = 0;
1700-
hocr_str.add_str_int("4\t", page_num);
1701-
hocr_str.add_str_int("\t", block_num);
1702-
hocr_str.add_str_int("\t", par_num);
1703-
hocr_str.add_str_int("\t", line_num);
1704-
hocr_str.add_str_int("\t", word_num);
1705-
AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str);
1706-
hocr_str += "\t-1\t\n";
1674+
tsv_str.add_str_int("4\t", page_num); // level 4 - line
1675+
tsv_str.add_str_int("\t", block_num);
1676+
tsv_str.add_str_int("\t", par_num);
1677+
tsv_str.add_str_int("\t", line_num);
1678+
tsv_str.add_str_int("\t", word_num);
1679+
AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
1680+
tsv_str += "\t-1\t\n"; // end of row for line
17071681
}
17081682

17091683
// Now, process the word...
@@ -1715,49 +1689,34 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
17151689
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
17161690
&monospace, &serif, &smallcaps,
17171691
&pointsize, &font_id);
1718-
word_num++;
1719-
hocr_str.add_str_int("5\t", page_num);
1720-
hocr_str.add_str_int("\t", block_num);
1721-
hocr_str.add_str_int("\t", par_num);
1722-
hocr_str.add_str_int("\t", line_num);
1723-
hocr_str.add_str_int("\t", word_num);
1724-
hocr_str.add_str_int("\t", left);
1725-
hocr_str.add_str_int("\t", top);
1726-
hocr_str.add_str_int("\t", right - left + 1);
1727-
hocr_str.add_str_int("\t", bottom - top + 1);
1728-
hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
1729-
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
1730-
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
1731-
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
1732-
hocr_str += "\t";
1692+
word_num++;
1693+
tsv_str.add_str_int("5\t", page_num); // level 5 - word
1694+
tsv_str.add_str_int("\t", block_num);
1695+
tsv_str.add_str_int("\t", par_num);
1696+
tsv_str.add_str_int("\t", line_num);
1697+
tsv_str.add_str_int("\t", word_num);
1698+
tsv_str.add_str_int("\t", left);
1699+
tsv_str.add_str_int("\t", top);
1700+
tsv_str.add_str_int("\t", right - left + 1);
1701+
tsv_str.add_str_int("\t", bottom - top + 1);
1702+
tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
1703+
tsv_str += "\t";
1704+
1705+
// Increment counts if at end of block/paragraph/textline.
1706+
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
1707+
if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
1708+
if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
1709+
17331710
do {
1734-
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
1735-
// if (grapheme && grapheme[0] != 0) {
1736-
// if (grapheme[1] == 0) {
1737-
// hocr_str += HOcrEscape(grapheme);
1738-
// } else {
1739-
hocr_str += grapheme;
1740-
// }
1741-
// }
1742-
delete []grapheme;
1711+
tsv_str += res_it->GetUTF8Text(RIL_SYMBOL);
17431712
res_it->Next(RIL_SYMBOL);
17441713
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1745-
hocr_str += "\n";
1714+
tsv_str += "\n"; // end of row
17461715
wcnt++;
1747-
// Close any ending block/paragraph/textline.
1748-
if (last_word_in_line) {
1749-
lcnt++;
1750-
}
1751-
if (last_word_in_para) {
1752-
pcnt++;
1753-
}
1754-
if (last_word_in_block) {
1755-
bcnt++;
1756-
}
17571716
}
17581717

1759-
char *ret = new char[hocr_str.length() + 1];
1760-
strcpy(ret, hocr_str.string());
1718+
char *ret = new char[tsv_str.length() + 1];
1719+
strcpy(ret, tsv_str.string());
17611720
delete res_it;
17621721
return ret;
17631722
}

api/baseapi.h

+2-4
Original file line numberDiff line numberDiff line change
@@ -603,12 +603,10 @@ class TESS_API TessBaseAPI {
603603
char* GetHOCRText(int page_number);
604604

605605
/**
606-
* Make a TSV-formatted string with hOCR markup from the internal
607-
* data structures.
606+
* Make a TSV-formatted string from the internal data structures.
608607
* page_number is 0-based but will appear in the output as 1-based.
609608
*/
610-
char* GetHOCRTSVText(int page_number);
611-
609+
char* GetTSVText(int page_number);
612610

613611
/**
614612
* The recognized text is returned as a char* which is coded in the same

api/renderer.cpp

+12-11
Original file line numberDiff line numberDiff line change
@@ -182,31 +182,32 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
182182
/**********************************************************************
183183
* HOcr Text Renderer interface implementation
184184
**********************************************************************/
185-
TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase)
186-
: TessResultRenderer(outputbase, "hocr.tsv") {
185+
TessTsvRenderer::TessTsvRenderer(const char *outputbase)
186+
: TessResultRenderer(outputbase, "tsv") {
187187
font_info_ = false;
188188
}
189189

190-
TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
191-
: TessResultRenderer(outputbase, "hocr.tsv") {
190+
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
191+
: TessResultRenderer(outputbase, "tsv") {
192192
font_info_ = font_info;
193193
}
194194

195-
bool TessHOcrTsvRenderer::BeginDocumentHandler() {
195+
bool TessTsvRenderer::BeginDocumentHandler() {
196+
// Output TSV column headings
196197
AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
197198
return true;
198199
}
199200

200-
bool TessHOcrTsvRenderer::EndDocumentHandler() {
201+
bool TessTsvRenderer::EndDocumentHandler() {
201202
return true;
202203
}
203204

204-
bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
205-
char* hocrtsv = api->GetHOCRTSVText(imagenum());
206-
if (hocrtsv == NULL) return false;
205+
bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) {
206+
char* tsv = api->GetTSVText(imagenum());
207+
if (tsv == NULL) return false;
207208

208-
AppendString(hocrtsv);
209-
delete[] hocrtsv;
209+
AppendString(tsv);
210+
delete[] tsv;
210211

211212
return true;
212213
}

api/renderer.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,12 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
163163
};
164164

165165
/**
166-
* Renders tesseract output into an hocr tsv string
166+
* Renders Tesseract output into a TSV string
167167
*/
168-
class TESS_API TessHOcrTsvRenderer : public TessResultRenderer {
168+
class TESS_API TessTsvRenderer : public TessResultRenderer {
169169
public:
170-
explicit TessHOcrTsvRenderer(const char *outputbase, bool font_info);
171-
explicit TessHOcrTsvRenderer(const char *outputbase);
170+
explicit TessTsvRenderer(const char *outputbase, bool font_info);
171+
explicit TessTsvRenderer(const char *outputbase);
172172

173173
protected:
174174
virtual bool BeginDocumentHandler();

api/tesseractmain.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -299,12 +299,12 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
299299
new tesseract::TessHOcrRenderer(outputbase, font_info));
300300
}
301301

302-
api->GetBoolVariable("tessedit_create_hocrtsv", &b);
302+
api->GetBoolVariable("tessedit_create_tsv", &b);
303303
if (b) {
304304
bool font_info;
305305
api->GetBoolVariable("hocr_font_info", &font_info);
306306
renderers->push_back(
307-
new tesseract::TessHOcrTsvRenderer(outputbase, font_info));
307+
new tesseract::TessTsvRenderer(outputbase, font_info));
308308
}
309309

310310
api->GetBoolVariable("tessedit_create_pdf", &b);

ccmain/tesseractclass.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ Tesseract::Tesseract()
385385
this->params()),
386386
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
387387
this->params()),
388-
BOOL_MEMBER(tessedit_create_hocrtsv, false, "Write .hocr.tsv TSV output file",
388+
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
389389
this->params()),
390390
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
391391
this->params()),

ccmain/tesseractclass.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1003,7 +1003,7 @@ class Tesseract : public Wordrec {
10031003
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
10041004
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
10051005
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
1006-
BOOL_VAR_H(tessedit_create_hocrtsv, false, "Write .hocr.tsv hOCR-tsv output file");
1006+
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
10071007
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
10081008
STRING_VAR_H(unrecognised_char, "|",
10091009
"Output char for unidentified blobs");

tessdata/configs/hocrtsv

-2
This file was deleted.

tessdata/configs/tsv

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
tessedit_create_tsv 1
2+
tessedit_pageseg_mode 1

0 commit comments

Comments
 (0)