@@ -1417,7 +1417,7 @@ static void AddBoxTohOCR(const ResultIterator *it,
1417
1417
*hocr_str += " \" >" ;
1418
1418
}
1419
1419
1420
- static void AddBoxTohOCRTSV (const PageIterator *it,
1420
+ static void AddBoxToTSV (const PageIterator *it,
1421
1421
PageIteratorLevel level,
1422
1422
STRING* hocr_str) {
1423
1423
int left, top, right, bottom;
@@ -1615,57 +1615,31 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
1615
1615
}
1616
1616
1617
1617
/* *
1618
- * Make a TSV-formatted string with hOCR markup from the internal
1619
- * data structures.
1618
+ * Make a TSV-formatted string from the internal data structures.
1620
1619
* page_number is 0-based but will appear in the output as 1-based.
1621
- * Image name/input_file_ can be set by SetInputName before calling
1622
- * GetHOCRText
1623
- * STL removed from original patch submission and refactored by rays.
1624
1620
*/
1625
- char * TessBaseAPI::GetHOCRTSVText (int page_number) {
1621
+ char * TessBaseAPI::GetTSVText (int page_number) {
1626
1622
if (tesseract_ == NULL ||
1627
1623
(page_res_ == NULL && Recognize (NULL ) < 0 ))
1628
1624
return NULL ;
1629
1625
1630
1626
int lcnt = 1 , bcnt = 1 , pcnt = 1 , wcnt = 1 ;
1631
- int page_id = page_number + 1 ; // hOCR uses 1-based page numbers.
1632
- bool font_info = false ;
1633
- GetBoolVariable (" hocr_font_info" , &font_info);
1634
-
1635
- STRING hocr_str (" " );
1627
+ int page_id = page_number + 1 ; // we use 1-based page numbers.
1636
1628
1637
- if (input_file_ == NULL )
1638
- SetInputName (NULL );
1639
-
1640
- #ifdef _WIN32
1641
- // convert input name from ANSI encoding to utf-8
1642
- int str16_len = MultiByteToWideChar (CP_ACP, 0 , input_file_->string (), -1 ,
1643
- NULL , NULL );
1644
- wchar_t *uni16_str = new WCHAR[str16_len];
1645
- str16_len = MultiByteToWideChar (CP_ACP, 0 , input_file_->string (), -1 ,
1646
- uni16_str, str16_len);
1647
- int utf8_len = WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, NULL ,
1648
- NULL , NULL , NULL );
1649
- char *utf8_str = new char [utf8_len];
1650
- WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, utf8_str,
1651
- utf8_len, NULL , NULL );
1652
- *input_file_ = utf8_str;
1653
- delete[] uni16_str;
1654
- delete[] utf8_str;
1655
- #endif
1629
+ STRING tsv_str (" " );
1656
1630
1657
1631
int page_num = page_id, block_num = 0 , par_num = 0 , line_num = 0 , word_num = 0 ;
1658
1632
1659
- hocr_str .add_str_int (" 1\t " , page_num);
1660
- hocr_str .add_str_int (" \t " , block_num);
1661
- hocr_str .add_str_int (" \t " , par_num);
1662
- hocr_str .add_str_int (" \t " , line_num);
1663
- hocr_str .add_str_int (" \t " , word_num);
1664
- hocr_str .add_str_int (" \t " , rect_left_);
1665
- hocr_str .add_str_int (" \t " , rect_top_);
1666
- hocr_str .add_str_int (" \t " , rect_width_);
1667
- hocr_str .add_str_int (" \t " , rect_height_);
1668
- hocr_str += " \t -1\t\n " ;
1633
+ tsv_str .add_str_int (" 1\t " , page_num); // level 1 - page
1634
+ tsv_str .add_str_int (" \t " , block_num);
1635
+ tsv_str .add_str_int (" \t " , par_num);
1636
+ tsv_str .add_str_int (" \t " , line_num);
1637
+ tsv_str .add_str_int (" \t " , word_num);
1638
+ tsv_str .add_str_int (" \t " , rect_left_);
1639
+ tsv_str .add_str_int (" \t " , rect_top_);
1640
+ tsv_str .add_str_int (" \t " , rect_width_);
1641
+ tsv_str .add_str_int (" \t " , rect_height_);
1642
+ tsv_str += " \t -1\t\n " ;
1669
1643
1670
1644
ResultIterator *res_it = GetIterator ();
1671
1645
while (!res_it->Empty (RIL_BLOCK)) {
@@ -1674,36 +1648,36 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
1674
1648
continue ;
1675
1649
}
1676
1650
1677
- // Open any new block/paragraph/textline.
1651
+ // Add rows for any new block/paragraph/textline.
1678
1652
if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
1679
1653
block_num++, par_num = 0 , line_num = 0 , word_num = 0 ;
1680
- hocr_str .add_str_int (" 2\t " , page_num);
1681
- hocr_str .add_str_int (" \t " , block_num);
1682
- hocr_str .add_str_int (" \t " , par_num);
1683
- hocr_str .add_str_int (" \t " , line_num);
1684
- hocr_str .add_str_int (" \t " , word_num);
1685
- AddBoxTohOCRTSV (res_it, RIL_BLOCK, &hocr_str );
1686
- hocr_str += " \t -1\t\n " ;
1654
+ tsv_str .add_str_int (" 2\t " , page_num); // level 2 - block
1655
+ tsv_str .add_str_int (" \t " , block_num);
1656
+ tsv_str .add_str_int (" \t " , par_num);
1657
+ tsv_str .add_str_int (" \t " , line_num);
1658
+ tsv_str .add_str_int (" \t " , word_num);
1659
+ AddBoxToTSV (res_it, RIL_BLOCK, &tsv_str );
1660
+ tsv_str += " \t -1\t\n " ; // end of row for block
1687
1661
}
1688
1662
if (res_it->IsAtBeginningOf (RIL_PARA)) {
1689
1663
par_num++, line_num = 0 , word_num = 0 ;
1690
- hocr_str .add_str_int (" 3\t " , page_num);
1691
- hocr_str .add_str_int (" \t " , block_num);
1692
- hocr_str .add_str_int (" \t " , par_num);
1693
- hocr_str .add_str_int (" \t " , line_num);
1694
- hocr_str .add_str_int (" \t " , word_num);
1695
- AddBoxTohOCRTSV (res_it, RIL_PARA, &hocr_str );
1696
- hocr_str += " \t -1\t\n " ;
1664
+ tsv_str .add_str_int (" 3\t " , page_num); // level 3 - paragraph
1665
+ tsv_str .add_str_int (" \t " , block_num);
1666
+ tsv_str .add_str_int (" \t " , par_num);
1667
+ tsv_str .add_str_int (" \t " , line_num);
1668
+ tsv_str .add_str_int (" \t " , word_num);
1669
+ AddBoxToTSV (res_it, RIL_PARA, &tsv_str );
1670
+ tsv_str += " \t -1\t\n " ; // end of row for para
1697
1671
}
1698
1672
if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
1699
1673
line_num++, word_num = 0 ;
1700
- hocr_str .add_str_int (" 4\t " , page_num);
1701
- hocr_str .add_str_int (" \t " , block_num);
1702
- hocr_str .add_str_int (" \t " , par_num);
1703
- hocr_str .add_str_int (" \t " , line_num);
1704
- hocr_str .add_str_int (" \t " , word_num);
1705
- AddBoxTohOCRTSV (res_it, RIL_TEXTLINE, &hocr_str );
1706
- hocr_str += " \t -1\t\n " ;
1674
+ tsv_str .add_str_int (" 4\t " , page_num); // level 4 - line
1675
+ tsv_str .add_str_int (" \t " , block_num);
1676
+ tsv_str .add_str_int (" \t " , par_num);
1677
+ tsv_str .add_str_int (" \t " , line_num);
1678
+ tsv_str .add_str_int (" \t " , word_num);
1679
+ AddBoxToTSV (res_it, RIL_TEXTLINE, &tsv_str );
1680
+ tsv_str += " \t -1\t\n " ; // end of row for line
1707
1681
}
1708
1682
1709
1683
// Now, process the word...
@@ -1715,49 +1689,34 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
1715
1689
font_name = res_it->WordFontAttributes (&bold , &italic , &underlined,
1716
1690
&monospace, &serif, &smallcaps,
1717
1691
&pointsize, &font_id);
1718
- word_num++;
1719
- hocr_str.add_str_int (" 5\t " , page_num);
1720
- hocr_str.add_str_int (" \t " , block_num);
1721
- hocr_str.add_str_int (" \t " , par_num);
1722
- hocr_str.add_str_int (" \t " , line_num);
1723
- hocr_str.add_str_int (" \t " , word_num);
1724
- hocr_str.add_str_int (" \t " , left);
1725
- hocr_str.add_str_int (" \t " , top);
1726
- hocr_str.add_str_int (" \t " , right - left + 1 );
1727
- hocr_str.add_str_int (" \t " , bottom - top + 1 );
1728
- hocr_str.add_str_int (" \t " , res_it->Confidence (RIL_WORD));
1729
- bool last_word_in_line = res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD);
1730
- bool last_word_in_para = res_it->IsAtFinalElement (RIL_PARA, RIL_WORD);
1731
- bool last_word_in_block = res_it->IsAtFinalElement (RIL_BLOCK, RIL_WORD);
1732
- hocr_str += " \t " ;
1692
+ word_num++;
1693
+ tsv_str.add_str_int (" 5\t " , page_num); // level 5 - word
1694
+ tsv_str.add_str_int (" \t " , block_num);
1695
+ tsv_str.add_str_int (" \t " , par_num);
1696
+ tsv_str.add_str_int (" \t " , line_num);
1697
+ tsv_str.add_str_int (" \t " , word_num);
1698
+ tsv_str.add_str_int (" \t " , left);
1699
+ tsv_str.add_str_int (" \t " , top);
1700
+ tsv_str.add_str_int (" \t " , right - left + 1 );
1701
+ tsv_str.add_str_int (" \t " , bottom - top + 1 );
1702
+ tsv_str.add_str_int (" \t " , res_it->Confidence (RIL_WORD));
1703
+ tsv_str += " \t " ;
1704
+
1705
+ // Increment counts if at end of block/paragraph/textline.
1706
+ if (res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD)) lcnt++;
1707
+ if (res_it->IsAtFinalElement (RIL_PARA, RIL_WORD)) pcnt++;
1708
+ if (res_it->IsAtFinalElement (RIL_BLOCK, RIL_WORD)) bcnt++;
1709
+
1733
1710
do {
1734
- const char *grapheme = res_it->GetUTF8Text (RIL_SYMBOL);
1735
- // if (grapheme && grapheme[0] != 0) {
1736
- // if (grapheme[1] == 0) {
1737
- // hocr_str += HOcrEscape(grapheme);
1738
- // } else {
1739
- hocr_str += grapheme;
1740
- // }
1741
- // }
1742
- delete [] grapheme;
1711
+ tsv_str += res_it->GetUTF8Text (RIL_SYMBOL);
1743
1712
res_it->Next (RIL_SYMBOL);
1744
1713
} while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_WORD));
1745
- hocr_str += " \n " ;
1714
+ tsv_str += " \n " ; // end of row
1746
1715
wcnt++;
1747
- // Close any ending block/paragraph/textline.
1748
- if (last_word_in_line) {
1749
- lcnt++;
1750
- }
1751
- if (last_word_in_para) {
1752
- pcnt++;
1753
- }
1754
- if (last_word_in_block) {
1755
- bcnt++;
1756
- }
1757
1716
}
1758
1717
1759
- char *ret = new char [hocr_str .length () + 1 ];
1760
- strcpy (ret, hocr_str .string ());
1718
+ char *ret = new char [tsv_str .length () + 1 ];
1719
+ strcpy (ret, tsv_str .string ());
1761
1720
delete res_it;
1762
1721
return ret;
1763
1722
}
0 commit comments