@@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it,
1417
1417
*hocr_str += " \" >" ;
1418
1418
}
1419
1419
1420
+ static void AddBoxTohOCRTSV (const PageIterator *it,
1421
+ PageIteratorLevel level,
1422
+ STRING* hocr_str) {
1423
+ int left, top, right, bottom;
1424
+ it->BoundingBox (level, &left, &top, &right, &bottom);
1425
+ hocr_str->add_str_int (" \t " , left);
1426
+ hocr_str->add_str_int (" \t " , top);
1427
+ hocr_str->add_str_int (" \t " , right - left + 1 );
1428
+ hocr_str->add_str_int (" \t " , bottom - top + 1 );
1429
+ }
1430
+
1431
+
1432
+
1420
1433
/* *
1421
1434
* Make a HTML-formatted string with hOCR markup from the internal
1422
1435
* data structures.
@@ -1641,19 +1654,18 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
1641
1654
delete[] utf8_str;
1642
1655
#endif
1643
1656
1644
- hocr_str.add_str_int (" <div class='ocr_page' id='page_" , page_id);
1645
- hocr_str += " ' title='image \" " ;
1646
- if (input_file_) {
1647
- hocr_str += HOcrEscape (input_file_->string ());
1648
- } else {
1649
- hocr_str += " unknown" ;
1650
- }
1651
- hocr_str.add_str_int (" \" ; bbox " , rect_left_);
1652
- hocr_str.add_str_int (" " , rect_top_);
1653
- hocr_str.add_str_int (" " , rect_width_);
1654
- hocr_str.add_str_int (" " , rect_height_);
1655
- hocr_str.add_str_int (" ; ppageno " , page_number);
1656
- hocr_str += " '>\n " ;
1657
+ int page_num = page_id, block_num = 0 , par_num = 0 , line_num = 0 , word_num = 0 ;
1658
+
1659
+ hocr_str.add_str_int (" 1\t " , page_num);
1660
+ hocr_str.add_str_int (" \t " , block_num);
1661
+ hocr_str.add_str_int (" \t " , par_num);
1662
+ hocr_str.add_str_int (" \t " , line_num);
1663
+ hocr_str.add_str_int (" \t " , word_num);
1664
+ hocr_str.add_str_int (" \t " , rect_left_);
1665
+ hocr_str.add_str_int (" \t " , rect_top_);
1666
+ hocr_str.add_str_int (" \t " , rect_width_);
1667
+ hocr_str.add_str_int (" \t " , rect_height_);
1668
+ hocr_str += " \t -1\t\n " ;
1657
1669
1658
1670
ResultIterator *res_it = GetIterator ();
1659
1671
while (!res_it->Empty (RIL_BLOCK)) {
@@ -1664,31 +1676,37 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
1664
1676
1665
1677
// Open any new block/paragraph/textline.
1666
1678
if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
1667
- hocr_str.add_str_int (" <div class='ocr_carea' id='block_" , page_id);
1668
- hocr_str.add_str_int (" _" , bcnt);
1669
- AddBoxTohOCR (res_it, RIL_BLOCK, &hocr_str);
1679
+ block_num++, par_num = 0 , line_num = 0 , word_num = 0 ;
1680
+ hocr_str.add_str_int (" 2\t " , page_num);
1681
+ hocr_str.add_str_int (" \t " , block_num);
1682
+ hocr_str.add_str_int (" \t " , par_num);
1683
+ hocr_str.add_str_int (" \t " , line_num);
1684
+ hocr_str.add_str_int (" \t " , word_num);
1685
+ AddBoxTohOCRTSV (res_it, RIL_BLOCK, &hocr_str);
1686
+ hocr_str += " \t -1\t\n " ;
1670
1687
}
1671
1688
if (res_it->IsAtBeginningOf (RIL_PARA)) {
1672
- if (res_it->ParagraphIsLtr ()) {
1673
- hocr_str.add_str_int (" \n <p class='ocr_par' dir='ltr' id='par_" ,
1674
- page_id);
1675
- hocr_str.add_str_int (" _" , pcnt);
1676
- } else {
1677
- hocr_str.add_str_int (" \n <p class='ocr_par' dir='rtl' id='par_" ,
1678
- page_id);
1679
- hocr_str.add_str_int (" _" , pcnt);
1680
- }
1681
- AddBoxTohOCR (res_it, RIL_PARA, &hocr_str);
1689
+ par_num++, line_num = 0 , word_num = 0 ;
1690
+ hocr_str.add_str_int (" 3\t " , page_num);
1691
+ hocr_str.add_str_int (" \t " , block_num);
1692
+ hocr_str.add_str_int (" \t " , par_num);
1693
+ hocr_str.add_str_int (" \t " , line_num);
1694
+ hocr_str.add_str_int (" \t " , word_num);
1695
+ AddBoxTohOCRTSV (res_it, RIL_PARA, &hocr_str);
1696
+ hocr_str += " \t -1\t\n " ;
1682
1697
}
1683
1698
if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
1684
- hocr_str.add_str_int (" \n <span class='ocr_line' id='line_" , page_id);
1685
- hocr_str.add_str_int (" _" , lcnt);
1686
- AddBoxTohOCR (res_it, RIL_TEXTLINE, &hocr_str);
1699
+ line_num++, word_num = 0 ;
1700
+ hocr_str.add_str_int (" 4\t " , page_num);
1701
+ hocr_str.add_str_int (" \t " , block_num);
1702
+ hocr_str.add_str_int (" \t " , par_num);
1703
+ hocr_str.add_str_int (" \t " , line_num);
1704
+ hocr_str.add_str_int (" \t " , word_num);
1705
+ AddBoxTohOCRTSV (res_it, RIL_TEXTLINE, &hocr_str);
1706
+ hocr_str += " \t -1\t\n " ;
1687
1707
}
1688
1708
1689
1709
// Now, process the word...
1690
- hocr_str.add_str_int (" <span class='ocrx_word' id='word_" , page_id);
1691
- hocr_str.add_str_int (" _" , wcnt);
1692
1710
int left, top, right, bottom;
1693
1711
bool bold , italic , underlined, monospace, serif, smallcaps;
1694
1712
int pointsize, font_id;
@@ -1697,34 +1715,21 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
1697
1715
font_name = res_it->WordFontAttributes (&bold , &italic , &underlined,
1698
1716
&monospace, &serif, &smallcaps,
1699
1717
&pointsize, &font_id);
1700
- hocr_str.add_str_int (" ' title='bbox " , left);
1701
- hocr_str.add_str_int (" " , top);
1702
- hocr_str.add_str_int (" " , right);
1703
- hocr_str.add_str_int (" " , bottom);
1704
- hocr_str.add_str_int (" ; x_wconf " , res_it->Confidence (RIL_WORD));
1705
- if (font_info) {
1706
- hocr_str += " ; x_font " ;
1707
- hocr_str += HOcrEscape (font_name);
1708
- hocr_str.add_str_int (" ; x_fsize " , pointsize);
1709
- }
1710
- hocr_str += " '" ;
1711
- if (res_it->WordRecognitionLanguage ()) {
1712
- hocr_str += " lang='" ;
1713
- hocr_str += res_it->WordRecognitionLanguage ();
1714
- hocr_str += " '" ;
1715
- }
1716
- switch (res_it->WordDirection ()) {
1717
- case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'" ; break ;
1718
- case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'" ; break ;
1719
- default : // Do nothing.
1720
- break ;
1721
- }
1722
- hocr_str += " >" ;
1718
+ word_num++;
1719
+ hocr_str.add_str_int (" 5\t " , page_num);
1720
+ hocr_str.add_str_int (" \t " , block_num);
1721
+ hocr_str.add_str_int (" \t " , par_num);
1722
+ hocr_str.add_str_int (" \t " , line_num);
1723
+ hocr_str.add_str_int (" \t " , word_num);
1724
+ hocr_str.add_str_int (" \t " , left);
1725
+ hocr_str.add_str_int (" \t " , top);
1726
+ hocr_str.add_str_int (" \t " , right - left + 1 );
1727
+ hocr_str.add_str_int (" \t " , bottom - top + 1 );
1728
+ hocr_str.add_str_int (" \t " , res_it->Confidence (RIL_WORD));
1723
1729
bool last_word_in_line = res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD);
1724
1730
bool last_word_in_para = res_it->IsAtFinalElement (RIL_PARA, RIL_WORD);
1725
1731
bool last_word_in_block = res_it->IsAtFinalElement (RIL_BLOCK, RIL_WORD);
1726
- if (bold ) hocr_str += " <strong>" ;
1727
- if (italic ) hocr_str += " <em>" ;
1732
+ hocr_str += " \t " ;
1728
1733
do {
1729
1734
const char *grapheme = res_it->GetUTF8Text (RIL_SYMBOL);
1730
1735
if (grapheme && grapheme[0 ] != 0 ) {
@@ -1737,25 +1742,19 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
1737
1742
delete [] grapheme;
1738
1743
res_it->Next (RIL_SYMBOL);
1739
1744
} while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_WORD));
1740
- if (italic ) hocr_str += " </em>" ;
1741
- if (bold ) hocr_str += " </strong>" ;
1742
- hocr_str += " </span> " ;
1745
+ hocr_str += " \n " ;
1743
1746
wcnt++;
1744
1747
// Close any ending block/paragraph/textline.
1745
1748
if (last_word_in_line) {
1746
- hocr_str += " \n </span>" ;
1747
1749
lcnt++;
1748
1750
}
1749
1751
if (last_word_in_para) {
1750
- hocr_str += " \n </p>\n " ;
1751
1752
pcnt++;
1752
1753
}
1753
1754
if (last_word_in_block) {
1754
- hocr_str += " </div>\n " ;
1755
1755
bcnt++;
1756
1756
}
1757
1757
}
1758
- hocr_str += " </div>\n " ;
1759
1758
1760
1759
char *ret = new char [hocr_str.length () + 1 ];
1761
1760
strcpy (ret, hocr_str.string ());
0 commit comments