@@ -1601,6 +1601,168 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
1601
1601
return ret;
1602
1602
}
1603
1603
1604
+ /* *
1605
+ * Make a TSV-formatted string with hOCR markup from the internal
1606
+ * data structures.
1607
+ * page_number is 0-based but will appear in the output as 1-based.
1608
+ * Image name/input_file_ can be set by SetInputName before calling
1609
+ * GetHOCRText
1610
+ * STL removed from original patch submission and refactored by rays.
1611
+ */
1612
+ char * TessBaseAPI::GetHOCRTSVText (int page_number) {
1613
+ if (tesseract_ == NULL ||
1614
+ (page_res_ == NULL && Recognize (NULL ) < 0 ))
1615
+ return NULL ;
1616
+
1617
+ int lcnt = 1 , bcnt = 1 , pcnt = 1 , wcnt = 1 ;
1618
+ int page_id = page_number + 1 ; // hOCR uses 1-based page numbers.
1619
+ bool font_info = false ;
1620
+ GetBoolVariable (" hocr_font_info" , &font_info);
1621
+
1622
+ STRING hocr_str (" " );
1623
+
1624
+ if (input_file_ == NULL )
1625
+ SetInputName (NULL );
1626
+
1627
+ #ifdef _WIN32
1628
+ // convert input name from ANSI encoding to utf-8
1629
+ int str16_len = MultiByteToWideChar (CP_ACP, 0 , input_file_->string (), -1 ,
1630
+ NULL , NULL );
1631
+ wchar_t *uni16_str = new WCHAR[str16_len];
1632
+ str16_len = MultiByteToWideChar (CP_ACP, 0 , input_file_->string (), -1 ,
1633
+ uni16_str, str16_len);
1634
+ int utf8_len = WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, NULL ,
1635
+ NULL , NULL , NULL );
1636
+ char *utf8_str = new char [utf8_len];
1637
+ WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, utf8_str,
1638
+ utf8_len, NULL , NULL );
1639
+ *input_file_ = utf8_str;
1640
+ delete[] uni16_str;
1641
+ delete[] utf8_str;
1642
+ #endif
1643
+
1644
+ hocr_str.add_str_int (" <div class='ocr_page' id='page_" , page_id);
1645
+ hocr_str += " ' title='image \" " ;
1646
+ if (input_file_) {
1647
+ hocr_str += HOcrEscape (input_file_->string ());
1648
+ } else {
1649
+ hocr_str += " unknown" ;
1650
+ }
1651
+ hocr_str.add_str_int (" \" ; bbox " , rect_left_);
1652
+ hocr_str.add_str_int (" " , rect_top_);
1653
+ hocr_str.add_str_int (" " , rect_width_);
1654
+ hocr_str.add_str_int (" " , rect_height_);
1655
+ hocr_str.add_str_int (" ; ppageno " , page_number);
1656
+ hocr_str += " '>\n " ;
1657
+
1658
+ ResultIterator *res_it = GetIterator ();
1659
+ while (!res_it->Empty (RIL_BLOCK)) {
1660
+ if (res_it->Empty (RIL_WORD)) {
1661
+ res_it->Next (RIL_WORD);
1662
+ continue ;
1663
+ }
1664
+
1665
+ // Open any new block/paragraph/textline.
1666
+ if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
1667
+ hocr_str.add_str_int (" <div class='ocr_carea' id='block_" , page_id);
1668
+ hocr_str.add_str_int (" _" , bcnt);
1669
+ AddBoxTohOCR (res_it, RIL_BLOCK, &hocr_str);
1670
+ }
1671
+ if (res_it->IsAtBeginningOf (RIL_PARA)) {
1672
+ if (res_it->ParagraphIsLtr ()) {
1673
+ hocr_str.add_str_int (" \n <p class='ocr_par' dir='ltr' id='par_" ,
1674
+ page_id);
1675
+ hocr_str.add_str_int (" _" , pcnt);
1676
+ } else {
1677
+ hocr_str.add_str_int (" \n <p class='ocr_par' dir='rtl' id='par_" ,
1678
+ page_id);
1679
+ hocr_str.add_str_int (" _" , pcnt);
1680
+ }
1681
+ AddBoxTohOCR (res_it, RIL_PARA, &hocr_str);
1682
+ }
1683
+ if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
1684
+ hocr_str.add_str_int (" \n <span class='ocr_line' id='line_" , page_id);
1685
+ hocr_str.add_str_int (" _" , lcnt);
1686
+ AddBoxTohOCR (res_it, RIL_TEXTLINE, &hocr_str);
1687
+ }
1688
+
1689
+ // Now, process the word...
1690
+ hocr_str.add_str_int (" <span class='ocrx_word' id='word_" , page_id);
1691
+ hocr_str.add_str_int (" _" , wcnt);
1692
+ int left, top, right, bottom;
1693
+ bool bold , italic , underlined, monospace, serif, smallcaps;
1694
+ int pointsize, font_id;
1695
+ const char *font_name;
1696
+ res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
1697
+ font_name = res_it->WordFontAttributes (&bold , &italic , &underlined,
1698
+ &monospace, &serif, &smallcaps,
1699
+ &pointsize, &font_id);
1700
+ hocr_str.add_str_int (" ' title='bbox " , left);
1701
+ hocr_str.add_str_int (" " , top);
1702
+ hocr_str.add_str_int (" " , right);
1703
+ hocr_str.add_str_int (" " , bottom);
1704
+ hocr_str.add_str_int (" ; x_wconf " , res_it->Confidence (RIL_WORD));
1705
+ if (font_info) {
1706
+ hocr_str += " ; x_font " ;
1707
+ hocr_str += HOcrEscape (font_name);
1708
+ hocr_str.add_str_int (" ; x_fsize " , pointsize);
1709
+ }
1710
+ hocr_str += " '" ;
1711
+ if (res_it->WordRecognitionLanguage ()) {
1712
+ hocr_str += " lang='" ;
1713
+ hocr_str += res_it->WordRecognitionLanguage ();
1714
+ hocr_str += " '" ;
1715
+ }
1716
+ switch (res_it->WordDirection ()) {
1717
+ case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'" ; break ;
1718
+ case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'" ; break ;
1719
+ default : // Do nothing.
1720
+ break ;
1721
+ }
1722
+ hocr_str += " >" ;
1723
+ bool last_word_in_line = res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD);
1724
+ bool last_word_in_para = res_it->IsAtFinalElement (RIL_PARA, RIL_WORD);
1725
+ bool last_word_in_block = res_it->IsAtFinalElement (RIL_BLOCK, RIL_WORD);
1726
+ if (bold ) hocr_str += " <strong>" ;
1727
+ if (italic ) hocr_str += " <em>" ;
1728
+ do {
1729
+ const char *grapheme = res_it->GetUTF8Text (RIL_SYMBOL);
1730
+ if (grapheme && grapheme[0 ] != 0 ) {
1731
+ if (grapheme[1 ] == 0 ) {
1732
+ hocr_str += HOcrEscape (grapheme);
1733
+ } else {
1734
+ hocr_str += grapheme;
1735
+ }
1736
+ }
1737
+ delete [] grapheme;
1738
+ res_it->Next (RIL_SYMBOL);
1739
+ } while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_WORD));
1740
+ if (italic ) hocr_str += " </em>" ;
1741
+ if (bold ) hocr_str += " </strong>" ;
1742
+ hocr_str += " </span> " ;
1743
+ wcnt++;
1744
+ // Close any ending block/paragraph/textline.
1745
+ if (last_word_in_line) {
1746
+ hocr_str += " \n </span>" ;
1747
+ lcnt++;
1748
+ }
1749
+ if (last_word_in_para) {
1750
+ hocr_str += " \n </p>\n " ;
1751
+ pcnt++;
1752
+ }
1753
+ if (last_word_in_block) {
1754
+ hocr_str += " </div>\n " ;
1755
+ bcnt++;
1756
+ }
1757
+ }
1758
+ hocr_str += " </div>\n " ;
1759
+
1760
+ char *ret = new char [hocr_str.length () + 1 ];
1761
+ strcpy (ret, hocr_str.string ());
1762
+ delete res_it;
1763
+ return ret;
1764
+ }
1765
+
1604
1766
/* * The 5 numbers output for each box (the usual 4 and a page number.) */
1605
1767
const int kNumbersPerBlob = 5 ;
1606
1768
/* *
0 commit comments