Adds char* GetHOCRTSVText(int) as placeholder. Copy of char* GetHOCRText(int).

sundarcf · tfmorris · commit d04e3259afa7 · 2016-03-01T12:13:42.000-05:00
diff --git a/api/baseapi.cpp b/api/baseapi.cpp
@@ -1601,6 +1601,168 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
   return ret;
 }
 
+/**
+ * Make a TSV-formatted string with hOCR markup from the internal
+ * data structures.
+ * page_number is 0-based but will appear in the output as 1-based.
+ * Image name/input_file_ can be set by SetInputName before calling
+ * GetHOCRText
+ * STL removed from original patch submission and refactored by rays.
+ */
+char* TessBaseAPI::GetHOCRTSVText(int page_number) {
+  if (tesseract_ == NULL ||
+      (page_res_ == NULL && Recognize(NULL) < 0))
+    return NULL;
+
+  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
+  int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
+  bool font_info = false;
+  GetBoolVariable("hocr_font_info", &font_info);
+
+  STRING hocr_str("");
+
+  if (input_file_ == NULL)
+      SetInputName(NULL);
+
+#ifdef _WIN32
+  // convert input name from ANSI encoding to utf-8
+  int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
+                                      NULL, NULL);
+  wchar_t *uni16_str = new WCHAR[str16_len];
+  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
+                                  uni16_str, str16_len);
+  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
+                                     NULL, NULL, NULL);
+  char *utf8_str = new char[utf8_len];
+  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
+                      utf8_len, NULL, NULL);
+  *input_file_ = utf8_str;
+  delete[] uni16_str;
+  delete[] utf8_str;
+#endif
+
+  hocr_str.add_str_int("  <div class='ocr_page' id='page_", page_id);
+  hocr_str += "' title='image \"";
+  if (input_file_) {
+    hocr_str += HOcrEscape(input_file_->string());
+  } else {
+    hocr_str += "unknown";
+  }
+  hocr_str.add_str_int("\"; bbox ", rect_left_);
+  hocr_str.add_str_int(" ", rect_top_);
+  hocr_str.add_str_int(" ", rect_width_);
+  hocr_str.add_str_int(" ", rect_height_);
+  hocr_str.add_str_int("; ppageno ", page_number);
+  hocr_str += "'>\n";
+
+  ResultIterator *res_it = GetIterator();
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->Empty(RIL_WORD)) {
+      res_it->Next(RIL_WORD);
+      continue;
+    }
+
+    // Open any new block/paragraph/textline.
+    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
+      hocr_str.add_str_int("   <div class='ocr_carea' id='block_", page_id);
+      hocr_str.add_str_int("_", bcnt);
+      AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
+    }
+    if (res_it->IsAtBeginningOf(RIL_PARA)) {
+      if (res_it->ParagraphIsLtr()) {
+        hocr_str.add_str_int("\n    <p class='ocr_par' dir='ltr' id='par_",
+                             page_id);
+        hocr_str.add_str_int("_", pcnt);
+      } else {
+        hocr_str.add_str_int("\n    <p class='ocr_par' dir='rtl' id='par_",
+                             page_id);
+        hocr_str.add_str_int("_", pcnt);
+      }
+      AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
+    }
+    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+      hocr_str.add_str_int("\n     <span class='ocr_line' id='line_", page_id);
+      hocr_str.add_str_int("_", lcnt);
+      AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
+    }
+
+    // Now, process the word...
+    hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
+    hocr_str.add_str_int("_", wcnt);
+    int left, top, right, bottom;
+    bool bold, italic, underlined, monospace, serif, smallcaps;
+    int pointsize, font_id;
+    const char *font_name;
+    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
+    font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
+                                           &monospace, &serif, &smallcaps,
+                                           &pointsize, &font_id);
+    hocr_str.add_str_int("' title='bbox ", left);
+    hocr_str.add_str_int(" ", top);
+    hocr_str.add_str_int(" ", right);
+    hocr_str.add_str_int(" ", bottom);
+    hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
+    if (font_info) {
+      hocr_str += "; x_font ";
+      hocr_str += HOcrEscape(font_name);
+      hocr_str.add_str_int("; x_fsize ", pointsize);
+    }
+    hocr_str += "'";
+    if (res_it->WordRecognitionLanguage()) {
+      hocr_str += " lang='";
+      hocr_str += res_it->WordRecognitionLanguage();
+      hocr_str += "'";
+    }
+    switch (res_it->WordDirection()) {
+      case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
+      case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
+      default:  // Do nothing.
+        break;
+    }
+    hocr_str += ">";
+    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
+    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
+    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
+    if (bold) hocr_str += "<strong>";
+    if (italic) hocr_str += "<em>";
+    do {
+      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
+      if (grapheme && grapheme[0] != 0) {
+        if (grapheme[1] == 0) {
+          hocr_str += HOcrEscape(grapheme);
+        } else {
+          hocr_str += grapheme;
+        }
+      }
+      delete []grapheme;
+      res_it->Next(RIL_SYMBOL);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+    if (italic) hocr_str += "</em>";
+    if (bold) hocr_str += "</strong>";
+    hocr_str += "</span> ";
+    wcnt++;
+    // Close any ending block/paragraph/textline.
+    if (last_word_in_line) {
+      hocr_str += "\n     </span>";
+      lcnt++;
+    }
+    if (last_word_in_para) {
+      hocr_str += "\n    </p>\n";
+      pcnt++;
+    }
+    if (last_word_in_block) {
+      hocr_str += "   </div>\n";
+      bcnt++;
+    }
+  }
+  hocr_str += "  </div>\n";
+
+  char *ret = new char[hocr_str.length() + 1];
+  strcpy(ret, hocr_str.string());
+  delete res_it;
+  return ret;
+}
+
 /** The 5 numbers output for each box (the usual 4 and a page number.) */
 const int kNumbersPerBlob = 5;
 /**
diff --git a/api/baseapi.h b/api/baseapi.h
@@ -602,6 +602,14 @@ class TESS_API TessBaseAPI {
    */
   char* GetHOCRText(int page_number);
 
+  /**
+   * Make a TSV-formatted string with hOCR markup from the internal
+   * data structures.
+   * page_number is 0-based but will appear in the output as 1-based.
+   */
+  char* GetHOCRTSVText(int page_number);
+
+
   /**
    * The recognized text is returned as a char* which is coded in the same
    * format as a box file used in training. Returned string must be freed with