Cleanup TSV renderer

tfmorris · tfmorris · commit 6700edd8bcdd · 2016-03-01T13:41:19.000-05:00
Remove all references to hocr, hocr.tsv, etc. Remove dead code for font
info, input filename, HTML escapes. Improved comments. Fixed
indentation.
diff --git a/api/baseapi.cpp b/api/baseapi.cpp
@@ -1417,7 +1417,7 @@ static void AddBoxTohOCR(const ResultIterator *it,
   *hocr_str += "\">";
 }
 
-static void AddBoxTohOCRTSV(const PageIterator *it,
+static void AddBoxToTSV(const PageIterator *it,
                          PageIteratorLevel level,
                          STRING* hocr_str) {
   int left, top, right, bottom;
@@ -1615,57 +1615,31 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
 }
 
 /**
- * Make a TSV-formatted string with hOCR markup from the internal
- * data structures.
+ * Make a TSV-formatted string from the internal data structures.
  * page_number is 0-based but will appear in the output as 1-based.
- * Image name/input_file_ can be set by SetInputName before calling
- * GetHOCRText
- * STL removed from original patch submission and refactored by rays.
  */
-char* TessBaseAPI::GetHOCRTSVText(int page_number) {
+char* TessBaseAPI::GetTSVText(int page_number) {
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
 
   int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
-  int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
-  bool font_info = false;
-  GetBoolVariable("hocr_font_info", &font_info);
-
-  STRING hocr_str("");
+  int page_id = page_number + 1;  // we use 1-based page numbers.
 
-  if (input_file_ == NULL)
-      SetInputName(NULL);
-
-#ifdef _WIN32
-  // convert input name from ANSI encoding to utf-8
-  int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
-                                      NULL, NULL);
-  wchar_t *uni16_str = new WCHAR[str16_len];
-  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
-                                  uni16_str, str16_len);
-  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
-                                     NULL, NULL, NULL);
-  char *utf8_str = new char[utf8_len];
-  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
-                      utf8_len, NULL, NULL);
-  *input_file_ = utf8_str;
-  delete[] uni16_str;
-  delete[] utf8_str;
-#endif
+  STRING tsv_str("");
 
   int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;
 
-  hocr_str.add_str_int("1\t", page_num);
-  hocr_str.add_str_int("\t", block_num);
-  hocr_str.add_str_int("\t", par_num);
-  hocr_str.add_str_int("\t", line_num);
-  hocr_str.add_str_int("\t", word_num);
-  hocr_str.add_str_int("\t", rect_left_);
-  hocr_str.add_str_int("\t", rect_top_);
-  hocr_str.add_str_int("\t", rect_width_);
-  hocr_str.add_str_int("\t", rect_height_);
-  hocr_str += "\t-1\t\n";
+  tsv_str.add_str_int("1\t", page_num); // level 1 - page
+  tsv_str.add_str_int("\t", block_num);
+  tsv_str.add_str_int("\t", par_num);
+  tsv_str.add_str_int("\t", line_num);
+  tsv_str.add_str_int("\t", word_num);
+  tsv_str.add_str_int("\t", rect_left_);
+  tsv_str.add_str_int("\t", rect_top_);
+  tsv_str.add_str_int("\t", rect_width_);
+  tsv_str.add_str_int("\t", rect_height_);
+  tsv_str += "\t-1\t\n";
 
   ResultIterator *res_it = GetIterator();
   while (!res_it->Empty(RIL_BLOCK)) {
@@ -1674,36 +1648,36 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
       continue;
     }
 
-    // Open any new block/paragraph/textline.
+    // Add rows for any new block/paragraph/textline.
     if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
       block_num++, par_num = 0, line_num = 0, word_num = 0;
-      hocr_str.add_str_int("2\t", page_num);
-      hocr_str.add_str_int("\t", block_num);
-      hocr_str.add_str_int("\t", par_num);
-      hocr_str.add_str_int("\t", line_num);
-      hocr_str.add_str_int("\t", word_num);
-      AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str);
-      hocr_str += "\t-1\t\n";
+      tsv_str.add_str_int("2\t", page_num); // level 2 - block
+      tsv_str.add_str_int("\t", block_num);
+      tsv_str.add_str_int("\t", par_num);
+      tsv_str.add_str_int("\t", line_num);
+      tsv_str.add_str_int("\t", word_num);
+      AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
+      tsv_str += "\t-1\t\n"; // end of row for block
     }
     if (res_it->IsAtBeginningOf(RIL_PARA)) {
       par_num++, line_num = 0, word_num = 0;
-      hocr_str.add_str_int("3\t", page_num);
-      hocr_str.add_str_int("\t", block_num);
-      hocr_str.add_str_int("\t", par_num);
-      hocr_str.add_str_int("\t", line_num);
-      hocr_str.add_str_int("\t", word_num);
-      AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str);
-      hocr_str += "\t-1\t\n";
+      tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
+      tsv_str.add_str_int("\t", block_num);
+      tsv_str.add_str_int("\t", par_num);
+      tsv_str.add_str_int("\t", line_num);
+      tsv_str.add_str_int("\t", word_num);
+      AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
+      tsv_str += "\t-1\t\n"; // end of row for para
     }
     if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
       line_num++, word_num = 0;
-      hocr_str.add_str_int("4\t", page_num);
-      hocr_str.add_str_int("\t", block_num);
-      hocr_str.add_str_int("\t", par_num);
-      hocr_str.add_str_int("\t", line_num);
-      hocr_str.add_str_int("\t", word_num);
-      AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str);
-      hocr_str += "\t-1\t\n";
+      tsv_str.add_str_int("4\t", page_num); // level 4 - line
+      tsv_str.add_str_int("\t", block_num);
+      tsv_str.add_str_int("\t", par_num);
+      tsv_str.add_str_int("\t", line_num);
+      tsv_str.add_str_int("\t", word_num);
+      AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
+      tsv_str += "\t-1\t\n"; // end of row for line
     }
 
     // Now, process the word...
@@ -1715,49 +1689,34 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
     font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
                                            &monospace, &serif, &smallcaps,
                                            &pointsize, &font_id);
-      word_num++;
-      hocr_str.add_str_int("5\t", page_num);
-      hocr_str.add_str_int("\t", block_num);
-      hocr_str.add_str_int("\t", par_num);
-      hocr_str.add_str_int("\t", line_num);
-      hocr_str.add_str_int("\t", word_num);
-      hocr_str.add_str_int("\t", left);
-      hocr_str.add_str_int("\t", top);
-      hocr_str.add_str_int("\t", right - left + 1);
-      hocr_str.add_str_int("\t", bottom - top + 1);
-      hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
-    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
-    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
-    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
-    hocr_str += "\t";
+    word_num++;
+    tsv_str.add_str_int("5\t", page_num); // level 5 - word
+    tsv_str.add_str_int("\t", block_num);
+    tsv_str.add_str_int("\t", par_num);
+    tsv_str.add_str_int("\t", line_num);
+    tsv_str.add_str_int("\t", word_num);
+    tsv_str.add_str_int("\t", left);
+    tsv_str.add_str_int("\t", top);
+    tsv_str.add_str_int("\t", right - left + 1);
+    tsv_str.add_str_int("\t", bottom - top + 1);
+    tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
+    tsv_str += "\t";
+
+    // Increment counts if at end of block/paragraph/textline.
+    if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
+    if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
+    if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
+
     do {
-      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
-//      if (grapheme && grapheme[0] != 0) {
-//        if (grapheme[1] == 0) {
-//          hocr_str += HOcrEscape(grapheme);
-//        } else {
-          hocr_str += grapheme;
-//        }
-//      }
-      delete []grapheme;
+      tsv_str += res_it->GetUTF8Text(RIL_SYMBOL);
       res_it->Next(RIL_SYMBOL);
     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
-    hocr_str += "\n";
+    tsv_str += "\n"; // end of row
     wcnt++;
-    // Close any ending block/paragraph/textline.
-    if (last_word_in_line) {
-      lcnt++;
-    }
-    if (last_word_in_para) {
-      pcnt++;
-    }
-    if (last_word_in_block) {
-      bcnt++;
-    }
   }
 
-  char *ret = new char[hocr_str.length() + 1];
-  strcpy(ret, hocr_str.string());
+  char *ret = new char[tsv_str.length() + 1];
+  strcpy(ret, tsv_str.string());
   delete res_it;
   return ret;
 }
diff --git a/api/baseapi.h b/api/baseapi.h
@@ -603,12 +603,10 @@ class TESS_API TessBaseAPI {
   char* GetHOCRText(int page_number);
 
   /**
-   * Make a TSV-formatted string with hOCR markup from the internal
-   * data structures.
+   * Make a TSV-formatted string from the internal data structures.
    * page_number is 0-based but will appear in the output as 1-based.
    */
-  char* GetHOCRTSVText(int page_number);
-
+  char* GetTSVText(int page_number);
 
   /**
    * The recognized text is returned as a char* which is coded in the same
diff --git a/api/renderer.cpp b/api/renderer.cpp
@@ -182,31 +182,32 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
 /**********************************************************************
  * HOcr Text Renderer interface implementation
  **********************************************************************/
-TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase)
-    : TessResultRenderer(outputbase, "hocr.tsv") {
+TessTsvRenderer::TessTsvRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "tsv") {
     font_info_ = false;
 }
 
-TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
-    : TessResultRenderer(outputbase, "hocr.tsv") {
+TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
+    : TessResultRenderer(outputbase, "tsv") {
     font_info_ = font_info;
 }
 
-bool TessHOcrTsvRenderer::BeginDocumentHandler() {
+bool TessTsvRenderer::BeginDocumentHandler() {
+  // Output TSV column headings
   AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
   return true;
 }
 
-bool TessHOcrTsvRenderer::EndDocumentHandler() {
+bool TessTsvRenderer::EndDocumentHandler() {
   return true;
 }
 
-bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
-  char* hocrtsv = api->GetHOCRTSVText(imagenum());
-  if (hocrtsv == NULL) return false;
+bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) {
+  char* tsv = api->GetTSVText(imagenum());
+  if (tsv == NULL) return false;
 
-  AppendString(hocrtsv);
-  delete[] hocrtsv;
+  AppendString(tsv);
+  delete[] tsv;
 
   return true;
 }
diff --git a/api/renderer.h b/api/renderer.h
@@ -163,12 +163,12 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
 };
 
 /**
- * Renders tesseract output into an hocr tsv string
+ * Renders Tesseract output into a TSV string
  */
-class TESS_API TessHOcrTsvRenderer : public TessResultRenderer {
+class TESS_API TessTsvRenderer : public TessResultRenderer {
  public:
-  explicit TessHOcrTsvRenderer(const char *outputbase, bool font_info);
-  explicit TessHOcrTsvRenderer(const char *outputbase);
+  explicit TessTsvRenderer(const char *outputbase, bool font_info);
+  explicit TessTsvRenderer(const char *outputbase);
 
 protected:
   virtual bool BeginDocumentHandler();
diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp
@@ -299,12 +299,12 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
                      new tesseract::TessHOcrRenderer(outputbase, font_info));
     }
 
-    api->GetBoolVariable("tessedit_create_hocrtsv", &b);
+    api->GetBoolVariable("tessedit_create_tsv", &b);
     if (b) {
       bool font_info;
       api->GetBoolVariable("hocr_font_info", &font_info);
       renderers->push_back(
-          new tesseract::TessHOcrTsvRenderer(outputbase, font_info));
+          new tesseract::TessTsvRenderer(outputbase, font_info));
     }
 
     api->GetBoolVariable("tessedit_create_pdf", &b);
diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp
@@ -385,7 +385,7 @@ Tesseract::Tesseract()
                   this->params()),
       BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
                   this->params()),
-      BOOL_MEMBER(tessedit_create_hocrtsv, false, "Write .hocr.tsv TSV output file",
+      BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
                   this->params()),
       BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
                   this->params()),
diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h
@@ -1003,7 +1003,7 @@ class Tesseract : public Wordrec {
   BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
   BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
   BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
-  BOOL_VAR_H(tessedit_create_hocrtsv, false, "Write .hocr.tsv hOCR-tsv output file");
+  BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
   BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
   STRING_VAR_H(unrecognised_char, "|",
                "Output char for unidentified blobs");
diff --git a/tessdata/configs/hocrtsv b/tessdata/configs/hocrtsv
diff --git a/tessdata/configs/tsv b/tessdata/configs/tsv
@@ -0,0 +1,2 @@
+tessedit_create_tsv 1
+tessedit_pageseg_mode 1

Original file line number	Diff line number	Diff line change
`@@ -299,12 +299,12 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,`
`299`	`299`	`new tesseract::TessHOcrRenderer(outputbase, font_info));`
`300`	`300`	`}`
`301`	`301`
`302`		`- api->GetBoolVariable("tessedit_create_hocrtsv", &b);`
	`302`	`+ api->GetBoolVariable("tessedit_create_tsv", &b);`
`303`	`303`	`if (b) {`
`304`	`304`	`bool font_info;`
`305`	`305`	`api->GetBoolVariable("hocr_font_info", &font_info);`
`306`	`306`	`renderers->push_back(`
`307`		`- new tesseract::TessHOcrTsvRenderer(outputbase, font_info));`
	`307`	`+ new tesseract::TessTsvRenderer(outputbase, font_info));`
`308`	`308`	`}`
`309`	`309`
`310`	`310`	`api->GetBoolVariable("tessedit_create_pdf", &b);`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+tessedit_create_tsv 1`
	`2`	`+tessedit_pageseg_mode 1`