Removed debug messages, forward compatability of traineddata files, further bug fix.

theraysmith · theraysmith · commit 44122698d746 · 2015-07-09T14:50:25.000-07:00
diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp
@@ -1,7 +1,12 @@
 /**********************************************************************
  * File:        pageres.cpp  (Formerly page_res.c)
- * Description: Results classes used by control.c
- * Author:		Phil Cheatle
+ * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
+ *              and an iterator class to iterate over the words.
+ * Main purposes:
+ *              Easy way to iterate over the words without a 3-nested loop.
+ *              Holds data used during word recognition.
+ *              Holds information about alternative spacing paths.
+ * Author:      Phil Cheatle
  * Created:     Tue Sep 22 08:42:49 BST 1992
  *
  * (C) Copyright 1992, Hewlett-Packard Ltd.
@@ -1478,8 +1483,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
   WERD* real_word = word_res->word;
   if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
     real_word->set_flag(W_FUZZY_SP, true);
-    tprintf("Made word fuzzy at:");
-    real_word->bounding_box().print();
     if (word_res->combination) {
       // The next word should be the corresponding part of combo, but we have
       // already stepped past it, so find it by search.
@@ -1493,8 +1496,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
       ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
                   !real_word->flag(W_FUZZY_NON));
       real_word->set_flag(W_FUZZY_SP, true);
-      tprintf("Made part of combo word fuzzy at:");
-      real_word->bounding_box().print();
     }
   }
 }
diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp
@@ -50,7 +50,10 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
     ReverseN(&actual_tessdata_num_entries_,
              sizeof(actual_tessdata_num_entries_));
   }
-  ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
+  if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
+    // For forward compatability, truncate to the number we can handle.
+    actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
+  }
   fread(offset_table_, sizeof(inT64),
         actual_tessdata_num_entries_, data_file_);
   if (swap_) {
diff --git a/ccutil/unicharset.cpp b/ccutil/unicharset.cpp
@@ -215,34 +215,6 @@ int UNICHARSET::step(const char* str) const {
   if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
   return lengths[0];
 }
-// As step except constraining the search to unichar-ids that are
-// self-normalized. Unlike step, does not encode the whole string, therefore
-// should be used on short strings (like those obtained from
-// get_normed_unichar.)
-int UNICHARSET::normed_step(const char* str) const {
-  // Find the length of the first matching unicharset member.
-  int length = ids.minmatch(str);
-  if (length == 0)
-    return 0;  // Empty string or illegal char.
-
-  while (length <= UNICHAR_LEN) {
-    if (ids.contains(str, length)) {
-      int matched_id = unichar_to_id(str, length);
-      const GenericVector<UNICHAR_ID>& matched_norms = normed_ids(matched_id);
-      bool good_start = matched_norms.size() == 1 &&
-                        matched_norms[0] == matched_id;
-      if (str[length] == '\0') {
-        return good_start ? length : 0;
-      }
-      if (normed_step(str + length) > 0)
-        return length;  // This length works!
-    } else if (str[length] == '\0') {
-      return 0;  // Ran out of string.
-    }
-    ++length;
-  }
-  return 0;
-}
 
 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
 // If not encodable, write the first byte offset which cannot be converted
@@ -375,19 +347,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
 // stored in the file, and needs to be set when the UNICHARSET is loaded.
 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
   unichars[unichar_id].properties.normed_ids.truncate(0);
-  int length = unichars[unichar_id].properties.normed.length();
-  const char* normed_str = unichars[unichar_id].properties.normed.string();
-  int step = 0;
-  for (int offset = 0; offset < length; offset+= step) {
-    step = normed_step(normed_str + offset);
-    if (step == 0) {
-      unichars[unichar_id].properties.normed_ids.truncate(0);
-      unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
-      break;
-    }
-    int normed_id = unichar_to_id(normed_str + offset, step);
-    ASSERT_HOST(normed_id >= 0);
-    unichars[unichar_id].properties.normed_ids.push_back(normed_id);
+  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
+    unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
+  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
+                            true, &unichars[unichar_id].properties.normed_ids,
+                            NULL, NULL)) {
+    unichars[unichar_id].properties.normed_ids.truncate(0);
+    unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
   }
 }
 
@@ -1015,6 +981,24 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
   }
 }
 
+// Returns true if there are any repeated unicodes in the normalized
+// text of any unichar-id in the unicharset.
+bool UNICHARSET::AnyRepeatedUnicodes() const {
+  int start_id = 0;
+  if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
+  for (int id = start_id; id < size_used; ++id) {
+    // Convert to unicodes.
+    GenericVector<int> unicodes;
+    if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
+        unicodes.size() > 1) {
+      for (int u = 1; u < unicodes.size(); ++u) {
+        if (unicodes[u - 1] == unicodes[u]) return true;
+      }
+    }
+  }
+  return false;
+}
+
 int UNICHARSET::add_script(const char* script) {
   for (int i = 0; i < script_table_size_used; ++i) {
     if (strcmp(script, script_table[i]) == 0)
diff --git a/ccutil/unicharset.h b/ccutil/unicharset.h
@@ -190,11 +190,6 @@ class UNICHARSET {
   // WARNING: this function now encodes the whole string for precision.
   // Use encode_string in preference to repeatedly calling step.
   int step(const char* str) const;
-  // As step except constraining the search to unichar-ids that are
-  // self-normalized. Unlike step, does not encode the whole string, therefore
-  // should be used on short strings (like those obtained from
-  // get_normed_unichar.)
-  int normed_step(const char* str) const;
 
   // Return whether the given UTF-8 string is encodable with this UNICHARSET.
   // If not encodable, write the first byte offset which cannot be converted
@@ -678,6 +673,10 @@ class UNICHARSET {
                kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
   }
 
+  // Returns true if there are any repeated unicodes in the normalized
+  // text of any unichar-id in the unicharset.
+  bool AnyRepeatedUnicodes() const;
+
   // Return a pointer to the CHAR_FRAGMENT class if the given
   // unichar id represents a character fragment.
   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
@@ -775,6 +774,7 @@ class UNICHARSET {
 
   // Returns normalized version of unichar with the given unichar_id.
   const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
+    if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
     return unichars[unichar_id].properties.normed.string();
   }
   // Returns a vector of UNICHAR_IDs that represent the ids of the normalized