Skip to content

Commit 4412269

Browse files
committed
Removed debug messages, forward compatability of traineddata files, further bug fix.
1 parent a303ab9 commit 4412269

File tree

4 files changed

+41
-53
lines changed

4 files changed

+41
-53
lines changed

ccstruct/pageres.cpp

+7-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
/**********************************************************************
22
* File: pageres.cpp (Formerly page_res.c)
3-
* Description: Results classes used by control.c
4-
* Author: Phil Cheatle
3+
* Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4+
* and an iterator class to iterate over the words.
5+
* Main purposes:
6+
* Easy way to iterate over the words without a 3-nested loop.
7+
* Holds data used during word recognition.
8+
* Holds information about alternative spacing paths.
9+
* Author: Phil Cheatle
510
* Created: Tue Sep 22 08:42:49 BST 1992
611
*
712
* (C) Copyright 1992, Hewlett-Packard Ltd.
@@ -1478,8 +1483,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
14781483
WERD* real_word = word_res->word;
14791484
if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
14801485
real_word->set_flag(W_FUZZY_SP, true);
1481-
tprintf("Made word fuzzy at:");
1482-
real_word->bounding_box().print();
14831486
if (word_res->combination) {
14841487
// The next word should be the corresponding part of combo, but we have
14851488
// already stepped past it, so find it by search.
@@ -1493,8 +1496,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
14931496
ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
14941497
!real_word->flag(W_FUZZY_NON));
14951498
real_word->set_flag(W_FUZZY_SP, true);
1496-
tprintf("Made part of combo word fuzzy at:");
1497-
real_word->bounding_box().print();
14981499
}
14991500
}
15001501
}

ccutil/tessdatamanager.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,10 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
5050
ReverseN(&actual_tessdata_num_entries_,
5151
sizeof(actual_tessdata_num_entries_));
5252
}
53-
ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
53+
if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
54+
// For forward compatability, truncate to the number we can handle.
55+
actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
56+
}
5457
fread(offset_table_, sizeof(inT64),
5558
actual_tessdata_num_entries_, data_file_);
5659
if (swap_) {

ccutil/unicharset.cpp

+25-41
Original file line numberDiff line numberDiff line change
@@ -215,34 +215,6 @@ int UNICHARSET::step(const char* str) const {
215215
if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
216216
return lengths[0];
217217
}
218-
// As step except constraining the search to unichar-ids that are
219-
// self-normalized. Unlike step, does not encode the whole string, therefore
220-
// should be used on short strings (like those obtained from
221-
// get_normed_unichar.)
222-
int UNICHARSET::normed_step(const char* str) const {
223-
// Find the length of the first matching unicharset member.
224-
int length = ids.minmatch(str);
225-
if (length == 0)
226-
return 0; // Empty string or illegal char.
227-
228-
while (length <= UNICHAR_LEN) {
229-
if (ids.contains(str, length)) {
230-
int matched_id = unichar_to_id(str, length);
231-
const GenericVector<UNICHAR_ID>& matched_norms = normed_ids(matched_id);
232-
bool good_start = matched_norms.size() == 1 &&
233-
matched_norms[0] == matched_id;
234-
if (str[length] == '\0') {
235-
return good_start ? length : 0;
236-
}
237-
if (normed_step(str + length) > 0)
238-
return length; // This length works!
239-
} else if (str[length] == '\0') {
240-
return 0; // Ran out of string.
241-
}
242-
++length;
243-
}
244-
return 0;
245-
}
246218

247219
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
248220
// If not encodable, write the first byte offset which cannot be converted
@@ -375,19 +347,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
375347
// stored in the file, and needs to be set when the UNICHARSET is loaded.
376348
void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
377349
unichars[unichar_id].properties.normed_ids.truncate(0);
378-
int length = unichars[unichar_id].properties.normed.length();
379-
const char* normed_str = unichars[unichar_id].properties.normed.string();
380-
int step = 0;
381-
for (int offset = 0; offset < length; offset+= step) {
382-
step = normed_step(normed_str + offset);
383-
if (step == 0) {
384-
unichars[unichar_id].properties.normed_ids.truncate(0);
385-
unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
386-
break;
387-
}
388-
int normed_id = unichar_to_id(normed_str + offset, step);
389-
ASSERT_HOST(normed_id >= 0);
390-
unichars[unichar_id].properties.normed_ids.push_back(normed_id);
350+
if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
351+
unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
352+
} else if (!encode_string(unichars[unichar_id].properties.normed.string(),
353+
true, &unichars[unichar_id].properties.normed_ids,
354+
NULL, NULL)) {
355+
unichars[unichar_id].properties.normed_ids.truncate(0);
356+
unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
391357
}
392358
}
393359

@@ -1015,6 +981,24 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
1015981
}
1016982
}
1017983

984+
// Returns true if there are any repeated unicodes in the normalized
985+
// text of any unichar-id in the unicharset.
986+
bool UNICHARSET::AnyRepeatedUnicodes() const {
987+
int start_id = 0;
988+
if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
989+
for (int id = start_id; id < size_used; ++id) {
990+
// Convert to unicodes.
991+
GenericVector<int> unicodes;
992+
if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
993+
unicodes.size() > 1) {
994+
for (int u = 1; u < unicodes.size(); ++u) {
995+
if (unicodes[u - 1] == unicodes[u]) return true;
996+
}
997+
}
998+
}
999+
return false;
1000+
}
1001+
10181002
int UNICHARSET::add_script(const char* script) {
10191003
for (int i = 0; i < script_table_size_used; ++i) {
10201004
if (strcmp(script, script_table[i]) == 0)

ccutil/unicharset.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -190,11 +190,6 @@ class UNICHARSET {
190190
// WARNING: this function now encodes the whole string for precision.
191191
// Use encode_string in preference to repeatedly calling step.
192192
int step(const char* str) const;
193-
// As step except constraining the search to unichar-ids that are
194-
// self-normalized. Unlike step, does not encode the whole string, therefore
195-
// should be used on short strings (like those obtained from
196-
// get_normed_unichar.)
197-
int normed_step(const char* str) const;
198193

199194
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
200195
// If not encodable, write the first byte offset which cannot be converted
@@ -678,6 +673,10 @@ class UNICHARSET {
678673
kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
679674
}
680675

676+
// Returns true if there are any repeated unicodes in the normalized
677+
// text of any unichar-id in the unicharset.
678+
bool AnyRepeatedUnicodes() const;
679+
681680
// Return a pointer to the CHAR_FRAGMENT class if the given
682681
// unichar id represents a character fragment.
683682
const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
@@ -775,6 +774,7 @@ class UNICHARSET {
775774

776775
// Returns normalized version of unichar with the given unichar_id.
777776
const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
777+
if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
778778
return unichars[unichar_id].properties.normed.string();
779779
}
780780
// Returns a vector of UNICHAR_IDs that represent the ids of the normalized

0 commit comments

Comments
 (0)