Skip to content

Commit 297d7d8

Browse files
committed
trying to add user words/patterns again:
- pass in ParamsVectors from Tesseract (carrying values from langdata/config/api) into LSTMRecognizer::Load and LoadDictionary - after LSTMRecognizer's Dict is initialised (with default values), reset the variables user_{words,patterns}_{suffix,file} from the corresponding entries in the passed vector
1 parent 0a36b38 commit 297d7d8

File tree

5 files changed

+86
-7
lines changed

5 files changed

+86
-7
lines changed

src/ccmain/tessedit.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ bool Tesseract::init_tesseract_lang_data(
186186
if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
187187
lstm_recognizer_ = new LSTMRecognizer;
188188
ASSERT_HOST(
189-
lstm_recognizer_->Load(lstm_use_matrix ? language : nullptr, mgr));
189+
lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : nullptr, mgr));
190190
} else {
191191
tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
192192
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);

src/ccutil/params.h

+33-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,15 @@ class IntParam : public Param {
155155
void ResetToDefault() {
156156
value_ = default_;
157157
}
158-
158+
void ResetFrom(const ParamsVectors* vec) {
159+
for (int i = 0; i < vec->int_params.size(); ++i) {
160+
if (strcmp(vec->int_params[i]->name_str(), name_) == 0) {
161+
//printf("overriding param %s=%d by =%d\n", name_, value_, *vec->int_params[i]);
162+
value_ = *vec->int_params[i];
163+
}
164+
}
165+
}
166+
159167
private:
160168
int32_t value_;
161169
int32_t default_;
@@ -179,6 +187,14 @@ class BoolParam : public Param {
179187
void ResetToDefault() {
180188
value_ = default_;
181189
}
190+
void ResetFrom(const ParamsVectors* vec) {
191+
for (int i = 0; i < vec->bool_params.size(); ++i) {
192+
if (strcmp(vec->bool_params[i]->name_str(), name_) == 0) {
193+
//printf("overriding param %s=%s by =%s\n", name_, value_ ? "true" : "false", *vec->bool_params[i] ? "true" : "false");
194+
value_ = *vec->bool_params[i];
195+
}
196+
}
197+
}
182198

183199
private:
184200
BOOL8 value_;
@@ -208,6 +224,14 @@ class StringParam : public Param {
208224
void ResetToDefault() {
209225
value_ = default_;
210226
}
227+
void ResetFrom(const ParamsVectors* vec) {
228+
for (int i = 0; i < vec->string_params.size(); ++i) {
229+
if (strcmp(vec->string_params[i]->name_str(), name_) == 0) {
230+
//printf("overriding param %s=%s by =%s\n", name_, value_, vec->string_params[i]->c_str());
231+
value_ = *vec->string_params[i];
232+
}
233+
}
234+
}
211235

212236
private:
213237
STRING value_;
@@ -232,6 +256,14 @@ class DoubleParam : public Param {
232256
void ResetToDefault() {
233257
value_ = default_;
234258
}
259+
void ResetFrom(const ParamsVectors* vec) {
260+
for (int i = 0; i < vec->double_params.size(); ++i) {
261+
if (strcmp(vec->double_params[i]->name_str(), name_) == 0) {
262+
//printf("overriding param %s=%f by =%f\n", name_, value_, *vec->double_params[i]);
263+
value_ = *vec->double_params[i];
264+
}
265+
}
266+
}
235267

236268
private:
237269
double value_;

src/dict/dict.cpp

+41
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,47 @@ void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
316316
lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
317317
if (number_dawg) dawgs_ += number_dawg;
318318
}
319+
320+
// stolen from Dict::Load (but needs params_ from Tesseract langdata/config/api):
321+
STRING name;
322+
if (((STRING &)user_words_suffix).length() > 0 ||
323+
((STRING &)user_words_file).length() > 0) {
324+
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
325+
getUnicharset().size(), dawg_debug_level);
326+
if (((STRING &)user_words_file).length() > 0) {
327+
name = user_words_file;
328+
} else {
329+
name = getCCUtil()->language_data_path_prefix;
330+
name += user_words_suffix;
331+
}
332+
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
333+
Trie::RRP_REVERSE_IF_HAS_RTL)) {
334+
tprintf("Error: failed to load %s\n", name.string());
335+
delete trie_ptr;
336+
} else {
337+
dawgs_ += trie_ptr;
338+
}
339+
}
340+
341+
if (((STRING &)user_patterns_suffix).length() > 0 ||
342+
((STRING &)user_patterns_file).length() > 0) {
343+
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
344+
getUnicharset().size(), dawg_debug_level);
345+
trie_ptr->initialize_patterns(&(getUnicharset()));
346+
if (((STRING &)user_patterns_file).length() > 0) {
347+
name = user_patterns_file;
348+
} else {
349+
name = getCCUtil()->language_data_path_prefix;
350+
name += user_patterns_suffix;
351+
}
352+
if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
353+
tprintf("Error: failed to load %s\n", name.string());
354+
delete trie_ptr;
355+
} else {
356+
dawgs_ += trie_ptr;
357+
}
358+
}
359+
319360
}
320361

321362
// Completes the loading process after Load() and/or LoadLSTM().

src/lstm/lstmrecognizer.cpp

+8-3
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,13 @@ LSTMRecognizer::~LSTMRecognizer() {
6666
}
6767

6868
// Loads a model from mgr, including the dictionary only if lang is not null.
69-
bool LSTMRecognizer::Load(const char* lang, TessdataManager* mgr) {
69+
bool LSTMRecognizer::Load(const ParamsVectors* params, const char* lang, TessdataManager* mgr) {
7070
TFile fp;
7171
if (!mgr->GetComponent(TESSDATA_LSTM, &fp)) return false;
7272
if (!DeSerialize(mgr, &fp)) return false;
7373
if (lang == nullptr) return true;
7474
// Allow it to run without a dictionary.
75-
LoadDictionary(lang, mgr);
75+
LoadDictionary(params, lang, mgr);
7676
return true;
7777
}
7878

@@ -154,9 +154,14 @@ bool LSTMRecognizer::LoadRecoder(TFile* fp) {
154154
// on the unicharset matching. This enables training to deserialize a model
155155
// from checkpoint or restore without having to go back and reload the
156156
// dictionary.
157-
bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
157+
// Some parameters have to be passed in (from langdata/config/api via Tesseract)
158+
bool LSTMRecognizer::LoadDictionary(const ParamsVectors* params, const char* lang, TessdataManager* mgr) {
158159
delete dict_;
159160
dict_ = new Dict(&ccutil_);
161+
dict_->user_words_file.ResetFrom(params);
162+
dict_->user_words_suffix.ResetFrom(params);
163+
dict_->user_patterns_file.ResetFrom(params);
164+
dict_->user_patterns_suffix.ResetFrom(params);
160165
dict_->SetupForLoad(Dict::GlobalDawgCache());
161166
dict_->LoadLSTM(lang, mgr);
162167
if (dict_->FinishLoad()) return true; // Success.

src/lstm/lstmrecognizer.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "matrix.h"
2626
#include "network.h"
2727
#include "networkscratch.h"
28+
#include "params.h"
2829
#include "recodebeam.h"
2930
#include "series.h"
3031
#include "strngs.h"
@@ -154,7 +155,7 @@ class LSTMRecognizer {
154155
int null_char() const { return null_char_; }
155156

156157
// Loads a model from mgr, including the dictionary only if lang is not null.
157-
bool Load(const char* lang, TessdataManager* mgr);
158+
bool Load(const ParamsVectors* params, const char* lang, TessdataManager* mgr);
158159

159160
// Writes to the given file. Returns false in case of error.
160161
// If mgr contains a unicharset and recoder, then they are not encoded to fp.
@@ -174,7 +175,7 @@ class LSTMRecognizer {
174175
// on the unicharset matching. This enables training to deserialize a model
175176
// from checkpoint or restore without having to go back and reload the
176177
// dictionary.
177-
bool LoadDictionary(const char* lang, TessdataManager* mgr);
178+
bool LoadDictionary(const ParamsVectors* params, const char* lang, TessdataManager* mgr);
178179

179180
// Recognizes the line image, contained within image_data, returning the
180181
// recognized tesseract WERD_RES for the words.

0 commit comments

Comments
 (0)