Skip to content

Commit 1c7e006

Browse files
committed
Add initial support for traineddata files in standard archive formats
This requires libarchive-dev. Tesseract can now load traineddata files in any of the archive formats which are supported by libarchive. Example of a zipped BagIt archive: $ unzip -l /usr/local/share/tessdata/zip.traineddata Archive: /usr/local/share/tessdata/zip.traineddata Length Date Time Name --------- ---------- ----- ---- 55 2019-03-05 15:27 bagit.txt 0 2019-03-05 15:25 data/ 1557 2019-03-05 15:28 manifest-sha256.txt 1082890 2019-03-05 15:25 data/eng.word-dawg 1487588 2019-03-05 15:25 data/eng.lstm 7477 2019-03-05 15:25 data/eng.unicharset 63346 2019-03-05 15:25 data/eng.shapetable 976552 2019-03-05 15:25 data/eng.inttemp 13408 2019-03-05 15:25 data/eng.normproto 4322 2019-03-05 15:25 data/eng.punc-dawg 4738 2019-03-05 15:25 data/eng.lstm-number-dawg 1410 2019-03-05 15:25 data/eng.freq-dawg 844 2019-03-05 15:25 data/eng.pffmtable 6360 2019-03-05 15:25 data/eng.lstm-unicharset 1012 2019-03-05 15:25 data/eng.lstm-recoder 1047 2019-03-05 15:25 data/eng.unicharambigs 4322 2019-03-05 15:25 data/eng.lstm-punc-dawg 16109842 2019-03-05 15:25 data/eng.bigram-dawg 80 2019-03-05 15:25 data/eng.version 6426 2019-03-05 15:25 data/eng.number-dawg 3694794 2019-03-05 15:25 data/eng.lstm-word-dawg --------- ------- 23468070 21 files `combine_tessdata -d` and `combine_tessdata -u` also work. The traineddata files in the new format can be generated with standard tools like zip or tar. More work is needed for other training tools and big endian support. Signed-off-by: Stefan Weil <[email protected]>
1 parent 7fbde96 commit 1c7e006

File tree

7 files changed

+95
-10
lines changed

7 files changed

+95
-10
lines changed

.travis.yml

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ addons:
2121
sources:
2222
#- ubuntu-toolchain-r-test
2323
packages:
24+
- libarchive-dev
2425
#- g++-6
2526

2627
#matrix:

configure.ac

+6
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,12 @@ else
422422
AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
423423
fi
424424
425+
PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
426+
AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
427+
if $have_libarchive; then
428+
AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
429+
fi
430+
425431
AM_CONDITIONAL([ENABLE_TRAINING], true)
426432
427433
# Check availability of ICU packages.

src/api/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)
8888

8989
tesseract_LDADD += $(LEPTONICA_LIBS)
9090
tesseract_LDADD += $(OPENMP_CXXFLAGS)
91+
tesseract_LDADD += $(libarchive_LIBS)
9192

9293
if T_WIN
9394
tesseract_LDADD += -ltiff

src/ccutil/Makefile.am

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \
4040
unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
4141
params.cpp universalambigs.cpp
4242

43+
AM_CPPFLAGS += $(libarchive_CFLAGS)
44+
4345
if T_WIN
4446
AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
4547
endif

src/ccutil/tessdatamanager.cpp

+60-9
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// File: tessdatamanager.cpp
33
// Description: Functions to handle loading/combining tesseract data files.
44
// Author: Daria Antonova
5-
// Created: Wed Jun 03 11:26:43 PST 2009
65
//
76
// (C) Copyright 2009, Google Inc.
87
// Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,6 +23,12 @@
2423
#include "tessdatamanager.h"
2524

2625
#include <cstdio>
26+
#include <string>
27+
28+
#if defined(HAVE_LIBARCHIVE)
29+
#include <archive.h>
30+
#include <archive_entry.h>
31+
#endif
2732

2833
#include "errcode.h"
2934
#include "helpers.h"
@@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) {
5257
data_file_name_ = data_file_name;
5358
}
5459

60+
#if defined(HAVE_LIBARCHIVE)
61+
bool TessdataManager::LoadArchiveFile(const char *filename) {
62+
bool result = false;
63+
archive *a = archive_read_new();
64+
if (a != nullptr) {
65+
archive_read_support_filter_all(a);
66+
archive_read_support_format_all(a);
67+
if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
68+
archive_entry *ae;
69+
while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
70+
const char *component = archive_entry_pathname(ae);
71+
if (component != nullptr) {
72+
TessdataType type;
73+
if (TessdataTypeFromFileName(component, &type)) {
74+
int64_t size = archive_entry_size(ae);
75+
if (size > 0) {
76+
entries_[type].resize_no_init(size);
77+
if (archive_read_data(a, &entries_[type][0], size) == size) {
78+
is_loaded_ = true;
79+
}
80+
}
81+
}
82+
}
83+
}
84+
result = is_loaded_;
85+
#if defined(DEBUG)
86+
} else {
87+
tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
88+
filename, strerror(archive_errno(a)));
89+
#endif
90+
}
91+
archive_read_free(a);
92+
}
93+
return result;
94+
}
95+
#endif
96+
5597
bool TessdataManager::Init(const char *data_file_name) {
5698
GenericVector<char> data;
5799
if (reader_ == nullptr) {
100+
#if defined(HAVE_LIBARCHIVE)
101+
if (LoadArchiveFile(data_file_name)) return true;
102+
#endif
58103
if (!LoadDataFromFile(data_file_name, &data)) return false;
59104
} else {
60105
if (!(*reader_)(data_file_name, &data)) return false;
@@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) {
65110
// Loads from the given memory buffer as if a file.
66111
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
67112
int size) {
113+
// TODO: This method supports only the proprietary file format.
68114
Clear();
69115
data_file_name_ = name;
70116
TFile fp;
@@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
78124
GenericVector<int64_t> offset_table;
79125
offset_table.resize_no_init(num_entries);
80126
if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
81-
for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
127+
for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
82128
if (offset_table[i] >= 0) {
83129
int64_t entry_size = size - offset_table[i];
84-
int j = i + 1;
130+
unsigned j = i + 1;
85131
while (j < num_entries && offset_table[j] == -1) ++j;
86132
if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
87133
entries_[i].resize_no_init(entry_size);
@@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
106152
// Saves to the given filename.
107153
bool TessdataManager::SaveFile(const STRING &filename,
108154
FileWriter writer) const {
155+
// TODO: This method supports only the proprietary file format.
109156
ASSERT_HOST(is_loaded_);
110157
GenericVector<char> data;
111158
Serialize(&data);
@@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename,
117164

118165
// Serializes to the given vector.
119166
void TessdataManager::Serialize(GenericVector<char> *data) const {
167+
// TODO: This method supports only the proprietary file format.
120168
ASSERT_HOST(is_loaded_);
121169
// Compute the offset_table and total size.
122170
int64_t offset_table[TESSDATA_NUM_ENTRIES];
123171
int64_t offset = sizeof(int32_t) + sizeof(offset_table);
124-
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
172+
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
125173
if (entries_[i].empty()) {
126174
offset_table[i] = -1;
127175
} else {
@@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
135183
fp.OpenWrite(data);
136184
fp.Serialize(&num_entries);
137185
fp.Serialize(&offset_table[0], countof(offset_table));
138-
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
186+
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
139187
if (!entries_[i].empty()) {
140188
fp.Serialize(&entries_[i][0], entries_[i].size());
141189
}
@@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
144192

145193
// Resets to the initial state, keeping the reader.
146194
void TessdataManager::Clear() {
147-
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
195+
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
148196
entries_[i].clear();
149197
}
150198
is_loaded_ = false;
@@ -154,7 +202,7 @@ void TessdataManager::Clear() {
154202
void TessdataManager::Directory() const {
155203
tprintf("Version string:%s\n", VersionString().c_str());
156204
int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
157-
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
205+
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
158206
if (!entries_[i].empty()) {
159207
tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
160208
entries_[i].size(), offset);
@@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles(
197245
const char *language_data_path_prefix,
198246
const char *output_filename) {
199247
// Load individual tessdata components from files.
200-
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
248+
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
201249
TessdataType type;
202250
ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
203251
STRING filename = language_data_path_prefix;
@@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents(
229277
char **component_filenames,
230278
int num_new_components) {
231279
// Open the files with the new components.
280+
// TODO: This method supports only the proprietary file format.
232281
for (int i = 0; i < num_new_components; ++i) {
233282
TessdataType type;
234283
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
@@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) {
253302

254303
bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
255304
TessdataType *type) {
256-
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
305+
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
257306
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
258307
*type = static_cast<TessdataType>(i);
259308
return true;
260309
}
261310
}
311+
#if defined(DEBUG)
262312
tprintf("TessdataManager can't determine which tessdata"
263313
" component is represented by %s\n", suffix);
314+
#endif
264315
return false;
265316
}
266317

src/ccutil/tessdatamanager.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,11 @@ class TessdataManager {
214214
*/
215215
bool ExtractToFile(const char *filename);
216216

217+
private:
218+
219+
// Use libarchive.
220+
bool LoadArchiveFile(const char *filename);
221+
217222
/**
218223
* Fills type with TessdataType of the tessdata component represented by the
219224
* given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
@@ -230,7 +235,6 @@ class TessdataManager {
230235
static bool TessdataTypeFromFileName(const char *filename,
231236
TessdataType *type);
232237

233-
private:
234238
// Name of file it came from.
235239
STRING data_file_name_;
236240
// Function to load the file when we need it.

src/training/Makefile.am

+20
Original file line numberDiff line numberDiff line change
@@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS)
280280
text2image_LDADD += $(LEPTONICA_LIBS)
281281
unicharset_extractor_LDADD += $(LEPTONICA_LIBS)
282282
wordlist2dawg_LDADD += $(LEPTONICA_LIBS)
283+
284+
extralib = $(libarchive_LIBS)
285+
286+
if !DISABLED_LEGACY_ENGINE
287+
ambiguous_words_LDADD += $(extralib)
288+
classifier_tester_LDADD += $(extralib)
289+
cntraining_LDADD += $(extralib)
290+
mftraining_LDADD += $(extralib)
291+
shapeclustering_LDADD += $(extralib)
292+
endif
293+
combine_lang_model_LDADD += $(extralib)
294+
combine_tessdata_LDADD += $(extralib)
295+
dawg2wordlist_LDADD += $(extralib)
296+
lstmeval_LDADD += $(extralib)
297+
lstmtraining_LDADD += $(extralib)
298+
merge_unicharsets_LDADD += $(extralib)
299+
set_unicharset_properties_LDADD += $(extralib)
300+
text2image_LDADD += $(extralib)
301+
unicharset_extractor_LDADD += $(extralib)
302+
wordlist2dawg_LDADD += $(extralib)

0 commit comments

Comments
 (0)