Skip to content

Commit f93fb9d

Browse files
committed
unittest: Add lang_model_test (only works partially)
The test currently has subtests which fail because of missing files. Signed-off-by: Stefan Weil <[email protected]>
1 parent de6a759 commit f93fb9d

File tree

3 files changed

+52
-18
lines changed

3 files changed

+52
-18
lines changed

unittest/Makefile.am

+10-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Absolute path of directory 'langdata'.
2+
LANGDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/langdata
3+
14
# Absolute path of directory 'tessdata' with traineddata files
25
# (must be on same level as top source directory).
36
TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
@@ -6,6 +9,7 @@ TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
69
# (using submodule test).
710
TESTING_DIR=$(shell cd $(top_srcdir) && pwd)/test/testing
811

12+
AM_CPPFLAGS += -DLANGDATA_DIR="\"$(LANGDATA_DIR)\""
913
AM_CPPFLAGS += -DTESSDATA_DIR="\"$(TESSDATA_DIR)\""
1014
AM_CPPFLAGS += -DTESTING_DIR="\"$(TESTING_DIR)\""
1115
AM_CPPFLAGS += -DPANGO_ENABLE_ENGINE
@@ -83,7 +87,6 @@ GMOCK_LIBS = libgmock.la libgmock_main.la
8387
TESS_LIBS = $(top_builddir)/src/api/libtesseract.la
8488
TRAINING_LIBS = $(top_builddir)/src/training/libtesseract_training.la
8589
TRAINING_LIBS += $(top_builddir)/src/training/libtesseract_tessopt.la
86-
TRAINING_LIBS += $(ICU_UC_LIBS)
8790
AM_CPPFLAGS += -isystem $(top_srcdir)/googletest/googletest/include \
8891
-isystem $(top_srcdir)/googletest/googlemock/include
8992

@@ -101,6 +104,7 @@ check_PROGRAMS = \
101104
indexmapbidi_test \
102105
intfeaturemap_test \
103106
intsimdmatrix_test \
107+
lang_model_test \
104108
linlsq_test \
105109
loadlang_test \
106110
matrix_test \
@@ -149,7 +153,7 @@ colpartition_test_SOURCES = colpartition_test.cc
149153
colpartition_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
150154

151155
commandlineflags_test_SOURCES = commandlineflags_test.cc
152-
commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
156+
commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
153157

154158
denorm_test_SOURCES = denorm_test.cc
155159
denorm_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
@@ -172,6 +176,9 @@ intfeaturemap_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
172176
intsimdmatrix_test_SOURCES = intsimdmatrix_test.cc
173177
intsimdmatrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
174178

179+
lang_model_test_SOURCES = lang_model_test.cc
180+
lang_model_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)
181+
175182
linlsq_test_SOURCES = linlsq_test.cc
176183
linlsq_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
177184

@@ -222,7 +229,7 @@ tfile_test_SOURCES = tfile_test.cc
222229
tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
223230

224231
validator_test_SOURCES = validator_test.cc
225-
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
232+
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
226233

227234
# for windows
228235
if T_WIN

unittest/include_gunit.h

+9
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,15 @@ class file : public tesseract::File {
2828
static int Defaults() {
2929
return 0;
3030
}
31+
32+
static std::string JoinPath(const std::string& s1, const std::string& s2) {
33+
return tesseract::File::JoinPath(s1, s2);
34+
}
35+
36+
static std::string JoinPath(const std::string& s1, const std::string& s2,
37+
const std::string& s3) {
38+
return JoinPath(JoinPath(s1, s2), s3);
39+
}
3140
};
3241

3342
#if !defined(ABSL_ARRAYSIZE)

unittest/lang_model_test.cc

+33-15
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,31 @@
1-
#include "tesseract/training/lang_model_helpers.h"
1+
// (C) Copyright 2017, Google Inc.
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
// http://www.apache.org/licenses/LICENSE-2.0
6+
// Unless required by applicable law or agreed to in writing, software
7+
// distributed under the License is distributed on an "AS IS" BASIS,
8+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
// See the License for the specific language governing permissions and
10+
// limitations under the License.
211

3-
#include "tesseract/lstm/lstmtrainer.h"
4-
#include "tesseract/training/unicharset_training_utils.h"
12+
#include <string> // for std::string
13+
14+
#include "absl/strings/str_cat.h"
15+
16+
#include "gmock/gmock.h" // for testing::ElementsAreArray
17+
18+
#include "include_gunit.h"
19+
#include "lang_model_helpers.h"
20+
#include "log.h" // for LOG
21+
#include "lstmtrainer.h"
22+
#include "unicharset_training_utils.h"
523

624
namespace tesseract {
725
namespace {
826

9-
string TestDataNameToPath(const string& name) {
10-
return file::JoinPath(FLAGS_test_srcdir, "testdata", name);
27+
std::string TestDataNameToPath(const std::string& name) {
28+
return file::JoinPath(TESTING_DIR, name);
1129
}
1230

1331
// This is an integration test that verifies that CombineLangModel works to
@@ -18,15 +36,15 @@ TEST(LangModelTest, AddACharacter) {
1836
constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&";
1937
constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹";
2038
// Setup the arguments.
21-
string script_dir = file::JoinPath(FLAGS_test_srcdir, "langdata");
22-
string eng_dir = file::JoinPath(script_dir, "eng");
23-
string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
39+
std::string script_dir = LANGDATA_DIR;
40+
std::string eng_dir = file::JoinPath(script_dir, "eng");
41+
std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
2442
UNICHARSET unicharset;
2543
EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
26-
string version_str = "TestVersion";
27-
string output_dir = FLAGS_test_tmpdir;
44+
std::string version_str = "TestVersion";
45+
std::string output_dir = FLAGS_test_tmpdir;
2846
LOG(INFO) << "Output dir=" << output_dir;
29-
string lang1 = "eng";
47+
std::string lang1 = "eng";
3048
bool pass_through_recoder = false;
3149
GenericVector<STRING> words, puncs, numbers;
3250
// If these reads fail, we get a warning message and an empty list of words.
@@ -44,7 +62,7 @@ TEST(LangModelTest, AddACharacter) {
4462
lang1, pass_through_recoder, words, puncs,
4563
numbers, lang_is_rtl, nullptr, nullptr));
4664
// Init a trainer with it, and encode a string.
47-
string traineddata1 =
65+
std::string traineddata1 =
4866
file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
4967
LSTMTrainer trainer1;
5068
trainer1.InitCharSet(traineddata1);
@@ -58,13 +76,13 @@ TEST(LangModelTest, AddACharacter) {
5876
&unicharset);
5977
EXPECT_EQ(size_before + 1, unicharset.size());
6078
// Generate the traineddata file.
61-
string lang2 = "extended";
79+
std::string lang2 = "extended";
6280
EXPECT_EQ(EXIT_SUCCESS,
6381
CombineLangModel(unicharset, script_dir, version_str, output_dir,
6482
lang2, pass_through_recoder, words, puncs, numbers,
6583
lang_is_rtl, nullptr, nullptr));
6684
// Init a trainer with it, and encode a string.
67-
string traineddata2 =
85+
std::string traineddata2 =
6886
file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
6987
LSTMTrainer trainer2;
7088
trainer2.InitCharSet(traineddata2);
@@ -86,7 +104,7 @@ TEST(LangModelTest, AddACharacter) {
86104
}
87105
EXPECT_THAT(labels1_v,
88106
testing::ElementsAreArray(&labels2[0], labels2.size()));
89-
// To make sure we weren't cheating somehow, we can now encode the Rupee
107+
// To make sure we we are not cheating somehow, we can now encode the Rupee
90108
// symbol, which we could not do before.
91109
EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
92110
EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));

0 commit comments

Comments
 (0)