Skip to content

Commit d97f67d

Browse files
committed
unittest: Fix and enable validate_grapheme_test
Signed-off-by: Stefan Weil <[email protected]>
1 parent a702f2d commit d97f67d

File tree

2 files changed

+52
-37
lines changed

2 files changed

+52
-37
lines changed

unittest/Makefile.am

+4
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ check_PROGRAMS += commandlineflags_test
135135
check_PROGRAMS += unichar_test
136136
check_PROGRAMS += unicharset_test
137137
check_PROGRAMS += unicharcompress_test
138+
check_PROGRAMS += validate_grapheme_test
138139
check_PROGRAMS += validator_test
139140
endif
140141

@@ -262,6 +263,9 @@ unicharcompress_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TES
262263
unicharset_test_SOURCES = unicharset_test.cc
263264
unicharset_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
264265

266+
validate_grapheme_test_SOURCES = validate_grapheme_test.cc
267+
validate_grapheme_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)
268+
265269
validator_test_SOURCES = validator_test.cc
266270
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
267271

unittest/validate_grapheme_test.cc

+48-37
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,38 @@
1-
#include "tesseract/training/normstrngs.h"
1+
// (C) Copyright 2017, Google Inc.
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
// http://www.apache.org/licenses/LICENSE-2.0
6+
// Unless required by applicable law or agreed to in writing, software
7+
// distributed under the License is distributed on an "AS IS" BASIS,
8+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
// See the License for the specific language governing permissions and
10+
// limitations under the License.
211

3-
#include "tesseract/unittest/normstrngs_test.h"
12+
#include "include_gunit.h"
13+
#include "normstrngs.h"
14+
#include "normstrngs_test.h"
415

516
namespace tesseract {
617
namespace {
718

819
TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
9-
string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
10-
std::vector<string> glyphs;
20+
std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
21+
std::vector<std::string> glyphs;
1122
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
1223
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
1324
str.c_str(), &glyphs))
1425
<< PrintString32WithUnicodes(str);
1526
// It made 3 graphemes.
1627
EXPECT_EQ(glyphs.size(), 3);
17-
EXPECT_EQ(glyphs[0], string("\u0c15\u0c3f"));
18-
EXPECT_EQ(glyphs[1], string("\u0c15"));
19-
EXPECT_EQ(glyphs[2], string("\u0c0e"));
28+
EXPECT_EQ(glyphs[0], std::string("\u0c15\u0c3f"));
29+
EXPECT_EQ(glyphs[1], std::string("\u0c15"));
30+
EXPECT_EQ(glyphs[2], std::string("\u0c0e"));
2031
}
2132

2233
TEST(ValidateGraphemeTest, SingleConsonantOK) {
23-
string str = "\u0cb9"; // HA
24-
std::vector<string> glyphs;
34+
std::string str = "\u0cb9"; // HA
35+
std::vector<std::string> glyphs;
2536
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
2637
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
2738
str.c_str(), &glyphs))
@@ -31,8 +42,8 @@ TEST(ValidateGraphemeTest, SingleConsonantOK) {
3142
}
3243

3344
TEST(ValidateGraphemeTest, SimpleCV) {
34-
string str = "\u0cb9\u0cbf"; // HA I
35-
std::vector<string> glyphs;
45+
std::string str = "\u0cb9\u0cbf"; // HA I
46+
std::vector<std::string> glyphs;
3647
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
3748
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
3849
str.c_str(), &glyphs))
@@ -42,8 +53,8 @@ TEST(ValidateGraphemeTest, SimpleCV) {
4253
}
4354

4455
TEST(ValidateGraphemeTest, SubscriptConjunct) {
45-
string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
46-
std::vector<string> glyphs;
56+
std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
57+
std::vector<std::string> glyphs;
4758
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
4859
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
4960
str.c_str(), &glyphs))
@@ -55,12 +66,12 @@ TEST(ValidateGraphemeTest, SubscriptConjunct) {
5566
true, str.c_str(), &glyphs))
5667
<< PrintString32WithUnicodes(str);
5768
EXPECT_EQ(glyphs.size(), 3);
58-
EXPECT_EQ(glyphs[1], string("\u0ccd\u0c95"));
69+
EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95"));
5970
}
6071

6172
TEST(ValidateGraphemeTest, HalfFormJoiner) {
62-
string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
63-
std::vector<string> glyphs;
73+
std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
74+
std::vector<std::string> glyphs;
6475
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
6576
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
6677
str.c_str(), &glyphs))
@@ -72,12 +83,12 @@ TEST(ValidateGraphemeTest, HalfFormJoiner) {
7283
true, str.c_str(), &glyphs))
7384
<< PrintString32WithUnicodes(str);
7485
EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);
75-
EXPECT_EQ(glyphs[0], string("\u0d15\u0d4d\u200d"));
86+
EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d"));
7687
}
7788

7889
TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
79-
string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
80-
std::vector<string> glyphs;
90+
std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
91+
std::vector<std::string> glyphs;
8192
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
8293
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
8394
str.c_str(), &glyphs))
@@ -89,12 +100,12 @@ TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
89100
true, str.c_str(), &glyphs))
90101
<< PrintString32WithUnicodes(str);
91102
EXPECT_EQ(glyphs.size(), 3);
92-
EXPECT_EQ(glyphs[1], string("\u200d\u0d4d"));
103+
EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d"));
93104
}
94105

95106
TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
96-
string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
97-
std::vector<string> glyphs;
107+
std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
108+
std::vector<std::string> glyphs;
98109
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
99110
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
100111
str.c_str(), &glyphs))
@@ -106,7 +117,7 @@ TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
106117
true, str.c_str(), &glyphs))
107118
<< PrintString32WithUnicodes(str);
108119
EXPECT_EQ(glyphs.size(), 3);
109-
EXPECT_EQ(glyphs[1], string("\u200c\u0d4d"));
120+
EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d"));
110121
// Malaylam only, so not allowed in Telugu.
111122
str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta
112123
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
@@ -116,26 +127,26 @@ TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
116127
}
117128

118129
TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {
119-
string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
120-
std::vector<string> glyphs;
130+
std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
131+
std::vector<std::string> glyphs;
121132
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
122133
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
123134
str.c_str(), &glyphs))
124135
<< PrintString32WithUnicodes(str);
125136
EXPECT_EQ(glyphs.size(), 2);
126-
EXPECT_EQ(glyphs[1], string("\u0d24"));
137+
EXPECT_EQ(glyphs[1], std::string("\u0d24"));
127138
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
128139
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
129140
true, str.c_str(), &glyphs))
130141
<< PrintString32WithUnicodes(str);
131142
EXPECT_EQ(glyphs.size(), 3);
132-
EXPECT_EQ(glyphs[1], string("\u0d4d\u200c"));
143+
EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c"));
133144
}
134145

135146
TEST(ValidateGraphemeTest, ThaiGraphemes) {
136147
// This is a single grapheme unless in glyph split mode
137-
string str = "\u0e14\u0e38\u0e4a";
138-
std::vector<string> glyphs;
148+
std::string str = "\u0e14\u0e38\u0e4a";
149+
std::vector<std::string> glyphs;
139150
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
140151
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
141152
str.c_str(), &glyphs))
@@ -147,23 +158,23 @@ TEST(ValidateGraphemeTest, ThaiGraphemes) {
147158
true, str.c_str(), &glyphs))
148159
<< PrintString32WithUnicodes(str);
149160
EXPECT_EQ(glyphs.size(), 3);
150-
EXPECT_EQ(glyphs[0], string("\u0e14"));
161+
EXPECT_EQ(glyphs[0], std::string("\u0e14"));
151162
}
152163

153164
TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {
154-
string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
155-
std::vector<string> glyphs;
165+
std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
166+
std::vector<std::string> glyphs;
156167
// Returns true, but the joiner is gone.
157168
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
158169
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
159170
str.c_str(), &glyphs))
160171
<< PrintString32WithUnicodes(str);
161172
EXPECT_EQ(glyphs.size(), 5);
162-
EXPECT_EQ(glyphs[0], string("'"));
163-
EXPECT_EQ(glyphs[1], string("\u0d24"));
164-
EXPECT_EQ(glyphs[2], string("\u0d23"));
165-
EXPECT_EQ(glyphs[3], string("\u0d32\u0d4d\u200c"));
166-
EXPECT_EQ(glyphs[4], string("'"));
173+
EXPECT_EQ(glyphs[0], std::string("'"));
174+
EXPECT_EQ(glyphs[1], std::string("\u0d24"));
175+
EXPECT_EQ(glyphs[2], std::string("\u0d23"));
176+
EXPECT_EQ(glyphs[3], std::string("\u0d32\u0d4d\u200c"));
177+
EXPECT_EQ(glyphs[4], std::string("'"));
167178
}
168179

169180
} // namespace

0 commit comments

Comments
 (0)