Skip to content

Commit bbd3626

Browse files
committed
unittest: Fix and enable normstrngs_test
Signed-off-by: Stefan Weil <[email protected]>
1 parent 73e5241 commit bbd3626

File tree

2 files changed

+78
-51
lines changed

2 files changed

+78
-51
lines changed

unittest/Makefile.am

+4-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ check_PROGRAMS += loadlang_test
128128
check_PROGRAMS += mastertrainer_test
129129
check_PROGRAMS += matrix_test
130130
# check_PROGRAMS += networkio_test
131-
# check_PROGRAMS += normstrngs_test
132131
check_PROGRAMS += nthitem_test
133132
check_PROGRAMS += osd_test
134133
# check_PROGRAMS += pagesegmode_test
@@ -158,6 +157,7 @@ check_PROGRAMS += lstm_recode_test
158157
check_PROGRAMS += lstm_squashed_test
159158
check_PROGRAMS += lstm_test
160159
check_PROGRAMS += lstmtrainer_test
160+
check_PROGRAMS += normstrngs_test
161161
check_PROGRAMS += unichar_test
162162
check_PROGRAMS += unicharcompress_test
163163
check_PROGRAMS += unicharset_test
@@ -271,6 +271,9 @@ mastertrainer_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_
271271
matrix_test_SOURCES = matrix_test.cc
272272
matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
273273

274+
normstrngs_test_SOURCES = normstrngs_test.cc
275+
normstrngs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)
276+
274277
nthitem_test_SOURCES = nthitem_test.cc
275278
nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
276279

unittest/normstrngs_test.cc

+74-50
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,39 @@
1-
#include "tesseract/training/normstrngs.h"
2-
3-
#include "tesseract/ccutil/strngs.h"
4-
#include "tesseract/ccutil/unichar.h"
5-
#include "tesseract/unittest/normstrngs_test.h"
6-
#include "util/utf8/public/unilib.h"
1+
// (C) Copyright 2017, Google Inc.
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
// http://www.apache.org/licenses/LICENSE-2.0
6+
// Unless required by applicable law or agreed to in writing, software
7+
// distributed under the License is distributed on an "AS IS" BASIS,
8+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
// See the License for the specific language governing permissions and
10+
// limitations under the License.
11+
12+
#include "absl/strings/str_format.h" // for absl::StrFormat
13+
#include "include_gunit.h"
14+
#include "normstrngs.h"
15+
#include "normstrngs_test.h"
16+
#include "strngs.h"
17+
#include "unichar.h"
18+
#if defined(HAS_UNILIB_H)
19+
#include "unilib.h"
20+
#endif
21+
22+
#include "include_gunit.h"
723

824
namespace tesseract {
925
namespace {
1026

11-
static string EncodeAsUTF8(const char32 ch32) {
27+
#if defined(HAS_UNILIB_H)
28+
static std::string EncodeAsUTF8(const char32 ch32) {
1229
UNICHAR uni_ch(ch32);
13-
return string(uni_ch.utf8(), uni_ch.utf8_len());
30+
return std::string(uni_ch.utf8(), uni_ch.utf8_len());
1431
}
32+
#endif
1533

1634
TEST(NormstrngsTest, BasicText) {
1735
const char* kBasicText = "AbCd Ef";
18-
string result;
36+
std::string result;
1937
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
2038
GraphemeNorm::kNormalize, kBasicText,
2139
&result));
@@ -24,7 +42,7 @@ TEST(NormstrngsTest, BasicText) {
2442

2543
TEST(NormstrngsTest, LigatureText) {
2644
const char* kTwoByteLigText = "ij"; // U+0133 (ij) -> ij
27-
string result;
45+
std::string result;
2846
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
2947
GraphemeNorm::kNormalize, kTwoByteLigText,
3048
&result));
@@ -39,7 +57,7 @@ TEST(NormstrngsTest, LigatureText) {
3957

4058
TEST(NormstrngsTest, OcrSpecificNormalization) {
4159
const char* kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (')
42-
string result;
60+
std::string result;
4361
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
4462
GraphemeNorm::kNormalize, kSingleQuoteText,
4563
&result));
@@ -80,7 +98,7 @@ const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नही
8098
const char* kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"};
8199

82100
TEST(NormstrngsTest, DetectsCorrectText) {
83-
string chars;
101+
std::string chars;
84102
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
85103
GraphemeNorm::kNormalize, kEngText, &chars));
86104
EXPECT_STREQ(kEngText, chars.c_str());
@@ -96,13 +114,13 @@ TEST(NormstrngsTest, DetectsCorrectText) {
96114
}
97115

98116
TEST(NormstrngsTest, DetectsIncorrectText) {
99-
for (int i = 0; i < ARRAYSIZE(kBadlyFormedHinWords); ++i) {
117+
for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedHinWords); ++i) {
100118
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
101119
GraphemeNorm::kNormalize,
102120
kBadlyFormedHinWords[i], nullptr))
103121
<< kBadlyFormedHinWords[i];
104122
}
105-
for (int i = 0; i < ARRAYSIZE(kBadlyFormedThaiWords); ++i) {
123+
for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedThaiWords); ++i) {
106124
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
107125
GraphemeNorm::kNormalize,
108126
kBadlyFormedThaiWords[i], nullptr))
@@ -111,8 +129,8 @@ TEST(NormstrngsTest, DetectsIncorrectText) {
111129
}
112130

113131
TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
114-
string nonindic = "Here's some latin text.";
115-
string dest;
132+
std::string nonindic = "Here's some latin text.";
133+
std::string dest;
116134
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
117135
GraphemeNorm::kNormalize, nonindic.c_str(),
118136
&dest))
@@ -121,59 +139,59 @@ TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
121139
}
122140

123141
TEST(NormstrngsTest, NoLonelyJoiners) {
124-
string str = "x\u200d\u0d06\u0d34\u0d02";
125-
std::vector<string> glyphs;
142+
std::string str = "x\u200d\u0d06\u0d34\u0d02";
143+
std::vector<std::string> glyphs;
126144
// Returns true, but the joiner is gone.
127145
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
128146
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
129147
str.c_str(), &glyphs))
130148
<< PrintString32WithUnicodes(str);
131149
EXPECT_EQ(glyphs.size(), 3);
132-
EXPECT_EQ(glyphs[0], string("x"));
133-
EXPECT_EQ(glyphs[1], string("\u0d06"));
134-
EXPECT_EQ(glyphs[2], string("\u0d34\u0d02"));
150+
EXPECT_EQ(glyphs[0], std::string("x"));
151+
EXPECT_EQ(glyphs[1], std::string("\u0d06"));
152+
EXPECT_EQ(glyphs[2], std::string("\u0d34\u0d02"));
135153
}
136154

137155
TEST(NormstrngsTest, NoLonelyJoinersPlus) {
138-
string str = "\u0d2a\u200d+\u0d2a\u0d4b";
139-
std::vector<string> glyphs;
156+
std::string str = "\u0d2a\u200d+\u0d2a\u0d4b";
157+
std::vector<std::string> glyphs;
140158
// Returns true, but the joiner is gone.
141159
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
142160
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
143161
str.c_str(), &glyphs))
144162
<< PrintString32WithUnicodes(str);
145163
EXPECT_EQ(glyphs.size(), 3);
146-
EXPECT_EQ(glyphs[0], string("\u0d2a"));
147-
EXPECT_EQ(glyphs[1], string("+"));
148-
EXPECT_EQ(glyphs[2], string("\u0d2a\u0d4b"));
164+
EXPECT_EQ(glyphs[0], std::string("\u0d2a"));
165+
EXPECT_EQ(glyphs[1], std::string("+"));
166+
EXPECT_EQ(glyphs[2], std::string("\u0d2a\u0d4b"));
149167
}
150168

151169
TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) {
152-
string str = "\u200d+\u200c\u200d";
170+
std::string str = "\u200d+\u200c\u200d";
153171
// Returns true, but the joiners are gone.
154-
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, string("+"));
172+
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, std::string("+"));
155173
str = "\u200d\u200c\u200d";
156174
// Without the plus, the string is invalid.
157-
string result;
175+
std::string result;
158176
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
159177
GraphemeNorm::kNormalize, str.c_str(),
160178
&result))
161179
<< PrintString32WithUnicodes(result);
162180
}
163181

164182
TEST(NormstrngsTest, JoinersStayInArabic) {
165-
string str = "\u0628\u200c\u0628\u200d\u0628";
183+
std::string str = "\u0628\u200c\u0628\u200d\u0628";
166184
// Returns true, string untouched.
167185
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 5, 5, 2, str);
168186
}
169187

170188
TEST(NormstrngsTest, DigitOK) {
171-
string str = "\u0cea"; // Digit 4.
189+
std::string str = "\u0cea"; // Digit 4.
172190
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
173191
}
174192

175193
TEST(NormstrngsTest, DandaOK) {
176-
string str = "\u0964"; // Single danda.
194+
std::string str = "\u0964"; // Single danda.
177195
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
178196
str = "\u0965"; // Double danda.
179197
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
@@ -182,7 +200,7 @@ TEST(NormstrngsTest, DandaOK) {
182200
TEST(NormstrngsTest, AllScriptsRegtest) {
183201
// Tests some valid text in a large number of scripts, some of which were
184202
// found to be rejected by an earlier version.
185-
const std::vector<std::pair<string, string>> kScriptText(
203+
const std::vector<std::pair<std::string, std::string>> kScriptText(
186204
{{"Arabic",
187205
" فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن"
188206
"توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة "
@@ -297,7 +315,7 @@ TEST(NormstrngsTest, AllScriptsRegtest) {
297315
"Cặp câu đói súc tích mà sâu sắc, là lời chúc lời"}});
298316

299317
for (const auto& p : kScriptText) {
300-
string normalized;
318+
std::string normalized;
301319
EXPECT_TRUE(tesseract::NormalizeUTF8String(
302320
tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
303321
tesseract::GraphemeNorm::kNormalize, p.second.c_str(), &normalized))
@@ -313,7 +331,7 @@ TEST(NormstrngsTest, IsWhitespace) {
313331
EXPECT_TRUE(IsWhitespace('\n'));
314332
// U+2000 through U+200A
315333
for (char32 ch = 0x2000; ch <= 0x200A; ++ch) {
316-
SCOPED_TRACE(StringPrintf("Failed at U+%x", ch));
334+
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
317335
EXPECT_TRUE(IsWhitespace(ch));
318336
}
319337
// U+3000 is whitespace
@@ -345,29 +363,33 @@ TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
345363
EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));
346364
}
347365

366+
#if defined(HAS_UNILIB_H)
348367
// Test that the method clones the util/utf8/public/unilib definition of
349368
// interchange validity.
350369
TEST(NormstrngsTest, IsInterchangeValid) {
351-
const int32 kMinUnicodeValue = 33;
352-
const int32 kMaxUnicodeValue = 0x10FFFF;
353-
for (int32 ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
354-
SCOPED_TRACE(StringPrintf("Failed at U+%x", ch));
370+
const int32_t kMinUnicodeValue = 33;
371+
const int32_t kMaxUnicodeValue = 0x10FFFF;
372+
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
373+
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
355374
EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch));
356375
}
357376
}
377+
#endif
358378

379+
#if defined(HAS_UNILIB_H)
359380
// Test that the method clones the util/utf8/public/unilib definition of
360381
// 7-bit ASCII interchange validity.
361382
TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
362-
const int32 kMinUnicodeValue = 33;
363-
const int32 kMaxUnicodeValue = 0x10FFFF;
364-
for (int32 ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
365-
SCOPED_TRACE(StringPrintf("Failed at U+%x", ch));
366-
string str = EncodeAsUTF8(ch);
383+
const int32_t kMinUnicodeValue = 33;
384+
const int32_t kMaxUnicodeValue = 0x10FFFF;
385+
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
386+
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
387+
std::string str = EncodeAsUTF8(ch);
367388
EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str),
368389
IsInterchangeValid7BitAscii(ch));
369390
}
370391
}
392+
#endif
371393

372394
// Test that the method clones the util/utf8/public/unilib definition of
373395
// fullwidth-halfwidth .
@@ -379,16 +401,18 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) {
379401
// U+FFE6 -> U+20A9 (won sign)
380402
EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));
381403

382-
const int32 kMinUnicodeValue = 33;
383-
const int32 kMaxUnicodeValue = 0x10FFFF;
384-
for (int32 ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
404+
#if defined(HAS_UNILIB_H)
405+
const int32_t kMinUnicodeValue = 33;
406+
const int32_t kMaxUnicodeValue = 0x10FFFF;
407+
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
385408
if (!IsValidCodepoint(ch)) continue;
386-
SCOPED_TRACE(StringPrintf("Failed at U+%x", ch));
387-
string str = EncodeAsUTF8(ch);
388-
const string expected_half_str =
409+
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
410+
std::string str = EncodeAsUTF8(ch);
411+
const std::string expected_half_str =
389412
UniLib::FullwidthToHalfwidth(str.c_str(), str.length(), true);
390413
EXPECT_EQ(expected_half_str, EncodeAsUTF8(FullwidthToHalfwidth(ch)));
391414
}
415+
#endif
392416
}
393417

394418
} // namespace

0 commit comments

Comments
 (0)