1
- #include " tesseract/training/normstrngs.h"
2
-
3
- #include " tesseract/ccutil/strngs.h"
4
- #include " tesseract/ccutil/unichar.h"
5
- #include " tesseract/unittest/normstrngs_test.h"
6
- #include " util/utf8/public/unilib.h"
1
+ // (C) Copyright 2017, Google Inc.
2
+ // Licensed under the Apache License, Version 2.0 (the "License");
3
+ // you may not use this file except in compliance with the License.
4
+ // You may obtain a copy of the License at
5
+ // http://www.apache.org/licenses/LICENSE-2.0
6
+ // Unless required by applicable law or agreed to in writing, software
7
+ // distributed under the License is distributed on an "AS IS" BASIS,
8
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ // See the License for the specific language governing permissions and
10
+ // limitations under the License.
11
+
12
+ #include " absl/strings/str_format.h" // for absl::StrFormat
13
+ #include " include_gunit.h"
14
+ #include " normstrngs.h"
15
+ #include " normstrngs_test.h"
16
+ #include " strngs.h"
17
+ #include " unichar.h"
18
+ #if defined(HAS_UNILIB_H)
19
+ #include " unilib.h"
20
+ #endif
21
+
22
+ #include " include_gunit.h"
7
23
8
24
namespace tesseract {
9
25
namespace {
10
26
11
- static string EncodeAsUTF8 (const char32 ch32) {
27
+ #if defined(HAS_UNILIB_H)
28
+ static std::string EncodeAsUTF8 (const char32 ch32) {
12
29
UNICHAR uni_ch (ch32);
13
- return string (uni_ch.utf8 (), uni_ch.utf8_len ());
30
+ return std:: string (uni_ch.utf8 (), uni_ch.utf8_len ());
14
31
}
32
+ #endif
15
33
16
34
TEST (NormstrngsTest, BasicText) {
17
35
const char * kBasicText = " AbCd Ef" ;
18
- string result;
36
+ std:: string result;
19
37
EXPECT_TRUE (NormalizeUTF8String (UnicodeNormMode::kNFKC , OCRNorm::kNormalize ,
20
38
GraphemeNorm::kNormalize , kBasicText ,
21
39
&result));
@@ -24,7 +42,7 @@ TEST(NormstrngsTest, BasicText) {
24
42
25
43
TEST (NormstrngsTest, LigatureText) {
26
44
const char * kTwoByteLigText = " ij" ; // U+0133 (ij) -> ij
27
- string result;
45
+ std:: string result;
28
46
EXPECT_TRUE (NormalizeUTF8String (UnicodeNormMode::kNFKC , OCRNorm::kNormalize ,
29
47
GraphemeNorm::kNormalize , kTwoByteLigText ,
30
48
&result));
@@ -39,7 +57,7 @@ TEST(NormstrngsTest, LigatureText) {
39
57
40
58
TEST (NormstrngsTest, OcrSpecificNormalization) {
41
59
const char * kSingleQuoteText = " ‘Hi" ; // U+2018 (‘) -> U+027 (')
42
- string result;
60
+ std:: string result;
43
61
EXPECT_TRUE (NormalizeUTF8String (UnicodeNormMode::kNFKC , OCRNorm::kNormalize ,
44
62
GraphemeNorm::kNormalize , kSingleQuoteText ,
45
63
&result));
@@ -80,7 +98,7 @@ const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नही
80
98
const char * kBadlyFormedThaiWords [] = {" ฤิ" , " กา้ํ" , " กิำ" , " นำ้" , " เเก" };
81
99
82
100
TEST (NormstrngsTest, DetectsCorrectText) {
83
- string chars;
101
+ std:: string chars;
84
102
EXPECT_TRUE (NormalizeUTF8String (UnicodeNormMode::kNFKC , OCRNorm::kNone ,
85
103
GraphemeNorm::kNormalize , kEngText , &chars));
86
104
EXPECT_STREQ (kEngText , chars.c_str ());
@@ -96,13 +114,13 @@ TEST(NormstrngsTest, DetectsCorrectText) {
96
114
}
97
115
98
116
TEST (NormstrngsTest, DetectsIncorrectText) {
99
- for (int i = 0 ; i < ARRAYSIZE (kBadlyFormedHinWords ); ++i) {
117
+ for (size_t i = 0 ; i < ARRAYSIZE (kBadlyFormedHinWords ); ++i) {
100
118
EXPECT_FALSE (NormalizeUTF8String (UnicodeNormMode::kNFKC , OCRNorm::kNone ,
101
119
GraphemeNorm::kNormalize ,
102
120
kBadlyFormedHinWords [i], nullptr ))
103
121
<< kBadlyFormedHinWords [i];
104
122
}
105
- for (int i = 0 ; i < ARRAYSIZE (kBadlyFormedThaiWords ); ++i) {
123
+ for (size_t i = 0 ; i < ARRAYSIZE (kBadlyFormedThaiWords ); ++i) {
106
124
EXPECT_FALSE (NormalizeUTF8String (UnicodeNormMode::kNFKC , OCRNorm::kNone ,
107
125
GraphemeNorm::kNormalize ,
108
126
kBadlyFormedThaiWords [i], nullptr ))
@@ -111,8 +129,8 @@ TEST(NormstrngsTest, DetectsIncorrectText) {
111
129
}
112
130
113
131
TEST (NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
114
- string nonindic = " Here's some latin text." ;
115
- string dest;
132
+ std:: string nonindic = " Here's some latin text." ;
133
+ std:: string dest;
116
134
EXPECT_TRUE (NormalizeUTF8String (UnicodeNormMode::kNFC , OCRNorm::kNone ,
117
135
GraphemeNorm::kNormalize , nonindic.c_str (),
118
136
&dest))
@@ -121,59 +139,59 @@ TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
121
139
}
122
140
123
141
TEST (NormstrngsTest, NoLonelyJoiners) {
124
- string str = " x\u200d\u0d06\u0d34\u0d02 " ;
125
- std::vector<string> glyphs;
142
+ std:: string str = " x\u200d\u0d06\u0d34\u0d02 " ;
143
+ std::vector<std:: string> glyphs;
126
144
// Returns true, but the joiner is gone.
127
145
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
128
146
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
129
147
str.c_str (), &glyphs))
130
148
<< PrintString32WithUnicodes (str);
131
149
EXPECT_EQ (glyphs.size (), 3 );
132
- EXPECT_EQ (glyphs[0 ], string (" x" ));
133
- EXPECT_EQ (glyphs[1 ], string (" \u0d06 " ));
134
- EXPECT_EQ (glyphs[2 ], string (" \u0d34\u0d02 " ));
150
+ EXPECT_EQ (glyphs[0 ], std:: string (" x" ));
151
+ EXPECT_EQ (glyphs[1 ], std:: string (" \u0d06 " ));
152
+ EXPECT_EQ (glyphs[2 ], std:: string (" \u0d34\u0d02 " ));
135
153
}
136
154
137
155
TEST (NormstrngsTest, NoLonelyJoinersPlus) {
138
- string str = " \u0d2a\u200d +\u0d2a\u0d4b " ;
139
- std::vector<string> glyphs;
156
+ std:: string str = " \u0d2a\u200d +\u0d2a\u0d4b " ;
157
+ std::vector<std:: string> glyphs;
140
158
// Returns true, but the joiner is gone.
141
159
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
142
160
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
143
161
str.c_str (), &glyphs))
144
162
<< PrintString32WithUnicodes (str);
145
163
EXPECT_EQ (glyphs.size (), 3 );
146
- EXPECT_EQ (glyphs[0 ], string (" \u0d2a " ));
147
- EXPECT_EQ (glyphs[1 ], string (" +" ));
148
- EXPECT_EQ (glyphs[2 ], string (" \u0d2a\u0d4b " ));
164
+ EXPECT_EQ (glyphs[0 ], std:: string (" \u0d2a " ));
165
+ EXPECT_EQ (glyphs[1 ], std:: string (" +" ));
166
+ EXPECT_EQ (glyphs[2 ], std:: string (" \u0d2a\u0d4b " ));
149
167
}
150
168
151
169
TEST (NormstrngsTest, NoLonelyJoinersNonAlpha) {
152
- string str = " \u200d +\u200c\u200d " ;
170
+ std:: string str = " \u200d +\u200c\u200d " ;
153
171
// Returns true, but the joiners are gone.
154
- ExpectGraphemeModeResults (str, UnicodeNormMode::kNFC , 1 , 1 , 1 , string (" +" ));
172
+ ExpectGraphemeModeResults (str, UnicodeNormMode::kNFC , 1 , 1 , 1 , std:: string (" +" ));
155
173
str = " \u200d\u200c\u200d " ;
156
174
// Without the plus, the string is invalid.
157
- string result;
175
+ std:: string result;
158
176
EXPECT_FALSE (NormalizeUTF8String (UnicodeNormMode::kNFC , OCRNorm::kNone ,
159
177
GraphemeNorm::kNormalize , str.c_str (),
160
178
&result))
161
179
<< PrintString32WithUnicodes (result);
162
180
}
163
181
164
182
TEST (NormstrngsTest, JoinersStayInArabic) {
165
- string str = " \u0628\u200c\u0628\u200d\u0628 " ;
183
+ std:: string str = " \u0628\u200c\u0628\u200d\u0628 " ;
166
184
// Returns true, string untouched.
167
185
ExpectGraphemeModeResults (str, UnicodeNormMode::kNFC , 5 , 5 , 2 , str);
168
186
}
169
187
170
188
TEST (NormstrngsTest, DigitOK) {
171
- string str = " \u0cea " ; // Digit 4.
189
+ std:: string str = " \u0cea " ; // Digit 4.
172
190
ExpectGraphemeModeResults (str, UnicodeNormMode::kNFC , 1 , 1 , 1 , str);
173
191
}
174
192
175
193
TEST (NormstrngsTest, DandaOK) {
176
- string str = " \u0964 " ; // Single danda.
194
+ std:: string str = " \u0964 " ; // Single danda.
177
195
ExpectGraphemeModeResults (str, UnicodeNormMode::kNFC , 1 , 1 , 1 , str);
178
196
str = " \u0965 " ; // Double danda.
179
197
ExpectGraphemeModeResults (str, UnicodeNormMode::kNFC , 1 , 1 , 1 , str);
@@ -182,7 +200,7 @@ TEST(NormstrngsTest, DandaOK) {
182
200
TEST (NormstrngsTest, AllScriptsRegtest) {
183
201
// Tests some valid text in a large number of scripts, some of which were
184
202
// found to be rejected by an earlier version.
185
- const std::vector<std::pair<string, string>> kScriptText (
203
+ const std::vector<std::pair<std:: string, std:: string>> kScriptText (
186
204
{{" Arabic" ,
187
205
" فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن"
188
206
" توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة "
@@ -297,7 +315,7 @@ TEST(NormstrngsTest, AllScriptsRegtest) {
297
315
" Cặp câu đói súc tích mà sâu sắc, là lời chúc lời" }});
298
316
299
317
for (const auto & p : kScriptText ) {
300
- string normalized;
318
+ std:: string normalized;
301
319
EXPECT_TRUE (tesseract::NormalizeUTF8String (
302
320
tesseract::UnicodeNormMode::kNFKC , tesseract::OCRNorm::kNormalize ,
303
321
tesseract::GraphemeNorm::kNormalize , p.second .c_str (), &normalized))
@@ -313,7 +331,7 @@ TEST(NormstrngsTest, IsWhitespace) {
313
331
EXPECT_TRUE (IsWhitespace (' \n ' ));
314
332
// U+2000 through U+200A
315
333
for (char32 ch = 0x2000 ; ch <= 0x200A ; ++ch) {
316
- SCOPED_TRACE (StringPrintf (" Failed at U+%x" , ch));
334
+ SCOPED_TRACE (absl::StrFormat (" Failed at U+%x" , ch));
317
335
EXPECT_TRUE (IsWhitespace (ch));
318
336
}
319
337
// U+3000 is whitespace
@@ -345,29 +363,33 @@ TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
345
363
EXPECT_EQ (12 , SpanUTF8NotWhitespace (kMixedText ));
346
364
}
347
365
366
+ #if defined(HAS_UNILIB_H)
348
367
// Test that the method clones the util/utf8/public/unilib definition of
349
368
// interchange validity.
350
369
TEST (NormstrngsTest, IsInterchangeValid) {
351
- const int32 kMinUnicodeValue = 33 ;
352
- const int32 kMaxUnicodeValue = 0x10FFFF ;
353
- for (int32 ch = kMinUnicodeValue ; ch <= kMaxUnicodeValue ; ++ch) {
354
- SCOPED_TRACE (StringPrintf (" Failed at U+%x" , ch));
370
+ const int32_t kMinUnicodeValue = 33 ;
371
+ const int32_t kMaxUnicodeValue = 0x10FFFF ;
372
+ for (int32_t ch = kMinUnicodeValue ; ch <= kMaxUnicodeValue ; ++ch) {
373
+ SCOPED_TRACE (absl::StrFormat (" Failed at U+%x" , ch));
355
374
EXPECT_EQ (UniLib::IsInterchangeValid (ch), IsInterchangeValid (ch));
356
375
}
357
376
}
377
+ #endif
358
378
379
+ #if defined(HAS_UNILIB_H)
359
380
// Test that the method clones the util/utf8/public/unilib definition of
360
381
// 7-bit ASCII interchange validity.
361
382
TEST (NormstrngsTest, IsInterchangeValid7BitAscii) {
362
- const int32 kMinUnicodeValue = 33 ;
363
- const int32 kMaxUnicodeValue = 0x10FFFF ;
364
- for (int32 ch = kMinUnicodeValue ; ch <= kMaxUnicodeValue ; ++ch) {
365
- SCOPED_TRACE (StringPrintf (" Failed at U+%x" , ch));
366
- string str = EncodeAsUTF8 (ch);
383
+ const int32_t kMinUnicodeValue = 33 ;
384
+ const int32_t kMaxUnicodeValue = 0x10FFFF ;
385
+ for (int32_t ch = kMinUnicodeValue ; ch <= kMaxUnicodeValue ; ++ch) {
386
+ SCOPED_TRACE (absl::StrFormat (" Failed at U+%x" , ch));
387
+ std:: string str = EncodeAsUTF8 (ch);
367
388
EXPECT_EQ (UniLib::IsInterchangeValid7BitAscii (str),
368
389
IsInterchangeValid7BitAscii (ch));
369
390
}
370
391
}
392
+ #endif
371
393
372
394
// Test that the method clones the util/utf8/public/unilib definition of
373
395
// fullwidth-halfwidth .
@@ -379,16 +401,18 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) {
379
401
// U+FFE6 -> U+20A9 (won sign)
380
402
EXPECT_EQ (0x20A9 , FullwidthToHalfwidth (0xFFE6 ));
381
403
382
- const int32 kMinUnicodeValue = 33 ;
383
- const int32 kMaxUnicodeValue = 0x10FFFF ;
384
- for (int32 ch = kMinUnicodeValue ; ch <= kMaxUnicodeValue ; ++ch) {
404
+ #if defined(HAS_UNILIB_H)
405
+ const int32_t kMinUnicodeValue = 33 ;
406
+ const int32_t kMaxUnicodeValue = 0x10FFFF ;
407
+ for (int32_t ch = kMinUnicodeValue ; ch <= kMaxUnicodeValue ; ++ch) {
385
408
if (!IsValidCodepoint (ch)) continue ;
386
- SCOPED_TRACE (StringPrintf (" Failed at U+%x" , ch));
387
- string str = EncodeAsUTF8 (ch);
388
- const string expected_half_str =
409
+ SCOPED_TRACE (absl::StrFormat (" Failed at U+%x" , ch));
410
+ std:: string str = EncodeAsUTF8 (ch);
411
+ const std:: string expected_half_str =
389
412
UniLib::FullwidthToHalfwidth (str.c_str (), str.length (), true );
390
413
EXPECT_EQ (expected_half_str, EncodeAsUTF8 (FullwidthToHalfwidth (ch)));
391
414
}
415
+ #endif
392
416
}
393
417
394
418
} // namespace
0 commit comments