1
- #include " tesseract/training/normstrngs.h"
1
+ // (C) Copyright 2017, Google Inc.
2
+ // Licensed under the Apache License, Version 2.0 (the "License");
3
+ // you may not use this file except in compliance with the License.
4
+ // You may obtain a copy of the License at
5
+ // http://www.apache.org/licenses/LICENSE-2.0
6
+ // Unless required by applicable law or agreed to in writing, software
7
+ // distributed under the License is distributed on an "AS IS" BASIS,
8
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ // See the License for the specific language governing permissions and
10
+ // limitations under the License.
2
11
3
- #include " tesseract/unittest/normstrngs_test.h"
12
+ #include " include_gunit.h"
13
+ #include " normstrngs.h"
14
+ #include " normstrngs_test.h"
4
15
5
16
namespace tesseract {
6
17
namespace {
7
18
8
19
TEST (ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
9
- string str = " \u0c15\u0c3f\u0c15\u0c0e " ; // KA - dep I - KA - ind E.
10
- std::vector<string> glyphs;
20
+ std:: string str = " \u0c15\u0c3f\u0c15\u0c0e " ; // KA - dep I - KA - ind E.
21
+ std::vector<std:: string> glyphs;
11
22
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
12
23
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
13
24
str.c_str (), &glyphs))
14
25
<< PrintString32WithUnicodes (str);
15
26
// It made 3 graphemes.
16
27
EXPECT_EQ (glyphs.size (), 3 );
17
- EXPECT_EQ (glyphs[0 ], string (" \u0c15\u0c3f " ));
18
- EXPECT_EQ (glyphs[1 ], string (" \u0c15 " ));
19
- EXPECT_EQ (glyphs[2 ], string (" \u0c0e " ));
28
+ EXPECT_EQ (glyphs[0 ], std:: string (" \u0c15\u0c3f " ));
29
+ EXPECT_EQ (glyphs[1 ], std:: string (" \u0c15 " ));
30
+ EXPECT_EQ (glyphs[2 ], std:: string (" \u0c0e " ));
20
31
}
21
32
22
33
TEST (ValidateGraphemeTest, SingleConsonantOK) {
23
- string str = " \u0cb9 " ; // HA
24
- std::vector<string> glyphs;
34
+ std:: string str = " \u0cb9 " ; // HA
35
+ std::vector<std:: string> glyphs;
25
36
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
26
37
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
27
38
str.c_str (), &glyphs))
@@ -31,8 +42,8 @@ TEST(ValidateGraphemeTest, SingleConsonantOK) {
31
42
}
32
43
33
44
TEST (ValidateGraphemeTest, SimpleCV) {
34
- string str = " \u0cb9\u0cbf " ; // HA I
35
- std::vector<string> glyphs;
45
+ std:: string str = " \u0cb9\u0cbf " ; // HA I
46
+ std::vector<std:: string> glyphs;
36
47
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
37
48
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
38
49
str.c_str (), &glyphs))
@@ -42,8 +53,8 @@ TEST(ValidateGraphemeTest, SimpleCV) {
42
53
}
43
54
44
55
TEST (ValidateGraphemeTest, SubscriptConjunct) {
45
- string str = " \u0cb9\u0ccd\u0c95\u0cbf " ; // HA Virama KA I
46
- std::vector<string> glyphs;
56
+ std:: string str = " \u0cb9\u0ccd\u0c95\u0cbf " ; // HA Virama KA I
57
+ std::vector<std:: string> glyphs;
47
58
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
48
59
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
49
60
str.c_str (), &glyphs))
@@ -55,12 +66,12 @@ TEST(ValidateGraphemeTest, SubscriptConjunct) {
55
66
true , str.c_str (), &glyphs))
56
67
<< PrintString32WithUnicodes (str);
57
68
EXPECT_EQ (glyphs.size (), 3 );
58
- EXPECT_EQ (glyphs[1 ], string (" \u0ccd\u0c95 " ));
69
+ EXPECT_EQ (glyphs[1 ], std:: string (" \u0ccd\u0c95 " ));
59
70
}
60
71
61
72
TEST (ValidateGraphemeTest, HalfFormJoiner) {
62
- string str = " \u0d15\u0d4d\u200d\u0d24 " ; // KA Virama ZWJ Ta
63
- std::vector<string> glyphs;
73
+ std:: string str = " \u0d15\u0d4d\u200d\u0d24 " ; // KA Virama ZWJ Ta
74
+ std::vector<std:: string> glyphs;
64
75
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
65
76
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
66
77
str.c_str (), &glyphs))
@@ -72,12 +83,12 @@ TEST(ValidateGraphemeTest, HalfFormJoiner) {
72
83
true , str.c_str (), &glyphs))
73
84
<< PrintString32WithUnicodes (str);
74
85
EXPECT_EQ (glyphs.size (), 2 ) << PrintStringVectorWithUnicodes (glyphs);
75
- EXPECT_EQ (glyphs[0 ], string (" \u0d15\u0d4d\u200d " ));
86
+ EXPECT_EQ (glyphs[0 ], std:: string (" \u0d15\u0d4d\u200d " ));
76
87
}
77
88
78
89
TEST (ValidateGraphemeTest, TraditionalConjunctJoiner) {
79
- string str = " \u0d15\u200d\u0d4d\u0d24 " ; // KA ZWI Virama Ta
80
- std::vector<string> glyphs;
90
+ std:: string str = " \u0d15\u200d\u0d4d\u0d24 " ; // KA ZWI Virama Ta
91
+ std::vector<std:: string> glyphs;
81
92
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
82
93
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
83
94
str.c_str (), &glyphs))
@@ -89,12 +100,12 @@ TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
89
100
true , str.c_str (), &glyphs))
90
101
<< PrintString32WithUnicodes (str);
91
102
EXPECT_EQ (glyphs.size (), 3 );
92
- EXPECT_EQ (glyphs[1 ], string (" \u200d\u0d4d " ));
103
+ EXPECT_EQ (glyphs[1 ], std:: string (" \u200d\u0d4d " ));
93
104
}
94
105
95
106
TEST (ValidateGraphemeTest, OpenConjunctNonJoiner) {
96
- string str = " \u0d15\u200c\u0d4d\u0d24 " ; // KA ZWNJ Virama Ta
97
- std::vector<string> glyphs;
107
+ std:: string str = " \u0d15\u200c\u0d4d\u0d24 " ; // KA ZWNJ Virama Ta
108
+ std::vector<std:: string> glyphs;
98
109
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
99
110
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
100
111
str.c_str (), &glyphs))
@@ -106,7 +117,7 @@ TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
106
117
true , str.c_str (), &glyphs))
107
118
<< PrintString32WithUnicodes (str);
108
119
EXPECT_EQ (glyphs.size (), 3 );
109
- EXPECT_EQ (glyphs[1 ], string (" \u200c\u0d4d " ));
120
+ EXPECT_EQ (glyphs[1 ], std:: string (" \u200c\u0d4d " ));
110
121
// Malaylam only, so not allowed in Telugu.
111
122
str = " \u0c15\u200c\u0c4d\u0c24 " ; // KA ZWNJ Virama Ta
112
123
EXPECT_FALSE (NormalizeCleanAndSegmentUTF8 (
@@ -116,26 +127,26 @@ TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
116
127
}
117
128
118
129
TEST (ValidateGraphemeTest, ExplicitViramaNonJoiner) {
119
- string str = " \u0d15\u0d4d\u200c\u0d24 " ; // KA Virama ZWNJ Ta
120
- std::vector<string> glyphs;
130
+ std:: string str = " \u0d15\u0d4d\u200c\u0d24 " ; // KA Virama ZWNJ Ta
131
+ std::vector<std:: string> glyphs;
121
132
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
122
133
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
123
134
str.c_str (), &glyphs))
124
135
<< PrintString32WithUnicodes (str);
125
136
EXPECT_EQ (glyphs.size (), 2 );
126
- EXPECT_EQ (glyphs[1 ], string (" \u0d24 " ));
137
+ EXPECT_EQ (glyphs[1 ], std:: string (" \u0d24 " ));
127
138
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
128
139
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kGlyphSplit ,
129
140
true , str.c_str (), &glyphs))
130
141
<< PrintString32WithUnicodes (str);
131
142
EXPECT_EQ (glyphs.size (), 3 );
132
- EXPECT_EQ (glyphs[1 ], string (" \u0d4d\u200c " ));
143
+ EXPECT_EQ (glyphs[1 ], std:: string (" \u0d4d\u200c " ));
133
144
}
134
145
135
146
TEST (ValidateGraphemeTest, ThaiGraphemes) {
136
147
// This is a single grapheme unless in glyph split mode
137
- string str = " \u0e14\u0e38\u0e4a " ;
138
- std::vector<string> glyphs;
148
+ std:: string str = " \u0e14\u0e38\u0e4a " ;
149
+ std::vector<std:: string> glyphs;
139
150
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
140
151
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
141
152
str.c_str (), &glyphs))
@@ -147,23 +158,23 @@ TEST(ValidateGraphemeTest, ThaiGraphemes) {
147
158
true , str.c_str (), &glyphs))
148
159
<< PrintString32WithUnicodes (str);
149
160
EXPECT_EQ (glyphs.size (), 3 );
150
- EXPECT_EQ (glyphs[0 ], string (" \u0e14 " ));
161
+ EXPECT_EQ (glyphs[0 ], std:: string (" \u0e14 " ));
151
162
}
152
163
153
164
TEST (ValidateGraphemeTest, NoLonelyJoinersQuote) {
154
- string str = " '\u0d24\u0d23\u0d32\u0d4d '\u200d " ;
155
- std::vector<string> glyphs;
165
+ std:: string str = " '\u0d24\u0d23\u0d32\u0d4d '\u200d " ;
166
+ std::vector<std:: string> glyphs;
156
167
// Returns true, but the joiner is gone.
157
168
EXPECT_TRUE (NormalizeCleanAndSegmentUTF8 (
158
169
UnicodeNormMode::kNFC , OCRNorm::kNone , GraphemeNormMode::kCombined , true ,
159
170
str.c_str (), &glyphs))
160
171
<< PrintString32WithUnicodes (str);
161
172
EXPECT_EQ (glyphs.size (), 5 );
162
- EXPECT_EQ (glyphs[0 ], string (" '" ));
163
- EXPECT_EQ (glyphs[1 ], string (" \u0d24 " ));
164
- EXPECT_EQ (glyphs[2 ], string (" \u0d23 " ));
165
- EXPECT_EQ (glyphs[3 ], string (" \u0d32\u0d4d\u200c " ));
166
- EXPECT_EQ (glyphs[4 ], string (" '" ));
173
+ EXPECT_EQ (glyphs[0 ], std:: string (" '" ));
174
+ EXPECT_EQ (glyphs[1 ], std:: string (" \u0d24 " ));
175
+ EXPECT_EQ (glyphs[2 ], std:: string (" \u0d23 " ));
176
+ EXPECT_EQ (glyphs[3 ], std:: string (" \u0d32\u0d4d\u200c " ));
177
+ EXPECT_EQ (glyphs[4 ], std:: string (" '" ));
167
178
}
168
179
169
180
} // namespace
0 commit comments