Skip to content

Commit a044f64

Browse files
committed
fix Myanmar validation rules as per Unicode charts
1 parent 7ca27bb commit a044f64

File tree

1 file changed

+17
-8
lines changed

1 file changed

+17
-8
lines changed

src/training/validate_myanmar.cpp

+17-8
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
111111
}
112112
}
113113
// The following characters are allowed, all optional, and in sequence.
114-
const std::vector<char32> kSigns({0x1036, 0x1037});
114+
// Anusvar, Dot below, Visarga
115+
const std::vector<char32> kSigns({0x1036, 0x1037, 0x1038});
115116
for (char32 ch : kSigns) {
116117
if (codes_[codes_used_].second == ch) {
117118
if (UseMultiCode(1)) return true;
@@ -131,29 +132,37 @@ bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
131132
// Returns true if the unicode is a Myanmar "letter" including consonants
132133
// and independent vowels. Although table 16-3 distinguishes between some
133134
// base consonants and vowels, the extensions make no such distinction, so we
134-
// put them all into a single bucket.
135+
// put them all into a single bucket.
136+
// Update MYANMAR LETTER based on following:
137+
// https://unicode.org/charts/PDF/U1000.pdf - Myanmar
138+
// http://unicode.org/charts/PDF/UAA60.pdf - Myanmar Extended-A
139+
// http://unicode.org/charts/PDF/UA9E0.pdf - Myanmar Extended-B
135140
/* static */
136141
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
137142
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
138143
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
139144
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
140-
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
141-
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
142-
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
145+
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1081) ||
146+
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9e4) ||
147+
(0xa9e7 <= ch && ch <= 0xa9ef) || (0xa9fa <= ch && ch <= 0xa9fe) ||
148+
(0xaa60 <= ch && ch <= 0xaa6f) || (0xaa71 <= ch && ch <= 0xaa73) ||
143149
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
144150
}
145151

146152
// Returns true if ch is a Myanmar digit or other symbol that does not take
147-
// part in being a syllable.
153+
// part in being a syllable eg. punctuation marks.
154+
// MYANMAR DIGIT, MYANMAR SYMBOL, MYANMAR LOGOGRAM
155+
// REDUPLICATION MARKS
148156
/* static */
149157
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
150158
IcuErrorCode err;
151159
UScriptCode script_code = uscript_getScript(ch, err);
152160
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
153161
ch != Validator::kZeroWidthNonJoiner)
154162
return true;
155-
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
156-
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
163+
return (0x1040 <= ch && ch <= 0x104f) || (0x1090 <= ch && ch <= 0x1099) ||
164+
(0x109e <= ch && ch <= 0x109f) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
165+
ch == 0xa9e6 || ch == 0xaa70 ||
157166
(0xaa74 <= ch && ch <= 0xaa79);
158167
}
159168

0 commit comments

Comments
 (0)