Skip to content

Commit 5ee1dea

Browse files
committed
correct handling of 0BF0-0BFA Tamil numbers and symbols
1 parent d7ddc4c commit 5ee1dea

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

src/training/validate_indic.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
7474
if (off == 0x62 || off == 0x63) return CharClass::kMatra;
7575
// Danda and digits up to 6f are OK as other.
7676
// 70-7f are script-specific.
77+
// 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
78+
if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72))
79+
return CharClass::kOther;
80+
// 0BF3-0BFA are other Tamil symbols.
81+
if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A))
82+
return CharClass::kOther;
7783
if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
7884
return CharClass::kConsonant;
7985
if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
@@ -139,7 +145,7 @@ bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
139145
// for consistency, we will always add ZWNJ if not present.
140146
output_.push_back(kZeroWidthNonJoiner);
141147
} else {
142-
CodeOnlyToOutput();
148+
CodeOnlyToOutput();
143149
}
144150
// Explicit virama [H z]
145151
MultiCodePart(2);

0 commit comments

Comments
 (0)