Skip to content

Commit 2ff0de9

Browse files
committed
ICU-22940 MF2 ICU4C: Update for bidi support
Per unicode-org/message-format-wg#884
1 parent ecb67cf commit 2ff0de9

File tree

6 files changed

+250
-69
lines changed

6 files changed

+250
-69
lines changed

icu4c/source/i18n/messageformat2_parser.cpp

Lines changed: 79 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,13 @@ static bool isContentChar(UChar32 c) {
125125
|| inRange(c, 0xE000, 0x10FFFF);
126126
}
127127

128-
// See `s` in the MessageFormat 2 grammar
128+
// See `bidi` in the MF2 grammar
129+
static bool isBidi(UChar32 c) {
130+
return (c == 0x061C || c == 0x200E || c == 0x200F ||
131+
inRange(c, 0x2066, 0x2069));
132+
}
133+
134+
// See `ws` in the MessageFormat 2 grammar
129135
inline bool isWhitespace(UChar32 c) {
130136
switch (c) {
131137
case SPACE:
@@ -153,8 +159,8 @@ static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); }
153159

154160
static bool isNameStart(UChar32 c) {
155161
return isAlpha(c) || c == UNDERSCORE || inRange(c, 0x00C0, 0x00D6) || inRange(c, 0x00D8, 0x00F6) ||
156-
inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x1FFF) ||
157-
inRange(c, 0x200C, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
162+
inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x061B) ||
163+
inRange(c, 0x061D, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
158164
inRange(c, 0x3001, 0xD7FF) || inRange(c, 0xF900, 0xFDCF) || inRange(c, 0xFDF0, 0xFFFD) ||
159165
inRange(c, 0x10000, 0xEFFFF);
160166
}
@@ -347,7 +353,7 @@ option, or the optional space before an attribute.
347353
No pre, no post.
348354
A message may end with whitespace, so `index` may equal `len()` on exit.
349355
*/
350-
void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) {
356+
void Parser::parseRequiredWS(UErrorCode& errorCode) {
351357
bool sawWhitespace = false;
352358

353359
// The loop exits either when we consume all the input,
@@ -358,7 +364,7 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode)
358364
// If whitespace isn't required -- or if we saw it already --
359365
// then the caller is responsible for checking this case and
360366
// setting an error if necessary.
361-
if (!required || sawWhitespace) {
367+
if (sawWhitespace) {
362368
// Not an error.
363369
return;
364370
}
@@ -380,24 +386,51 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode)
380386
}
381387
}
382388

383-
if (!sawWhitespace && required) {
389+
if (!sawWhitespace) {
384390
ERROR(errorCode);
385391
}
386392
}
387393

394+
void Parser::parseOptionalBidi() {
395+
while (true) {
396+
if (!inBounds()) {
397+
return;
398+
}
399+
if (isBidi(peek())) {
400+
next();
401+
} else {
402+
break;
403+
}
404+
}
405+
}
406+
388407
/*
389-
No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
408+
No pre, no post, because a message may end with whitespace
409+
Matches `s` in the MF2 grammar
390410
*/
391411
void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
392-
parseWhitespaceMaybeRequired(true, errorCode);
412+
parseOptionalBidi();
413+
parseRequiredWS(errorCode);
414+
parseOptionalWhitespace();
393415
normalizedInput += SPACE;
394416
}
395417

396418
/*
397419
No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
398420
*/
399-
void Parser::parseOptionalWhitespace(UErrorCode& errorCode) {
400-
parseWhitespaceMaybeRequired(false, errorCode);
421+
void Parser::parseOptionalWhitespace() {
422+
while (true) {
423+
if (!inBounds()) {
424+
return;
425+
}
426+
auto cp = peek();
427+
if (isWhitespace(cp) || isBidi(cp)) {
428+
maybeAdvanceLine();
429+
next();
430+
} else {
431+
break;
432+
}
433+
}
401434
}
402435

403436
// Consumes a single character, signaling an error if `peek()` != `c`
@@ -442,11 +475,11 @@ void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode)
442475
*/
443476
void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
444477
// No need for error check or bounds check before parseOptionalWhitespace
445-
parseOptionalWhitespace(errorCode);
478+
parseOptionalWhitespace();
446479
// Establish precondition
447480
CHECK_BOUNDS(errorCode);
448481
parseToken(token, errorCode);
449-
parseOptionalWhitespace(errorCode);
482+
parseOptionalWhitespace();
450483
// Guarantee postcondition
451484
CHECK_BOUNDS(errorCode);
452485
}
@@ -458,12 +491,12 @@ void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCo
458491
then consumes optional whitespace again
459492
*/
460493
void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
461-
// No need for error check or bounds check before parseOptionalWhitespace(errorCode)
462-
parseOptionalWhitespace(errorCode);
494+
// No need for error check or bounds check before parseOptionalWhitespace()
495+
parseOptionalWhitespace();
463496
// Establish precondition
464497
CHECK_BOUNDS(errorCode);
465498
parseToken(c, errorCode);
466-
parseOptionalWhitespace(errorCode);
499+
parseOptionalWhitespace();
467500
// Guarantee postcondition
468501
CHECK_BOUNDS(errorCode);
469502
}
@@ -482,11 +515,17 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) {
482515

483516
U_ASSERT(inBounds());
484517

485-
if (!isNameStart(peek())) {
518+
if (!(isNameStart(peek()) || isBidi(peek()))) {
486519
ERROR(errorCode);
487520
return name;
488521
}
489522

523+
// name = [bidi] name-start *name-char [bidi]
524+
525+
// [bidi]
526+
parseOptionalBidi();
527+
528+
// name-start *name-char
490529
while (isNameChar(peek())) {
491530
UChar32 c = peek();
492531
name += c;
@@ -497,6 +536,10 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) {
497536
break;
498537
}
499538
}
539+
540+
// [bidi]
541+
parseOptionalBidi();
542+
500543
return name;
501544
}
502545

@@ -845,7 +888,7 @@ void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode)
845888
// about whether whitespace precedes another
846889
// attribute, or the '=' sign
847890
int32_t savedIndex = index;
848-
parseOptionalWhitespace(errorCode);
891+
parseOptionalWhitespace();
849892

850893
Operand rand;
851894
if (peek() == EQUALS) {
@@ -1131,7 +1174,7 @@ the comment in `parseOptions()` for details.
11311174
// (the character is either the required space before an annotation, or optional
11321175
// trailing space after the literal or variable). It's still ambiguous which
11331176
// one does apply.
1134-
parseOptionalWhitespace(status);
1177+
parseOptionalWhitespace();
11351178
// Restore precondition
11361179
CHECK_BOUNDS(status);
11371180

@@ -1202,7 +1245,7 @@ Expression Parser::parseExpression(UErrorCode& status) {
12021245
// Parse opening brace
12031246
parseToken(LEFT_CURLY_BRACE, status);
12041247
// Optional whitespace after opening brace
1205-
parseOptionalWhitespace(status);
1248+
parseOptionalWhitespace();
12061249

12071250
Expression::Builder exprBuilder(status);
12081251
// Restore precondition
@@ -1245,7 +1288,7 @@ Expression Parser::parseExpression(UErrorCode& status) {
12451288

12461289
// Parse optional space
12471290
// (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
1248-
parseOptionalWhitespace(status);
1291+
parseOptionalWhitespace();
12491292

12501293
// Either an operand or operator (or both) must have been set already,
12511294
// so there can't be an error
@@ -1321,7 +1364,7 @@ void Parser::parseInputDeclaration(UErrorCode& status) {
13211364
CHECK_BOUNDS(status);
13221365

13231366
parseToken(ID_INPUT, status);
1324-
parseOptionalWhitespace(status);
1367+
parseOptionalWhitespace();
13251368

13261369
// Restore precondition before calling parseExpression()
13271370
CHECK_BOUNDS(status);
@@ -1382,7 +1425,7 @@ void Parser::parseDeclarations(UErrorCode& status) {
13821425
// Avoid looping infinitely
13831426
CHECK_ERROR(status);
13841427

1385-
parseOptionalWhitespace(status);
1428+
parseOptionalWhitespace();
13861429
// Restore precondition
13871430
CHECK_BOUNDS(status);
13881431
}
@@ -1492,8 +1535,8 @@ This is addressed using "backtracking" (similarly to `parseOptions()`).
14921535

14931536
// We've seen at least one whitespace-key pair, so now we can parse
14941537
// *(s key) [s]
1495-
while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek())) { // Try to recover from errors
1496-
bool wasWhitespace = isWhitespace(peek());
1538+
while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidi(peek())) {
1539+
bool wasWhitespace = isWhitespace(peek()) || isBidi(peek());
14971540
parseRequiredWhitespace(status);
14981541
if (!wasWhitespace) {
14991542
// Avoid infinite loop when parsing something like:
@@ -1551,7 +1594,7 @@ Markup Parser::parseMarkup(UErrorCode& status) {
15511594
// Consume the '{'
15521595
next();
15531596
normalizedInput += LEFT_CURLY_BRACE;
1554-
parseOptionalWhitespace(status);
1597+
parseOptionalWhitespace();
15551598
bool closing = false;
15561599
switch (peek()) {
15571600
case NUMBER_SIGN: {
@@ -1578,19 +1621,19 @@ Markup Parser::parseMarkup(UErrorCode& status) {
15781621

15791622
// Parse the options, which must begin with a ' '
15801623
// if present
1581-
if (inBounds() && isWhitespace(peek())) {
1624+
if (inBounds() && (isWhitespace(peek()) || isBidi(peek()))) {
15821625
OptionAdder<Markup::Builder> optionAdder(builder);
15831626
parseOptions(optionAdder, status);
15841627
}
15851628

15861629
// Parse the attributes, which also must begin
15871630
// with a ' '
1588-
if (inBounds() && isWhitespace(peek())) {
1631+
if (inBounds() && (isWhitespace(peek()) || isBidi(peek()))) {
15891632
AttributeAdder<Markup::Builder> attrAdder(builder);
15901633
parseAttributes(attrAdder, status);
15911634
}
15921635

1593-
parseOptionalWhitespace(status);
1636+
parseOptionalWhitespace();
15941637

15951638
bool standalone = false;
15961639
// Check if this is a standalone or not
@@ -1638,7 +1681,7 @@ std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
16381681
isMarkup = true;
16391682
break;
16401683
}
1641-
if (!isWhitespace(c)){
1684+
if (!(isWhitespace(c) || isBidi(c))) {
16421685
break;
16431686
}
16441687
tempIndex++;
@@ -1738,8 +1781,7 @@ void Parser::parseSelectors(UErrorCode& status) {
17381781
// "Backtracking" is required here. It's not clear if whitespace is
17391782
// (`[s]` selector) or (`[s]` variant)
17401783
while (isWhitespace(peek()) || peek() == DOLLAR) {
1741-
int32_t whitespaceStart = index;
1742-
parseRequiredWhitespace(status);
1784+
parseOptionalWhitespace();
17431785
// Restore precondition
17441786
CHECK_BOUNDS(status);
17451787
if (peek() != DOLLAR) {
@@ -1771,24 +1813,9 @@ void Parser::parseSelectors(UErrorCode& status) {
17711813
} \
17721814

17731815
// Parse variants
1774-
// matcher = match-statement s variant *(o variant)
1775-
1776-
// Parse first variant
1777-
parseRequiredWhitespace(status);
1778-
if (!inBounds()) {
1779-
ERROR(status);
1780-
return;
1781-
}
1782-
parseVariant(status);
1783-
if (!inBounds()) {
1784-
// Not an error; there might be only one variant
1785-
return;
1786-
}
1787-
1788-
while (isWhitespace(peek()) || isKeyStart(peek())) {
1789-
parseOptionalWhitespace(status);
1790-
// Restore the precondition.
1791-
// Trailing whitespace is allowed.
1816+
while (isWhitespace(peek()) || isBidi(peek()) || isKeyStart(peek())) {
1817+
// Trailing whitespace is allowed
1818+
parseOptionalWhitespace();
17921819
if (!inBounds()) {
17931820
return;
17941821
}
@@ -1874,7 +1901,7 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
18741901
bool complex = false;
18751902
// First, "look ahead" to determine if this is a simple or complex
18761903
// message. To do that, check the first non-whitespace character.
1877-
while (inBounds(index) && isWhitespace(peek())) {
1904+
while (inBounds(index) && (isWhitespace(peek()) || isBidi(peek()))) {
18781905
next();
18791906
}
18801907

@@ -1894,10 +1921,10 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
18941921
// Message can be empty, so we need to only look ahead
18951922
// if we know it's non-empty
18961923
if (complex) {
1897-
parseOptionalWhitespace(status);
1924+
parseOptionalWhitespace();
18981925
parseDeclarations(status);
18991926
parseBody(status);
1900-
parseOptionalWhitespace(status);
1927+
parseOptionalWhitespace();
19011928
} else {
19021929
// Simple message
19031930
// For normalization, quote the pattern

icu4c/source/i18n/messageformat2_parser.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,10 @@ namespace message2 {
103103
void parseSelectors(UErrorCode&);
104104
void parseVariant(UErrorCode&);
105105

106-
void parseWhitespaceMaybeRequired(bool, UErrorCode&);
106+
void parseRequiredWS(UErrorCode&);
107107
void parseRequiredWhitespace(UErrorCode&);
108-
void parseOptionalWhitespace(UErrorCode&);
108+
void parseOptionalBidi();
109+
void parseOptionalWhitespace();
109110
void parseToken(UChar32, UErrorCode&);
110111
void parseTokenWithWhitespace(UChar32, UErrorCode&);
111112
void parseToken(const std::u16string_view&, UErrorCode&);

icu4c/source/test/intltest/messageformat2test.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,6 @@
1111

1212
using namespace icu::message2;
1313

14-
/*
15-
TODO: Tests need to be unified in a single format that
16-
both ICU4C and ICU4J can use, rather than being embedded in code.
17-
18-
Tests are included in their current state to give a sense of
19-
how much test coverage has been achieved. Most of the testing is
20-
of the parser/serializer; the formatter needs to be tested more
21-
thoroughly.
22-
*/
23-
2414
void
2515
TestMessageFormat2::runIndexedTest(int32_t index, UBool exec,
2616
const char* &name, char* /*par*/) {

icu4c/source/test/intltest/messageformat2test_read_json.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,9 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) {
316316
// TODO: move this into the spec tests when
317317
// https://github.com/unicode-org/message-format-wg/pull/846 lands
318318
runTestsFromJsonFile(*this, "u-options.json", errorCode);
319+
// TODO: https://github.com/unicode-org/message-format-wg/pull/902 will
320+
// move the bidi tests into the spec
321+
runTestsFromJsonFile(*this, "bidi.json", errorCode);
319322
runTestsFromJsonFile(*this, "more-functions.json", errorCode);
320323
runTestsFromJsonFile(*this, "valid-tests.json", errorCode);
321324
runTestsFromJsonFile(*this, "resolution-errors.json", errorCode);

icu4c/source/test/intltest/messageformat2test_utils.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ class TestUtils {
253253
// For now, don't round-trip messages with these errors,
254254
// since duplicate options are dropped
255255
&& (testCase.expectSuccess() ||
256-
(testCase.expectedErrorCode() != U_MF_DUPLICATE_OPTION_NAME_ERROR))) {
256+
testCase.expectedErrorCode() != U_MF_DUPLICATE_OPTION_NAME_ERROR)) {
257257
failRoundTrip(tmsg, testCase, in, out);
258258
}
259259

@@ -295,10 +295,10 @@ class TestUtils {
295295
}
296296
// Re-run the formatter
297297
result = mf.formatToString(MessageArguments(testCase.getArguments(), errorCode), errorCode);
298-
if (!testCase.outputMatches(result)) {
299-
failWrongOutput(tmsg, testCase, result);
300-
return;
301-
}
298+
}
299+
if (!testCase.outputMatches(result)) {
300+
failWrongOutput(tmsg, testCase, result);
301+
return;
302302
}
303303
errorCode.reset();
304304
}

0 commit comments

Comments
 (0)