diff --git a/spec/message.abnf b/spec/message.abnf index 04ad7c8fed..c01b0276a5 100644 --- a/spec/message.abnf +++ b/spec/message.abnf @@ -5,41 +5,44 @@ simple-start = simple-start-char / escaped-char / placeholder pattern = *(text-char / escaped-char / placeholder) placeholder = expression / markup -complex-message = *(declaration [s]) complex-body +complex-message = *(declaration owsp) complex-body declaration = input-declaration / local-declaration / reserved-statement complex-body = quoted-pattern / matcher -input-declaration = input [s] variable-expression -local-declaration = local s variable [s] "=" [s] expression +input-declaration = input owsp variable-expression +local-declaration = local wsp variable owsp "=" owsp expression quoted-pattern = "{{" pattern "}}" + / %x2066 "{{" pattern "}}" %x2069 -matcher = match-statement 1*([s] variant) -match-statement = match 1*([s] selector) +matcher = match-statement 1*(owsp variant) +match-statement = match 1*(owsp selector) selector = expression -variant = key *(s key) [s] quoted-pattern +variant = key *(wsp key) owsp quoted-pattern key = literal / "*" ; Expressions -expression = literal-expression - / variable-expression - / annotation-expression -literal-expression = "{" [s] literal [s annotation] *(s attribute) [s] "}" -variable-expression = "{" [s] variable [s annotation] *(s attribute) [s] "}" -annotation-expression = "{" [s] annotation *(s attribute) [s] "}" +expression = "{" (literal-expression / variable-expression / annotation-expression) "}" + / "{" %x2066 (literal-expression / variable-expression / annotation-expression) %x2069 "}" +literal-expression = owsp literal [wsp annotation] *(wsp attribute) owsp +variable-expression = owsp variable [wsp annotation] *(wsp attribute) owsp +annotation-expression = owsp annotation *(wsp attribute) owsp annotation = function / private-use-annotation / reserved-annotation -markup = "{" [s] "#" identifier *(s option) *(s attribute) [s] ["/"] "}" ; open and standalone - / "{" [s] "/" identifier *(s option) *(s attribute) [s] "}" ; close +markup = "{" markup-body "}" + / "{" %x2066 markup-body %x2069 "}" + +markup-body = owsp "#" identifier *(wsp option) *(wsp attribute) owsp ["/"] ; open and standalone + / owsp "/" identifier *(wsp option) *(wsp attribute) owsp ; close ; Expression and literal parts -function = ":" identifier *(s option) -option = identifier [s] "=" [s] (literal / variable) +function = ":" identifier *(wsp option) +option = identifier owsp "=" owsp (literal / variable) ; Attributes are reserved for future standardization -attribute = "@" identifier [[s] "=" [s] (literal / variable)] +attribute = "@" identifier [owsp "=" owsp (literal / variable)] variable = "$" name literal = quoted / unquoted @@ -54,7 +57,7 @@ local = %s".local" match = %s".match" ; Reserve additional .keywords for use by future versions of this specification. -reserved-statement = reserved-keyword [s reserved-body] 1*([s] expression) +reserved-statement = reserved-keyword [s reserved-body] 1*(owsp expression) ; Note that the following production is a simplification, ; as this rule MUST NOT be considered to match existing keywords ; (`.input`, `.local`, and `.match`). @@ -67,8 +70,8 @@ reserved-annotation-start = "!" / "%" / "*" / "+" / "<" / ">" / "?" / "~" ; Reserve sigils for private-use by implementations. private-use-annotation = private-start [[s] reserved-body] private-start = "^" / "&" -reserved-body = reserved-body-part *([s] reserved-body-part) -reserved-body-part = reserved-char / escaped-char / quoted +reserved-body = *(owsp 1*(reserved-char / escaped-char / quoted)) + ; Names and identifiers ; identifier matches https://www.w3.org/TR/REC-xml-names/#NT-QName @@ -104,5 +107,15 @@ content-char = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A) escaped-char = backslash ( backslash / "{" / "|" / "}" ) backslash = %x5C ; U+005C REVERSE SOLIDUS "\" -; Whitespace -s = 1*( SP / HTAB / CR / LF / %x3000 ) +; bidi controls +; LRM / RLM / LRI / RLI / FSI / PDI +bidi = %x200E / %x200F / %x2066-2069 + +; optional whitespace +owsp = *( s / bidi ) + +; required whitespace +wsp = [ bidi ] 1*s [ bidi ] + +; whitespace characters +s = ( SP / HTAB / CR / LF / %x3000 ) diff --git a/spec/syntax.md b/spec/syntax.md index 3376f65e25..c874be78b2 100644 --- a/spec/syntax.md +++ b/spec/syntax.md @@ -134,15 +134,23 @@ A **_local variable_** is a _variable_ created as the result of a _lo > > An exception to this is: whitespace inside a _pattern_ is **always** significant. > [!NOTE] -> The syntax assumes that each _message_ will be displayed with a left-to-right display order -> and be processed in the logical character order. -> The syntax also permits the use of right-to-left characters in _identifiers_, +> The MessageFormat 2 syntax assumes that each _message_ will be displayed +> with a left-to-right display order +> and be processed in the logical character order +> while permitting the use of right-to-left characters in _identifiers_, > _literals_, and other values. -> This can result in confusion when viewing the _message_. +> This can result in confusion when viewing the message +> or in users incorrectly inserting controls that negatively affect the output +> of the message. +> +> To assist with this, the syntax permits the use of various controls and +> strongly-directional markers in both optional and required _whitespace_ +> in a _message_, as well was encouraging the use of isolating controls +> with _expressions_ and _quoted patterns_. +> See: [whitespace](#whitespace) (below) for more information. > -> Additional restrictions or requirements, -> such as permitting the use of certain bidirectional control characters in the syntax, -> might be added during the Tech Preview to better manage bidirectional text. +> Additional restrictions or requirements might be added during the +> Tech Preview to better manage bidirectional text. > Feedback on the creation and management of _messages_ > containing bidirectional tokens is strongly desired. @@ -955,24 +963,91 @@ and inside _patterns_ only escape `{` and `}`. ### Whitespace -**_Whitespace_** is defined as one or more of -U+0009 CHARACTER TABULATION (tab), -U+000A LINE FEED (new line), -U+000D CARRIAGE RETURN, -U+3000 IDEOGRAPHIC SPACE, -or U+0020 SPACE. +The syntax limits whitespace characters outside of a _pattern_ to the following: +`U+0009 CHARACTER TABULATION` (tab), +`U+000A LINE FEED` (new line), +`U+000D CARRIAGE RETURN`, +`U+3000 IDEOGRAPHIC SPACE`, +or `U+0020 SPACE`. Inside _patterns_ and _quoted literals_, whitespace is part of the content and is recorded and stored verbatim. Whitespace is not significant outside translatable text, except where required by the syntax. +There are two whitespace productions in the syntax. +**_Optional whitespace_** is whitespace that is not required by the syntax, +but which users might want to include to increase the readability of a _message_. +**_Required whitespace_** is whitespace that is required by the syntax. + +_Messages_ that contain right-to-left (aka RTL) characters SHOULD use one of the +following mechanisms to make messages display intelligibly in plain-text editors: + +1. Use paired isolating bidi controls `U+2066 LEFT-TO-RIGHT ISOLATE` + and `U+2069 POP DIRECTIONAL ISOLATE` as permitted by the ABNF around + parts of any _message_ containing RTL characters: + - _inside_ of _placeholder_ markers `{` and `}` + - _outside_ _quoted-pattern_ markers `{{` and `}}` + - _identifiers_ + - _literals_ (This is especially important for individual _keys_ in a _variant_) + - _option_ values +2. Use the 'local-effect' bidi controls`U+200E LEFT-TO-RIGHT MARK` or + `U+200F RIGHT-TO-LEFT MARK` as permitted by the ABNF around + parts of any _message_ containing RTL characters: + - _identifiers_ + - _literals_ (taking care not to include the mark inside any quotes), + - _option_ values + +> [!IMPORTANT] +> Always take care **not** to add a bidi control where it is semantically significant: +> - put them outside of _literal_ quotes, such as `|...|` +> - put them outside of quoted _patterns_, such as `{{...}}` +> Controls placed inside _literal_ quotes or quoted _patterns_ are part of the literal +> or pattern. +> Controls in a _pattern_ will appear in the output of the message. +> Controls inside _literal_ quotes are part of the _literal_ and +> will be considered in operations such as matching a _key_ to a _selector_. + +> [!NOTE] +> Users cannot be expected to create or manage bidirectional controls or +> marks in _messages_, since the characters are invisible and can be difficult +> to manage. +> Tools (such as resource editors or translation editors) +> and other implementations of MessageFormat 2 serialization are strongly +> encouraged to provide paired isolates around any right-to-left +> syntax as described above so that _messages_ display appropriately as plain text. + +These definitions of _whitespace_ implement +[UAX#31 Requirement R3a-2](https://www.unicode.org/reports/tr31/#R3a-2). +It is a profile of R3a-1 in that specification because: +the following pattern whitespace characters are not allowed: +`U+000B FORM FEED`, +`U+000C VERTICAL TABULATION`, +`U+0085 NEXT LINE`, +`U+2028 LINE SEPARATOR` and +`U+2029 PARAGRAPH SEPARATOR`; +the character `U+3000 IDEOGRAPHIC SPACE` +_is_ interpreted as whitespace, + and the directional isolates U+2066..U+2069 + are treated as ignorable format controls. + > [!NOTE] > The character U+3000 IDEOGRAPHIC SPACE is included in whitespace for > compatibility with certain East Asian keyboards and input methods, > in which users might accidentally create these characters in a _message_. ```abnf -s = 1*( SP / HTAB / CR / LF / %x3000 ) +; bidi controls +; LRM / RLM / LRI / RLI / FSI / PDI +bidi = %x200E / %x200F / %x2066-2069 + +; optional whitespace +owsp = *( s / bidi ) + +; required whitespace +wsp = [ bidi ] 1*s [ bidi ] + +; whitespace characters +s = ( SP / HTAB / CR / LF / %x3000 ) ``` ## Complete ABNF