Skip to content

Commit 8461de1

Browse files
authored
Add UTF16 handling to ixdtf (unicode-org#6577)
This PR adds UTF16 handling to `ixdtf`. For reference, an issue was recently filed on `temporal_rs` (boa-dev/temporal#275). It was noted that `ixdtf` needs to support UTF16. The major changes occur in `utils/ixdtf/src/core.rs`. Just to note as well, since this would be a breaking change that adds the import `ixdtf::Slice`, the records module was moved out of parsers module and into crate module to clean up imports.
1 parent c2ea845 commit 8461de1

File tree

17 files changed

+552
-345
lines changed

17 files changed

+552
-345
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ databake = { version = "0.2.0", path = "utils/databake", default-features = fals
192192
databake-derive = { version = "0.2.0", path = "utils/databake/derive", default-features = false }
193193
deduplicating_array = { version = "0.1.6", path = "utils/deduplicating_array", default-features = false }
194194
fixed_decimal = { version = "0.7.0", path = "utils/fixed_decimal", default-features = false }
195-
ixdtf = { version = "0.5.0", path = "utils/ixdtf", default-features = false }
195+
ixdtf = { version = "0.6.0-dev", path = "utils/ixdtf", default-features = false }
196196
litemap = { version = "0.8.0", path = "utils/litemap", default-features = false }
197197
potential_utf = { version = "0.1.1", path = "utils/potential_utf", default-features = false }
198198
tzif = { version = "0.4.0", path = "utils/tzif", default-features = false }

components/calendar/src/ixdtf.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ use core::str::FromStr;
66

77
use crate::{AsCalendar, Calendar, Date, Iso, RangeError};
88
use icu_locale_core::preferences::extensions::unicode::keywords::CalendarAlgorithm;
9-
use ixdtf::parsers::records::IxdtfParseRecord;
9+
use ixdtf::encoding::Utf8;
1010
use ixdtf::parsers::IxdtfParser;
11+
use ixdtf::records::IxdtfParseRecord;
1112
use ixdtf::ParseError as Rfc9557Error;
1213

1314
/// An error returned from parsing an RFC 9557 string to an `icu_calendar` type.
@@ -93,7 +94,7 @@ impl<A: AsCalendar> Date<A> {
9394

9495
#[doc(hidden)]
9596
pub fn try_from_ixdtf_record(
96-
ixdtf_record: &IxdtfParseRecord,
97+
ixdtf_record: &IxdtfParseRecord<'_, Utf8>,
9798
calendar: A,
9899
) -> Result<Self, ParseError> {
99100
let date_record = ixdtf_record.date.ok_or(ParseError::MissingFields)?;

components/time/src/ixdtf.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,11 @@ use core::str::FromStr;
1313
use icu_calendar::{AnyCalendarKind, AsCalendar, Date, DateError, Iso, RangeError};
1414
use icu_locale_core::subtags::subtag;
1515
use ixdtf::{
16-
parsers::{
17-
records::{
18-
DateRecord, IxdtfParseRecord, TimeRecord, TimeZoneAnnotation, TimeZoneRecord,
19-
UtcOffsetRecord, UtcOffsetRecordOrZ,
20-
},
21-
IxdtfParser,
16+
encoding::Utf8,
17+
parsers::IxdtfParser,
18+
records::{
19+
DateRecord, IxdtfParseRecord, TimeRecord, TimeZoneAnnotation, TimeZoneRecord,
20+
UtcOffsetRecord, UtcOffsetRecordOrZ,
2221
},
2322
ParseError as Rfc9557ParseError,
2423
};
@@ -136,7 +135,9 @@ struct Intermediate<'a> {
136135
}
137136

138137
impl<'a> Intermediate<'a> {
139-
fn try_from_ixdtf_record(ixdtf_record: &'a IxdtfParseRecord) -> Result<Self, ParseError> {
138+
fn try_from_ixdtf_record(
139+
ixdtf_record: &'a IxdtfParseRecord<'_, Utf8>,
140+
) -> Result<Self, ParseError> {
140141
let (offset, is_z, iana_identifier) = match ixdtf_record {
141142
// empty
142143
IxdtfParseRecord {
@@ -740,7 +741,9 @@ impl Time {
740741
Self::try_from_ixdtf_record(&ixdtf_record)
741742
}
742743

743-
fn try_from_ixdtf_record(ixdtf_record: &IxdtfParseRecord) -> Result<Self, ParseError> {
744+
fn try_from_ixdtf_record(
745+
ixdtf_record: &IxdtfParseRecord<'_, Utf8>,
746+
) -> Result<Self, ParseError> {
744747
let time_record = ixdtf_record.time.ok_or(ParseError::MissingFields)?;
745748
Self::try_from_time_record(&time_record)
746749
}

utils/ixdtf/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
[package]
66
name = "ixdtf"
77
description = "Parser for Internet eXtended DateTime Format"
8-
version = "0.5.0"
8+
version = "0.6.0-dev"
99

1010
authors.workspace = true
1111
categories.workspace = true

utils/ixdtf/README.md

Lines changed: 7 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utils/ixdtf/src/core.rs

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
// This file is part of ICU4X. For terms of use, please see the file
2+
// called LICENSE at the top level of the ICU4X source tree
3+
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4+
5+
//! Core functionality for `ixdtf`'s parsers
6+
7+
use crate::{ParseError, ParserResult};
8+
9+
mod private {
10+
pub trait Sealed {}
11+
}
12+
13+
/// A trait for defining various supported encodings
14+
/// and implementing functionality that is encoding
15+
/// sensitive / specific.
16+
pub trait EncodingType: private::Sealed {
17+
type CodeUnit: PartialEq + core::fmt::Debug + Clone;
18+
19+
/// Get a slice from the underlying source using for start..end
20+
fn slice(source: &[Self::CodeUnit], start: usize, end: usize) -> Option<&[Self::CodeUnit]>;
21+
22+
/// Retrieve the provided code unit index and returns the value as an ASCII byte
23+
/// or None if the value is not ASCII representable.
24+
fn get_ascii(source: &[Self::CodeUnit], index: usize) -> ParserResult<Option<u8>>;
25+
26+
/// Checks for the known calendar annotation key `u-ca`.
27+
fn check_calendar_key(key: &[Self::CodeUnit]) -> bool;
28+
}
29+
30+
/// A marker type that signals a parser should parse the source as UTF-16 bytes.
31+
#[derive(Debug, PartialEq, Clone)]
32+
#[allow(clippy::exhaustive_structs)] // ZST Marker trait, no fields should be added
33+
pub struct Utf16;
34+
35+
impl private::Sealed for Utf16 {}
36+
37+
impl EncodingType for Utf16 {
38+
type CodeUnit = u16;
39+
fn slice(source: &[Self::CodeUnit], start: usize, end: usize) -> Option<&[Self::CodeUnit]> {
40+
source.get(start..end)
41+
}
42+
43+
fn get_ascii(source: &[Self::CodeUnit], index: usize) -> ParserResult<Option<u8>> {
44+
source.get(index).copied().map(to_ascii_byte).transpose()
45+
}
46+
47+
fn check_calendar_key(key: &[Self::CodeUnit]) -> bool {
48+
key == [0x75, 0x2d, 0x63, 0x61]
49+
}
50+
}
51+
52+
#[inline]
53+
fn to_ascii_byte(b: u16) -> ParserResult<u8> {
54+
if !(0x01..0x7F).contains(&b) {
55+
return Err(ParseError::NonAsciiCodePoint);
56+
}
57+
Ok(b as u8)
58+
}
59+
60+
/// A marker type that signals a parser should parse the source as UTF-8 bytes.
61+
#[derive(Debug, PartialEq, Clone)]
62+
#[allow(clippy::exhaustive_structs)] // ZST Marker trait, no fields should be added.
63+
pub struct Utf8;
64+
65+
impl private::Sealed for Utf8 {}
66+
67+
impl EncodingType for Utf8 {
68+
type CodeUnit = u8;
69+
70+
fn slice<'a>(source: &[Self::CodeUnit], start: usize, end: usize) -> Option<&[Self::CodeUnit]> {
71+
source.get(start..end)
72+
}
73+
74+
fn get_ascii(source: &[Self::CodeUnit], index: usize) -> ParserResult<Option<u8>> {
75+
Ok(source.get(index).copied())
76+
}
77+
78+
fn check_calendar_key(key: &[Self::CodeUnit]) -> bool {
79+
key == "u-ca".as_bytes()
80+
}
81+
}
82+
83+
// ==== Mini cursor implementation for Iso8601 targets ====
84+
85+
/// `Cursor` is a small cursor implementation for parsing Iso8601 grammar.
86+
#[derive(Debug)]
87+
pub(crate) struct Cursor<'a, T: EncodingType> {
88+
pos: usize,
89+
source: &'a [T::CodeUnit],
90+
}
91+
92+
impl<'a, T: EncodingType> Cursor<'a, T> {
93+
/// Create a new cursor from a source UTF8 string.
94+
#[must_use]
95+
pub fn new(source: &'a [T::CodeUnit]) -> Self {
96+
Self { pos: 0, source }
97+
}
98+
99+
/// Returns a string value from a slice of the cursor.
100+
pub(crate) fn slice(&self, start: usize, end: usize) -> Option<&'a [T::CodeUnit]> {
101+
T::slice(self.source, start, end)
102+
}
103+
104+
/// Get current position
105+
pub(crate) const fn pos(&self) -> usize {
106+
self.pos
107+
}
108+
109+
/// Peek the value at next position (current + 1).
110+
pub(crate) fn peek(&self) -> ParserResult<Option<u8>> {
111+
self.peek_n(1)
112+
}
113+
114+
/// Returns current position in source as `char`.
115+
pub(crate) fn current(&self) -> ParserResult<Option<u8>> {
116+
self.peek_n(0)
117+
}
118+
119+
/// Peeks the value at `n` as a `char`.
120+
pub(crate) fn peek_n(&self, n: usize) -> ParserResult<Option<u8>> {
121+
T::get_ascii(self.source, self.pos + n)
122+
}
123+
124+
/// Runs the provided check on the current position.
125+
pub(crate) fn check<F>(&self, f: F) -> ParserResult<Option<bool>>
126+
where
127+
F: FnOnce(u8) -> bool,
128+
{
129+
Ok(self.current()?.map(f))
130+
}
131+
132+
/// Runs the provided check on current position returns the default value if None.
133+
pub(crate) fn check_or<F>(&self, default: bool, f: F) -> ParserResult<bool>
134+
where
135+
F: FnOnce(u8) -> bool,
136+
{
137+
Ok(self.current()?.map_or(default, f))
138+
}
139+
140+
/// Returns `Cursor`'s current char and advances to the next position.
141+
pub(crate) fn next(&mut self) -> ParserResult<Option<u8>> {
142+
let result = self.current();
143+
self.advance_n(1);
144+
result
145+
}
146+
147+
/// Returns the next value as a digit
148+
///
149+
/// # Errors
150+
/// - Returns an AbruptEnd error if cursor ends.
151+
pub(crate) fn next_digit(&mut self) -> ParserResult<Option<u8>> {
152+
let ascii_char = self.next_or(ParseError::AbruptEnd { location: "digit" })?;
153+
if ascii_char.is_ascii_digit() {
154+
Ok(Some(ascii_char - 48))
155+
} else {
156+
Ok(None)
157+
}
158+
}
159+
160+
/// A utility next method that returns an `AbruptEnd` error if invalid.
161+
pub(crate) fn next_or(&mut self, err: ParseError) -> ParserResult<u8> {
162+
self.next()?.ok_or(err)
163+
}
164+
165+
/// Advances the cursor's position by n code points.
166+
pub(crate) fn advance_n(&mut self, n: usize) {
167+
self.pos += n;
168+
}
169+
170+
// Advances the cursor by 1 code point.
171+
pub(crate) fn advance(&mut self) {
172+
self.advance_n(1)
173+
}
174+
175+
/// Utility function to advance when a condition is true
176+
pub(crate) fn advance_if(&mut self, condition: bool) {
177+
if condition {
178+
self.advance();
179+
}
180+
}
181+
182+
/// Closes the current cursor by checking if all contents have been consumed. If not, returns an error for invalid syntax.
183+
pub(crate) fn close(&mut self) -> ParserResult<()> {
184+
if self.pos < self.source.len() {
185+
return Err(ParseError::InvalidEnd);
186+
}
187+
Ok(())
188+
}
189+
}

utils/ixdtf/src/error.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ use displaydoc::Display;
1212
pub enum ParseError {
1313
#[displaydoc("Implementation error: this error must not throw.")]
1414
ImplAssert,
15+
#[displaydoc("Code point was not ASCII")]
16+
NonAsciiCodePoint,
1517
#[displaydoc("Invalid float while parsing fraction part.")]
1618
ParseFloat,
1719
#[displaydoc("Parsing ended abruptly while parsing {location}")]

utils/ixdtf/src/lib.rs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
//! ## Example Usage
2020
//!
2121
//! ```
22-
//! use ixdtf::parsers::{
22+
//! use ixdtf::{
2323
//! records::{Sign, TimeZoneRecord},
24-
//! IxdtfParser,
24+
//! parsers::IxdtfParser,
2525
//! };
2626
//!
2727
//! let ixdtf_str = "2024-03-02T08:48:00-05:00[America/New_York]";
@@ -73,9 +73,9 @@
7373
//! RFC 9557 updates the interpretation of `Z` to align with `-00:00`.
7474
//!
7575
//! ```rust
76-
//! use ixdtf::parsers::{
76+
//! use ixdtf::{
77+
//! parsers::IxdtfParser,
7778
//! records::{Sign, TimeZoneRecord},
78-
//! IxdtfParser,
7979
//! };
8080
//!
8181
//! let ixdtf_str = "2024-03-02T08:48:00Z[America/New_York]";
@@ -134,9 +134,9 @@
134134
//! zone annotation if it is provided.
135135
//!
136136
//! ```rust
137-
//! use ixdtf::parsers::{
137+
//! use ixdtf::{
138+
//! parsers::IxdtfParser,
138139
//! records::{Sign, TimeZoneRecord},
139-
//! IxdtfParser,
140140
//! };
141141
//!
142142
//! let zulu_offset = "2024-03-02T08:48:00Z[!America/New_York]";
@@ -289,7 +289,7 @@
289289
//! between the offset and annotation.
290290
//!
291291
//! ```rust
292-
//! use ixdtf::parsers::{IxdtfParser, records::TimeZoneRecord};
292+
//! use ixdtf::{parsers::IxdtfParser, records::TimeZoneRecord};
293293
//!
294294
//! let example_two = "2024-03-02T08:48:00+01:00[!America/New_York]";
295295
//!
@@ -388,12 +388,19 @@
388388
)
389389
)]
390390

391+
pub(crate) mod core;
391392
mod error;
392393
pub mod parsers;
394+
pub mod records;
393395

394396
extern crate alloc;
395397

396398
pub use error::ParseError;
397399

400+
/// This module contains the supported encoding for `ixdtf` parsing.
401+
pub mod encoding {
402+
pub use crate::core::{Utf16, Utf8};
403+
}
404+
398405
/// The `ixdtf` crate's Result type.
399406
pub type ParserResult<T> = Result<T, ParseError>;

0 commit comments

Comments
 (0)