Skip to content

Commit e4db884

Browse files
authored
Segmenter cleanups, part 2 (reduce trait complexity) (#6417)
First set of followups after #6409 This: - Removes lifetimes and `?Sized` from the BreakType traits - Coalesces the BreakType markers into a single set of RuleBreakTypeFoo markers - Makes LineBreakType depend on RuleBreakType - Adds a new WordBreakType that depends on RuleBreakType previously the wordbreaktype distinction was awkwardly handled by `unreachable!()`s and trait impls; which meant it was not straightforward to make WordBreak use RuleBreakType instead of WordBreakType. Moving that required moving it to a function, but it seemed to work well. This does not intend to finalize the module structure, it's still doing cleanups, I didn't wish to have these be blocked on module structure decisions. A cleanup I may make is to use a secondary type parameter on RuleBReakIterator instead of a function pointer. It ought not matter, though. <!-- Thank you for your pull request to ICU4X! Reminder: try to use [Conventional Comments](https://conventionalcomments.org/) to make comments clearer. Please see https://github.com/unicode-org/icu4x/blob/main/CONTRIBUTING.md for general information on contributing to ICU4X. -->
1 parent 899bac4 commit e4db884

File tree

11 files changed

+168
-282
lines changed

11 files changed

+168
-282
lines changed

components/segmenter/src/complex/dictionary.rs

+10-10
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ use core::str::CharIndices;
99
use icu_collections::char16trie::{Char16Trie, TrieResult};
1010

1111
/// A trait for dictionary based iterator
12-
trait DictionaryType<'l, 's> {
12+
trait DictionaryType {
1313
/// The iterator over characters.
14-
type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;
14+
type IterAttr<'s>: Iterator<Item = (usize, Self::CharType)> + Clone;
1515

1616
/// The character type.
1717
type CharType: Copy + Into<u32>;
@@ -23,11 +23,11 @@ trait DictionaryType<'l, 's> {
2323
struct DictionaryBreakIterator<
2424
'l,
2525
's,
26-
Y: DictionaryType<'l, 's> + ?Sized,
26+
Y: DictionaryType + ?Sized,
2727
X: Iterator<Item = usize> + ?Sized,
2828
> {
2929
trie: Char16Trie<'l>,
30-
iter: Y::IterAttr,
30+
iter: Y::IterAttr<'s>,
3131
len: usize,
3232
grapheme_iter: X,
3333
// TODO transform value for byte trie
@@ -41,8 +41,8 @@ struct DictionaryBreakIterator<
4141
/// - `'s` = lifetime of the string being segmented
4242
///
4343
/// [`Iterator`]: core::iter::Iterator
44-
impl<'l, 's, Y: DictionaryType<'l, 's> + ?Sized, X: Iterator<Item = usize> + ?Sized> Iterator
45-
for DictionaryBreakIterator<'l, 's, Y, X>
44+
impl<Y: DictionaryType + ?Sized, X: Iterator<Item = usize> + ?Sized> Iterator
45+
for DictionaryBreakIterator<'_, '_, Y, X>
4646
{
4747
type Item = usize;
4848

@@ -106,8 +106,8 @@ impl<'l, 's, Y: DictionaryType<'l, 's> + ?Sized, X: Iterator<Item = usize> + ?Si
106106
}
107107
}
108108

109-
impl<'s> DictionaryType<'_, 's> for u32 {
110-
type IterAttr = Utf16Indices<'s>;
109+
impl DictionaryType for u32 {
110+
type IterAttr<'s> = Utf16Indices<'s>;
111111
type CharType = u32;
112112

113113
fn to_char(c: u32) -> char {
@@ -123,8 +123,8 @@ impl<'s> DictionaryType<'_, 's> for u32 {
123123
}
124124
}
125125

126-
impl<'s> DictionaryType<'_, 's> for char {
127-
type IterAttr = CharIndices<'s>;
126+
impl DictionaryType for char {
127+
type IterAttr<'s> = CharIndices<'s>;
128128
type CharType = char;
129129

130130
fn to_char(c: char) -> char {

components/segmenter/src/grapheme.rs

+9-5
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use utf8_iter::Utf8CharIndices;
2424
///
2525
/// For examples of use, see [`GraphemeClusterSegmenter`].
2626
#[derive(Debug)]
27-
pub struct GraphemeClusterBreakIterator<'data, 's, Y: RuleBreakType<'s> + ?Sized>(
27+
pub struct GraphemeClusterBreakIterator<'data, 's, Y: RuleBreakType>(
2828
RuleBreakIterator<'data, 's, Y>,
2929
);
3030

@@ -34,25 +34,25 @@ derive_usize_iterator_with_type!(GraphemeClusterBreakIterator, 'data);
3434
///
3535
/// For examples of use, see [`GraphemeClusterSegmenter`].
3636
pub type GraphemeClusterBreakIteratorUtf8<'data, 's> =
37-
GraphemeClusterBreakIterator<'data, 's, RuleBreakTypeUtf8>;
37+
GraphemeClusterBreakIterator<'data, 's, Utf8>;
3838

3939
/// Grapheme cluster break iterator for a potentially invalid UTF-8 string.
4040
///
4141
/// For examples of use, see [`GraphemeClusterSegmenter`].
4242
pub type GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8<'data, 's> =
43-
GraphemeClusterBreakIterator<'data, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
43+
GraphemeClusterBreakIterator<'data, 's, PotentiallyIllFormedUtf8>;
4444

4545
/// Grapheme cluster break iterator for a Latin-1 (8-bit) string.
4646
///
4747
/// For examples of use, see [`GraphemeClusterSegmenter`].
4848
pub type GraphemeClusterBreakIteratorLatin1<'data, 's> =
49-
GraphemeClusterBreakIterator<'data, 's, RuleBreakTypeLatin1>;
49+
GraphemeClusterBreakIterator<'data, 's, Latin1>;
5050

5151
/// Grapheme cluster break iterator for a UTF-16 string.
5252
///
5353
/// For examples of use, see [`GraphemeClusterSegmenter`].
5454
pub type GraphemeClusterBreakIteratorUtf16<'data, 's> =
55-
GraphemeClusterBreakIterator<'data, 's, RuleBreakTypeUtf16>;
55+
GraphemeClusterBreakIterator<'data, 's, Utf16>;
5656

5757
/// Segments a string into grapheme clusters.
5858
///
@@ -195,6 +195,7 @@ impl<'data> GraphemeClusterSegmenterBorrowed<'data> {
195195
complex: None,
196196
boundary_property: 0,
197197
locale_override: None,
198+
handle_complex_language: empty_handle_complex_language,
198199
})
199200
}
200201
/// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string
@@ -215,6 +216,7 @@ impl<'data> GraphemeClusterSegmenterBorrowed<'data> {
215216
complex: None,
216217
boundary_property: 0,
217218
locale_override: None,
219+
handle_complex_language: empty_handle_complex_language,
218220
})
219221
}
220222
/// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
@@ -233,6 +235,7 @@ impl<'data> GraphemeClusterSegmenterBorrowed<'data> {
233235
complex: None,
234236
boundary_property: 0,
235237
locale_override: None,
238+
handle_complex_language: empty_handle_complex_language,
236239
})
237240
}
238241

@@ -252,6 +255,7 @@ impl<'data> GraphemeClusterSegmenterBorrowed<'data> {
252255
complex: None,
253256
boundary_property: 0,
254257
locale_override: None,
258+
handle_complex_language: empty_handle_complex_language,
255259
})
256260
}
257261
}

components/segmenter/src/iterator_helpers.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
77
macro_rules! derive_usize_iterator_with_type {
88
($ty:tt, $($lt:lifetime),* ) => {
9-
impl<$($lt,)* 's, Y: RuleBreakType<'s> + ?Sized> Iterator for $ty<$($lt,)* 's, Y> {
9+
impl<$($lt,)* 's, Y: RuleBreakType> Iterator for $ty<$($lt,)* 's, Y> {
1010
type Item = usize;
1111
#[inline]
1212
fn next(&mut self) -> Option<Self::Item> {

components/segmenter/src/lib.rs

+3-4
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,9 @@ pub mod options {
162162

163163
/// Largely-internal scaffolding types (You should very rarely need to reference these directly)
164164
pub mod scaffold {
165-
pub use crate::rule_segmenter::{
166-
RuleBreakType, RuleBreakTypeLatin1, RuleBreakTypePotentiallyIllFormedUtf8,
167-
RuleBreakTypeUtf16, RuleBreakTypeUtf8,
168-
};
165+
pub use crate::line::LineBreakType;
166+
pub use crate::rule_segmenter::{Latin1, PotentiallyIllFormedUtf8, RuleBreakType, Utf16, Utf8};
167+
pub use crate::word::WordBreakType;
169168
}
170169

171170
pub(crate) mod private {

0 commit comments

Comments
 (0)