Skip to content

Commit 8bec1f9

Browse files
authored
Don't reexport iterator types at the top level of segmenter (#6409)
We should keep the top level clean: only the segmenters and their borrowed variants. This also ends up exposing more of the public-but-behind-private-module "line break type" etc markers.
1 parent 240e7c2 commit 8bec1f9

File tree

59 files changed

+315
-255
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+315
-255
lines changed

components/segmenter/src/complex/mod.rs

-12
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,13 @@ use lstm::*;
2020
#[allow(clippy::large_enum_variant)]
2121
enum DictOrLstm {
2222
Dict(DataPayload<UCharDictionaryBreakDataV1>),
23-
#[cfg(not(feature = "lstm"))]
24-
Lstm(core::convert::Infallible),
2523
#[cfg(feature = "lstm")]
2624
Lstm(DataPayload<SegmenterLstmAutoV1>),
2725
}
2826

2927
#[derive(Debug, Clone, Copy)]
3028
enum DictOrLstmBorrowed<'data> {
3129
Dict(&'data UCharDictionaryBreakData<'data>),
32-
#[cfg(not(feature = "lstm"))]
33-
Lstm(core::convert::Infallible),
3430
#[cfg(feature = "lstm")]
3531
Lstm(&'data LstmData<'data>),
3632
}
@@ -40,8 +36,6 @@ fn borrow_dictor(dict_or: &DictOrLstm) -> DictOrLstmBorrowed<'_> {
4036
DictOrLstm::Dict(dict) => DictOrLstmBorrowed::Dict(dict.get()),
4137
#[cfg(feature = "lstm")]
4238
DictOrLstm::Lstm(lstm) => DictOrLstmBorrowed::Lstm(lstm.get()),
43-
#[cfg(not(feature = "lstm"))]
44-
DictOrLstm::Lstm(infallible) => DictOrLstmBorrowed::Lstm(*infallible),
4539
}
4640
}
4741

@@ -50,8 +44,6 @@ fn fromstatic_dictor(dict_or: DictOrLstmBorrowed<'static>) -> DictOrLstm {
5044
DictOrLstmBorrowed::Dict(dict) => DictOrLstm::Dict(DataPayload::from_static_ref(dict)),
5145
#[cfg(feature = "lstm")]
5246
DictOrLstmBorrowed::Lstm(lstm) => DictOrLstm::Lstm(DataPayload::from_static_ref(lstm)),
53-
#[cfg(not(feature = "lstm"))]
54-
DictOrLstmBorrowed::Lstm(infallible) => DictOrLstm::Lstm(infallible),
5547
}
5648
}
5749

@@ -131,8 +123,6 @@ impl<'data> ComplexPayloadsBorrowed<'data> {
131123
let seg = LstmSegmenter::new(lstm, self.grapheme);
132124
result.extend(seg.segment_str(slice).map(|n| offset + n));
133125
}
134-
#[cfg(not(feature = "lstm"))]
135-
Some(DictOrLstmBorrowed::Lstm(_infallible)) => {} // should be refutable
136126
None => {
137127
result.push(offset + slice.len());
138128
}
@@ -156,8 +146,6 @@ impl<'data> ComplexPayloadsBorrowed<'data> {
156146
let seg = LstmSegmenter::new(lstm, self.grapheme);
157147
result.extend(seg.segment_utf16(slice).map(|n| offset + n));
158148
}
159-
#[cfg(not(feature = "lstm"))]
160-
Some(DictOrLstmBorrowed::Lstm(_infallible)) => {} // should be refutable
161149
None => {
162150
result.push(offset + slice.len());
163151
}

components/segmenter/src/lib.rs

+15-26
Original file line numberDiff line numberDiff line change
@@ -127,24 +127,24 @@ mod indices;
127127
mod iterator_helpers;
128128
mod rule_segmenter;
129129

130-
mod grapheme;
131-
mod line;
132-
mod sentence;
133-
mod word;
130+
/// [`GraphemeClusterSegmenter`] and its related iterators, borrowed types, and options.
131+
pub mod grapheme;
132+
/// [`LineSegmenter`] and its related iterators, borrowed types, and options.
133+
pub mod line;
134+
/// [`SentenceSegmenter`] and its related iterators, borrowed types, and options.
135+
pub mod sentence;
136+
/// [`WordSegmenter`] and its related iterators, borrowed types, and options.
137+
pub mod word;
134138

135139
pub mod provider;
136140

137141
// Main Segmenter and BreakIterator public types
138-
pub use crate::grapheme::GraphemeClusterBreakIterator;
139142
pub use crate::grapheme::GraphemeClusterSegmenter;
140143
pub use crate::grapheme::GraphemeClusterSegmenterBorrowed;
141-
pub use crate::line::LineBreakIterator;
142144
pub use crate::line::LineSegmenter;
143145
pub use crate::line::LineSegmenterBorrowed;
144-
pub use crate::sentence::SentenceBreakIterator;
145146
pub use crate::sentence::SentenceSegmenter;
146147
pub use crate::sentence::SentenceSegmenterBorrowed;
147-
pub use crate::word::WordBreakIterator;
148148
pub use crate::word::WordSegmenter;
149149
pub use crate::word::WordSegmenterBorrowed;
150150

@@ -160,24 +160,13 @@ pub mod options {
160160
pub use crate::word::WordType;
161161
}
162162

163-
// Typedefs
164-
pub use crate::grapheme::GraphemeClusterBreakIteratorLatin1;
165-
pub use crate::grapheme::GraphemeClusterBreakIteratorPotentiallyIllFormedUtf8;
166-
pub use crate::grapheme::GraphemeClusterBreakIteratorUtf16;
167-
pub use crate::grapheme::GraphemeClusterBreakIteratorUtf8;
168-
pub use crate::line::LineBreakIteratorLatin1;
169-
pub use crate::line::LineBreakIteratorPotentiallyIllFormedUtf8;
170-
pub use crate::line::LineBreakIteratorUtf16;
171-
pub use crate::line::LineBreakIteratorUtf8;
172-
pub use crate::sentence::SentenceBreakIteratorLatin1;
173-
pub use crate::sentence::SentenceBreakIteratorPotentiallyIllFormedUtf8;
174-
pub use crate::sentence::SentenceBreakIteratorUtf16;
175-
pub use crate::sentence::SentenceBreakIteratorUtf8;
176-
pub use crate::word::WordBreakIteratorLatin1;
177-
pub use crate::word::WordBreakIteratorPotentiallyIllFormedUtf8;
178-
pub use crate::word::WordBreakIteratorUtf16;
179-
pub use crate::word::WordBreakIteratorUtf8;
180-
pub use crate::word::WordBreakIteratorWithWordType;
163+
/// Largely-internal scaffolding types (You should very rarely need to reference these directly)
164+
pub mod scaffold {
165+
pub use crate::rule_segmenter::{
166+
RuleBreakType, RuleBreakTypeLatin1, RuleBreakTypePotentiallyIllFormedUtf8,
167+
RuleBreakTypeUtf16, RuleBreakTypeUtf8,
168+
};
169+
}
181170

182171
pub(crate) mod private {
183172
/// Trait marking other traits that are considered unstable and should not generally be

components/segmenter/src/line.rs

+4
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,7 @@ impl<'s, Y: LineBreakType<'s>> LineBreakIterator<'_, 's, Y> {
11021102

11031103
#[derive(Debug)]
11041104
#[non_exhaustive]
1105+
/// [`LineBreakType`] for UTF-8 strings
11051106
pub struct LineBreakTypeUtf8;
11061107

11071108
impl crate::private::Sealed for LineBreakTypeUtf8 {}
@@ -1137,6 +1138,7 @@ impl<'s> LineBreakType<'s> for LineBreakTypeUtf8 {
11371138

11381139
#[derive(Debug)]
11391140
#[non_exhaustive]
1141+
/// [`LineBreakType`] for potentially ill-formed UTF-8 strings
11401142
pub struct LineBreakTypePotentiallyIllFormedUtf8;
11411143

11421144
impl crate::private::Sealed for LineBreakTypePotentiallyIllFormedUtf8 {}
@@ -1225,6 +1227,7 @@ where
12251227

12261228
#[derive(Debug)]
12271229
#[non_exhaustive]
1230+
/// [`LineBreakType`] for Latin-1 strings
12281231
pub struct LineBreakTypeLatin1;
12291232
impl crate::private::Sealed for LineBreakTypeLatin1 {}
12301233

@@ -1257,6 +1260,7 @@ impl<'s> LineBreakType<'s> for LineBreakTypeLatin1 {
12571260

12581261
#[derive(Debug)]
12591262
#[non_exhaustive]
1263+
/// [`LineBreakType`] for UTF-16 strings
12601264
pub struct LineBreakTypeUtf16;
12611265
impl crate::private::Sealed for LineBreakTypeUtf16 {}
12621266

components/segmenter/src/provider/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ icu_provider::data_marker!(
121121
is_singleton = true
122122
);
123123

124+
pub use crate::word::inner::WordTypeULE;
125+
124126
#[cfg(feature = "datagen")]
125127
/// The latest minimum set of markers required by this component.
126128
pub const MARKERS: &[DataMarkerInfo] = &[

components/segmenter/src/rule_segmenter.rs

+4
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ impl<'s, Y: RuleBreakType<'s> + ?Sized> RuleBreakIterator<'_, 's, Y> {
269269

270270
#[derive(Debug)]
271271
#[non_exhaustive]
272+
/// [`RuleBreakType`] for UTF-8 strings
272273
pub struct RuleBreakTypeUtf8;
273274

274275
impl crate::private::Sealed for RuleBreakTypeUtf8 {}
@@ -291,6 +292,7 @@ impl<'s> RuleBreakType<'s> for RuleBreakTypeUtf8 {
291292

292293
#[derive(Debug)]
293294
#[non_exhaustive]
295+
/// [`RuleBreakType`] for potentially ill-formed UTF-8 strings
294296
pub struct RuleBreakTypePotentiallyIllFormedUtf8;
295297

296298
impl crate::private::Sealed for RuleBreakTypePotentiallyIllFormedUtf8 {}
@@ -313,6 +315,7 @@ impl<'s> RuleBreakType<'s> for RuleBreakTypePotentiallyIllFormedUtf8 {
313315

314316
#[derive(Debug)]
315317
#[non_exhaustive]
318+
/// [`RuleBreakType`] for Latin-1 strings
316319
pub struct RuleBreakTypeLatin1;
317320

318321
impl crate::private::Sealed for RuleBreakTypeLatin1 {}
@@ -335,6 +338,7 @@ impl<'s> RuleBreakType<'s> for RuleBreakTypeLatin1 {
335338

336339
#[derive(Debug)]
337340
#[non_exhaustive]
341+
/// [`RuleBreakType`] for UTF-16 strings
338342
pub struct RuleBreakTypeUtf16;
339343

340344
impl crate::private::Sealed for RuleBreakTypeUtf16 {}

components/segmenter/src/word.rs

+22-12
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,27 @@ pub struct WordBreakIterator<'data, 's, Y: RuleBreakType<'s> + ?Sized>(
5151

5252
derive_usize_iterator_with_type!(WordBreakIterator, 'data);
5353

54-
/// The word type tag that is returned by [`WordBreakIterator::word_type()`].
55-
#[non_exhaustive]
56-
#[derive(Copy, Clone, PartialEq, Debug)]
57-
#[repr(u8)]
58-
#[zerovec::make_ule(WordTypeULE)]
59-
pub enum WordType {
60-
/// No category tag.
61-
None = 0,
62-
/// Number category tag.
63-
Number = 1,
64-
/// Letter category tag, including CJK.
65-
Letter = 2,
54+
/// Hide ULE type
55+
pub(crate) mod inner {
56+
/// The word type tag that is returned by [`WordBreakIterator::word_type()`].
57+
///
58+
/// [`WordBreakIterator::word_type()`]: super::WordBreakIterator::word_type
59+
#[non_exhaustive]
60+
#[derive(Copy, Clone, PartialEq, Debug)]
61+
#[repr(u8)]
62+
#[zerovec::make_ule(WordTypeULE)]
63+
pub enum WordType {
64+
/// No category tag.
65+
None = 0,
66+
/// Number category tag.
67+
Number = 1,
68+
/// Letter category tag, including CJK.
69+
Letter = 2,
70+
}
6671
}
6772

73+
pub use inner::WordType;
74+
6875
impl WordType {
6976
/// Whether the segment is word-like; word-like segments include numbers, as
7077
/// well as segments made up of letters (including CJKV ideographs).
@@ -580,6 +587,7 @@ impl WordSegmenterBorrowed<'static> {
580587

581588
#[derive(Debug)]
582589
#[non_exhaustive]
590+
/// [`RuleBreakType`] for word-breaking UTF-16 strings
583591
pub struct WordBreakTypeUtf8;
584592
impl crate::private::Sealed for WordBreakTypeUtf8 {}
585593

@@ -601,6 +609,7 @@ impl<'s> RuleBreakType<'s> for WordBreakTypeUtf8 {
601609

602610
#[derive(Debug)]
603611
#[non_exhaustive]
612+
/// Word breaks for word-breaking potentially ill-formed UTF-8 strings
604613
pub struct WordBreakTypePotentiallyIllFormedUtf8;
605614
impl crate::private::Sealed for WordBreakTypePotentiallyIllFormedUtf8 {}
606615

@@ -677,6 +686,7 @@ where
677686

678687
#[derive(Debug)]
679688
#[non_exhaustive]
689+
/// Word breaks word-breaking for UTF-16 strings
680690
pub struct WordBreakTypeUtf16;
681691

682692
impl crate::private::Sealed for WordBreakTypeUtf16 {}

ffi/capi/bindings/cpp/icu4x/GraphemeClusterBreakIteratorLatin1.d.hpp

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/GraphemeClusterBreakIteratorUtf16.d.hpp

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/GraphemeClusterBreakIteratorUtf8.d.hpp

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/LineBreakIteratorLatin1.d.hpp

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/LineBreakIteratorUtf16.d.hpp

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/LineBreakIteratorUtf8.d.hpp

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/SentenceBreakIteratorLatin1.d.hpp

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)