Skip to content

Commit 21aca87

Browse files
committed
Bring back the traces
1 parent ad6727a commit 21aca87

File tree

5 files changed

+23
-2
lines changed

5 files changed

+23
-2
lines changed

components/segmenter/src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
//! See [`SentenceSegmenter`] for more examples.
101101
102102
// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations
103-
#![cfg_attr(not(any(test, feature = "std")), no_std)]
103+
//#![cfg_attr(not(any(test, feature = "std")), no_std)]
104104
#![cfg_attr(
105105
not(test),
106106
deny(

components/segmenter/src/rule_segmenter.rs

+11
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'
105105
let left_prop = self.get_break_property(left_codepoint);
106106
self.advance_iter();
107107

108+
let right_codepoint = self.get_current_codepoint().map_or("????".to_string(), |c| format!("U+{:02X}", c.into()));
108109
let Some(right_prop) = self.get_current_break_property() else {
109110
self.boundary_property = left_prop;
110111
return Some(self.len);
@@ -126,14 +127,21 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'
126127

127128
// If break_state is equals or grater than 0, it is alias of property.
128129
let mut break_state = self.get_break_state_from_table(left_prop, right_prop);
130+
let STATE_NAMES = ["Unknown", "CR", "LF", "Extend", "Sep", "Format", "Sp", "Lower", "Upper", "OLetter", "Numeric", "ATerm", "SContinue", "STerm", "Close", "ATerm_Close", "ATerm_Close_Sp", "STerm_Close", "STerm_Close_Sp", "Upper_ATerm", "Lower_ATerm", "ATerm_Close_Sp_SB8", "ATerm_Close_Sp_ParaSep", "ATerm_Close_Sp_CR", "STerm_Close_Sp_ParaSep", "STerm_Close_Sp_CR", "sot", "eot"];
131+
println!("left={:02X} right={:02X} {} state={:02X}", left_prop, right_prop, right_codepoint, break_state);
132+
println!("left={} right={} {}", STATE_NAMES[left_prop as usize], STATE_NAMES[right_prop as usize], right_codepoint);
129133

130134
if break_state >= 0 {
131135
// This isn't simple rule set. We need marker to restore iterator to previous position.
132136
let mut previous_iter = self.iter.clone();
133137
let mut previous_pos_data = self.current_pos_data;
134138
let mut previous_left_prop = left_prop;
135139

140+
if (break_state & INTERMEDIATE_MATCH_RULE) != 0 {
141+
println!("going through intermediate match rule");
142+
}
136143
break_state &= !INTERMEDIATE_MATCH_RULE;
144+
println!("Inner loop");
137145
loop {
138146
self.advance_iter();
139147

@@ -155,6 +163,8 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'
155163

156164
let previous_break_state = break_state;
157165
break_state = self.get_break_state_from_table(break_state as u8, prop);
166+
println!("> left={:02X} right={:02X} state={:02X}", previous_break_state, prop, break_state);
167+
println!("> left={} right={}", STATE_NAMES[(previous_break_state & !INTERMEDIATE_MATCH_RULE) as usize], STATE_NAMES[prop as usize]);
158168
if break_state < 0 {
159169
break;
160170
}
@@ -167,6 +177,7 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'
167177
previous_left_prop = break_state as u8;
168178
}
169179
if (break_state & INTERMEDIATE_MATCH_RULE) != 0 {
180+
println!("going through intermediate match rule");
170181
break_state -= INTERMEDIATE_MATCH_RULE;
171182
previous_iter = self.iter.clone();
172183
previous_pos_data = self.current_pos_data;

components/segmenter/src/sentence.rs

+6
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
2828
RuleBreakIterator<'l, 's, Y>,
2929
);
3030

31+
impl<'l, 's, Y: RuleBreakType<'l, 's>> SentenceBreakIterator<'l, 's, Y> {
32+
pub fn state(&self) -> u8 {
33+
self.0.boundary_property
34+
}
35+
}
36+
3137
derive_usize_iterator_with_type!(SentenceBreakIterator);
3238

3339
/// Sentence break iterator for an `str` (a UTF-8 string).

components/segmenter/tests/spec_test.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ fn sentence_break_test(filename: &str) {
223223
let mut iter = segmenter.segment_str(&s);
224224
// TODO(egg): It would be really nice to have Name here.
225225
println!(" | A | E | Code pt. | Sentence_Break | State | Literal");
226+
let STATE_NAMES = ["Unknown", "CR", "LF", "Extend", "Sep", "Format", "Sp", "Lower", "Upper", "OLetter", "Numeric", "ATerm", "SContinue", "STerm", "Close", "ATerm_Close", "ATerm_Close_Sp", "STerm_Close", "STerm_Close_Sp", "Upper_ATerm", "Lower_ATerm", "ATerm_Close_Sp_SB8", "ATerm_Close_Sp_ParaSep", "ATerm_Close_Sp_CR", "STerm_Close_Sp_ParaSep", "STerm_Close_Sp_CR", "sot", "eot"];
226227
for (i, c) in s.char_indices() {
227228
let expected_break = test.break_result_utf8.contains(&i);
228229
let actual_break = result.contains(&i);
@@ -244,7 +245,7 @@ fn sentence_break_test(filename: &str) {
244245
.unwrap_or(&format!("{:?}", sb.get(c))),
245246
// Placeholder for logging the state if exposed.
246247
// Not "?????" to hide from clippy.
247-
"?".repeat(5),
248+
if actual_break { format!("{:02X} {}", iter.state(), STATE_NAMES[iter.state() as usize]) } else {"?".repeat(5)},
248249
c
249250
)
250251
}

provider/datagen/src/transform/segmenter/mod.rs

+3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use icu_provider::datagen::IterableDataProvider;
1717
use icu_provider::prelude::*;
1818
use icu_segmenter::provider::*;
1919
use icu_segmenter::symbols::*;
20+
use itertools::Itertools;
2021
use std::fmt::Debug;
2122
use zerovec::ZeroVec;
2223

@@ -469,6 +470,8 @@ impl crate::DatagenProvider {
469470
// sot and eot
470471
properties_names.push("sot".to_string());
471472
properties_names.push("eot".to_string());
473+
println!("{:?}", properties_names);
474+
println!("{}", properties_names.iter().enumerate().map(|(i, name)| format!("{:02X}={}", i, name)).join("\n"));
472475

473476
let rule_size = properties_names.len() * properties_names.len();
474477
let mut break_state_table = vec![UNKNOWN_RULE; rule_size];

0 commit comments

Comments
 (0)