Skip to content

Commit

Permalink
Bring back the traces
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Oct 24, 2023
1 parent ad6727a commit 21aca87
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 2 deletions.
2 changes: 1 addition & 1 deletion components/segmenter/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@
//! See [`SentenceSegmenter`] for more examples.
// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations
#![cfg_attr(not(any(test, feature = "std")), no_std)]
//#![cfg_attr(not(any(test, feature = "std")), no_std)]
#![cfg_attr(
not(test),
deny(
Expand Down
11 changes: 11 additions & 0 deletions components/segmenter/src/rule_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'
let left_prop = self.get_break_property(left_codepoint);
self.advance_iter();

let right_codepoint = self.get_current_codepoint().map_or("????".to_string(), |c| format!("U+{:02X}", c.into()));
let Some(right_prop) = self.get_current_break_property() else {
self.boundary_property = left_prop;
return Some(self.len);
Expand All @@ -126,14 +127,21 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'

// If break_state is equals or grater than 0, it is alias of property.
let mut break_state = self.get_break_state_from_table(left_prop, right_prop);
let STATE_NAMES = ["Unknown", "CR", "LF", "Extend", "Sep", "Format", "Sp", "Lower", "Upper", "OLetter", "Numeric", "ATerm", "SContinue", "STerm", "Close", "ATerm_Close", "ATerm_Close_Sp", "STerm_Close", "STerm_Close_Sp", "Upper_ATerm", "Lower_ATerm", "ATerm_Close_Sp_SB8", "ATerm_Close_Sp_ParaSep", "ATerm_Close_Sp_CR", "STerm_Close_Sp_ParaSep", "STerm_Close_Sp_CR", "sot", "eot"];
println!("left={:02X} right={:02X} {} state={:02X}", left_prop, right_prop, right_codepoint, break_state);
println!("left={} right={} {}", STATE_NAMES[left_prop as usize], STATE_NAMES[right_prop as usize], right_codepoint);

if break_state >= 0 {
// This isn't simple rule set. We need marker to restore iterator to previous position.
let mut previous_iter = self.iter.clone();
let mut previous_pos_data = self.current_pos_data;
let mut previous_left_prop = left_prop;

if (break_state & INTERMEDIATE_MATCH_RULE) != 0 {
println!("going through intermediate match rule");
}
break_state &= !INTERMEDIATE_MATCH_RULE;
println!("Inner loop");
loop {
self.advance_iter();

Expand All @@ -155,6 +163,8 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'

let previous_break_state = break_state;
break_state = self.get_break_state_from_table(break_state as u8, prop);
println!("> left={:02X} right={:02X} state={:02X}", previous_break_state, prop, break_state);
println!("> left={} right={}", STATE_NAMES[(previous_break_state & !INTERMEDIATE_MATCH_RULE) as usize], STATE_NAMES[prop as usize]);
if break_state < 0 {
break;
}
Expand All @@ -167,6 +177,7 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'
previous_left_prop = break_state as u8;
}
if (break_state & INTERMEDIATE_MATCH_RULE) != 0 {
println!("going through intermediate match rule");
break_state -= INTERMEDIATE_MATCH_RULE;
previous_iter = self.iter.clone();
previous_pos_data = self.current_pos_data;
Expand Down
6 changes: 6 additions & 0 deletions components/segmenter/src/sentence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
RuleBreakIterator<'l, 's, Y>,
);

impl<'l, 's, Y: RuleBreakType<'l, 's>> SentenceBreakIterator<'l, 's, Y> {
pub fn state(&self) -> u8 {
self.0.boundary_property
}
}

derive_usize_iterator_with_type!(SentenceBreakIterator);

/// Sentence break iterator for an `str` (a UTF-8 string).
Expand Down
3 changes: 2 additions & 1 deletion components/segmenter/tests/spec_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ fn sentence_break_test(filename: &str) {
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Sentence_Break | State | Literal");
let STATE_NAMES = ["Unknown", "CR", "LF", "Extend", "Sep", "Format", "Sp", "Lower", "Upper", "OLetter", "Numeric", "ATerm", "SContinue", "STerm", "Close", "ATerm_Close", "ATerm_Close_Sp", "STerm_Close", "STerm_Close_Sp", "Upper_ATerm", "Lower_ATerm", "ATerm_Close_Sp_SB8", "ATerm_Close_Sp_ParaSep", "ATerm_Close_Sp_CR", "STerm_Close_Sp_ParaSep", "STerm_Close_Sp_CR", "sot", "eot"];
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
Expand All @@ -244,7 +245,7 @@ fn sentence_break_test(filename: &str) {
.unwrap_or(&format!("{:?}", sb.get(c))),
// Placeholder for logging the state if exposed.
// Not "?????" to hide from clippy.
"?".repeat(5),
if actual_break { format!("{:02X} {}", iter.state(), STATE_NAMES[iter.state() as usize]) } else {"?".repeat(5)},
c
)
}
Expand Down
3 changes: 3 additions & 0 deletions provider/datagen/src/transform/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use icu_provider::datagen::IterableDataProvider;
use icu_provider::prelude::*;
use icu_segmenter::provider::*;
use icu_segmenter::symbols::*;
use itertools::Itertools;
use std::fmt::Debug;
use zerovec::ZeroVec;

Expand Down Expand Up @@ -469,6 +470,8 @@ impl crate::DatagenProvider {
// sot and eot
properties_names.push("sot".to_string());
properties_names.push("eot".to_string());
println!("{:?}", properties_names);
println!("{}", properties_names.iter().enumerate().map(|(i, name)| format!("{:02X}={}", i, name)).join("\n"));

let rule_size = properties_names.len() * properties_names.len();
let mut break_state_table = vec![UNKNOWN_RULE; rule_size];
Expand Down

0 comments on commit 21aca87

Please sign in to comment.