Skip to content

Commit

Permalink
Progress! on to the next failure.
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Oct 24, 2023
1 parent b518661 commit 9403d59
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 5 deletions.

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion components/segmenter/src/rule_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'

// If break_state is equals or grater than 0, it is alias of property.
let mut break_state = self.get_break_state_from_table(left_prop, right_prop);
let STATE_NAMES = ["Unknown", "CR", "LF", "Extend", "Sep", "Format", "Sp", "Lower", "Upper", "OLetter", "Numeric", "ATerm", "SContinue", "STerm", "Close", "ATerm_Close", "ATerm_Close_Sp", "STerm_Close", "STerm_Close_Sp", "Upper_ATerm", "Lower_ATerm", "ATerm_Close_Sp_ParaSep", "ATerm_Close_Sp_CR", "STerm_Close_Sp_ParaSep", "STerm_Close_Sp_CR", "ATerm_Close_Sp_SB8", "sot", "eot"];
let STATE_NAMES = ["Unknown", "CR", "LF", "Extend", "Sep", "Format", "Sp", "Lower", "Upper", "OLetter", "Numeric", "ATerm", "SContinue", "STerm", "Close", "ATerm_Close", "ATerm_Close_Sp", "STerm_Close", "STerm_Close_Sp", "Upper_ATerm", "Lower_ATerm", "ATerm_Close_Sp_SB8", "ATerm_Close_Sp_ParaSep", "ATerm_Close_Sp_CR", "STerm_Close_Sp_ParaSep", "STerm_Close_Sp_CR", "sot", "eot"];
println!("left={:02X} right={:02X} {} state={:02X}", left_prop, right_prop, right_codepoint, break_state);
println!("left={} right={} {}", STATE_NAMES[left_prop as usize], STATE_NAMES[right_prop as usize], right_codepoint);

Expand Down
5 changes: 3 additions & 2 deletions components/segmenter/tests/spec_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ fn run_grapheme_break_test() {
fn sentence_break_test(filename: &str) {
let test_iter = TestContentIterator::new(filename);
let segmenter = SentenceSegmenter::new();
for test in test_iter {
for (i, test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
Expand All @@ -225,7 +225,7 @@ fn sentence_break_test(filename: &str) {
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Sentence_Break | State | Literal");
let STATE_NAMES = ["Unknown", "CR", "LF", "Extend", "Sep", "Format", "Sp", "Lower", "Upper", "OLetter", "Numeric", "ATerm", "SContinue", "STerm", "Close", "ATerm_Close", "ATerm_Close_Sp", "STerm_Close", "STerm_Close_Sp", "Upper_ATerm", "Lower_ATerm", "ATerm_Close_Sp_ParaSep", "ATerm_Close_Sp_CR", "STerm_Close_Sp_ParaSep", "STerm_Close_Sp_CR", "ATerm_Close_Sp_SB8", "sot", "eot"];
let STATE_NAMES = ["Unknown", "CR", "LF", "Extend", "Sep", "Format", "Sp", "Lower", "Upper", "OLetter", "Numeric", "ATerm", "SContinue", "STerm", "Close", "ATerm_Close", "ATerm_Close_Sp", "STerm_Close", "STerm_Close_Sp", "Upper_ATerm", "Lower_ATerm", "ATerm_Close_Sp_SB8", "ATerm_Close_Sp_ParaSep", "ATerm_Close_Sp_CR", "STerm_Close_Sp_ParaSep", "STerm_Close_Sp_CR", "sot", "eot"];
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
Expand All @@ -247,6 +247,7 @@ fn sentence_break_test(filename: &str) {
c
)
}
println!("Test case #{}", i);
assert!(false)
}

Expand Down
9 changes: 8 additions & 1 deletion provider/datagen/src/transform/segmenter/rules/sentence.toml
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,13 @@ left = "ATerm"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "Upper_ATerm"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
Expand Down Expand Up @@ -619,7 +626,7 @@ left = [
"STerm_Close_Sp_ParaSep",
"STerm_Close_Sp_CR"
]
right = [ "ATerm", "Lower", "OLetter", "Upper", "Numeric", "STerm", "CR" ]
right = [ "ATerm", "Lower", "OLetter", "Upper", "Numeric", "STerm", "CR", "LF" ]
break_state = true

[[rules]]
Expand Down

0 comments on commit 9403d59

Please sign in to comment.