Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Unicode 15.0 sentence segmentation #4213

Merged
merged 32 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
e6ac4ee
Broken monkeys and better errors
eggrobin Sep 13, 2023
a28c814
log the guts
eggrobin Sep 14, 2023
1c86263
part of a fix
eggrobin Sep 15, 2023
1df249f
more logging for good measure
eggrobin Sep 19, 2023
99ef1ad
Many fixes to the tables
eggrobin Sep 25, 2023
ac8bb20
A code change probably needed
eggrobin Sep 25, 2023
9055bdc
traces etc.
eggrobin Sep 25, 2023
60cc05a
revert tools/make/data.toml
eggrobin Oct 24, 2023
e00088b
Merge remote-tracking branch 'la-vache/main' into import-monkeys
eggrobin Oct 24, 2023
b518661
More tests, more breakages.
eggrobin Oct 24, 2023
9403d59
Progress! on to the next failure.
eggrobin Oct 24, 2023
926d11d
Il faut imaginer Sisyphe heureux.
eggrobin Oct 24, 2023
66dd136
I sure hope this is wrong, but I am no monkey.
eggrobin Oct 24, 2023
b385f18
Somehow it is not failing anymore
eggrobin Oct 24, 2023
24cf9af
An eggsplanation.
eggrobin Oct 24, 2023
e795a2a
Looks like this is all we need
eggrobin Oct 24, 2023
c97b87f
Remove traces
eggrobin Oct 24, 2023
9187a76
Keep it at 100 test cases
eggrobin Oct 24, 2023
d65fecf
Just note it
eggrobin Oct 24, 2023
fcfc430
cargo make testdata
eggrobin Oct 24, 2023
1453cca
cargo fmt
eggrobin Oct 24, 2023
cf2198a
appease clippy
eggrobin Oct 24, 2023
4482f6d
Try to appease clippy while giving myself a place to restore the traces
eggrobin Oct 24, 2023
02dda40
Bit twiddling
eggrobin Oct 24, 2023
bb69e2b
properties data
eggrobin Oct 24, 2023
925dc3b
properties data
eggrobin Oct 24, 2023
4aeef2b
?????
eggrobin Oct 24, 2023
bd34306
Merge branch 'import-monkeys' of https://github.com/eggrobin/icu4x in…
eggrobin Oct 24, 2023
ad6727a
No need for the conditional unless logging
eggrobin Oct 24, 2023
21aca87
Bring back the traces
eggrobin Oct 24, 2023
785bf07
Revert "Bring back the traces"
eggrobin Oct 24, 2023
824ae44
Merge remote-tracking branch 'la-vache/main' into import-monkeys
eggrobin Oct 25, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion components/segmenter/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ serde = { version = "1.0", default-features = false, features = ["derive"] }
serde_json = "1.0"
icu = { path = "../../components/icu", default-features = false }
itertools = "0.10"
icu_properties = { workspace = true }

[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
criterion = "0.4"
Expand All @@ -51,7 +52,7 @@ serde = ["dep:serde", "zerovec/serde", "icu_collections/serde", "icu_provider/se
datagen = ["serde", "dep:databake", "zerovec/databake", "icu_collections/databake"]
lstm = ["dep:core_maths"]
auto = ["lstm"] # Enabled try_new_auto_unstable constructors
compiled_data = ["dep:icu_segmenter_data"]
compiled_data = ["dep:icu_segmenter_data", "icu_properties/compiled_data"]
bench = []

[lib]
Expand Down

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions components/segmenter/src/rule_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'
let mut previous_pos_data = self.current_pos_data;
let mut previous_left_prop = left_prop;

if (break_state & INTERMEDIATE_MATCH_RULE) != 0 {
break_state &= !INTERMEDIATE_MATCH_RULE;
}
loop {
self.advance_iter();

Expand Down
35 changes: 33 additions & 2 deletions components/segmenter/tests/spec_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,42 @@ fn run_grapheme_break_test() {
fn sentence_break_test(filename: &str) {
let test_iter = TestContentIterator::new(filename);
let segmenter = SentenceSegmenter::new();
for test in test_iter {
for (i, test) in test_iter.enumerate() {
let s: String = test.utf8_vec.into_iter().collect();
let iter = segmenter.segment_str(&s);
let result: Vec<usize> = iter.collect();
assert_eq!(result, test.break_result_utf8, "{}", test.original_line);
if result != test.break_result_utf8 {
let sb = icu::properties::maps::sentence_break();
let sb_name = icu::properties::SentenceBreak::enum_to_long_name_mapper();
let mut iter = segmenter.segment_str(&s);
// TODO(egg): It would be really nice to have Name here.
println!(" | A | E | Code pt. | Sentence_Break | State | Literal");
for (i, c) in s.char_indices() {
let expected_break = test.break_result_utf8.contains(&i);
let actual_break = result.contains(&i);
if actual_break {
iter.next();
}
println!(
"{}| {} | {} | {:>8} | {:>14} | {} | {}",
if actual_break != expected_break {
"😭"
} else {
" "
},
if actual_break { "÷" } else { "×" },
if expected_break { "÷" } else { "×" },
format!("{:04X}", c as u32),
sb_name
.get(sb.get(c))
.unwrap_or(&format!("{:?}", sb.get(c))),
"?".repeat(5),
c
)
}
println!("Test case #{}", i);
panic!()
}

let iter = segmenter.segment_utf16(&test.utf16_vec);
let result: Vec<usize> = iter.collect();
Expand Down
108 changes: 108 additions & 0 deletions components/segmenter/tests/testdata/SentenceBreakExtraTest.txt

Large diffs are not rendered by default.

79 changes: 76 additions & 3 deletions provider/datagen/src/transform/segmenter/rules/sentence.toml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,18 @@ name = "Lower_ATerm"
left = "Lower_ATerm"
right = "Format"

[[tables]]
# SB5
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Extend"

[[tables]]
# SB5
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Format"

[[tables]]
# SB7
name = "Upper_ATerm"
Expand Down Expand Up @@ -385,6 +397,41 @@ name = "STerm_Close_Sp_ParaSep"
left = "STerm_Close_Sp"
right = "LF"

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "Upper_ATerm"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "Lower_ATerm"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close"
right = "Numeric"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
Expand All @@ -399,6 +446,13 @@ left = "ATerm_Close_Sp"
right = "Numeric"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
Expand All @@ -417,6 +471,24 @@ name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Numeric"

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Sp"

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Unknown"

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "SContinue"

[[rules]]
# SB1
left = [ "sot" ]
Expand Down Expand Up @@ -561,19 +633,20 @@ left = [
"STerm_Close_Sp_ParaSep",
"STerm_Close_Sp_CR"
]
right = [ "ATerm", "Lower", "OLetter", "Upper", "Numeric", "STerm" ]
# NOTE(egg): left=Meow_CR, right=LF would be wrong, but it is prevented by another rule above.
right = [ "ATerm", "Lower", "OLetter", "Upper", "Numeric", "STerm", "CR", "LF", "Sep" ]
break_state = true

[[rules]]
# SB11
left = [ "ATerm_Close_Sp", "STerm_Close_Sp" ]
right = [ "Numeric", "Upper", "Close", "Unknown", "OLetter" ]
right = [ "Numeric", "Upper", "Lower", "Close", "Unknown", "OLetter" ]
break_state = true

[[rules]]
# SB11
left = [ "ATerm_Close", "STerm", "STerm_Close" ]
right = [ "Numeric", "Upper" ]
right = [ "Numeric", "Upper", "Lower", "OLetter", "Unknown" ]
break_state = true

[[rules]]
Expand Down
Loading