Skip to content

Commit

Permalink
Use key attributes for segmenter (#4985)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian authored Jun 3, 2024
1 parent b448129 commit 05b220f
Show file tree
Hide file tree
Showing 15 changed files with 189 additions and 177 deletions.
2 changes: 1 addition & 1 deletion components/segmenter/src/complex/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ mod tests {
fn cj_dictionary_test() {
let dict_payload: DataPayload<DictionaryForWordOnlyAutoV1Marker> = crate::provider::Baked
.load(DataRequest {
locale: &icu_locale_core::langid!("ja").into(),
key_attributes: &"cjdict".parse().unwrap(),
..Default::default()
})
.unwrap()
Expand Down
3 changes: 1 addition & 2 deletions components/segmenter/src/complex/lstm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,6 @@ fn compute_hc<'a>(
#[cfg(test)]
mod tests {
use super::*;
use icu_locale_core::langid;
use icu_provider::prelude::*;
use serde::Deserialize;

Expand Down Expand Up @@ -349,7 +348,7 @@ mod tests {
fn segment_file_by_lstm() {
let lstm: DataPayload<LstmForWordLineAutoV1Marker> = crate::provider::Baked
.load(DataRequest {
locale: &langid!("th").into(),
key_attributes: &"Thai_codepoints_exclusive_model4_heavy".parse().unwrap(),
..Default::default()
})
.unwrap()
Expand Down
213 changes: 127 additions & 86 deletions components/segmenter/src/complex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

use crate::provider::*;
use alloc::vec::Vec;
use icu_locale_core::{langid, LanguageIdentifier};
use icu_provider::prelude::*;

mod dictionary;
Expand Down Expand Up @@ -78,22 +77,34 @@ impl ComplexPayloads {
grapheme: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
),
my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, langid!("my"))
.unwrap()
.map(DataPayload::cast)
.map(Err),
km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, langid!("km"))
.unwrap()
.map(DataPayload::cast)
.map(Err),
lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, langid!("lo"))
.unwrap()
.map(DataPayload::cast)
.map(Err),
th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, langid!("th"))
.unwrap()
.map(DataPayload::cast)
.map(Err),
my: try_load::<LstmForWordLineAutoV1Marker, _>(
&crate::provider::Baked,
"Burmese_codepoints_exclusive_model4_heavy",
)
.unwrap()
.map(DataPayload::cast)
.map(Err),
km: try_load::<LstmForWordLineAutoV1Marker, _>(
&crate::provider::Baked,
"Khmer_codepoints_exclusive_model4_heavy",
)
.unwrap()
.map(DataPayload::cast)
.map(Err),
lo: try_load::<LstmForWordLineAutoV1Marker, _>(
&crate::provider::Baked,
"Lao_codepoints_exclusive_model4_heavy",
)
.unwrap()
.map(DataPayload::cast)
.map(Err),
th: try_load::<LstmForWordLineAutoV1Marker, _>(
&crate::provider::Baked,
"Thai_codepoints_exclusive_model4_heavy",
)
.unwrap()
.map(DataPayload::cast)
.map(Err),
ja: None,
}
}
Expand All @@ -107,18 +118,30 @@ impl ComplexPayloads {
{
Ok(Self {
grapheme: provider.load(Default::default())?.take_payload()?,
my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, langid!("my"))?
.map(DataPayload::cast)
.map(Err),
km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, langid!("km"))?
.map(DataPayload::cast)
.map(Err),
lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, langid!("lo"))?
.map(DataPayload::cast)
.map(Err),
th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, langid!("th"))?
.map(DataPayload::cast)
.map(Err),
my: try_load::<LstmForWordLineAutoV1Marker, D>(
provider,
"Burmese_codepoints_exclusive_model4_heavy",
)?
.map(DataPayload::cast)
.map(Err),
km: try_load::<LstmForWordLineAutoV1Marker, D>(
provider,
"Khmer_codepoints_exclusive_model4_heavy",
)?
.map(DataPayload::cast)
.map(Err),
lo: try_load::<LstmForWordLineAutoV1Marker, D>(
provider,
"Lao_codepoints_exclusive_model4_heavy",
)?
.map(DataPayload::cast)
.map(Err),
th: try_load::<LstmForWordLineAutoV1Marker, D>(
provider,
"Thai_codepoints_exclusive_model4_heavy",
)?
.map(DataPayload::cast)
.map(Err),
ja: None,
})
}
Expand All @@ -133,38 +156,35 @@ impl ComplexPayloads {
),
my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
&crate::provider::Baked,
langid!("my"),
"burmesedict",
)
.unwrap()
.map(DataPayload::cast)
.map(Ok),
km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
&crate::provider::Baked,
langid!("km"),
"khmerdict",
)
.unwrap()
.map(DataPayload::cast)
.map(Ok),
lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
&crate::provider::Baked,
langid!("lo"),
"laodict",
)
.unwrap()
.map(DataPayload::cast)
.map(Ok),
th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
&crate::provider::Baked,
langid!("th"),
"thaidict",
)
.unwrap()
.map(DataPayload::cast)
.map(Ok),
ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(
&crate::provider::Baked,
langid!("ja"),
)
.unwrap()
.map(DataPayload::cast),
ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(&crate::provider::Baked, "cjdict")
.unwrap()
.map(DataPayload::cast),
}
}

Expand All @@ -177,19 +197,19 @@ impl ComplexPayloads {
{
Ok(Self {
grapheme: provider.load(Default::default())?.take_payload()?,
my: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, langid!("my"))?
my: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, "burmesedict")?
.map(DataPayload::cast)
.map(Ok),
km: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, langid!("km"))?
km: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, "khmerdict")?
.map(DataPayload::cast)
.map(Ok),
lo: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, langid!("lo"))?
lo: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, "laodict")?
.map(DataPayload::cast)
.map(Ok),
th: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, langid!("th"))?
th: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, "thaidict")?
.map(DataPayload::cast)
.map(Ok),
ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, langid!("ja"))?
ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, "cjdict")?
.map(DataPayload::cast),
})
}
Expand All @@ -203,28 +223,37 @@ impl ComplexPayloads {
grapheme: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
),
my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, langid!("my"))
.unwrap()
.map(DataPayload::cast)
.map(Err),
km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, langid!("km"))
.unwrap()
.map(DataPayload::cast)
.map(Err),
lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, langid!("lo"))
.unwrap()
.map(DataPayload::cast)
.map(Err),
th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, langid!("th"))
.unwrap()
.map(DataPayload::cast)
.map(Err),
ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(
my: try_load::<LstmForWordLineAutoV1Marker, _>(
&crate::provider::Baked,
"Burmese_codepoints_exclusive_model4_heavy",
)
.unwrap()
.map(DataPayload::cast)
.map(Err),
km: try_load::<LstmForWordLineAutoV1Marker, _>(
&crate::provider::Baked,
"Khmer_codepoints_exclusive_model4_heavy",
)
.unwrap()
.map(DataPayload::cast)
.map(Err),
lo: try_load::<LstmForWordLineAutoV1Marker, _>(
&crate::provider::Baked,
langid!("ja"),
"Lao_codepoints_exclusive_model4_heavy",
)
.unwrap()
.map(DataPayload::cast),
.map(DataPayload::cast)
.map(Err),
th: try_load::<LstmForWordLineAutoV1Marker, _>(
&crate::provider::Baked,
"Thai_codepoints_exclusive_model4_heavy",
)
.unwrap()
.map(DataPayload::cast)
.map(Err),
ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(&crate::provider::Baked, "cjdict")
.unwrap()
.map(DataPayload::cast),
}
}

Expand All @@ -238,19 +267,31 @@ impl ComplexPayloads {
{
Ok(Self {
grapheme: provider.load(Default::default())?.take_payload()?,
my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, langid!("my"))?
.map(DataPayload::cast)
.map(Err),
km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, langid!("km"))?
.map(DataPayload::cast)
.map(Err),
lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, langid!("lo"))?
.map(DataPayload::cast)
.map(Err),
th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, langid!("th"))?
.map(DataPayload::cast)
.map(Err),
ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, langid!("ja"))?
my: try_load::<LstmForWordLineAutoV1Marker, D>(
provider,
"Burmese_codepoints_exclusive_model4_heavy",
)?
.map(DataPayload::cast)
.map(Err),
km: try_load::<LstmForWordLineAutoV1Marker, D>(
provider,
"Khmer_codepoints_exclusive_model4_heavy",
)?
.map(DataPayload::cast)
.map(Err),
lo: try_load::<LstmForWordLineAutoV1Marker, D>(
provider,
"Lao_codepoints_exclusive_model4_heavy",
)?
.map(DataPayload::cast)
.map(Err),
th: try_load::<LstmForWordLineAutoV1Marker, D>(
provider,
"Thai_codepoints_exclusive_model4_heavy",
)?
.map(DataPayload::cast)
.map(Err),
ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, "cjdict")?
.map(DataPayload::cast),
})
}
Expand All @@ -265,28 +306,28 @@ impl ComplexPayloads {
),
my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
&crate::provider::Baked,
langid!("my"),
"burmesedict",
)
.unwrap()
.map(DataPayload::cast)
.map(Ok),
km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
&crate::provider::Baked,
langid!("km"),
"khmerdict",
)
.unwrap()
.map(DataPayload::cast)
.map(Ok),
lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
&crate::provider::Baked,
langid!("lo"),
"laodict",
)
.unwrap()
.map(DataPayload::cast)
.map(Ok),
th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
&crate::provider::Baked,
langid!("th"),
"thaidict",
)
.unwrap()
.map(DataPayload::cast)
Expand All @@ -303,16 +344,16 @@ impl ComplexPayloads {
{
Ok(Self {
grapheme: provider.load(Default::default())?.take_payload()?,
my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, langid!("my"))?
my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, "burmesedict")?
.map(DataPayload::cast)
.map(Ok),
km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, langid!("km"))?
km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, "khmerdict")?
.map(DataPayload::cast)
.map(Ok),
lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, langid!("lo"))?
lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, "laodict")?
.map(DataPayload::cast)
.map(Ok),
th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, langid!("th"))?
th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, "thaidict")?
.map(DataPayload::cast)
.map(Ok),
ja: None,
Expand All @@ -322,10 +363,10 @@ impl ComplexPayloads {

fn try_load<M: KeyedDataMarker, P: DataProvider<M> + ?Sized>(
provider: &P,
locale: LanguageIdentifier,
model: &'static str,
) -> Result<Option<DataPayload<M>>, DataError> {
match provider.load(DataRequest {
locale: &locale.into(),
key_attributes: &model.parse().unwrap(),
metadata: {
let mut m = DataRequestMetadata::default();
m.silent = true;
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Loading

0 comments on commit 05b220f

Please sign in to comment.