Skip to content

Commit

Permalink
Use char instead of U24 in normalizer data (#2481)
Browse files Browse the repository at this point in the history
* Use char instead of U24 in normalizer data

char now has the same 3-byte ULE representation as U24, so the postcard and
the baked form do not change. (The JSON form changes, though.)
  • Loading branch information
hsivonen authored Aug 31, 2022
1 parent 780a04b commit 90db7c6
Show file tree
Hide file tree
Showing 10 changed files with 494 additions and 2,421 deletions.
55 changes: 21 additions & 34 deletions components/collator/src/elements.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@ use icu_collections::char16trie::TrieResult;
use icu_collections::codepointtrie::CodePointTrie;
use icu_normalizer::provider::DecompositionDataV1;
use icu_normalizer::provider::DecompositionTablesV1;
use icu_normalizer::u24::EMPTY_U24;
use icu_normalizer::u24::U24;
use icu_properties::CanonicalCombiningClass;
use smallvec::SmallVec;
use zerovec::ule::AsULE;
use zerovec::ule::CharULE;
use zerovec::ule::RawBytesULE;
use zerovec::ZeroSlice;

Expand Down Expand Up @@ -151,12 +150,14 @@ pub(crate) const FFFD_CE32: CollationElement32 = CollationElement32(FFFD_CE32_VA

pub(crate) const EMPTY_U16: &ZeroSlice<u16> =
ZeroSlice::<u16>::from_ule_slice(&<u16 as AsULE>::ULE::from_array([]));
const SINGLE_U16: &ZeroSlice<u16> =
const SINGLE_REPLACEMENT_CHARACTER_U16: &ZeroSlice<u16> =
ZeroSlice::<u16>::from_ule_slice(&<u16 as AsULE>::ULE::from_array([0xFFFD]));

const SINGLE_U24_ARR: [u8; 3] = [0xFD, 0xFF, 00];
const SINGLE_U24_SLICE: &[U24] = &[U24(SINGLE_U24_ARR)];
const SINGLE_U24: &ZeroSlice<U24> = unsafe { core::mem::transmute(SINGLE_U24_SLICE) };
pub(crate) const EMPTY_CHAR: &ZeroSlice<char> = ZeroSlice::new_empty();

const SINGLE_REPLACEMENT_CHARACTER_CHAR: &ZeroSlice<char> = ZeroSlice::from_ule_slice(&[unsafe {
core::mem::transmute::<[u8; 3], CharULE>([0xFDu8, 0xFFu8, 0u8])
}]);

/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
/// are enabled and return `default` if debug assertions are not enabled.
Expand Down Expand Up @@ -186,12 +187,6 @@ fn char_from_u16(u: u16) -> char {
char_from_u32(u32::from(u))
}

/// Convert a `U24` _obtained from data provider data_ to `char`.
#[inline(always)]
fn char_from_u24(u: U24) -> char {
char_from_u32(u.into())
}

#[inline(always)]
fn in_inclusive_range(c: char, start: char, end: char) -> bool {
u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
Expand Down Expand Up @@ -805,7 +800,7 @@ where
/// NFD complex decompositions on the BMP
scalars16: &'data ZeroSlice<u16>,
/// NFD complex decompositions on supplementary planes
scalars32: &'data ZeroSlice<U24>,
scalars32: &'data ZeroSlice<char>,
/// If numeric mode is enabled, the 8 high bits of the numeric primary.
/// `None` if disabled.
numeric_primary: Option<u8>,
Expand Down Expand Up @@ -1101,7 +1096,7 @@ where
let len = usize::from(trail_or_complex >> 13) + 2;
for u in unwrap_or_gigo(
self.scalars16.get_subslice(offset..offset + len),
SINGLE_U16, // single instead of empty for consistency with the other code path
SINGLE_REPLACEMENT_CHARACTER_U16, // single instead of empty for consistency with the other code path
)
.iter()
{
Expand All @@ -1113,13 +1108,12 @@ where
} else {
let len = usize::from(trail_or_complex >> 13) + 1;
let offset32 = offset - self.scalars16.len();
for u in unwrap_or_gigo(
for ch in unwrap_or_gigo(
self.scalars32.get_subslice(offset32..offset32 + len),
SINGLE_U24, // single instead of empty for consistency with the other code path
SINGLE_REPLACEMENT_CHARACTER_CHAR, // single instead of empty for consistency with the other code path
)
.iter()
{
let ch = char_from_u24(u);
let trie_value = self.trie.get(u32::from(ch));
self.upcoming
.push(CharacterAndClassAndTrieValue::new_with_non_special_decomposition_trie_val(ch, trie_value));
Expand Down Expand Up @@ -1499,28 +1493,22 @@ where
.scalars32
.get_subslice(offset32..offset32 + len)
.and_then(|slice| slice.split_first())
.map_or_else(
|| {
// GIGO case
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_U24)
},
|(first, tail)| (char_from_u24(first), tail),
);
.unwrap_or_else(|| {
// GIGO case
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_CHAR)
});

c = starter;
if trail_or_complex & 0x1000 != 0 {
for u in tail.iter() {
let char_from_u = char_from_u24(u);
let trie_value = self.trie.get(u32::from(char_from_u));
for ch in tail.iter() {
let trie_value = self.trie.get(u32::from(ch));
let ccc = ccc_from_trie_value(trie_value);
combining_characters
.push(CharacterAndClass::new(char_from_u, ccc));
combining_characters.push(CharacterAndClass::new(ch, ccc));
}
} else {
let mut it = tail.iter();
while let Some(u) = it.next() {
let ch = char_from_u24(u);
while let Some(ch) = it.next() {
let ccc = ccc_from_trie_value(self.trie.get(u32::from(ch)));
if ccc != CanonicalCombiningClass::NotReordered {
// As of Unicode 14, this branch is never taken.
Expand All @@ -1535,8 +1523,7 @@ where
// sort the right characters.
self.maybe_gather_combining();

while let Some(u) = it.next_back() {
let tail_char = char_from_u24(u);
while let Some(tail_char) = it.next_back() {
let trie_value = self.trie.get(u32::from(tail_char));
self.prepend_and_sort_non_starter_prefix_of_suffix(CharacterAndClassAndTrieValue::new_with_non_special_decomposition_trie_val(tail_char, trie_value));
}
Expand Down
39 changes: 14 additions & 25 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ extern crate alloc;
pub mod error;
pub mod properties;
pub mod provider;
pub mod u24;

use crate::error::NormalizerError;
use crate::provider::CanonicalDecompositionDataV1Marker;
Expand All @@ -94,8 +93,6 @@ use provider::CompatibilityDecompositionTablesV1Marker;
use provider::DecompositionSupplementV1;
use provider::DecompositionTablesV1;
use smallvec::SmallVec;
use u24::EMPTY_U24;
use u24::U24;
use utf16_iter::Utf16CharsEx;
use utf8_iter::Utf8CharsEx;
use write16::Write16;
Expand Down Expand Up @@ -242,15 +239,11 @@ fn char_from_u16(u: u16) -> char {
char_from_u32(u32::from(u))
}

/// Convert a `U24` _obtained from data provider data_ to `char`.
#[inline(always)]
fn char_from_u24(u: U24) -> char {
char_from_u32(u.into())
}

const EMPTY_U16: &ZeroSlice<u16> =
ZeroSlice::<u16>::from_ule_slice(&<u16 as AsULE>::ULE::from_array([]));

const EMPTY_CHAR: &ZeroSlice<char> = ZeroSlice::new_empty();

#[inline(always)]
fn in_inclusive_range(c: char, start: char, end: char) -> bool {
u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
Expand Down Expand Up @@ -517,9 +510,9 @@ where
trie: &'data CodePointTrie<'data, u32>,
supplementary_trie: Option<&'data CodePointTrie<'data, u32>>,
scalars16: &'data ZeroSlice<u16>,
scalars24: &'data ZeroSlice<U24>,
scalars24: &'data ZeroSlice<char>,
supplementary_scalars16: &'data ZeroSlice<u16>,
supplementary_scalars24: &'data ZeroSlice<U24>,
supplementary_scalars24: &'data ZeroSlice<char>,
half_width_voicing_marks_become_non_starters: bool,
/// The lowest character for which either of the following does
/// not hold:
Expand Down Expand Up @@ -589,7 +582,7 @@ where
supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
&supplementary.scalars24
} else {
EMPTY_U24
EMPTY_CHAR
},
half_width_voicing_marks_become_non_starters,
decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
Expand Down Expand Up @@ -647,32 +640,28 @@ where
&mut self,
low: u16,
offset: usize,
slice32: &ZeroSlice<U24>,
slice32: &ZeroSlice<char>,
) -> (char, usize) {
let len = usize::from(low >> 13) + 1;
let (starter, tail) = slice32
.get_subslice(offset..offset + len)
.and_then(|slice| slice.split_first())
.map_or_else(
|| {
// GIGO case
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_U24)
},
|(first, trail)| (char_from_u24(first), trail),
);
.unwrap_or_else(|| {
// GIGO case
debug_assert!(false);
(REPLACEMENT_CHARACTER, EMPTY_CHAR)
});
if low & 0x1000 != 0 {
// All the rest are combining
for u in tail.iter() {
for ch in tail.iter() {
self.buffer
.push(CharacterAndClass::new_with_placeholder(char_from_u24(u)));
.push(CharacterAndClass::new_with_placeholder(ch));
}
(starter, 0)
} else {
let mut i = 0;
let mut combining_start = 0;
for u in tail.iter() {
let ch = char_from_u24(u);
for ch in tail.iter() {
let trie_value = self.trie.get(u32::from(ch));
self.buffer.push(CharacterAndClass::new_with_trie_value(
CharacterAndTrieValue::new(ch, trie_value),
Expand Down
9 changes: 3 additions & 6 deletions components/normalizer/src/properties.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
//! glyph-availability-guided custom normalizer.
use crate::char_from_u16;
use crate::char_from_u24;
use crate::error::NormalizerError;
use crate::in_inclusive_range;
use crate::provider::CanonicalCompositionsV1Marker;
Expand Down Expand Up @@ -269,13 +268,11 @@ impl CanonicalDecomposition {
break;
}
let offset24 = offset - tables.scalars16.len();
if let Some(first) = tables.scalars24.get(offset24) {
let first_c = char_from_u24(first);
if let Some(first_c) = tables.scalars24.get(offset24) {
if len == 1 {
return Decomposed::Singleton(first_c);
}
if let Some(second) = tables.scalars24.get(offset24 + 1) {
let second_c = char_from_u24(second);
if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
return Decomposed::Expansion(first_c, second_c);
}
}
Expand Down Expand Up @@ -305,7 +302,7 @@ impl CanonicalDecomposition {
let offset = usize::from(trail_or_complex - 1);
if let Some(first) = non_recursive.scalars24.get(offset) {
if let Some(second) = non_recursive.scalars24.get(offset + 1) {
return Decomposed::Expansion(char_from_u24(first), char_from_u24(second));
return Decomposed::Expansion(first, second);
}
}
// GIGO case
Expand Down
6 changes: 2 additions & 4 deletions components/normalizer/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ use zerovec::ZeroVec;
#[cfg(feature = "serde")]
use serde;

use crate::u24::U24;

/// Main data for NFD
#[icu_provider::data_struct(CanonicalDecompositionDataV1Marker = "normalizer/nfd@1")]
#[derive(Debug, PartialEq, Clone)]
Expand Down Expand Up @@ -87,7 +85,7 @@ pub struct DecompositionTablesV1<'data> {
/// Decompositions with at least one character outside
/// the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars24: ZeroVec<'data, U24>,
pub scalars24: ZeroVec<'data, char>,
}

/// Non-Hangul canonical compositions
Expand Down Expand Up @@ -116,5 +114,5 @@ pub struct NonRecursiveDecompositionSupplementV1<'data> {
/// Decompositions with at least one character outside
/// the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars24: ZeroVec<'data, U24>,
pub scalars24: ZeroVec<'data, char>,
}
101 changes: 0 additions & 101 deletions components/normalizer/src/u24.rs

This file was deleted.

Loading

0 comments on commit 90db7c6

Please sign in to comment.