Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial ZeroHashMap #2579

Merged
merged 39 commits into from
Feb 13, 2023
Merged
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d455634
add core functionality of ZeroHashMap based on CHD algorithm
pdogr Sep 19, 2022
9c4bba2
remove keys from HashIndex,
pdogr Sep 19, 2022
4423552
rename variables, use vec! macro
pdogr Sep 19, 2022
3aa891e
remove #[macro_use] and directly import macro
pdogr Sep 19, 2022
9d01fd5
add benchmarks for zerohashmap
pdogr Sep 20, 2022
2ee9633
make functions inline
pdogr Sep 20, 2022
92700b4
add hashmap feature for ZeroHashMapStatic
pdogr Sep 21, 2022
5bd7ddb
remove unnecessary Iterator impl
pdogr Sep 21, 2022
027a3ce
Merge branch 'unicode-org:main' into hm
pdogr Sep 21, 2022
3a17393
Apply reverse permutation in HashIndex bulding using zvl_permute
pdogr Sep 21, 2022
9284bde
Derive serde for HashIndex, ZeroHashMapStatic
pdogr Sep 21, 2022
6ebde0e
modify generation algorithm using hashing only once
pdogr Sep 23, 2022
1d5f211
replace ahash with wyhash
pdogr Sep 23, 2022
305038f
Merge branch 'unicode-org:main' into hm
pdogr Sep 23, 2022
69a3355
Merge branch 'unicode-org:main' into hm
pdogr Dec 19, 2022
d9c52f5
Merge branch 'unicode-org:main' into hm
pdogr Dec 24, 2022
6781a16
Move zerohashmap into a separate module
pdogr Dec 24, 2022
12045d5
Merge branch 'main' into hm
pdogr Jan 3, 2023
40e89a6
move everything to mod.rs
pdogr Jan 8, 2023
553705b
Use t1ha hash function
pdogr Jan 8, 2023
7bf72b0
s/ZeroHashMapStatic/ZeroHashMap
pdogr Jan 8, 2023
061a38e
Add docs, fix zhm lookup bench, add zhm deserialize bench
pdogr Jan 9, 2023
b0d7b8b
add zhm deserialize benches
pdogr Jan 9, 2023
3d10558
remove hashindex and refactor code
pdogr Jan 12, 2023
35b8393
move common functions to algorithms module
pdogr Jan 12, 2023
52d9e79
impl FromIterator for zhm
pdogr Jan 12, 2023
1cebd79
Update utils/zerovec/src/hashmap/mod.rs
pdogr Jan 12, 2023
3e1a1d8
remove borrow, pub changes
pdogr Jan 12, 2023
51e577e
add Hash to make_ule
pdogr Jan 13, 2023
900c668
add contains_key, iter_keys, iter_values, iter
pdogr Jan 13, 2023
575a040
minor benchmark refactor
pdogr Jan 13, 2023
60f4d7f
pass m as usize
pdogr Jan 19, 2023
ffe15f6
fix maths
pdogr Jan 19, 2023
dbe8882
remove inlining
pdogr Jan 19, 2023
8e6dae9
custom serde for zerohashmap to bake in length validations
pdogr Feb 9, 2023
96fd1c2
revert back to u32 arithmetic
pdogr Feb 9, 2023
a2abdde
Merge branch 'main' into hm
pdogr Feb 9, 2023
715f667
fix clippy errors
pdogr Feb 9, 2023
7811713
add derive Hash to make_ule
pdogr Feb 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 39 additions & 39 deletions utils/zerovec/src/map/hashmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use super::*;
use crate::flexzerovec::{FlexZeroVec, FlexZeroVecOwned};
use crate::ule::AsULE;
use crate::ZeroVec;
use ahash::AHasher;
use alloc::borrow::Borrow;
use alloc::vec::{from_elem, Vec};
Expand All @@ -16,40 +16,40 @@ fn create_hasher_with_seed(seed: u128) -> AHasher {
}

#[inline]
fn compute_hash<K: Hash>(seed: u32, k: K, m: usize) -> u32 {
fn compute_hash<K: Hash>(seed: u32, k: K, m: usize) -> usize {
let mut hasher = create_hasher_with_seed(seed.into());
k.hash(&mut hasher);
(hasher.finish() % m as u64) as u32
(hasher.finish() as usize % m) as usize
pdogr marked this conversation as resolved.
Show resolved Hide resolved
}

#[derive(Debug, Clone)]
#[derive(Debug)]
pub struct HashIndex<'a> {
pdogr marked this conversation as resolved.
Show resolved Hide resolved
displacements: ZeroVec<'a, u32>,
reverse_mapping: ZeroVec<'a, u32>,
displacements: FlexZeroVec<'a>,
reverse_mapping: FlexZeroVec<'a>,
}

impl<'a> HashIndex<'a> {
#[inline]
pub fn build_from_exact_iter<K, I, A>(iter: I) -> Self
pub fn build_from_exact_iter<K, I, A>(keys: I) -> Self
where
A: Borrow<K>,
K: 'a + ?Sized + Hash,
I: ExactSizeIterator<Item = A>,
{
HashIndex::build_from_exact_iter_with_hash_fn(iter, |seed, k, len| {
HashIndex::build_from_exact_iter_with_hash_fn(keys, |seed, k, len| {
pdogr marked this conversation as resolved.
Show resolved Hide resolved
compute_hash(seed, k, len)
})
}

#[inline]
pub fn build_from_exact_iter_with_hash_fn<K, I, A, H>(iter: I, h: H) -> Self
pub fn build_from_exact_iter_with_hash_fn<K, I, A, H>(keys: I, h: H) -> Self
where
A: Borrow<K>,
K: 'a + ?Sized,
I: ExactSizeIterator<Item = A>,
H: Fn(u32, &K, usize) -> u32,
H: Fn(u32, &K, usize) -> usize,
{
let iter_len = iter.len();
let iter_len = keys.len();
pdogr marked this conversation as resolved.
Show resolved Hide resolved

// A vector to track the size of buckets for sorting.
let mut bucket_sizes = from_elem(0, iter_len);
pdogr marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -58,7 +58,7 @@ impl<'a> HashIndex<'a> {
let mut bucket_flatten = Vec::with_capacity(iter_len);

// Compute initial displacement and bucket sizes
for (i, k) in iter.enumerate() {
for (i, k) in keys.enumerate() {
// Compute first level hash of the key bytes.
// First level uses a seed value of 0.
let l1 = h(0x00, k.borrow(), iter_len);
Expand Down Expand Up @@ -129,7 +129,7 @@ impl<'a> HashIndex<'a> {

// Successfully found a seed, store it as index l1.
if let Some(v) = displacements.get_mut(l1) {
*v = seed;
*v = seed as usize;
}

for (i, displacement_idx) in current_displacements.iter().enumerate() {
Expand All @@ -142,7 +142,7 @@ impl<'a> HashIndex<'a> {
*v = true;
}
if let Some(v) = reverse_mapping.get_mut(*displacement_idx) {
*v = *original_idx as u32;
*v = *original_idx;
}
}
break;
Expand All @@ -151,8 +151,12 @@ impl<'a> HashIndex<'a> {
}

Self {
displacements: ZeroVec::alloc_from_slice(&displacements),
reverse_mapping: ZeroVec::alloc_from_slice(&reverse_mapping),
displacements: FlexZeroVec::Owned(FlexZeroVecOwned::from_iter(
displacements.into_iter(),
)),
reverse_mapping: FlexZeroVec::Owned(FlexZeroVecOwned::from_iter(
reverse_mapping.into_iter(),
)),
}
}

Expand All @@ -164,47 +168,46 @@ impl<'a> HashIndex<'a> {
let l1 = compute_hash(0, k, self.displacements.len());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will panic if the map is empty, as compute_hash will do % 0. Add that as a precondition to compute_hash and guard against it here.


#[allow(clippy::unwrap_used)] // l1 is in 0..self.displacements.len()
let seed = self.displacements.get(l1 as usize).unwrap();
let seed = self.displacements.get(l1).unwrap();
if seed == 0 {
None
} else {
let hash = compute_hash(seed, k, self.displacements.len());
self.reverse_mapping.get(hash as usize).map(|i| i as usize)
let hash = compute_hash(seed as u32, k, self.displacements.len());
self.reverse_mapping.get(hash).map(|i| i as usize)
pdogr marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

pub struct ZeroHashMapStatic<'a, K, V>
pub struct ZeroHashMapStatic<'a, V>
where
K: ZeroMapKV<'a> + ?Sized,
V: ZeroMapKV<'a> + ?Sized,
{
index: HashIndex<'a>,
pdogr marked this conversation as resolved.
Show resolved Hide resolved
keys: K::Container,
values: V::Container,
}

impl<'a, K, V> ZeroHashMapStatic<'a, K, V>
impl<'a, V> ZeroHashMapStatic<'a, V>
where
K: ZeroMapKV<'a> + ?Sized,
V: ZeroMapKV<'a> + ?Sized,
{
pub fn len(&self) -> usize {
self.keys.zvl_len()
self.values.zvl_len()
}

pub fn is_empty(&self) -> bool {
self.len() == 0
}
}

impl<'a, K, V> ZeroHashMapStatic<'a, K, V>
impl<'a, V> ZeroHashMapStatic<'a, V>
where
K: ZeroMapKV<'a, Container = ZeroVec<'a, K>> + Hash + AsULE + ?Sized + 'static,
V: ZeroMapKV<'a> + ?Sized,
{
#[inline]
pub fn get<'b>(&'b self, key: &'b K) -> Option<&'b V::GetType> {
pub fn get<'b, K>(&'b self, key: &'b K) -> Option<&'b V::GetType>
where
K: Hash + AsULE + ?Sized,
{
self.index.index(key).map(|i| {
#[allow(clippy::unwrap_used)] // i is in 0..values.len() and there is a value at i
self.values.zvl_get(i as usize).unwrap()
Expand All @@ -218,30 +221,27 @@ where
/// use zerovec::ZeroHashMapStatic;
///
/// let kv: Vec<(i32, &str)> = vec![(1,"a"), (2, "b"),(3, "c"),(4 , "d")];
/// let hashmap: ZeroHashMapStatic<i32, str> = ZeroHashMapStatic::from_exact_iter(kv.into_iter());
/// let hashmap: ZeroHashMapStatic<str> = ZeroHashMapStatic::from_exact_iter(kv.into_iter());
/// assert_eq!(hashmap.get(&1), Some("a"));
/// assert_eq!(hashmap.get(&2), Some("b"));
/// assert_eq!(hashmap.get(&3), Some("c"));
/// assert_eq!(hashmap.get(&4), Some("d"));
/// ```
pub fn from_exact_iter<A, B, I>(iter: I) -> Self
pub fn from_exact_iter<A, B, I, K>(iter: I) -> Self
where
A: Borrow<K>,
B: Borrow<V>,
K: Hash + AsULE + ?Sized + 'a,
I: ExactSizeIterator<Item = (A, B)>,
pdogr marked this conversation as resolved.
Show resolved Hide resolved
{
let mut keys = K::Container::zvl_with_capacity(iter.len());
let mut keys = Vec::with_capacity(iter.len());
let mut values = V::Container::zvl_with_capacity(iter.len());
for (k, v) in iter {
keys.zvl_push(k.borrow());
keys.push(k);
values.zvl_push(v.borrow());
}
let index = HashIndex::build_from_exact_iter::<K, _, _>(keys.iter());
Self {
index,
keys,
values,
}
let index = HashIndex::build_from_exact_iter::<K, _, _>(keys.into_iter());
Self { index, values }
}
}

Expand All @@ -258,7 +258,7 @@ mod tests {
let rng = Lcg64Xsh32::seed_from_u64(seed);
let kv: Vec<(u64, u64)> = rng.sample_iter(&Standard).take(N).collect();
let kv_copy = kv.clone();
let hashmap: ZeroHashMapStatic<u64, u64> =
let hashmap: ZeroHashMapStatic<u64> =
ZeroHashMapStatic::from_exact_iter(kv_copy.into_iter());
for (k, v) in kv {
assert_eq!(
Expand Down