Skip to content

Commit

Permalink
refactor builder
Browse files Browse the repository at this point in the history
Signed-off-by: Little-Wallace <[email protected]>
  • Loading branch information
Little-Wallace committed Jan 19, 2023
1 parent 93dc5f8 commit 523b5a4
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 91 deletions.
19 changes: 6 additions & 13 deletions src/storage/src/hummock/iterator/backward_user.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,7 @@ mod tests {
let begin_key: usize = rng.gen_range(1..=end_key);
let begin_key_bytes = key_from_num(begin_key);
chaos_test_case(
clone_sst(&sst),
sst.clone(),
Unbounded,
Unbounded,
&truth,
Expand All @@ -1038,7 +1038,7 @@ mod tests {
let begin_key: usize = rng.gen_range(1..=end_key);
let begin_key_bytes = key_from_num(begin_key);
chaos_test_case(
clone_sst(&sst),
sst.clone(),
Unbounded,
Included(end_key_bytes.clone()),
&truth,
Expand All @@ -1059,7 +1059,7 @@ mod tests {
let begin_key: usize = rng.gen_range(1..=end_key);
let begin_key_bytes = key_from_num(begin_key);
chaos_test_case(
clone_sst(&sst),
sst.clone(),
Included(begin_key_bytes.clone()),
Unbounded,
&truth,
Expand All @@ -1080,7 +1080,7 @@ mod tests {
let begin_key: usize = rng.gen_range(1..=end_key);
let begin_key_bytes = key_from_num(begin_key);
chaos_test_case(
clone_sst(&sst),
sst.clone(),
Excluded(begin_key_bytes.clone()),
Unbounded,
&truth,
Expand All @@ -1101,7 +1101,7 @@ mod tests {
let begin_key: usize = rng.gen_range(1..=end_key);
let begin_key_bytes = key_from_num(begin_key);
chaos_test_case(
clone_sst(&sst),
sst.clone(),
Included(begin_key_bytes.clone()),
Included(end_key_bytes.clone()),
&truth,
Expand All @@ -1122,7 +1122,7 @@ mod tests {
let begin_key: usize = rng.gen_range(1..=end_key);
let begin_key_bytes = key_from_num(begin_key);
chaos_test_case(
clone_sst(&sst),
sst.clone(),
Excluded(begin_key_bytes),
Included(end_key_bytes),
&truth,
Expand All @@ -1132,13 +1132,6 @@ mod tests {
}
}

fn clone_sst(sst: &Sstable) -> Sstable {
Sstable {
id: sst.id,
meta: sst.meta.clone(),
}
}

#[tokio::test]
async fn test_min_epoch() {
let sstable_store = mock_sstable_store();
Expand Down
162 changes: 108 additions & 54 deletions src/storage/src/hummock/sstable/bloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
use std::f64;

use bytes::BufMut;
use xxhash_rust::xxh32;

pub trait BitSlice {
fn get_bit(&self, idx: usize) -> bool;
Expand All @@ -27,6 +28,13 @@ pub trait BitSliceMut {
fn set_bit(&mut self, idx: usize, val: bool);
}

pub trait FilterBuildeer {
/// add key which need to be filter for construct filter data.
fn add_key(&mut self, key: &[u8]);
/// Builds Bloom filter from key hashes
fn finish(&mut self) -> Vec<u8>;
}

impl<T: AsRef<[u8]>> BitSlice for T {
fn get_bit(&self, idx: usize) -> bool {
let pos = idx / 8;
Expand All @@ -52,42 +60,105 @@ impl<T: AsMut<[u8]>> BitSliceMut for T {
}

/// Bloom implements Bloom filter functionalities over a bit-slice of data.
pub struct Bloom<'a> {
#[derive(Clone)]
pub struct BloomFilterReader {
/// data of filter in bits
filter: &'a [u8],
data: Vec<u8>,
/// number of hash functions
k: u8,
}

impl<'a> Bloom<'a> {
impl BloomFilterReader {
/// Creates a Bloom filter from a byte slice
pub fn new(buf: &'a [u8]) -> Self {
let filter = &buf[..buf.len() - 1];
pub fn new(mut buf: Vec<u8>) -> Self {
if buf.len() <= 1 {
return Self { data: vec![], k: 0 };
}
let k = buf[buf.len() - 1];
Self { filter, k }
buf.resize(buf.len() - 1, 0);
Self { data: buf, k }
}

pub fn is_empty(&self) -> bool {
self.data.is_empty()
}

/// Judges whether the hash value is in the table with the given false positive rate.
///
/// Note:
/// - if the return value is false, then the table surely does not have the user key that has
/// the hash;
/// - if the return value is true, then the table may or may not have the user key that has
/// the hash actually, a.k.a. we don't know the answer.
pub fn may_match(&self, mut h: u32) -> bool {
if self.k > 30 {
// potential new encoding for short Bloom filters
true
} else {
let nbits = self.data.bit_len();
let delta = (h >> 17) | (h << 15);
for _ in 0..self.k {
let bit_pos = h % (nbits as u32);
if !self.data.get_bit(bit_pos as usize) {
return false;
}
h = h.wrapping_add(delta);
}
true
}
}
}

pub struct BloomFilterBuilder {
key_hash_entries: Vec<u32>,
bloom_false_positive: f64,
}

impl BloomFilterBuilder {
pub fn new(bloom_false_positive: f64, capacity: usize) -> Self {
let key_hash_entries = if capacity > 0 {
Vec::with_capacity(capacity)
} else {
vec![]
};
Self {
key_hash_entries,
bloom_false_positive,
}
}

pub fn approximate_len(&self) -> usize {
self.key_hash_entries.len() * 4
}

/// Gets Bloom filter bits per key from entries count and FPR
pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize {
fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize {
let size = -1.0 * (entries as f64) * false_positive_rate.ln() / f64::consts::LN_2.powi(2);
let locs = (size / (entries as f64)).ceil();
locs as usize
}
}

/// Builds Bloom filter from key hashes
pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Vec<u8> {
impl FilterBuildeer for BloomFilterBuilder {
fn add_key(&mut self, key: &[u8]) {
self.key_hash_entries.push(xxh32::xxh32(key, 0));
}

fn finish(&mut self) -> Vec<u8> {
let bits_per_key =
Self::bloom_bits_per_key(self.key_hash_entries.len(), self.bloom_false_positive);
// 0.69 is approximately ln(2)
let k = ((bits_per_key as f64) * 0.69) as u32;
// limit k in [1, 30]
let k = k.clamp(1, 30);
// For small len(keys), we set a minimum Bloom filter length to avoid high FPR
let nbits = (keys.len() * bits_per_key).max(64);
let nbits = (self.key_hash_entries.len() * bits_per_key).max(64);
let nbytes = (nbits + 7) / 8;
// nbits is always multiplication of 8
let nbits = nbytes * 8;
let mut filter = Vec::with_capacity(nbytes + 1);
filter.resize(nbytes, 0);
for h in keys {
for h in &self.key_hash_entries {
let mut h = *h;
let delta = (h >> 17) | (h << 15);
for _ in 0..k {
Expand All @@ -97,33 +168,9 @@ impl<'a> Bloom<'a> {
}
}
filter.put_u8(k as u8);
self.key_hash_entries.clear();
filter
}

/// Judges whether the hash value is in the table with the given false positive rate.
///
/// Note:
/// - if the return value is true, then the table surely does not have the user key that has
/// the hash;
/// - if the return value is false, then the table may or may not have the user key that has
/// the hash actually, a.k.a. we don't know the answer.
pub fn surely_not_have_hash(&self, mut h: u32) -> bool {
if self.k > 30 {
// potential new encoding for short Bloom filters
false
} else {
let nbits = self.filter.bit_len();
let delta = (h >> 17) | (h << 15);
for _ in 0..self.k {
let bit_pos = h % (nbits as u32);
if !self.filter.get_bit(bit_pos as usize) {
return true;
}
h = h.wrapping_add(delta);
}
false
}
}
}

#[cfg(test)]
Expand All @@ -132,14 +179,14 @@ mod tests {
use xxhash_rust::xxh32;

use super::*;
use crate::hummock::SstableBuilderOptions;

#[test]
fn test_small_bloom_filter() {
let hash: Vec<u32> = vec![b"hello".to_vec(), b"world".to_vec()]
.into_iter()
.map(|x| xxh32::xxh32(&x, 0))
.collect();
let buf = Bloom::build_from_key_hashes(&hash, 10);
let mut builder = BloomFilterBuilder::new(0.01, 0);
builder.add_key(b"hello");
builder.add_key(b"world");
let buf = builder.finish();

let check_hash: Vec<u32> = vec![
b"hello".to_vec(),
Expand All @@ -151,37 +198,44 @@ mod tests {
.map(|x| xxh32::xxh32(&x, 0))
.collect();

let f = Bloom::new(&buf);
let f = BloomFilterReader::new(buf);
assert_eq!(f.k, 6);

assert!(!f.surely_not_have_hash(check_hash[0]));
assert!(!f.surely_not_have_hash(check_hash[1]));
assert!(f.surely_not_have_hash(check_hash[2]));
assert!(f.surely_not_have_hash(check_hash[3]));
assert!(f.may_match(check_hash[0]));
assert!(f.may_match(check_hash[1]));
assert!(!f.may_match(check_hash[2]));
assert!(!f.may_match(check_hash[3]));
let t = BloomFilterBuilder::bloom_bits_per_key(
10000,
SstableBuilderOptions::default().bloom_false_positive,
);
println!("expected bits: {}", t);
let t = BloomFilterBuilder::bloom_bits_per_key(
1000000,
SstableBuilderOptions::default().bloom_false_positive,
);
println!("expected bits: {}", t);
}

fn false_positive_rate_case(
preset_key_count: usize,
test_key_count: usize,
expected_false_positive_rate: f64,
) {
let mut key_list = vec![];

let mut builder = BloomFilterBuilder::new(expected_false_positive_rate, preset_key_count);
for i in 0..preset_key_count {
let k = Bytes::from(format!("{:032}", i));
let h = xxh32::xxh32(&k, 0);
key_list.push(h);
builder.add_key(&k);
}

let bits_per_key = Bloom::bloom_bits_per_key(key_list.len(), expected_false_positive_rate);
let vec = Bloom::build_from_key_hashes(&key_list, bits_per_key);
let filter = Bloom::new(&vec);
let data = builder.finish();
let filter = BloomFilterReader::new(data);

let mut true_count = 0;
for i in preset_key_count..preset_key_count + test_key_count {
let k = Bytes::from(format!("{:032}", i));
let h = xxh32::xxh32(&k, 0);
if filter.surely_not_have_hash(h) {
if !filter.may_match(h) {
true_count += 1;
}
}
Expand Down
Loading

0 comments on commit 523b5a4

Please sign in to comment.