From 17b76cc23fd08a1a037e88f97d80c9b1b325a04f Mon Sep 17 00:00:00 2001 From: congyi <15605187270@163.com> Date: Wed, 4 Jan 2023 16:57:37 +0800 Subject: [PATCH 1/4] sstable: per table bloom filter --- src/storage/src/hummock/mod.rs | 6 ++- src/storage/src/hummock/sstable/builder.rs | 50 ++++++++++++++------ src/storage/src/hummock/sstable/mod.rs | 43 +++++++++++++---- src/storage/src/hummock/sstable/writer.rs | 4 +- src/storage/src/hummock/state_store_v1.rs | 29 ++++++++---- src/storage/src/hummock/store/version.rs | 55 ++++++++++++++++++---- 6 files changed, 142 insertions(+), 45 deletions(-) diff --git a/src/storage/src/hummock/mod.rs b/src/storage/src/hummock/mod.rs index a40f9c90df79d..cf3ae8791255b 100644 --- a/src/storage/src/hummock/mod.rs +++ b/src/storage/src/hummock/mod.rs @@ -333,6 +333,7 @@ pub async fn get_from_sstable_info( ) -> HummockResult>> { let sstable = sstable_store_ref.sstable(sstable_info, local_stats).await?; + let table_id = full_key.user_key.table_id.table_id(); let ukey = &full_key.user_key; let delete_epoch = if read_options.ignore_range_tombstone { None @@ -343,7 +344,7 @@ pub async fn get_from_sstable_info( // Bloom filter key is the distribution key, which is no need to be the prefix of pk, and do not // contain `TablePrefix` and `VnodePrefix`. if read_options.check_bloom_filter - && !hit_sstable_bloom_filter(sstable.value(), dist_key_hash, local_stats) + && !hit_sstable_bloom_filter(sstable.value(), dist_key_hash, local_stats, table_id) { if delete_epoch.is_some() { return Ok(Some(HummockValue::Delete)); @@ -391,9 +392,10 @@ pub fn hit_sstable_bloom_filter( sstable_info_ref: &Sstable, prefix_hash: u32, local_stats: &mut StoreLocalStatistic, + table_id: u32, ) -> bool { local_stats.bloom_filter_check_counts += 1; - let surely_not_have = sstable_info_ref.surely_not_have_hashvalue(prefix_hash); + let surely_not_have = sstable_info_ref.surely_not_have_hashvalue(prefix_hash, table_id); if surely_not_have { local_stats.bloom_filter_true_negative_count += 1; diff --git a/src/storage/src/hummock/sstable/builder.rs b/src/storage/src/hummock/sstable/builder.rs index df528f27dd9bb..f9c747c154e84 100644 --- a/src/storage/src/hummock/sstable/builder.rs +++ b/src/storage/src/hummock/sstable/builder.rs @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::BTreeSet; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use bytes::BytesMut; +use itertools::Itertools; use risingwave_common::catalog::TableId; use risingwave_common::config::StorageConfig; use risingwave_hummock_sdk::filter_key_extractor::{ @@ -31,7 +32,7 @@ use super::bloom::Bloom; use super::utils::CompressionAlgorithm; use super::{ BlockBuilder, BlockBuilderOptions, BlockMeta, SstableMeta, SstableWriter, DEFAULT_BLOCK_SIZE, - DEFAULT_ENTRY_SIZE, DEFAULT_RESTART_INTERVAL, VERSION, + DEFAULT_RESTART_INTERVAL, VERSION, }; use crate::hummock::value::HummockValue; use crate::hummock::{DeleteRangeTombstone, HummockResult}; @@ -100,7 +101,7 @@ pub struct SstableBuilder { /// `table_id` of added keys. table_ids: BTreeSet, /// Hashes of user keys. - user_key_hashes: Vec, + user_key_hashes: HashMap>, last_full_key: Vec, last_extract_key: Vec, /// Buffer for encoded key and value to avoid allocation. @@ -148,7 +149,7 @@ impl SstableBuilder { }), block_metas: Vec::with_capacity(options.capacity / options.block_capacity + 1), table_ids: BTreeSet::new(), - user_key_hashes: Vec::with_capacity(options.capacity / DEFAULT_ENTRY_SIZE + 1), + user_key_hashes: HashMap::new(), last_table_id: None, raw_key: BytesMut::new(), raw_value: BytesMut::new(), @@ -210,9 +211,21 @@ impl SstableBuilder { // add bloom_filter check // 1. not empty_key // 2. extract_key key is not duplicate + if !extract_key.is_empty() && extract_key != self.last_extract_key.as_slice() { // avoid duplicate add to bloom filter - self.user_key_hashes.push(xxh32::xxh32(extract_key, 0)); + let key_hash = xxh32::xxh32(extract_key, 0); + if self.user_key_hashes.contains_key(&table_id) { + let mut current_key_hashes = + self.user_key_hashes.get(&table_id).unwrap().clone(); + current_key_hashes.push(key_hash); + self.user_key_hashes.remove(&table_id); + self.user_key_hashes + .insert(table_id, current_key_hashes.to_vec()); + } else { + self.user_key_hashes.insert(table_id, vec![key_hash]); + } + self.last_extract_key.clear(); self.last_extract_key.extend_from_slice(extract_key); } @@ -292,18 +305,27 @@ impl SstableBuilder { } self.total_key_count += self.range_tombstones.len() as u64; self.stale_key_count += self.range_tombstones.len() as u64; - - let mut meta = SstableMeta { - block_metas: self.block_metas, - bloom_filter: if self.options.bloom_false_positive > 0.0 { + let table_ids = self.table_ids.iter().collect_vec(); + let mut bloom_filter = BTreeMap::new(); + for table_id in table_ids { + if let Some(per_table_user_key_hashes) = self.user_key_hashes.get(table_id) && self.options.bloom_false_positive > 0.0 { let bits_per_key = Bloom::bloom_bits_per_key( - self.user_key_hashes.len(), + per_table_user_key_hashes.len(), self.options.bloom_false_positive, ); - Bloom::build_from_key_hashes(&self.user_key_hashes, bits_per_key) - } else { - vec![] - }, + + bloom_filter.insert( + *table_id, + Bloom::build_from_key_hashes( + self.user_key_hashes.get(table_id).unwrap(), + bits_per_key, + ), + ); + } + } + let mut meta = SstableMeta { + block_metas: self.block_metas, + bloom_filter, estimated_size: 0, key_count: self.total_key_count as u32, smallest_key, diff --git a/src/storage/src/hummock/sstable/mod.rs b/src/storage/src/hummock/sstable/mod.rs index c51b720c73a43..53a1d0cd3c86d 100644 --- a/src/storage/src/hummock/sstable/mod.rs +++ b/src/storage/src/hummock/sstable/mod.rs @@ -17,6 +17,7 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. mod block; +use std::collections::BTreeMap; use std::fmt::{Debug, Formatter}; pub use block::*; @@ -147,7 +148,7 @@ impl Sstable { }; if enable_bloom_filter() && self.has_bloom_filter() { let hash = xxh32::xxh32(dist_key, 0); - self.surely_not_have_hashvalue(hash) + self.surely_not_have_hashvalue(hash, 0_u32) } else { false } @@ -159,9 +160,15 @@ impl Sstable { } #[inline(always)] - pub fn surely_not_have_hashvalue(&self, hash: u32) -> bool { - let bloom = Bloom::new(&self.meta.bloom_filter); - bloom.surely_not_have_hash(hash) + pub fn surely_not_have_hashvalue(&self, hash: u32, table_id: u32) -> bool { + let bloom_filter_key = self.meta.bloom_filter.get(&table_id); + match bloom_filter_key { + Some(bloom_filter_key) => { + let bloom = Bloom::new(bloom_filter_key); + bloom.surely_not_have_hash(hash) + } + None => false, + } } pub fn block_count(&self) -> usize { @@ -235,7 +242,7 @@ impl BlockMeta { #[derive(Clone, PartialEq, Eq, Debug)] pub struct SstableMeta { pub block_metas: Vec, - pub bloom_filter: Vec, + pub bloom_filter: BTreeMap>, pub estimated_size: u32, pub key_count: u32, pub smallest_key: Vec, @@ -271,7 +278,11 @@ impl SstableMeta { for block_meta in &self.block_metas { block_meta.encode(buf); } - put_length_prefixed_slice(buf, &self.bloom_filter); + buf.put_u32_le(self.bloom_filter.len() as u32); + for (table_id, bloom_filter_key) in &self.bloom_filter { + buf.put_u32_le(*table_id); + put_length_prefixed_slice(buf, bloom_filter_key); + } buf.put_u32_le(self.estimated_size); buf.put_u32_le(self.key_count); put_length_prefixed_slice(buf, &self.smallest_key); @@ -312,7 +323,14 @@ impl SstableMeta { for _ in 0..block_meta_count { block_metas.push(BlockMeta::decode(buf)); } - let bloom_filter = get_length_prefixed_slice(buf); + let bloom_filter_count = buf.get_u32_le() as usize; + let mut bloom_filter = BTreeMap::new(); + for _ in 0..bloom_filter_count { + let table_id = buf.get_u32_le(); + let bloom_filter_key = get_length_prefixed_slice(buf); + bloom_filter.insert(table_id, bloom_filter_key); + } + let estimated_size = buf.get_u32_le(); let key_count = buf.get_u32_le(); let smallest_key = get_length_prefixed_slice(buf); @@ -353,7 +371,8 @@ impl SstableMeta { .map(| tombstone| 16 + tombstone.start_user_key.encoded_len() + tombstone.end_user_key.encoded_len()) .sum::() + 4 // bloom filter len - + self.bloom_filter.len() + + 8 * self.bloom_filter.len() + + self.bloom_filter.values().map(| bloom_filter_key|bloom_filter_key.len()).sum::() + 4 // estimated size + 4 // key count + 4 // key len @@ -378,6 +397,11 @@ mod tests { #[test] pub fn test_sstable_meta_enc_dec() { + let mut bloom_filter = BTreeMap::new(); + bloom_filter.insert(0_u32, b"0123456789".to_vec()); + bloom_filter.insert(1_u32, b"987654321".to_vec()); + bloom_filter.insert(2_u32, b"abcde".to_vec()); + bloom_filter.insert(3_u32, b"xyz".to_vec()); let meta = SstableMeta { block_metas: vec![ BlockMeta { @@ -393,7 +417,8 @@ mod tests { uncompressed_size: 0, }, ], - bloom_filter: b"0123456789".to_vec(), + + bloom_filter, estimated_size: 123, key_count: 123, smallest_key: b"0-smallest-key".to_vec(), diff --git a/src/storage/src/hummock/sstable/writer.rs b/src/storage/src/hummock/sstable/writer.rs index c2ac0bf02d4fb..0865fd928699e 100644 --- a/src/storage/src/hummock/sstable/writer.rs +++ b/src/storage/src/hummock/sstable/writer.rs @@ -73,6 +73,8 @@ impl SstableWriter for InMemWriter { #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use bytes::Bytes; use itertools::Itertools; use rand::{Rng, SeedableRng}; @@ -100,7 +102,7 @@ mod tests { } let meta = SstableMeta { block_metas, - bloom_filter: Vec::new(), + bloom_filter: BTreeMap::new(), estimated_size: 0, key_count: 0, smallest_key: Vec::new(), diff --git a/src/storage/src/hummock/state_store_v1.rs b/src/storage/src/hummock/state_store_v1.rs index 582753e4603a7..e31144ba163ba 100644 --- a/src/storage/src/hummock/state_store_v1.rs +++ b/src/storage/src/hummock/state_store_v1.rs @@ -336,10 +336,16 @@ impl HummockStorageV1 { .sstable(sstable_info, &mut local_stats) .in_span(Span::enter_with_local_parent("get_sstable")) .await?; - - if hit_sstable_bloom_filter(sstable.value(), *prefix_hash, &mut local_stats) - { - sstables.push((*sstable_info).clone()); + for table_id in &sstable_info.table_ids { + if hit_sstable_bloom_filter( + sstable.value(), + *prefix_hash, + &mut local_stats, + *table_id, + ) { + sstables.push((*sstable_info).clone()); + break; + } } } else { sstables.push((*sstable_info).clone()); @@ -362,12 +368,15 @@ impl HummockStorageV1 { .in_span(Span::enter_with_local_parent("get_sstable")) .await?; if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { - if !hit_sstable_bloom_filter( - sstable.value(), - *prefix_hash, - &mut local_stats, - ) { - continue; + for table_id in &table_info.table_ids { + if !hit_sstable_bloom_filter( + sstable.value(), + *prefix_hash, + &mut local_stats, + *table_id, + ) { + continue; + } } } diff --git a/src/storage/src/hummock/store/version.rs b/src/storage/src/hummock/store/version.rs index 453f1e16a3aa7..534d9d09d26f6 100644 --- a/src/storage/src/hummock/store/version.rs +++ b/src/storage/src/hummock/store/version.rs @@ -553,11 +553,23 @@ impl HummockVersionReader { .sstable(sstable_info, &mut local_stats) .in_span(Span::enter_with_local_parent("get_sstable")) .await?; - if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { - if !hit_sstable_bloom_filter(table_holder.value(), *prefix_hash, &mut local_stats) { - continue; + let mut hit_bloom_filter = false; + for table_id in &sstable_info.table_ids { + if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { + if !hit_sstable_bloom_filter( + table_holder.value(), + *prefix_hash, + &mut local_stats, + *table_id, + ) { + hit_bloom_filter = true; + break; + } } } + if hit_bloom_filter { + continue; + } if !table_holder.value().meta.range_tombstone_list.is_empty() && !read_options.ignore_range_tombstone @@ -624,11 +636,23 @@ impl HummockVersionReader { .in_span(Span::enter_with_local_parent("get_sstable")) .await?; - if let Some(key_hash) = bloom_filter_prefix_hash.as_ref() { - if !hit_sstable_bloom_filter(sstable.value(), *key_hash, &mut local_stats) { - continue; + let mut hit_bloom_filter = false; + for table_id in &sstable_info.table_ids { + if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { + if !hit_sstable_bloom_filter( + sstable.value(), + *prefix_hash, + &mut local_stats, + *table_id, + ) { + hit_bloom_filter = true; + break; + } } } + if hit_bloom_filter { + continue; + } if !sstable.value().meta.range_tombstone_list.is_empty() && !read_options.ignore_range_tombstone { @@ -654,15 +678,28 @@ impl HummockVersionReader { } // Overlapping let mut iters = Vec::new(); + let mut hit_bloom_filter = false; for table_info in table_infos.into_iter().rev() { let sstable = self .sstable_store .sstable(table_info, &mut local_stats) .in_span(Span::enter_with_local_parent("get_sstable")) .await?; - if let Some(dist_hash) = bloom_filter_prefix_hash.as_ref() { - if !hit_sstable_bloom_filter(sstable.value(), *dist_hash, &mut local_stats) - { + + for table_id in &table_info.table_ids { + if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { + if !hit_sstable_bloom_filter( + sstable.value(), + *prefix_hash, + &mut local_stats, + *table_id, + ) { + hit_bloom_filter = true; + break; + } + } + + if hit_bloom_filter { continue; } } From 753311004f7201af870996823d71e4d4021b5145 Mon Sep 17 00:00:00 2001 From: congyi <15605187270@163.com> Date: Wed, 4 Jan 2023 17:17:47 +0800 Subject: [PATCH 2/4] update SstableMeta version --- src/storage/src/hummock/sstable/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storage/src/hummock/sstable/mod.rs b/src/storage/src/hummock/sstable/mod.rs index 53a1d0cd3c86d..3e1ea88c66275 100644 --- a/src/storage/src/hummock/sstable/mod.rs +++ b/src/storage/src/hummock/sstable/mod.rs @@ -59,7 +59,7 @@ use super::{HummockError, HummockResult}; const DEFAULT_META_BUFFER_CAPACITY: usize = 4096; const MAGIC: u32 = 0x5785ab73; -const VERSION: u32 = 1; +const VERSION: u32 = 2; #[derive(Clone, PartialEq, Eq, Debug)] // delete keys located in [start_user_key, end_user_key) From ad1fc492d0e3b0a1f3fc5188ab43dfa5fd955734 Mon Sep 17 00:00:00 2001 From: congyi <15605187270@163.com> Date: Thu, 5 Jan 2023 11:52:27 +0800 Subject: [PATCH 3/4] fix comments --- src/storage/src/hummock/sstable/builder.rs | 6 +-- src/storage/src/hummock/state_store_v1.rs | 18 ++++---- src/storage/src/hummock/store/version.rs | 50 ++++++++-------------- 3 files changed, 30 insertions(+), 44 deletions(-) diff --git a/src/storage/src/hummock/sstable/builder.rs b/src/storage/src/hummock/sstable/builder.rs index f9c747c154e84..cac05a889f8ef 100644 --- a/src/storage/src/hummock/sstable/builder.rs +++ b/src/storage/src/hummock/sstable/builder.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::collections::{BTreeMap, BTreeSet}; use std::sync::Arc; use bytes::BytesMut; @@ -101,7 +101,7 @@ pub struct SstableBuilder { /// `table_id` of added keys. table_ids: BTreeSet, /// Hashes of user keys. - user_key_hashes: HashMap>, + user_key_hashes: BTreeMap>, last_full_key: Vec, last_extract_key: Vec, /// Buffer for encoded key and value to avoid allocation. @@ -149,7 +149,7 @@ impl SstableBuilder { }), block_metas: Vec::with_capacity(options.capacity / options.block_capacity + 1), table_ids: BTreeSet::new(), - user_key_hashes: HashMap::new(), + user_key_hashes: BTreeMap::new(), last_table_id: None, raw_key: BytesMut::new(), raw_value: BytesMut::new(), diff --git a/src/storage/src/hummock/state_store_v1.rs b/src/storage/src/hummock/state_store_v1.rs index e31144ba163ba..362dc603e282b 100644 --- a/src/storage/src/hummock/state_store_v1.rs +++ b/src/storage/src/hummock/state_store_v1.rs @@ -336,16 +336,14 @@ impl HummockStorageV1 { .sstable(sstable_info, &mut local_stats) .in_span(Span::enter_with_local_parent("get_sstable")) .await?; - for table_id in &sstable_info.table_ids { - if hit_sstable_bloom_filter( - sstable.value(), - *prefix_hash, - &mut local_stats, - *table_id, - ) { - sstables.push((*sstable_info).clone()); - break; - } + + if hit_sstable_bloom_filter( + sstable.value(), + *prefix_hash, + &mut local_stats, + table_id.table_id(), + ) { + sstables.push((*sstable_info).clone()); } } else { sstables.push((*sstable_info).clone()); diff --git a/src/storage/src/hummock/store/version.rs b/src/storage/src/hummock/store/version.rs index 534d9d09d26f6..30eccb44f0892 100644 --- a/src/storage/src/hummock/store/version.rs +++ b/src/storage/src/hummock/store/version.rs @@ -546,30 +546,24 @@ impl HummockVersionReader { .prefix_hint .as_ref() .map(|hint| Sstable::hash_for_bloom_filter(hint)); - + let table_id = read_options.table_id.table_id(); for sstable_info in &uncommitted_ssts { let table_holder = self .sstable_store .sstable(sstable_info, &mut local_stats) .in_span(Span::enter_with_local_parent("get_sstable")) .await?; - let mut hit_bloom_filter = false; - for table_id in &sstable_info.table_ids { - if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { - if !hit_sstable_bloom_filter( - table_holder.value(), - *prefix_hash, - &mut local_stats, - *table_id, - ) { - hit_bloom_filter = true; - break; - } + + if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { + if !hit_sstable_bloom_filter( + table_holder.value(), + *prefix_hash, + &mut local_stats, + table_id, + ) { + continue; } } - if hit_bloom_filter { - continue; - } if !table_holder.value().meta.range_tombstone_list.is_empty() && !read_options.ignore_range_tombstone @@ -636,23 +630,17 @@ impl HummockVersionReader { .in_span(Span::enter_with_local_parent("get_sstable")) .await?; - let mut hit_bloom_filter = false; - for table_id in &sstable_info.table_ids { - if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { - if !hit_sstable_bloom_filter( - sstable.value(), - *prefix_hash, - &mut local_stats, - *table_id, - ) { - hit_bloom_filter = true; - break; - } + if let Some(prefix_hash) = bloom_filter_prefix_hash.as_ref() { + if !hit_sstable_bloom_filter( + sstable.value(), + *prefix_hash, + &mut local_stats, + table_id, + ) { + continue; } } - if hit_bloom_filter { - continue; - } + if !sstable.value().meta.range_tombstone_list.is_empty() && !read_options.ignore_range_tombstone { From dcdd685d5fd802dff19de9796c1a4d1d62455f78 Mon Sep 17 00:00:00 2001 From: congyi <15605187270@163.com> Date: Thu, 5 Jan 2023 15:34:48 +0800 Subject: [PATCH 4/4] minor fix --- src/storage/src/hummock/sstable/builder.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/storage/src/hummock/sstable/builder.rs b/src/storage/src/hummock/sstable/builder.rs index 4d74e6405ad23..4bf73ccea8409 100644 --- a/src/storage/src/hummock/sstable/builder.rs +++ b/src/storage/src/hummock/sstable/builder.rs @@ -227,16 +227,6 @@ impl SstableBuilder { current_key_hashes.push(key_hash); } }; - if self.user_key_hashes.contains_key(&table_id) { - let mut current_key_hashes = - self.user_key_hashes.get(&table_id).unwrap().clone(); - current_key_hashes.push(key_hash); - self.user_key_hashes.remove(&table_id); - self.user_key_hashes - .insert(table_id, current_key_hashes.to_vec()); - } else { - self.user_key_hashes.insert(table_id, vec![key_hash]); - } self.last_extract_key.clear(); self.last_extract_key.extend_from_slice(extract_key);