diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml index df19a127b985d..5791cf7b17796 100644 --- a/src/common/Cargo.toml +++ b/src/common/Cargo.toml @@ -111,3 +111,7 @@ harness = false [[bench]] name = "bitmap" harness = false + +[[bench]] +name = "bench_hash_key_encoding" +harness = false diff --git a/src/common/benches/bench_hash_key_encoding.rs b/src/common/benches/bench_hash_key_encoding.rs new file mode 100644 index 0000000000000..8dccf32165a16 --- /dev/null +++ b/src/common/benches/bench_hash_key_encoding.rs @@ -0,0 +1,224 @@ +// Copyright 2023 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use criterion::{criterion_group, criterion_main, Criterion}; +use itertools::Itertools; +use risingwave_common::array::column::Column; +use risingwave_common::array::serial_array::SerialArray; +use risingwave_common::array::{ + ArrayBuilderImpl, BoolArray, DataChunk, DecimalArray, F32Array, F64Array, I16Array, I32Array, + I64Array, IntervalArray, NaiveDateArray, NaiveDateTimeArray, NaiveTimeArray, Utf8Array, +}; +use risingwave_common::hash::{calc_hash_key_kind, HashKey, HashKeyDispatcher}; +use risingwave_common::test_utils::rand_array::seed_rand_array_ref; +use risingwave_common::types::DataType; + +static SEED: u64 = 998244353u64; +static CHUNK_SIZES: &[usize] = &[128, 1024]; +static NULL_RATIOS: &[f64] = &[0.0, 0.01, 0.1]; + +trait Case: Send + 'static { + fn bench(&self, c: &mut Criterion); +} +type BoxedCase = Box; + +struct HashKeyBenchCaseBuilder { + pub data_types: Vec, + pub describe: String, +} +impl HashKeyBenchCaseBuilder { + pub fn gen_cases(self) -> Vec { + self.dispatch() + } +} +impl HashKeyDispatcher for HashKeyBenchCaseBuilder { + type Output = Vec; + + fn dispatch_impl(self) -> Self::Output { + let mut ret: Vec = vec![]; + for null_ratio in NULL_RATIOS { + for chunk_size in CHUNK_SIZES { + let id = format!( + "{}, key type: {:?}, chunk size {}, null ratio {}", + self.describe, + calc_hash_key_kind(self.data_types()), + chunk_size, + null_ratio + ); + let input_chunk = gen_chunk(self.data_types(), *chunk_size, SEED, *null_ratio); + ret.push(Box::new(HashKeyBenchCase::::new( + id, + input_chunk, + self.data_types.clone(), + ))); + } + } + ret + } + + fn data_types(&self) -> &[DataType] { + &self.data_types + } +} + +struct HashKeyBenchCase { + id: String, + input_chunk: DataChunk, + keys: Vec, + data_types: Vec, + col_idxes: Vec, +} + +impl HashKeyBenchCase { + pub fn new(id: String, input_chunk: DataChunk, data_types: Vec) -> Self { + // please check the `bench_vec_dser` and `bench_deser` method when want to bench not full + // `col_idxes` + let col_idxes = (0..input_chunk.columns().len()).collect_vec(); + let keys = HashKey::build(&col_idxes, &input_chunk).unwrap(); + Self { + id, + input_chunk, + keys, + data_types, + col_idxes, + } + } + + pub fn bench_vec_ser(&self, c: &mut Criterion) { + let vectorize_serialize_id = "vec ser ".to_string() + &self.id; + c.bench_function(&vectorize_serialize_id, |b| { + b.iter(|| K::build(&self.col_idxes, &self.input_chunk).unwrap()) + }); + } + + pub fn bench_vec_deser(&self, c: &mut Criterion) { + let vectorize_deserialize_id = "vec deser ".to_string() + &self.id; + c.bench_function(&vectorize_deserialize_id, |b| { + let mut array_builders = self + .input_chunk + .columns() + .iter() + .map(|c| c.array_ref().create_builder(self.input_chunk.capacity())) + .collect::>(); + b.iter(|| { + for key in &self.keys { + key.deserialize_to_builders(&mut array_builders[..], &self.data_types) + .unwrap(); + } + }) + }); + } + + pub fn bench_deser(&self, c: &mut Criterion) { + let vectorize_deserialize_id = "row deser ".to_string() + &self.id; + c.bench_function(&vectorize_deserialize_id, |b| { + b.iter(|| { + for key in &self.keys { + key.deserialize(&self.data_types).unwrap(); + } + }) + }); + } +} +impl Case for HashKeyBenchCase { + fn bench(&self, c: &mut Criterion) { + self.bench_vec_ser(c); + self.bench_vec_deser(c); + self.bench_deser(c); + } +} + +fn gen_chunk(data_types: &[DataType], size: usize, seed: u64, null_ratio: f64) -> DataChunk { + let mut columns = vec![]; + + for d in data_types { + columns.push(Column::new(match d { + DataType::Boolean => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Int16 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Int32 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Int64 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Float32 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Float64 => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Decimal => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Date => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Varchar => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Time => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Serial => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Timestamp => { + seed_rand_array_ref::(size, seed, null_ratio) + } + DataType::Timestamptz => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Interval => seed_rand_array_ref::(size, seed, null_ratio), + DataType::Struct(_) | DataType::Bytea | DataType::Jsonb => { + todo!() + } + DataType::List { datatype: _ } => { + todo!() + } + })); + } + risingwave_common::util::schema_check::schema_check(data_types, &columns).unwrap(); + DataChunk::new(columns, size) +} + +fn case_builders() -> Vec { + vec![ + HashKeyBenchCaseBuilder { + data_types: vec![DataType::Serial], + describe: "single Serial".to_string(), + }, + HashKeyBenchCaseBuilder { + data_types: vec![DataType::Int32], + describe: "single int32".to_string(), + }, + HashKeyBenchCaseBuilder { + data_types: vec![DataType::Int64], + describe: "single int64".to_string(), + }, + HashKeyBenchCaseBuilder { + data_types: vec![DataType::Varchar], + describe: "single varchar".to_string(), + }, + HashKeyBenchCaseBuilder { + data_types: vec![DataType::Int32, DataType::Int32, DataType::Int32], + describe: "composite fixed size".to_string(), + }, + HashKeyBenchCaseBuilder { + data_types: vec![DataType::Int32, DataType::Int64, DataType::Int32], + describe: "composite fixed size2".to_string(), + }, + HashKeyBenchCaseBuilder { + data_types: vec![DataType::Int32, DataType::Varchar], + describe: "composite fixed and not fixed size".to_string(), + }, + HashKeyBenchCaseBuilder { + data_types: vec![DataType::Int64, DataType::Varchar], + describe: "composite fixed and not fixed size".to_string(), + }, + ] +} + +fn bench_hash_key_encoding(c: &mut Criterion) { + for case_builder in case_builders() { + let cases = case_builder.gen_cases(); + for case in cases { + case.bench(c); + } + } +} + +// `cargo bench -- "vec ser[\s\S]*KeySerialized[\s\S]*null ratio 0$"` bench all the +// `KeySerialized` hash key vectorized serialize cases with data's null ratio is 0,001 +criterion_group!(benches, bench_hash_key_encoding); +criterion_main!(benches); diff --git a/src/common/src/hash/dispatcher.rs b/src/common/src/hash/dispatcher.rs index d104fa1b3ce1e..646d230bf1dd0 100644 --- a/src/common/src/hash/dispatcher.rs +++ b/src/common/src/hash/dispatcher.rs @@ -19,7 +19,7 @@ use crate::types::DataType; /// An enum to help to dynamically dispatch [`HashKey`] template. #[derive(Copy, Clone, Debug, Eq, PartialEq)] -enum HashKeyKind { +pub enum HashKeyKind { Key8, Key16, Key32, @@ -120,7 +120,7 @@ const MAX_FIXED_SIZE_KEY_ELEMENTS: usize = 8; /// 4. Any column's serialized format can't be used for equality check. /// /// Otherwise we choose smallest [`crate::hash::FixedSizeKey`] whose size can hold all data types. -fn calc_hash_key_kind(data_types: &[DataType]) -> HashKeyKind { +pub fn calc_hash_key_kind(data_types: &[DataType]) -> HashKeyKind { if data_types.len() > MAX_FIXED_SIZE_KEY_ELEMENTS { return HashKeyKind::KeySerialized; } diff --git a/src/common/src/hash/key.rs b/src/common/src/hash/key.rs index d7f5d479dfc21..42b08cc040408 100644 --- a/src/common/src/hash/key.rs +++ b/src/common/src/hash/key.rs @@ -758,19 +758,28 @@ mod tests { let capacity = 128; let seed = 10244021u64; let columns = vec![ - Column::new(seed_rand_array_ref::(capacity, seed)), - Column::new(seed_rand_array_ref::(capacity, seed + 1)), - Column::new(seed_rand_array_ref::(capacity, seed + 2)), - Column::new(seed_rand_array_ref::(capacity, seed + 3)), - Column::new(seed_rand_array_ref::(capacity, seed + 4)), - Column::new(seed_rand_array_ref::(capacity, seed + 5)), - Column::new(seed_rand_array_ref::(capacity, seed + 6)), - Column::new(seed_rand_array_ref::(capacity, seed + 7)), - Column::new(seed_rand_array_ref::(capacity, seed + 8)), - Column::new(seed_rand_array_ref::(capacity, seed + 9)), + Column::new(seed_rand_array_ref::(capacity, seed, 0.5)), + Column::new(seed_rand_array_ref::(capacity, seed + 1, 0.5)), + Column::new(seed_rand_array_ref::(capacity, seed + 2, 0.5)), + Column::new(seed_rand_array_ref::(capacity, seed + 3, 0.5)), + Column::new(seed_rand_array_ref::(capacity, seed + 4, 0.5)), + Column::new(seed_rand_array_ref::(capacity, seed + 5, 0.5)), + Column::new(seed_rand_array_ref::(capacity, seed + 6, 0.5)), + Column::new(seed_rand_array_ref::(capacity, seed + 7, 0.5)), + Column::new(seed_rand_array_ref::( + capacity, + seed + 8, + 0.5, + )), + Column::new(seed_rand_array_ref::( + capacity, + seed + 9, + 0.5, + )), Column::new(seed_rand_array_ref::( capacity, seed + 10, + 0.5, )), ]; let types = vec![ diff --git a/src/common/src/hash/mod.rs b/src/common/src/hash/mod.rs index 58dbada538a7d..477ef4352f6f4 100644 --- a/src/common/src/hash/mod.rs +++ b/src/common/src/hash/mod.rs @@ -19,5 +19,5 @@ mod key; pub use consistent_hash::bitmap::*; pub use consistent_hash::mapping::*; pub use consistent_hash::vnode::*; -pub use dispatcher::HashKeyDispatcher; +pub use dispatcher::{calc_hash_key_kind, HashKeyDispatcher}; pub use key::*; diff --git a/src/common/src/lib.rs b/src/common/src/lib.rs index 6e68b6240cc3b..762cae7564719 100644 --- a/src/common/src/lib.rs +++ b/src/common/src/lib.rs @@ -56,7 +56,6 @@ pub mod session_config; pub mod system_param; pub mod telemetry; -#[cfg(test)] pub mod test_utils; pub mod types; diff --git a/src/common/src/test_utils/rand_array.rs b/src/common/src/test_utils/rand_array.rs index 4341067cce8ad..d70538ce386e3 100644 --- a/src/common/src/test_utils/rand_array.rs +++ b/src/common/src/test_utils/rand_array.rs @@ -143,7 +143,7 @@ impl RandValue for ListValue { } } -pub fn rand_array(rand: &mut R, size: usize) -> A +pub fn rand_array(rand: &mut R, size: usize, null_ratio: f64) -> A where A: Array, R: Rng, @@ -151,7 +151,7 @@ where { let mut builder = A::Builder::new(size); for _ in 0..size { - let is_null = rand.gen::(); + let is_null = rand.gen_bool(null_ratio); if is_null { builder.append_null(); } else { @@ -163,21 +163,21 @@ where builder.finish() } -pub fn seed_rand_array(size: usize, seed: u64) -> A +pub fn seed_rand_array(size: usize, seed: u64, null_ratio: f64) -> A where A: Array, A::OwnedItem: RandValue, { let mut rand = SmallRng::seed_from_u64(seed); - rand_array(&mut rand, size) + rand_array(&mut rand, size, null_ratio) } -pub fn seed_rand_array_ref(size: usize, seed: u64) -> ArrayRef +pub fn seed_rand_array_ref(size: usize, seed: u64, null_ratio: f64) -> ArrayRef where A: Array, A::OwnedItem: RandValue, { - let array: A = seed_rand_array(size, seed); + let array: A = seed_rand_array(size, seed, null_ratio); Arc::new(array.into()) } @@ -195,7 +195,7 @@ mod tests { ($( { $variant_name:ident, $suffix_name:ident, $array:ty, $builder:ty } ),*) => { $( { - let array = seed_rand_array::<$array>(10, 1024); + let array = seed_rand_array::<$array>(10, 1024, 0.5); assert_eq!(10, array.len()); } )*