Skip to content

Commit

Permalink
perf(hashkey): add benchmark for hash key ser/deser (#8733)
Browse files Browse the repository at this point in the history
  • Loading branch information
st1page authored Mar 23, 2023
1 parent 96aa23d commit 1df800a
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 21 deletions.
4 changes: 4 additions & 0 deletions src/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,7 @@ harness = false
[[bench]]
name = "bitmap"
harness = false

[[bench]]
name = "bench_hash_key_encoding"
harness = false
224 changes: 224 additions & 0 deletions src/common/benches/bench_hash_key_encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
// Copyright 2023 RisingWave Labs
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use criterion::{criterion_group, criterion_main, Criterion};
use itertools::Itertools;
use risingwave_common::array::column::Column;
use risingwave_common::array::serial_array::SerialArray;
use risingwave_common::array::{
ArrayBuilderImpl, BoolArray, DataChunk, DecimalArray, F32Array, F64Array, I16Array, I32Array,
I64Array, IntervalArray, NaiveDateArray, NaiveDateTimeArray, NaiveTimeArray, Utf8Array,
};
use risingwave_common::hash::{calc_hash_key_kind, HashKey, HashKeyDispatcher};
use risingwave_common::test_utils::rand_array::seed_rand_array_ref;
use risingwave_common::types::DataType;

static SEED: u64 = 998244353u64;
static CHUNK_SIZES: &[usize] = &[128, 1024];
static NULL_RATIOS: &[f64] = &[0.0, 0.01, 0.1];

trait Case: Send + 'static {
fn bench(&self, c: &mut Criterion);
}
type BoxedCase = Box<dyn Case>;

struct HashKeyBenchCaseBuilder {
pub data_types: Vec<DataType>,
pub describe: String,
}
impl HashKeyBenchCaseBuilder {
pub fn gen_cases(self) -> Vec<BoxedCase> {
self.dispatch()
}
}
impl HashKeyDispatcher for HashKeyBenchCaseBuilder {
type Output = Vec<BoxedCase>;

fn dispatch_impl<K: HashKey>(self) -> Self::Output {
let mut ret: Vec<BoxedCase> = vec![];
for null_ratio in NULL_RATIOS {
for chunk_size in CHUNK_SIZES {
let id = format!(
"{}, key type: {:?}, chunk size {}, null ratio {}",
self.describe,
calc_hash_key_kind(self.data_types()),
chunk_size,
null_ratio
);
let input_chunk = gen_chunk(self.data_types(), *chunk_size, SEED, *null_ratio);
ret.push(Box::new(HashKeyBenchCase::<K>::new(
id,
input_chunk,
self.data_types.clone(),
)));
}
}
ret
}

fn data_types(&self) -> &[DataType] {
&self.data_types
}
}

struct HashKeyBenchCase<K: HashKey> {
id: String,
input_chunk: DataChunk,
keys: Vec<K>,
data_types: Vec<DataType>,
col_idxes: Vec<usize>,
}

impl<K: HashKey> HashKeyBenchCase<K> {
pub fn new(id: String, input_chunk: DataChunk, data_types: Vec<DataType>) -> Self {
// please check the `bench_vec_dser` and `bench_deser` method when want to bench not full
// `col_idxes`
let col_idxes = (0..input_chunk.columns().len()).collect_vec();
let keys = HashKey::build(&col_idxes, &input_chunk).unwrap();
Self {
id,
input_chunk,
keys,
data_types,
col_idxes,
}
}

pub fn bench_vec_ser(&self, c: &mut Criterion) {
let vectorize_serialize_id = "vec ser ".to_string() + &self.id;
c.bench_function(&vectorize_serialize_id, |b| {
b.iter(|| K::build(&self.col_idxes, &self.input_chunk).unwrap())
});
}

pub fn bench_vec_deser(&self, c: &mut Criterion) {
let vectorize_deserialize_id = "vec deser ".to_string() + &self.id;
c.bench_function(&vectorize_deserialize_id, |b| {
let mut array_builders = self
.input_chunk
.columns()
.iter()
.map(|c| c.array_ref().create_builder(self.input_chunk.capacity()))
.collect::<Vec<ArrayBuilderImpl>>();
b.iter(|| {
for key in &self.keys {
key.deserialize_to_builders(&mut array_builders[..], &self.data_types)
.unwrap();
}
})
});
}

pub fn bench_deser(&self, c: &mut Criterion) {
let vectorize_deserialize_id = "row deser ".to_string() + &self.id;
c.bench_function(&vectorize_deserialize_id, |b| {
b.iter(|| {
for key in &self.keys {
key.deserialize(&self.data_types).unwrap();
}
})
});
}
}
impl<K: HashKey> Case for HashKeyBenchCase<K> {
fn bench(&self, c: &mut Criterion) {
self.bench_vec_ser(c);
self.bench_vec_deser(c);
self.bench_deser(c);
}
}

fn gen_chunk(data_types: &[DataType], size: usize, seed: u64, null_ratio: f64) -> DataChunk {
let mut columns = vec![];

for d in data_types {
columns.push(Column::new(match d {
DataType::Boolean => seed_rand_array_ref::<BoolArray>(size, seed, null_ratio),
DataType::Int16 => seed_rand_array_ref::<I16Array>(size, seed, null_ratio),
DataType::Int32 => seed_rand_array_ref::<I32Array>(size, seed, null_ratio),
DataType::Int64 => seed_rand_array_ref::<I64Array>(size, seed, null_ratio),
DataType::Float32 => seed_rand_array_ref::<F32Array>(size, seed, null_ratio),
DataType::Float64 => seed_rand_array_ref::<F64Array>(size, seed, null_ratio),
DataType::Decimal => seed_rand_array_ref::<DecimalArray>(size, seed, null_ratio),
DataType::Date => seed_rand_array_ref::<NaiveDateArray>(size, seed, null_ratio),
DataType::Varchar => seed_rand_array_ref::<Utf8Array>(size, seed, null_ratio),
DataType::Time => seed_rand_array_ref::<NaiveTimeArray>(size, seed, null_ratio),
DataType::Serial => seed_rand_array_ref::<SerialArray>(size, seed, null_ratio),
DataType::Timestamp => {
seed_rand_array_ref::<NaiveDateTimeArray>(size, seed, null_ratio)
}
DataType::Timestamptz => seed_rand_array_ref::<I64Array>(size, seed, null_ratio),
DataType::Interval => seed_rand_array_ref::<IntervalArray>(size, seed, null_ratio),
DataType::Struct(_) | DataType::Bytea | DataType::Jsonb => {
todo!()
}
DataType::List { datatype: _ } => {
todo!()
}
}));
}
risingwave_common::util::schema_check::schema_check(data_types, &columns).unwrap();
DataChunk::new(columns, size)
}

fn case_builders() -> Vec<HashKeyBenchCaseBuilder> {
vec![
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Serial],
describe: "single Serial".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int32],
describe: "single int32".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int64],
describe: "single int64".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Varchar],
describe: "single varchar".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int32, DataType::Int32, DataType::Int32],
describe: "composite fixed size".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int32, DataType::Int64, DataType::Int32],
describe: "composite fixed size2".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int32, DataType::Varchar],
describe: "composite fixed and not fixed size".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int64, DataType::Varchar],
describe: "composite fixed and not fixed size".to_string(),
},
]
}

fn bench_hash_key_encoding(c: &mut Criterion) {
for case_builder in case_builders() {
let cases = case_builder.gen_cases();
for case in cases {
case.bench(c);
}
}
}

// `cargo bench -- "vec ser[\s\S]*KeySerialized[\s\S]*null ratio 0$"` bench all the
// `KeySerialized` hash key vectorized serialize cases with data's null ratio is 0,001
criterion_group!(benches, bench_hash_key_encoding);
criterion_main!(benches);
4 changes: 2 additions & 2 deletions src/common/src/hash/dispatcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use crate::types::DataType;

/// An enum to help to dynamically dispatch [`HashKey`] template.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
enum HashKeyKind {
pub enum HashKeyKind {
Key8,
Key16,
Key32,
Expand Down Expand Up @@ -120,7 +120,7 @@ const MAX_FIXED_SIZE_KEY_ELEMENTS: usize = 8;
/// 4. Any column's serialized format can't be used for equality check.
///
/// Otherwise we choose smallest [`crate::hash::FixedSizeKey`] whose size can hold all data types.
fn calc_hash_key_kind(data_types: &[DataType]) -> HashKeyKind {
pub fn calc_hash_key_kind(data_types: &[DataType]) -> HashKeyKind {
if data_types.len() > MAX_FIXED_SIZE_KEY_ELEMENTS {
return HashKeyKind::KeySerialized;
}
Expand Down
29 changes: 19 additions & 10 deletions src/common/src/hash/key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -758,19 +758,28 @@ mod tests {
let capacity = 128;
let seed = 10244021u64;
let columns = vec![
Column::new(seed_rand_array_ref::<BoolArray>(capacity, seed)),
Column::new(seed_rand_array_ref::<I16Array>(capacity, seed + 1)),
Column::new(seed_rand_array_ref::<I32Array>(capacity, seed + 2)),
Column::new(seed_rand_array_ref::<I64Array>(capacity, seed + 3)),
Column::new(seed_rand_array_ref::<F32Array>(capacity, seed + 4)),
Column::new(seed_rand_array_ref::<F64Array>(capacity, seed + 5)),
Column::new(seed_rand_array_ref::<DecimalArray>(capacity, seed + 6)),
Column::new(seed_rand_array_ref::<Utf8Array>(capacity, seed + 7)),
Column::new(seed_rand_array_ref::<NaiveDateArray>(capacity, seed + 8)),
Column::new(seed_rand_array_ref::<NaiveTimeArray>(capacity, seed + 9)),
Column::new(seed_rand_array_ref::<BoolArray>(capacity, seed, 0.5)),
Column::new(seed_rand_array_ref::<I16Array>(capacity, seed + 1, 0.5)),
Column::new(seed_rand_array_ref::<I32Array>(capacity, seed + 2, 0.5)),
Column::new(seed_rand_array_ref::<I64Array>(capacity, seed + 3, 0.5)),
Column::new(seed_rand_array_ref::<F32Array>(capacity, seed + 4, 0.5)),
Column::new(seed_rand_array_ref::<F64Array>(capacity, seed + 5, 0.5)),
Column::new(seed_rand_array_ref::<DecimalArray>(capacity, seed + 6, 0.5)),
Column::new(seed_rand_array_ref::<Utf8Array>(capacity, seed + 7, 0.5)),
Column::new(seed_rand_array_ref::<NaiveDateArray>(
capacity,
seed + 8,
0.5,
)),
Column::new(seed_rand_array_ref::<NaiveTimeArray>(
capacity,
seed + 9,
0.5,
)),
Column::new(seed_rand_array_ref::<NaiveDateTimeArray>(
capacity,
seed + 10,
0.5,
)),
];
let types = vec![
Expand Down
2 changes: 1 addition & 1 deletion src/common/src/hash/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ mod key;
pub use consistent_hash::bitmap::*;
pub use consistent_hash::mapping::*;
pub use consistent_hash::vnode::*;
pub use dispatcher::HashKeyDispatcher;
pub use dispatcher::{calc_hash_key_kind, HashKeyDispatcher};
pub use key::*;
1 change: 0 additions & 1 deletion src/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ pub mod session_config;
pub mod system_param;
pub mod telemetry;

#[cfg(test)]
pub mod test_utils;
pub mod types;

Expand Down
14 changes: 7 additions & 7 deletions src/common/src/test_utils/rand_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@ impl RandValue for ListValue {
}
}

pub fn rand_array<A, R>(rand: &mut R, size: usize) -> A
pub fn rand_array<A, R>(rand: &mut R, size: usize, null_ratio: f64) -> A
where
A: Array,
R: Rng,
A::OwnedItem: RandValue,
{
let mut builder = A::Builder::new(size);
for _ in 0..size {
let is_null = rand.gen::<bool>();
let is_null = rand.gen_bool(null_ratio);
if is_null {
builder.append_null();
} else {
Expand All @@ -163,21 +163,21 @@ where
builder.finish()
}

pub fn seed_rand_array<A>(size: usize, seed: u64) -> A
pub fn seed_rand_array<A>(size: usize, seed: u64, null_ratio: f64) -> A
where
A: Array,
A::OwnedItem: RandValue,
{
let mut rand = SmallRng::seed_from_u64(seed);
rand_array(&mut rand, size)
rand_array(&mut rand, size, null_ratio)
}

pub fn seed_rand_array_ref<A>(size: usize, seed: u64) -> ArrayRef
pub fn seed_rand_array_ref<A>(size: usize, seed: u64, null_ratio: f64) -> ArrayRef
where
A: Array,
A::OwnedItem: RandValue,
{
let array: A = seed_rand_array(size, seed);
let array: A = seed_rand_array(size, seed, null_ratio);
Arc::new(array.into())
}

Expand All @@ -195,7 +195,7 @@ mod tests {
($( { $variant_name:ident, $suffix_name:ident, $array:ty, $builder:ty } ),*) => {
$(
{
let array = seed_rand_array::<$array>(10, 1024);
let array = seed_rand_array::<$array>(10, 1024, 0.5);
assert_eq!(10, array.len());
}
)*
Expand Down

0 comments on commit 1df800a

Please sign in to comment.