Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(hashkey): add benchmark for hash key ser/deser #8733

Merged
merged 8 commits into from
Mar 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,7 @@ harness = false
[[bench]]
name = "bitmap"
harness = false

[[bench]]
name = "bench_hash_key_encoding"
harness = false
224 changes: 224 additions & 0 deletions src/common/benches/bench_hash_key_encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
// Copyright 2023 RisingWave Labs
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use criterion::{criterion_group, criterion_main, Criterion};
use itertools::Itertools;
use risingwave_common::array::column::Column;
use risingwave_common::array::serial_array::SerialArray;
use risingwave_common::array::{
ArrayBuilderImpl, BoolArray, DataChunk, DecimalArray, F32Array, F64Array, I16Array, I32Array,
I64Array, IntervalArray, NaiveDateArray, NaiveDateTimeArray, NaiveTimeArray, Utf8Array,
};
use risingwave_common::hash::{calc_hash_key_kind, HashKey, HashKeyDispatcher};
use risingwave_common::test_utils::rand_array::seed_rand_array_ref;
use risingwave_common::types::DataType;

static SEED: u64 = 998244353u64;
static CHUNK_SIZES: &[usize] = &[128, 1024];
static NULL_RATIOS: &[f64] = &[0.0, 0.01, 0.1];

trait Case: Send + 'static {
fn bench(&self, c: &mut Criterion);
}
type BoxedCase = Box<dyn Case>;

struct HashKeyBenchCaseBuilder {
pub data_types: Vec<DataType>,
pub describe: String,
}
impl HashKeyBenchCaseBuilder {
pub fn gen_cases(self) -> Vec<BoxedCase> {
self.dispatch()
}
}
impl HashKeyDispatcher for HashKeyBenchCaseBuilder {
type Output = Vec<BoxedCase>;

fn dispatch_impl<K: HashKey>(self) -> Self::Output {
let mut ret: Vec<BoxedCase> = vec![];
for null_ratio in NULL_RATIOS {
for chunk_size in CHUNK_SIZES {
let id = format!(
"{}, key type: {:?}, chunk size {}, null ratio {}",
self.describe,
calc_hash_key_kind(self.data_types()),
chunk_size,
null_ratio
);
let input_chunk = gen_chunk(self.data_types(), *chunk_size, SEED, *null_ratio);
ret.push(Box::new(HashKeyBenchCase::<K>::new(
id,
input_chunk,
self.data_types.clone(),
)));
}
}
ret
}

fn data_types(&self) -> &[DataType] {
&self.data_types
}
}

struct HashKeyBenchCase<K: HashKey> {
id: String,
input_chunk: DataChunk,
keys: Vec<K>,
data_types: Vec<DataType>,
col_idxes: Vec<usize>,
}

impl<K: HashKey> HashKeyBenchCase<K> {
pub fn new(id: String, input_chunk: DataChunk, data_types: Vec<DataType>) -> Self {
// please check the `bench_vec_dser` and `bench_deser` method when want to bench not full
// `col_idxes`
let col_idxes = (0..input_chunk.columns().len()).collect_vec();
let keys = HashKey::build(&col_idxes, &input_chunk).unwrap();
Self {
id,
input_chunk,
keys,
data_types,
col_idxes,
}
}

pub fn bench_vec_ser(&self, c: &mut Criterion) {
let vectorize_serialize_id = "vec ser ".to_string() + &self.id;
c.bench_function(&vectorize_serialize_id, |b| {
b.iter(|| K::build(&self.col_idxes, &self.input_chunk).unwrap())
});
}

pub fn bench_vec_deser(&self, c: &mut Criterion) {
let vectorize_deserialize_id = "vec deser ".to_string() + &self.id;
c.bench_function(&vectorize_deserialize_id, |b| {
let mut array_builders = self
.input_chunk
.columns()
.iter()
.map(|c| c.array_ref().create_builder(self.input_chunk.capacity()))
.collect::<Vec<ArrayBuilderImpl>>();
b.iter(|| {
for key in &self.keys {
key.deserialize_to_builders(&mut array_builders[..], &self.data_types)
.unwrap();
}
})
});
}

pub fn bench_deser(&self, c: &mut Criterion) {
let vectorize_deserialize_id = "row deser ".to_string() + &self.id;
c.bench_function(&vectorize_deserialize_id, |b| {
b.iter(|| {
for key in &self.keys {
key.deserialize(&self.data_types).unwrap();
}
})
});
}
}
impl<K: HashKey> Case for HashKeyBenchCase<K> {
fn bench(&self, c: &mut Criterion) {
self.bench_vec_ser(c);
self.bench_vec_deser(c);
self.bench_deser(c);
}
}

fn gen_chunk(data_types: &[DataType], size: usize, seed: u64, null_ratio: f64) -> DataChunk {
let mut columns = vec![];

for d in data_types {
columns.push(Column::new(match d {
DataType::Boolean => seed_rand_array_ref::<BoolArray>(size, seed, null_ratio),
DataType::Int16 => seed_rand_array_ref::<I16Array>(size, seed, null_ratio),
DataType::Int32 => seed_rand_array_ref::<I32Array>(size, seed, null_ratio),
DataType::Int64 => seed_rand_array_ref::<I64Array>(size, seed, null_ratio),
DataType::Float32 => seed_rand_array_ref::<F32Array>(size, seed, null_ratio),
DataType::Float64 => seed_rand_array_ref::<F64Array>(size, seed, null_ratio),
DataType::Decimal => seed_rand_array_ref::<DecimalArray>(size, seed, null_ratio),
DataType::Date => seed_rand_array_ref::<NaiveDateArray>(size, seed, null_ratio),
DataType::Varchar => seed_rand_array_ref::<Utf8Array>(size, seed, null_ratio),
DataType::Time => seed_rand_array_ref::<NaiveTimeArray>(size, seed, null_ratio),
DataType::Serial => seed_rand_array_ref::<SerialArray>(size, seed, null_ratio),
DataType::Timestamp => {
seed_rand_array_ref::<NaiveDateTimeArray>(size, seed, null_ratio)
}
DataType::Timestamptz => seed_rand_array_ref::<I64Array>(size, seed, null_ratio),
DataType::Interval => seed_rand_array_ref::<IntervalArray>(size, seed, null_ratio),
DataType::Struct(_) | DataType::Bytea | DataType::Jsonb => {
todo!()
}
DataType::List { datatype: _ } => {
todo!()
}
}));
}
risingwave_common::util::schema_check::schema_check(data_types, &columns).unwrap();
DataChunk::new(columns, size)
}

fn case_builders() -> Vec<HashKeyBenchCaseBuilder> {
vec![
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Serial],
describe: "single Serial".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int32],
describe: "single int32".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int64],
describe: "single int64".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Varchar],
describe: "single varchar".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int32, DataType::Int32, DataType::Int32],
describe: "composite fixed size".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int32, DataType::Int64, DataType::Int32],
describe: "composite fixed size2".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int32, DataType::Varchar],
describe: "composite fixed and not fixed size".to_string(),
},
HashKeyBenchCaseBuilder {
data_types: vec![DataType::Int64, DataType::Varchar],
describe: "composite fixed and not fixed size".to_string(),
},
]
}

fn bench_hash_key_encoding(c: &mut Criterion) {
for case_builder in case_builders() {
let cases = case_builder.gen_cases();
for case in cases {
case.bench(c);
}
}
}

// `cargo bench -- "vec ser[\s\S]*KeySerialized[\s\S]*null ratio 0$"` bench all the
// `KeySerialized` hash key vectorized serialize cases with data's null ratio is 0,001
criterion_group!(benches, bench_hash_key_encoding);
criterion_main!(benches);
4 changes: 2 additions & 2 deletions src/common/src/hash/dispatcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use crate::types::DataType;

/// An enum to help to dynamically dispatch [`HashKey`] template.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
enum HashKeyKind {
pub enum HashKeyKind {
Key8,
Key16,
Key32,
Expand Down Expand Up @@ -120,7 +120,7 @@ const MAX_FIXED_SIZE_KEY_ELEMENTS: usize = 8;
/// 4. Any column's serialized format can't be used for equality check.
///
/// Otherwise we choose smallest [`crate::hash::FixedSizeKey`] whose size can hold all data types.
fn calc_hash_key_kind(data_types: &[DataType]) -> HashKeyKind {
pub fn calc_hash_key_kind(data_types: &[DataType]) -> HashKeyKind {
if data_types.len() > MAX_FIXED_SIZE_KEY_ELEMENTS {
return HashKeyKind::KeySerialized;
}
Expand Down
29 changes: 19 additions & 10 deletions src/common/src/hash/key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -758,19 +758,28 @@ mod tests {
let capacity = 128;
let seed = 10244021u64;
let columns = vec![
Column::new(seed_rand_array_ref::<BoolArray>(capacity, seed)),
Column::new(seed_rand_array_ref::<I16Array>(capacity, seed + 1)),
Column::new(seed_rand_array_ref::<I32Array>(capacity, seed + 2)),
Column::new(seed_rand_array_ref::<I64Array>(capacity, seed + 3)),
Column::new(seed_rand_array_ref::<F32Array>(capacity, seed + 4)),
Column::new(seed_rand_array_ref::<F64Array>(capacity, seed + 5)),
Column::new(seed_rand_array_ref::<DecimalArray>(capacity, seed + 6)),
Column::new(seed_rand_array_ref::<Utf8Array>(capacity, seed + 7)),
Column::new(seed_rand_array_ref::<NaiveDateArray>(capacity, seed + 8)),
Column::new(seed_rand_array_ref::<NaiveTimeArray>(capacity, seed + 9)),
Column::new(seed_rand_array_ref::<BoolArray>(capacity, seed, 0.5)),
Column::new(seed_rand_array_ref::<I16Array>(capacity, seed + 1, 0.5)),
Column::new(seed_rand_array_ref::<I32Array>(capacity, seed + 2, 0.5)),
Column::new(seed_rand_array_ref::<I64Array>(capacity, seed + 3, 0.5)),
Column::new(seed_rand_array_ref::<F32Array>(capacity, seed + 4, 0.5)),
Column::new(seed_rand_array_ref::<F64Array>(capacity, seed + 5, 0.5)),
Column::new(seed_rand_array_ref::<DecimalArray>(capacity, seed + 6, 0.5)),
Column::new(seed_rand_array_ref::<Utf8Array>(capacity, seed + 7, 0.5)),
Column::new(seed_rand_array_ref::<NaiveDateArray>(
capacity,
seed + 8,
0.5,
)),
Column::new(seed_rand_array_ref::<NaiveTimeArray>(
capacity,
seed + 9,
0.5,
)),
Column::new(seed_rand_array_ref::<NaiveDateTimeArray>(
capacity,
seed + 10,
0.5,
)),
];
let types = vec![
Expand Down
2 changes: 1 addition & 1 deletion src/common/src/hash/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ mod key;
pub use consistent_hash::bitmap::*;
pub use consistent_hash::mapping::*;
pub use consistent_hash::vnode::*;
pub use dispatcher::HashKeyDispatcher;
pub use dispatcher::{calc_hash_key_kind, HashKeyDispatcher};
pub use key::*;
1 change: 0 additions & 1 deletion src/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ pub mod session_config;
pub mod system_param;
pub mod telemetry;

#[cfg(test)]
pub mod test_utils;
pub mod types;

Expand Down
14 changes: 7 additions & 7 deletions src/common/src/test_utils/rand_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@ impl RandValue for ListValue {
}
}

pub fn rand_array<A, R>(rand: &mut R, size: usize) -> A
pub fn rand_array<A, R>(rand: &mut R, size: usize, null_ratio: f64) -> A
where
A: Array,
R: Rng,
A::OwnedItem: RandValue,
{
let mut builder = A::Builder::new(size);
for _ in 0..size {
let is_null = rand.gen::<bool>();
let is_null = rand.gen_bool(null_ratio);
if is_null {
builder.append_null();
} else {
Expand All @@ -163,21 +163,21 @@ where
builder.finish()
}

pub fn seed_rand_array<A>(size: usize, seed: u64) -> A
pub fn seed_rand_array<A>(size: usize, seed: u64, null_ratio: f64) -> A
where
A: Array,
A::OwnedItem: RandValue,
{
let mut rand = SmallRng::seed_from_u64(seed);
rand_array(&mut rand, size)
rand_array(&mut rand, size, null_ratio)
}

pub fn seed_rand_array_ref<A>(size: usize, seed: u64) -> ArrayRef
pub fn seed_rand_array_ref<A>(size: usize, seed: u64, null_ratio: f64) -> ArrayRef
where
A: Array,
A::OwnedItem: RandValue,
{
let array: A = seed_rand_array(size, seed);
let array: A = seed_rand_array(size, seed, null_ratio);
Arc::new(array.into())
}

Expand All @@ -195,7 +195,7 @@ mod tests {
($( { $variant_name:ident, $suffix_name:ident, $array:ty, $builder:ty } ),*) => {
$(
{
let array = seed_rand_array::<$array>(10, 1024);
let array = seed_rand_array::<$array>(10, 1024, 0.5);
assert_eq!(10, array.len());
}
)*
Expand Down