Skip to content

Commit 1df800a

Browse files
authored
perf(hashkey): add benchmark for hash key ser/deser (risingwavelabs#8733)
1 parent 96aa23d commit 1df800a

File tree

7 files changed

+257
-21
lines changed

7 files changed

+257
-21
lines changed

src/common/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,7 @@ harness = false
111111
[[bench]]
112112
name = "bitmap"
113113
harness = false
114+
115+
[[bench]]
116+
name = "bench_hash_key_encoding"
117+
harness = false
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
// Copyright 2023 RisingWave Labs
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use criterion::{criterion_group, criterion_main, Criterion};
16+
use itertools::Itertools;
17+
use risingwave_common::array::column::Column;
18+
use risingwave_common::array::serial_array::SerialArray;
19+
use risingwave_common::array::{
20+
ArrayBuilderImpl, BoolArray, DataChunk, DecimalArray, F32Array, F64Array, I16Array, I32Array,
21+
I64Array, IntervalArray, NaiveDateArray, NaiveDateTimeArray, NaiveTimeArray, Utf8Array,
22+
};
23+
use risingwave_common::hash::{calc_hash_key_kind, HashKey, HashKeyDispatcher};
24+
use risingwave_common::test_utils::rand_array::seed_rand_array_ref;
25+
use risingwave_common::types::DataType;
26+
27+
static SEED: u64 = 998244353u64;
28+
static CHUNK_SIZES: &[usize] = &[128, 1024];
29+
static NULL_RATIOS: &[f64] = &[0.0, 0.01, 0.1];
30+
31+
trait Case: Send + 'static {
32+
fn bench(&self, c: &mut Criterion);
33+
}
34+
type BoxedCase = Box<dyn Case>;
35+
36+
struct HashKeyBenchCaseBuilder {
37+
pub data_types: Vec<DataType>,
38+
pub describe: String,
39+
}
40+
impl HashKeyBenchCaseBuilder {
41+
pub fn gen_cases(self) -> Vec<BoxedCase> {
42+
self.dispatch()
43+
}
44+
}
45+
impl HashKeyDispatcher for HashKeyBenchCaseBuilder {
46+
type Output = Vec<BoxedCase>;
47+
48+
fn dispatch_impl<K: HashKey>(self) -> Self::Output {
49+
let mut ret: Vec<BoxedCase> = vec![];
50+
for null_ratio in NULL_RATIOS {
51+
for chunk_size in CHUNK_SIZES {
52+
let id = format!(
53+
"{}, key type: {:?}, chunk size {}, null ratio {}",
54+
self.describe,
55+
calc_hash_key_kind(self.data_types()),
56+
chunk_size,
57+
null_ratio
58+
);
59+
let input_chunk = gen_chunk(self.data_types(), *chunk_size, SEED, *null_ratio);
60+
ret.push(Box::new(HashKeyBenchCase::<K>::new(
61+
id,
62+
input_chunk,
63+
self.data_types.clone(),
64+
)));
65+
}
66+
}
67+
ret
68+
}
69+
70+
fn data_types(&self) -> &[DataType] {
71+
&self.data_types
72+
}
73+
}
74+
75+
struct HashKeyBenchCase<K: HashKey> {
76+
id: String,
77+
input_chunk: DataChunk,
78+
keys: Vec<K>,
79+
data_types: Vec<DataType>,
80+
col_idxes: Vec<usize>,
81+
}
82+
83+
impl<K: HashKey> HashKeyBenchCase<K> {
84+
pub fn new(id: String, input_chunk: DataChunk, data_types: Vec<DataType>) -> Self {
85+
// please check the `bench_vec_dser` and `bench_deser` method when want to bench not full
86+
// `col_idxes`
87+
let col_idxes = (0..input_chunk.columns().len()).collect_vec();
88+
let keys = HashKey::build(&col_idxes, &input_chunk).unwrap();
89+
Self {
90+
id,
91+
input_chunk,
92+
keys,
93+
data_types,
94+
col_idxes,
95+
}
96+
}
97+
98+
pub fn bench_vec_ser(&self, c: &mut Criterion) {
99+
let vectorize_serialize_id = "vec ser ".to_string() + &self.id;
100+
c.bench_function(&vectorize_serialize_id, |b| {
101+
b.iter(|| K::build(&self.col_idxes, &self.input_chunk).unwrap())
102+
});
103+
}
104+
105+
pub fn bench_vec_deser(&self, c: &mut Criterion) {
106+
let vectorize_deserialize_id = "vec deser ".to_string() + &self.id;
107+
c.bench_function(&vectorize_deserialize_id, |b| {
108+
let mut array_builders = self
109+
.input_chunk
110+
.columns()
111+
.iter()
112+
.map(|c| c.array_ref().create_builder(self.input_chunk.capacity()))
113+
.collect::<Vec<ArrayBuilderImpl>>();
114+
b.iter(|| {
115+
for key in &self.keys {
116+
key.deserialize_to_builders(&mut array_builders[..], &self.data_types)
117+
.unwrap();
118+
}
119+
})
120+
});
121+
}
122+
123+
pub fn bench_deser(&self, c: &mut Criterion) {
124+
let vectorize_deserialize_id = "row deser ".to_string() + &self.id;
125+
c.bench_function(&vectorize_deserialize_id, |b| {
126+
b.iter(|| {
127+
for key in &self.keys {
128+
key.deserialize(&self.data_types).unwrap();
129+
}
130+
})
131+
});
132+
}
133+
}
134+
impl<K: HashKey> Case for HashKeyBenchCase<K> {
135+
fn bench(&self, c: &mut Criterion) {
136+
self.bench_vec_ser(c);
137+
self.bench_vec_deser(c);
138+
self.bench_deser(c);
139+
}
140+
}
141+
142+
fn gen_chunk(data_types: &[DataType], size: usize, seed: u64, null_ratio: f64) -> DataChunk {
143+
let mut columns = vec![];
144+
145+
for d in data_types {
146+
columns.push(Column::new(match d {
147+
DataType::Boolean => seed_rand_array_ref::<BoolArray>(size, seed, null_ratio),
148+
DataType::Int16 => seed_rand_array_ref::<I16Array>(size, seed, null_ratio),
149+
DataType::Int32 => seed_rand_array_ref::<I32Array>(size, seed, null_ratio),
150+
DataType::Int64 => seed_rand_array_ref::<I64Array>(size, seed, null_ratio),
151+
DataType::Float32 => seed_rand_array_ref::<F32Array>(size, seed, null_ratio),
152+
DataType::Float64 => seed_rand_array_ref::<F64Array>(size, seed, null_ratio),
153+
DataType::Decimal => seed_rand_array_ref::<DecimalArray>(size, seed, null_ratio),
154+
DataType::Date => seed_rand_array_ref::<NaiveDateArray>(size, seed, null_ratio),
155+
DataType::Varchar => seed_rand_array_ref::<Utf8Array>(size, seed, null_ratio),
156+
DataType::Time => seed_rand_array_ref::<NaiveTimeArray>(size, seed, null_ratio),
157+
DataType::Serial => seed_rand_array_ref::<SerialArray>(size, seed, null_ratio),
158+
DataType::Timestamp => {
159+
seed_rand_array_ref::<NaiveDateTimeArray>(size, seed, null_ratio)
160+
}
161+
DataType::Timestamptz => seed_rand_array_ref::<I64Array>(size, seed, null_ratio),
162+
DataType::Interval => seed_rand_array_ref::<IntervalArray>(size, seed, null_ratio),
163+
DataType::Struct(_) | DataType::Bytea | DataType::Jsonb => {
164+
todo!()
165+
}
166+
DataType::List { datatype: _ } => {
167+
todo!()
168+
}
169+
}));
170+
}
171+
risingwave_common::util::schema_check::schema_check(data_types, &columns).unwrap();
172+
DataChunk::new(columns, size)
173+
}
174+
175+
fn case_builders() -> Vec<HashKeyBenchCaseBuilder> {
176+
vec![
177+
HashKeyBenchCaseBuilder {
178+
data_types: vec![DataType::Serial],
179+
describe: "single Serial".to_string(),
180+
},
181+
HashKeyBenchCaseBuilder {
182+
data_types: vec![DataType::Int32],
183+
describe: "single int32".to_string(),
184+
},
185+
HashKeyBenchCaseBuilder {
186+
data_types: vec![DataType::Int64],
187+
describe: "single int64".to_string(),
188+
},
189+
HashKeyBenchCaseBuilder {
190+
data_types: vec![DataType::Varchar],
191+
describe: "single varchar".to_string(),
192+
},
193+
HashKeyBenchCaseBuilder {
194+
data_types: vec![DataType::Int32, DataType::Int32, DataType::Int32],
195+
describe: "composite fixed size".to_string(),
196+
},
197+
HashKeyBenchCaseBuilder {
198+
data_types: vec![DataType::Int32, DataType::Int64, DataType::Int32],
199+
describe: "composite fixed size2".to_string(),
200+
},
201+
HashKeyBenchCaseBuilder {
202+
data_types: vec![DataType::Int32, DataType::Varchar],
203+
describe: "composite fixed and not fixed size".to_string(),
204+
},
205+
HashKeyBenchCaseBuilder {
206+
data_types: vec![DataType::Int64, DataType::Varchar],
207+
describe: "composite fixed and not fixed size".to_string(),
208+
},
209+
]
210+
}
211+
212+
fn bench_hash_key_encoding(c: &mut Criterion) {
213+
for case_builder in case_builders() {
214+
let cases = case_builder.gen_cases();
215+
for case in cases {
216+
case.bench(c);
217+
}
218+
}
219+
}
220+
221+
// `cargo bench -- "vec ser[\s\S]*KeySerialized[\s\S]*null ratio 0$"` bench all the
222+
// `KeySerialized` hash key vectorized serialize cases with data's null ratio is 0,001
223+
criterion_group!(benches, bench_hash_key_encoding);
224+
criterion_main!(benches);

src/common/src/hash/dispatcher.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use crate::types::DataType;
1919

2020
/// An enum to help to dynamically dispatch [`HashKey`] template.
2121
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
22-
enum HashKeyKind {
22+
pub enum HashKeyKind {
2323
Key8,
2424
Key16,
2525
Key32,
@@ -120,7 +120,7 @@ const MAX_FIXED_SIZE_KEY_ELEMENTS: usize = 8;
120120
/// 4. Any column's serialized format can't be used for equality check.
121121
///
122122
/// Otherwise we choose smallest [`crate::hash::FixedSizeKey`] whose size can hold all data types.
123-
fn calc_hash_key_kind(data_types: &[DataType]) -> HashKeyKind {
123+
pub fn calc_hash_key_kind(data_types: &[DataType]) -> HashKeyKind {
124124
if data_types.len() > MAX_FIXED_SIZE_KEY_ELEMENTS {
125125
return HashKeyKind::KeySerialized;
126126
}

src/common/src/hash/key.rs

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -758,19 +758,28 @@ mod tests {
758758
let capacity = 128;
759759
let seed = 10244021u64;
760760
let columns = vec![
761-
Column::new(seed_rand_array_ref::<BoolArray>(capacity, seed)),
762-
Column::new(seed_rand_array_ref::<I16Array>(capacity, seed + 1)),
763-
Column::new(seed_rand_array_ref::<I32Array>(capacity, seed + 2)),
764-
Column::new(seed_rand_array_ref::<I64Array>(capacity, seed + 3)),
765-
Column::new(seed_rand_array_ref::<F32Array>(capacity, seed + 4)),
766-
Column::new(seed_rand_array_ref::<F64Array>(capacity, seed + 5)),
767-
Column::new(seed_rand_array_ref::<DecimalArray>(capacity, seed + 6)),
768-
Column::new(seed_rand_array_ref::<Utf8Array>(capacity, seed + 7)),
769-
Column::new(seed_rand_array_ref::<NaiveDateArray>(capacity, seed + 8)),
770-
Column::new(seed_rand_array_ref::<NaiveTimeArray>(capacity, seed + 9)),
761+
Column::new(seed_rand_array_ref::<BoolArray>(capacity, seed, 0.5)),
762+
Column::new(seed_rand_array_ref::<I16Array>(capacity, seed + 1, 0.5)),
763+
Column::new(seed_rand_array_ref::<I32Array>(capacity, seed + 2, 0.5)),
764+
Column::new(seed_rand_array_ref::<I64Array>(capacity, seed + 3, 0.5)),
765+
Column::new(seed_rand_array_ref::<F32Array>(capacity, seed + 4, 0.5)),
766+
Column::new(seed_rand_array_ref::<F64Array>(capacity, seed + 5, 0.5)),
767+
Column::new(seed_rand_array_ref::<DecimalArray>(capacity, seed + 6, 0.5)),
768+
Column::new(seed_rand_array_ref::<Utf8Array>(capacity, seed + 7, 0.5)),
769+
Column::new(seed_rand_array_ref::<NaiveDateArray>(
770+
capacity,
771+
seed + 8,
772+
0.5,
773+
)),
774+
Column::new(seed_rand_array_ref::<NaiveTimeArray>(
775+
capacity,
776+
seed + 9,
777+
0.5,
778+
)),
771779
Column::new(seed_rand_array_ref::<NaiveDateTimeArray>(
772780
capacity,
773781
seed + 10,
782+
0.5,
774783
)),
775784
];
776785
let types = vec![

src/common/src/hash/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,5 @@ mod key;
1919
pub use consistent_hash::bitmap::*;
2020
pub use consistent_hash::mapping::*;
2121
pub use consistent_hash::vnode::*;
22-
pub use dispatcher::HashKeyDispatcher;
22+
pub use dispatcher::{calc_hash_key_kind, HashKeyDispatcher};
2323
pub use key::*;

src/common/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ pub mod session_config;
5656
pub mod system_param;
5757
pub mod telemetry;
5858

59-
#[cfg(test)]
6059
pub mod test_utils;
6160
pub mod types;
6261

src/common/src/test_utils/rand_array.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,15 +143,15 @@ impl RandValue for ListValue {
143143
}
144144
}
145145

146-
pub fn rand_array<A, R>(rand: &mut R, size: usize) -> A
146+
pub fn rand_array<A, R>(rand: &mut R, size: usize, null_ratio: f64) -> A
147147
where
148148
A: Array,
149149
R: Rng,
150150
A::OwnedItem: RandValue,
151151
{
152152
let mut builder = A::Builder::new(size);
153153
for _ in 0..size {
154-
let is_null = rand.gen::<bool>();
154+
let is_null = rand.gen_bool(null_ratio);
155155
if is_null {
156156
builder.append_null();
157157
} else {
@@ -163,21 +163,21 @@ where
163163
builder.finish()
164164
}
165165

166-
pub fn seed_rand_array<A>(size: usize, seed: u64) -> A
166+
pub fn seed_rand_array<A>(size: usize, seed: u64, null_ratio: f64) -> A
167167
where
168168
A: Array,
169169
A::OwnedItem: RandValue,
170170
{
171171
let mut rand = SmallRng::seed_from_u64(seed);
172-
rand_array(&mut rand, size)
172+
rand_array(&mut rand, size, null_ratio)
173173
}
174174

175-
pub fn seed_rand_array_ref<A>(size: usize, seed: u64) -> ArrayRef
175+
pub fn seed_rand_array_ref<A>(size: usize, seed: u64, null_ratio: f64) -> ArrayRef
176176
where
177177
A: Array,
178178
A::OwnedItem: RandValue,
179179
{
180-
let array: A = seed_rand_array(size, seed);
180+
let array: A = seed_rand_array(size, seed, null_ratio);
181181
Arc::new(array.into())
182182
}
183183

@@ -195,7 +195,7 @@ mod tests {
195195
($( { $variant_name:ident, $suffix_name:ident, $array:ty, $builder:ty } ),*) => {
196196
$(
197197
{
198-
let array = seed_rand_array::<$array>(10, 1024);
198+
let array = seed_rand_array::<$array>(10, 1024, 0.5);
199199
assert_eq!(10, array.len());
200200
}
201201
)*

0 commit comments

Comments
 (0)