Skip to content

Commit 57ff8e9

Browse files
authored
chore(core): introduce the benchmark framework for Wren core Rust (#805)
* implement the benchmark framework * add tpch query * add readme and compare script * update gitignore * taplo fmt
1 parent 5c4ad06 commit 57ff8e9

36 files changed

+1587
-1
lines changed

wren-modeling-rs/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
Cargo.lock
22
target/
33
sqllogictest/test_files/scratch/
4+
benchmarks/results/

wren-modeling-rs/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[workspace]
2-
members = ["core", "sqllogictest", "wren-example"]
2+
members = ["benchmarks", "core", "sqllogictest", "wren-example"]
33
resolver = "2"
44

55
[workspace.package]
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
[package]
2+
name = "wren-benchmarks"
3+
authors.workspace = true
4+
edition.workspace = true
5+
homepage.workspace = true
6+
license.workspace = true
7+
readme.workspace = true
8+
repository.workspace = true
9+
rust-version.workspace = true
10+
version.workspace = true
11+
12+
[lib]
13+
name = "wren_benchmarks"
14+
path = "src/lib.rs"
15+
16+
[dependencies]
17+
datafusion = { workspace = true }
18+
env_logger = { workspace = true }
19+
log = "0.4.21"
20+
num_cpus = "1.16.0"
21+
serde = { workspace = true }
22+
serde_json = { workspace = true }
23+
structopt = { version = "0.3.26", default-features = false }
24+
tokio = { workspace = true }
25+
wren-core = { workspace = true }

wren-modeling-rs/benchmarks/README.md

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# Wren core benchmarks
2+
3+
This crate contains the benchmarks for the Wren core library based on some open source benchmarks, to help
4+
with performance improvements of Wren core.
5+
6+
# Supported Benchmarks
7+
8+
## TPCH
9+
10+
Run the tpch benchmark.
11+
12+
This benchmarks is derived from the [TPC-H][1] version
13+
[2.17.1]. The data and answers are generated using `tpch-gen` from
14+
[2].
15+
16+
[1]: http://www.tpc.org/tpch/
17+
[2]: https://github.com/databricks/tpch-dbgen.git,
18+
[2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
19+
20+
21+
# Running the benchmarks
22+
23+
## `bench.sh`
24+
25+
The easiest way to run benchmarks is the [bench.sh](bench.sh)
26+
script. Usage instructions can be found with:
27+
28+
```shell
29+
# show usage
30+
./bench.sh
31+
```
32+
33+
## Comparing performance of main and a branch
34+
35+
```shell
36+
git checkout main
37+
38+
# Gather baseline data for tpch benchmark
39+
./benchmarks/bench.sh run tpch
40+
41+
# Switch to the branch the branch name is mybranch and gather data
42+
git checkout mybranch
43+
./benchmarks/bench.sh run tpch
44+
45+
# Compare results in the two branches:
46+
./bench.sh compare main mybranch
47+
```
48+
49+
This produces results like:
50+
51+
```shell
52+
Comparing main and mybranch
53+
--------------------
54+
Benchmark tpch.json
55+
--------------------
56+
┏━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┓
57+
┃ Query ┃ main ┃mybranch ┃ Change ┃
58+
┡━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━┩
59+
│ QQuery 1 │ 4.25ms │ 4.26ms │ no change │
60+
│ QQuery 2 │ 11.25ms │ 11.68ms │ no change │
61+
│ QQuery 3 │ 5.03ms │ 4.97ms │ no change │
62+
│ QQuery 4 │ 3.43ms │ 3.46ms │ no change │
63+
│ QQuery 5 │ 7.39ms │ 7.28ms │ no change │
64+
│ QQuery 6 │ 2.26ms │ 2.26ms │ no change │
65+
│ QQuery 7 │ 8.53ms │ 8.51ms │ no change │
66+
│ QQuery 8 │ 9.90ms │ 9.99ms │ no change │
67+
│ QQuery 9 │ 8.56ms │ 8.27ms │ no change │
68+
│ QQuery 10 │ 7.37ms │ 7.63ms │ no change │
69+
│ QQuery 11 │ 7.06ms │ 7.00ms │ no change │
70+
│ QQuery 12 │ 4.35ms │ 4.19ms │ no change │
71+
│ QQuery 13 │ 2.93ms │ 2.88ms │ no change │
72+
│ QQuery 14 │ 3.34ms │ 3.33ms │ no change │
73+
│ QQuery 15 │ 6.51ms │ 6.49ms │ no change │
74+
│ QQuery 16 │ 4.59ms │ 4.64ms │ no change │
75+
│ QQuery 17 │ 4.00ms │ 4.05ms │ no change │
76+
│ QQuery 18 │ 5.46ms │ 5.47ms │ no change │
77+
│ QQuery 19 │ 5.84ms │ 5.72ms │ no change │
78+
│ QQuery 20 │ 7.22ms │ 7.33ms │ no change │
79+
│ QQuery 21 │ 9.35ms │ 9.19ms │ no change │
80+
│ QQuery 22 │ 4.54ms │ 4.33ms │ no change │
81+
└──────────────┴─────────┴─────────┴───────────┘
82+
┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
83+
┃ Benchmark Summary ┃ ┃
84+
┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
85+
│ Total Time (main) │ 133.16ms │
86+
│ Total Time (mybranch) │ 132.92ms │
87+
│ Average Time (main) │ 6.05ms │
88+
│ Average Time (mybranch)│ 6.04ms │
89+
│ Queries Faster │ 0 │
90+
│ Queries Slower │ 0 │
91+
│ Queries with No Change │ 22 │
92+
└────────────────────────┴──────────┘
93+
```
94+
95+
### Running Benchmarks Manually
96+
97+
The `tpch` benchmark can be run with a command like this
98+
99+
```bash
100+
cargo run --release --bin tpch -- benchmark --query 1 -i 10 -o result.json
101+
```

wren-modeling-rs/benchmarks/bench.sh

+214
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
#!/usr/bin/env bash
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
19+
# This script is meant for developers of DataFusion -- it is runnable
20+
# from the standard DataFusion development environment and uses cargo,
21+
# etc and orchestrates gathering data and run the benchmark binary in
22+
# different configurations.
23+
24+
25+
# Exit on error
26+
set -e
27+
28+
# https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
29+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
30+
31+
32+
# Set Defaults
33+
COMMAND=
34+
BENCHMARK=all
35+
WREN_DIR=${WREN_DIR:-$SCRIPT_DIR/..}
36+
CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
37+
VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
38+
39+
usage() {
40+
echo "
41+
Orchestrates running benchmarks against DataFusion checkouts
42+
43+
Usage:
44+
$0 run [benchmark]
45+
$0 compare <branch1> <branch2>
46+
$0 venv
47+
48+
**********
49+
Examples:
50+
**********
51+
# Run the 'tpch' benchmark on the datafusion checkout in /source/datafusion
52+
WREN_DIR=/source/datafusion ./bench.sh run tpch
53+
54+
**********
55+
* Commands
56+
**********
57+
run: Runs the named benchmark
58+
compare: Compares results from benchmark runs
59+
venv: Creates new venv (unless already exists) and installs compare's requirements into it
60+
61+
**********
62+
* Benchmarks
63+
**********
64+
all(default): Data/Run/Compare for all benchmarks
65+
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join
66+
67+
**********
68+
* Supported Configuration (Environment Variables)
69+
**********
70+
CARGO_COMMAND command that runs the benchmark binary
71+
WREN_DIR directory to use (default $WREN_DIR)
72+
RESULTS_NAME folder where the benchmark files are stored
73+
VENV_PATH Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
74+
"
75+
exit 1
76+
}
77+
78+
# https://stackoverflow.com/questions/192249/how-do-i-parse-command-line-arguments-in-bash
79+
POSITIONAL_ARGS=()
80+
81+
while [[ $# -gt 0 ]]; do
82+
case $1 in
83+
# -e|--extension)
84+
# EXTENSION="$2"
85+
# shift # past argument
86+
# shift # past value
87+
# ;;
88+
-h|--help)
89+
shift # past argument
90+
usage
91+
;;
92+
-*)
93+
echo "Unknown option $1"
94+
exit 1
95+
;;
96+
*)
97+
POSITIONAL_ARGS+=("$1") # save positional arg
98+
shift # past argument
99+
;;
100+
esac
101+
done
102+
103+
set -- "${POSITIONAL_ARGS[@]}" # restore positional parameters
104+
COMMAND=${1:-"${COMMAND}"}
105+
ARG2=$2
106+
ARG3=$3
107+
108+
# Do what is requested
109+
main() {
110+
# Command Dispatch
111+
case "$COMMAND" in
112+
run)
113+
# Parse positional parameters
114+
BENCHMARK=${ARG2:-"${BENCHMARK}"}
115+
BRANCH_NAME=$(cd "${WREN_DIR}" && git rev-parse --abbrev-ref HEAD)
116+
BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _
117+
RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"}
118+
RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"}
119+
120+
echo "***************************"
121+
echo "DataFusion Benchmark Script"
122+
echo "COMMAND: ${COMMAND}"
123+
echo "BENCHMARK: ${BENCHMARK}"
124+
echo "WREN_DIR: ${WREN_DIR}"
125+
echo "BRANCH_NAME: ${BRANCH_NAME}"
126+
echo "RESULTS_DIR: ${RESULTS_DIR}"
127+
echo "CARGO_COMMAND: ${CARGO_COMMAND}"
128+
echo "***************************"
129+
130+
# navigate to the appropriate directory
131+
pushd "${WREN_DIR}/benchmarks" > /dev/null
132+
mkdir -p "${RESULTS_DIR}"
133+
case "$BENCHMARK" in
134+
all)
135+
run_tpch "1"
136+
;;
137+
tpch)
138+
run_tpch "1"
139+
;;
140+
*)
141+
echo "Error: unknown benchmark '$BENCHMARK' for run"
142+
usage
143+
;;
144+
esac
145+
popd > /dev/null
146+
echo "Done"
147+
;;
148+
compare)
149+
compare_benchmarks "$ARG2" "$ARG3"
150+
;;
151+
venv)
152+
setup_venv
153+
;;
154+
"")
155+
usage
156+
;;
157+
*)
158+
echo "Error: unknown command: $COMMAND"
159+
usage
160+
;;
161+
esac
162+
}
163+
164+
165+
# Runs the tpch benchmark
166+
run_tpch() {
167+
RESULTS_FILE="${RESULTS_DIR}/tpch.json"
168+
echo "RESULTS_FILE: ${RESULTS_FILE}"
169+
echo "Running tpch benchmark..."
170+
$CARGO_COMMAND --bin tpch -- benchmark -i 10 -o "${RESULTS_FILE}"
171+
}
172+
173+
174+
175+
176+
compare_benchmarks() {
177+
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
178+
BRANCH1="$1"
179+
BRANCH2="$2"
180+
if [ -z "$BRANCH1" ] ; then
181+
echo "<branch1> not specified. Available branches:"
182+
ls -1 "${BASE_RESULTS_DIR}"
183+
exit 1
184+
fi
185+
186+
if [ -z "$BRANCH2" ] ; then
187+
echo "<branch2> not specified"
188+
ls -1 "${BASE_RESULTS_DIR}"
189+
exit 1
190+
fi
191+
192+
echo "Comparing ${BRANCH1} and ${BRANCH2}"
193+
for RESULTS_FILE1 in "${BASE_RESULTS_DIR}/${BRANCH1}"/*.json ; do
194+
BENCH=$(basename "${RESULTS_FILE1}")
195+
RESULTS_FILE2="${BASE_RESULTS_DIR}/${BRANCH2}/${BENCH}"
196+
if test -f "${RESULTS_FILE2}" ; then
197+
echo "--------------------"
198+
echo "Benchmark ${BENCH}"
199+
echo "--------------------"
200+
PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
201+
else
202+
echo "Note: Skipping ${RESULTS_FILE1} as ${RESULTS_FILE2} does not exist"
203+
fi
204+
done
205+
206+
}
207+
208+
setup_venv() {
209+
python3 -m venv "$VIRTUAL_ENV"
210+
PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
211+
}
212+
213+
# And start the process up
214+
main

0 commit comments

Comments
 (0)