This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 0f7b1c58f8 minor: Allow to run TPCH bench for a specific query (#15467)
0f7b1c58f8 is described below
commit 0f7b1c58f833227263030f6069ce722abc8e8675
Author: Oleks V <[email protected]>
AuthorDate: Fri Mar 28 07:29:42 2025 -0700
minor: Allow to run TPCH bench for a specific query (#15467)
* minor: Allow to run TPCH bench for a specific query
---
benchmarks/README.md | 18 +++++++++++++++++-
benchmarks/bench.sh | 9 ++++++---
2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 39b4584bd2..8acaa298bd 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -46,6 +46,7 @@ script. Usage instructions can be found with:
```shell
# show usage
+cd ./benchmarks/
./bench.sh
```
@@ -64,9 +65,24 @@ Create / download a specific dataset (TPCH)
```shell
./bench.sh data tpch
```
-
Data is placed in the `data` subdirectory.
+## Running benchmarks
+
+Run benchmark for TPC-H dataset
+```shell
+./bench.sh run tpch
+```
+or for TPC-H dataset scale 10
+```shell
+./bench.sh run tpch10
+```
+
+To run for specific query, for example Q21
+```shell
+./bench.sh run tpch10 21
+```
+
## Select join algorithm
The benchmark runs with `prefer_hash_join == true` by default, which enforces
HASH join algorithm.
To run TPCH benchmarks with join other than HASH:
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index 43d3d78c7e..5be825eb0d 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -43,7 +43,7 @@ usage() {
Orchestrates running benchmarks against DataFusion checkouts
Usage:
-$0 data [benchmark]
+$0 data [benchmark] [query]
$0 run [benchmark]
$0 compare <branch1> <branch2>
$0 venv
@@ -410,7 +410,9 @@ run_tpch() {
RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch benchmark..."
- $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path
"${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o
"${RESULTS_FILE}"
+ # Optional query filter to run specific query
+ QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")
+ $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path
"${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o
"${RESULTS_FILE}" $QUERY
}
# Runs the tpch in memory
@@ -425,8 +427,9 @@ run_tpch_mem() {
RESULTS_FILE="${RESULTS_DIR}/tpch_mem_sf${SCALE_FACTOR}.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch_mem benchmark..."
+ QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")
# -m means in memory
- $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path
"${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o
"${RESULTS_FILE}"
+ $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path
"${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o
"${RESULTS_FILE}" $QUERY
}
# Runs the cancellation benchmark
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]