This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 8b5c9b465d feat: generate reversed-name data for sort pushdown
benchmark (#21266)
8b5c9b465d is described below
commit 8b5c9b465dace58b0ea8d7b8b8f55f7730eeb610
Author: Qi Zhu <[email protected]>
AuthorDate: Thu Apr 2 20:57:25 2026 +0800
feat: generate reversed-name data for sort pushdown benchmark (#21266)
## Which issue does this PR close?
Related to https://github.com/apache/datafusion/issues/17348
Precursor to https://github.com/apache/datafusion/pull/21182
## Rationale for this change
The sort pushdown benchmark (#21213) uses TPC-H data where file names
happen to match sort key order, so the optimization in #21182 shows no
difference vs. main
([comment](https://github.com/apache/datafusion/pull/21182#issuecomment-4158740710)).
This PR generates custom benchmark data with **reversed file names** so
the sort pushdown optimizer must reorder files by statistics to achieve
sort elimination.
## What changes are included in this PR?
Updated `data_sort_pushdown` in `bench.sh` to use `tpchgen --parts=3`
and rename files:
```
tpchgen produces 3 sorted, non-overlapping parquet files:
lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys)
lineitem.2.parquet: l_orderkey 2M ~ 4M
lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys)
Renamed so alphabetical order is reversed vs key order:
a_part3.parquet → highest keys, sorts first alphabetically
b_part2.parquet
c_part1.parquet → lowest keys, sorts last alphabetically
```
No datafusion-cli needed — just `tpchgen-cli` + `mv`.
## Benchmark Results
With #21182 optimization (release build, 6M rows, single partition):
**On main (no optimization)**: files read in alphabetical order
`[a_part3, b_part2, c_part1]` → wrong order → SortExec stays
**With optimization**: files reordered by statistics `[c_part1, b_part2,
a_part3]` → non-overlapping → SortExec eliminated
| Query | Description | Main (ms) | PR #21182 (ms) | Speedup |
|-------|-------------|-----------|----------------|---------|
| Q1 | `ORDER BY ASC` (full scan) | 259 | 122 | **53%** |
| Q2 | `ORDER BY ASC LIMIT 100` | 80 | 9 | **89%** |
| Q3 | `SELECT * ORDER BY ASC` | 700 | 353 | **50%** |
| Q4 | `SELECT * LIMIT 100` | 342 | 24 | **93%** |
LIMIT queries show the biggest improvement because sort elimination +
limit pushdown means only the first ~100 rows are read before stopping.
## Test plan
- [x] `cargo clippy -p datafusion-benchmarks` — 0 warnings
- [x] Local benchmark verified with reversed-name data
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
benchmarks/bench.sh | 49 +++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 43 insertions(+), 6 deletions(-)
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index b8c9ff5c8e..badf9ce435 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -314,8 +314,7 @@ main() {
data_tpch "1" "parquet"
;;
sort_pushdown|sort_pushdown_sorted)
- # same data as for tpch
- data_tpch "1" "parquet"
+ data_sort_pushdown
;;
sort_tpch)
# same data as for tpch
@@ -1085,19 +1084,57 @@ run_external_aggr() {
}
# Runs the sort pushdown benchmark (without WITH ORDER)
+# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts,
+# renamed so alphabetical order does NOT match sort key order.
+# This forces the sort pushdown optimizer to reorder files by statistics.
+#
+# tpchgen produces 3 sorted, non-overlapping parquet files:
+# lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys)
+# lineitem.2.parquet: l_orderkey 2M ~ 4M
+# lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys)
+#
+# We rename them so alphabetical order is reversed:
+# a_part3.parquet (highest keys, sorts first alphabetically)
+# b_part2.parquet
+# c_part1.parquet (lowest keys, sorts last alphabetically)
+data_sort_pushdown() {
+ SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem"
+ if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A
${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then
+ echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}"
+ return
+ fi
+
+ echo "Generating sort pushdown benchmark data (3 parts with reversed
naming)..."
+
+ TEMP_DIR="${DATA_DIR}/sort_pushdown_temp"
+ mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}"
+
+ tpchgen-cli --scale-factor 1 --format parquet
--parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}"
+
+ # Rename: reverse alphabetical order vs key order
+ mv "${TEMP_DIR}/lineitem/lineitem.3.parquet"
"${SORT_PUSHDOWN_DIR}/a_part3.parquet"
+ mv "${TEMP_DIR}/lineitem/lineitem.2.parquet"
"${SORT_PUSHDOWN_DIR}/b_part2.parquet"
+ mv "${TEMP_DIR}/lineitem/lineitem.1.parquet"
"${SORT_PUSHDOWN_DIR}/c_part1.parquet"
+
+ rm -rf "${TEMP_DIR}"
+
+ echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}"
+ ls -la "${SORT_PUSHDOWN_DIR}"
+}
+
run_sort_pushdown() {
- TPCH_DIR="${DATA_DIR}/tpch_sf1"
+ SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
echo "Running sort pushdown benchmark (no WITH ORDER)..."
- debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5
--path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5
--path "${SORT_PUSHDOWN_DIR}" --queries-path
"${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG}
${LATENCY_ARG}
}
# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
run_sort_pushdown_sorted() {
- TPCH_DIR="${DATA_DIR}/tpch_sf1"
+ SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
echo "Running sort pushdown benchmark (with WITH ORDER)..."
- debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted
--iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
${LATENCY_ARG}
+ debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted
--iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path
"${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG}
${LATENCY_ARG}
}
# Runs the sort integration benchmark
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]