This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 8b5c9b465d feat: generate reversed-name data for sort pushdown 
benchmark (#21266)
8b5c9b465d is described below

commit 8b5c9b465dace58b0ea8d7b8b8f55f7730eeb610
Author: Qi Zhu <[email protected]>
AuthorDate: Thu Apr 2 20:57:25 2026 +0800

    feat: generate reversed-name data for sort pushdown benchmark (#21266)
    
    ## Which issue does this PR close?
    
    Related to https://github.com/apache/datafusion/issues/17348
    Precursor to https://github.com/apache/datafusion/pull/21182
    
    ## Rationale for this change
    
    The sort pushdown benchmark (#21213) uses TPC-H data where file names
    happen to match sort key order, so the optimization in #21182 shows no
    difference vs. main
    
([comment](https://github.com/apache/datafusion/pull/21182#issuecomment-4158740710)).
    
    This PR generates custom benchmark data with **reversed file names** so
    the sort pushdown optimizer must reorder files by statistics to achieve
    sort elimination.
    
    ## What changes are included in this PR?
    
    Updated `data_sort_pushdown` in `bench.sh` to use `tpchgen --parts=3`
    and rename files:
    
    ```
    tpchgen produces 3 sorted, non-overlapping parquet files:
      lineitem.1.parquet: l_orderkey 1 ~ 2M        (lowest keys)
      lineitem.2.parquet: l_orderkey 2M ~ 4M
      lineitem.3.parquet: l_orderkey 4M ~ 6M       (highest keys)
    
    Renamed so alphabetical order is reversed vs key order:
      a_part3.parquet → highest keys, sorts first alphabetically
      b_part2.parquet
      c_part1.parquet → lowest keys, sorts last alphabetically
    ```
    
    No datafusion-cli needed — just `tpchgen-cli` + `mv`.
    
    ## Benchmark Results
    
    With #21182 optimization (release build, 6M rows, single partition):
    
    **On main (no optimization)**: files read in alphabetical order
    `[a_part3, b_part2, c_part1]` → wrong order → SortExec stays
    
    **With optimization**: files reordered by statistics `[c_part1, b_part2,
    a_part3]` → non-overlapping → SortExec eliminated
    
    | Query | Description | Main (ms) | PR #21182 (ms) | Speedup |
    |-------|-------------|-----------|----------------|---------|
    | Q1 | `ORDER BY ASC` (full scan) | 259 | 122 | **53%** |
    | Q2 | `ORDER BY ASC LIMIT 100` | 80 | 9 | **89%** |
    | Q3 | `SELECT * ORDER BY ASC` | 700 | 353 | **50%** |
    | Q4 | `SELECT * LIMIT 100` | 342 | 24 | **93%** |
    
    LIMIT queries show the biggest improvement because sort elimination +
    limit pushdown means only the first ~100 rows are read before stopping.
    
    ## Test plan
    
    - [x] `cargo clippy -p datafusion-benchmarks` — 0 warnings
    - [x] Local benchmark verified with reversed-name data
    
    🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 benchmarks/bench.sh | 49 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index b8c9ff5c8e..badf9ce435 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -314,8 +314,7 @@ main() {
                     data_tpch "1" "parquet"
                     ;;
                 sort_pushdown|sort_pushdown_sorted)
-                    # same data as for tpch
-                    data_tpch "1" "parquet"
+                    data_sort_pushdown
                     ;;
                 sort_tpch)
                     # same data as for tpch
@@ -1085,19 +1084,57 @@ run_external_aggr() {
 }
 
 # Runs the sort pushdown benchmark (without WITH ORDER)
+# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts,
+# renamed so alphabetical order does NOT match sort key order.
+# This forces the sort pushdown optimizer to reorder files by statistics.
+#
+# tpchgen produces 3 sorted, non-overlapping parquet files:
+#   lineitem.1.parquet: l_orderkey 1 ~ 2M        (lowest keys)
+#   lineitem.2.parquet: l_orderkey 2M ~ 4M
+#   lineitem.3.parquet: l_orderkey 4M ~ 6M       (highest keys)
+#
+# We rename them so alphabetical order is reversed:
+#   a_part3.parquet (highest keys, sorts first alphabetically)
+#   b_part2.parquet
+#   c_part1.parquet (lowest keys, sorts last alphabetically)
+data_sort_pushdown() {
+    SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem"
+    if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A 
${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then
+        echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}"
+        return
+    fi
+
+    echo "Generating sort pushdown benchmark data (3 parts with reversed 
naming)..."
+
+    TEMP_DIR="${DATA_DIR}/sort_pushdown_temp"
+    mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}"
+
+    tpchgen-cli --scale-factor 1 --format parquet 
--parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}"
+
+    # Rename: reverse alphabetical order vs key order
+    mv "${TEMP_DIR}/lineitem/lineitem.3.parquet" 
"${SORT_PUSHDOWN_DIR}/a_part3.parquet"
+    mv "${TEMP_DIR}/lineitem/lineitem.2.parquet" 
"${SORT_PUSHDOWN_DIR}/b_part2.parquet"
+    mv "${TEMP_DIR}/lineitem/lineitem.1.parquet" 
"${SORT_PUSHDOWN_DIR}/c_part1.parquet"
+
+    rm -rf "${TEMP_DIR}"
+
+    echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}"
+    ls -la "${SORT_PUSHDOWN_DIR}"
+}
+
 run_sort_pushdown() {
-    TPCH_DIR="${DATA_DIR}/tpch_sf1"
+    SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
     RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
     echo "Running sort pushdown benchmark (no WITH ORDER)..."
-    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 
--path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 
--path "${SORT_PUSHDOWN_DIR}" --queries-path 
"${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} 
${LATENCY_ARG}
 }
 
 # Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
 run_sort_pushdown_sorted() {
-    TPCH_DIR="${DATA_DIR}/tpch_sf1"
+    SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
     RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
     echo "Running sort pushdown benchmark (with WITH ORDER)..."
-    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted 
--iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} 
${LATENCY_ARG}
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted 
--iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path 
"${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} 
${LATENCY_ARG}
 }
 
 # Runs the sort integration benchmark


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to