zhuqi-lucas commented on code in PR #21213: URL: https://github.com/apache/datafusion/pull/21213#discussion_r3007355239
########## benchmarks/src/sort_pushdown.rs: ########## @@ -0,0 +1,306 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for sort pushdown optimization. +//! +//! Tests performance of sort elimination when files are non-overlapping and +//! internally sorted (declared via `--sorted` / `WITH ORDER`). +//! +//! # Usage +//! +//! ```text +//! # Prepare sorted TPCH lineitem data (SF=1) +//! ./bench.sh data sort_pushdown +//! +//! # Baseline (no WITH ORDER, full SortExec) +//! ./bench.sh run sort_pushdown +//! +//! # With sort elimination (WITH ORDER, SortExec removed) +//! ./bench.sh run sort_pushdown_sorted +//! ``` +//! +//! # Reference Results +//! +//! Measured on 300k rows, 8 non-overlapping sorted parquet files, single partition, +//! debug build (results vary by hardware; relative speedup is the key metric): +//! +//! ```text +//! Query | Description | baseline (ms) | sort eliminated (ms) | speedup +//! ------|----------------------|---------------|---------------------|-------- +//! Q1 | ASC full scan | 159 | 91 | 43% +//! Q2 | ASC LIMIT 100 | 36 | 12 | 67% +//! Q3 | ASC full (wide, *) | 487 | 333 | 31% +//! Q4 | ASC LIMIT 100 (wide) | 119 | 30 | 74% +//! ``` +//! +//! Key observations: +//! - **LIMIT queries benefit most** (67-74%): sort elimination + limit pushdown +//! means only the first few rows are read before stopping. +//! - **Full scans** (31-43%): saving comes from eliminating the O(n log n) sort +//! step entirely. +//! - **Wide projections** amplify the benefit: larger rows make sorting more +//! expensive, so eliminating it saves more. + +use clap::Args; +use futures::StreamExt; +use std::path::PathBuf; +use std::sync::Arc; + +use datafusion::datasource::TableProvider; +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::listing::{ + ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, +}; +use datafusion::error::Result; +use datafusion::execution::SessionStateBuilder; +use datafusion::physical_plan::display::DisplayableExecutionPlan; +use datafusion::physical_plan::{displayable, execute_stream}; +use datafusion::prelude::*; +use datafusion_common::DEFAULT_PARQUET_EXTENSION; +use datafusion_common::instant::Instant; + +use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats}; + +#[derive(Debug, Args)] +pub struct RunOpt { + /// Common options + #[command(flatten)] + common: CommonOpt, + + /// Sort pushdown query number. If not specified, runs all queries + #[arg(short, long)] + pub query: Option<usize>, + + /// Path to data files (lineitem). Only parquet format is supported. + /// Data should be pre-sorted by l_orderkey ASC for meaningful results. + #[arg(required = true, short = 'p', long = "path")] + path: PathBuf, + + /// Path to JSON benchmark result to be compared using `compare.py` + #[arg(short = 'o', long = "output")] + output_path: Option<PathBuf>, + + /// Mark the first column (l_orderkey) as sorted via WITH ORDER. + /// When set, enables sort elimination for matching queries. + #[arg(short = 't', long = "sorted")] + sorted: bool, +} + +pub const SORT_PUSHDOWN_QUERY_START_ID: usize = 1; +pub const SORT_PUSHDOWN_QUERY_END_ID: usize = 4; + +impl RunOpt { + const TABLES: [&'static str; 1] = ["lineitem"]; + + /// Queries benchmarking sort elimination when files are non-overlapping + /// and internally sorted (WITH ORDER declared via `--sorted`). + /// + /// With `--sorted`: ParquetSource returns Exact, files are verified + /// non-overlapping by statistics → SortExec eliminated, no SPM needed + /// for single partition. + /// + /// Without `--sorted`: baseline with full SortExec. + const QUERIES: [&'static str; 4] = [ Review Comment: Good suggestion\! Moved queries to separate SQL files under `benchmarks/queries/sort_pushdown/` (q1.sql - q4.sql). Also added `--queries-path` arg so users can point to custom query files. The queries can now be run directly with datafusion-cli too. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
