Re: [PR] Establish the high level API for sort pushdown and the optimizer rule and support reverse files and row groups [datafusion]

via GitHub Wed, 03 Dec 2025 19:48:13 -0800


zhuqi-lucas commented on code in PR #19064:
URL: https://github.com/apache/datafusion/pull/19064#discussion_r2587364095



##########
datafusion/physical-optimizer/src/pushdown_sort.rs:
##########
@@ -0,0 +1,329 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Sort Pushdown Optimization (Phase 1)
+//!
+//! Phase 1 focuses on rearranging files and row groups based on statistics
+//! to provide approximate ordering, enabling early termination for TopK 
queries.
+//!
+//! This optimization:
+//! 1. Detects SortExec nodes that require a specific ordering
+//! 2. Recursively traverses through transparent nodes to find data sources
+//! 3. Pushes the sort requirement down when possible
+//! 4. Returns **Inexact** results (keeps Sort but enables early termination)
+//! 5. Phase 2 todo will detect perfect ordering and remove Sort completely
+
+use crate::{OptimizerContext, PhysicalOptimizerRule};
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_common::Result;
+use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr;
+use datafusion_physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use 
datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
+use std::sync::Arc;
+
+/// A PhysicalOptimizerRule that attempts to push down sort requirements to 
data sources.
+///
+/// # Phase 1 Behavior (Current)
+///
+/// This optimization rearranges files and row groups to match query ordering:
+/// - Files are reordered based on their min/max statistics
+/// - Row groups are read in reverse order when appropriate
+/// - Returns **Inexact** ordering (keeps Sort but enables early termination)
+///
+/// Benefits:
+/// - TopK queries (ORDER BY ... LIMIT): 50-80% faster due to early termination
+/// - Range queries: 30-50% improvement from better data locality
+/// - Memory: No additional overhead (only changes read order)
+///
+/// # Phase 2 (Future)
+///
+/// Will detect when files are perfectly sorted and:
+/// - Return **Exact** ordering guarantees
+/// - Completely eliminate the Sort operator
+/// - Provide even better performance
+///
+/// # Implementation
+///
+/// 1. Detects SortExec nodes
+/// 2. Recursively pushes through transparent nodes (CoalesceBatches, 
Repartition, etc.)
+/// 3. Asks data sources to optimize via `try_pushdown_sort()`
+/// 4. Keeps Sort operator (Phase 1 returns Inexact)
+#[derive(Debug, Clone, Default)]
+pub struct PushdownSort;
+
+impl PushdownSort {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl PhysicalOptimizerRule for PushdownSort {
+    fn optimize_plan(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        context: &OptimizerContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Check if sort pushdown optimization is enabled
+        let enable_sort_pushdown = context
+            .session_config()
+            .options()
+            .execution
+            .parquet
+            .enable_sort_pushdown;
+
+        // Return early if not enabled
+        if !enable_sort_pushdown {
+            return Ok(plan);
+        }
+
+        // Search for any SortExec nodes and try to optimize them
+        plan.transform_down(&|plan: Arc<dyn ExecutionPlan>| {
+            // First check if this is a GlobalLimitExec -> SortExec pattern
+            // This is important for TopK queries
+            if let Some(limit_exec) = 
plan.as_any().downcast_ref::<GlobalLimitExec>() {
+                if let Some(sort_exec) =
+                    limit_exec.input().as_any().downcast_ref::<SortExec>()
+                {
+                    return optimize_limit_sort(limit_exec, sort_exec);
+                }
+            }
+
+            // Otherwise, check if this is just a SortExec
+            let sort_exec = match plan.as_any().downcast_ref::<SortExec>() {
+                Some(sort_exec) => sort_exec,
+                None => return Ok(Transformed::no(plan)),
+            };
+
+            optimize_sort(sort_exec)
+        })
+        .data()
+    }
+
+    fn name(&self) -> &str {
+        "PushdownSort"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+/// Optimize a SortExec by potentially pushing the sort down to the data source
+fn optimize_sort(sort_exec: &SortExec) -> Result<Transformed<Arc<dyn 
ExecutionPlan>>> {
+    let sort_input = Arc::clone(sort_exec.input());
+    let required_ordering = sort_exec.expr();
+
+    // First, check if the sort is already satisfied by input ordering
+    if let Some(_input_ordering) = sort_input.output_ordering() {
+        let input_eq_properties = sort_input.equivalence_properties();
+
+        if input_eq_properties.ordering_satisfy(required_ordering.clone())? {
+            return remove_unnecessary_sort(sort_exec, sort_input);
+        }
+    }
+
+    // Try to push the sort requirement down to the data source (with 
recursive traversal)
+    if let Some(optimized_input) = try_pushdown_sort(&sort_input, 
required_ordering)? {
+        // Phase 1: Always keep the Sort operator
+        // Even though we optimized the input (reordered files/row groups),
+        // we cannot guarantee perfect ordering due to potential overlaps
+        //
+        // However, this still provides huge benefits:
+        // - TopK queries can terminate early
+        // - Less data needs to be sorted
+        // - Better cache locality
+        return Ok(Transformed::yes(Arc::new(
+            SortExec::new(required_ordering.clone(), optimized_input)
+                .with_fetch(sort_exec.fetch())
+                .with_preserve_partitioning(sort_exec.preserve_partitioning()),
+        )));
+    }
+
+    Ok(Transformed::no(Arc::new(sort_exec.clone())))
+}
+
+/// Handle the GlobalLimitExec -> SortExec pattern
+/// This is critical for TopK query optimization
+fn optimize_limit_sort(
+    limit_exec: &GlobalLimitExec,
+    sort_exec: &SortExec,
+) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
+    let sort_input = Arc::clone(sort_exec.input());
+    let required_ordering = sort_exec.expr();
+
+    // Check if input is already sorted
+    if let Some(_input_ordering) = sort_input.output_ordering() {
+        let input_eq_properties = sort_input.equivalence_properties();
+        if input_eq_properties.ordering_satisfy(required_ordering.clone())? {
+            // Input is already sorted correctly, remove sort and keep limit
+            return Ok(Transformed::yes(Arc::new(GlobalLimitExec::new(
+                sort_input,
+                limit_exec.skip(),
+                limit_exec.fetch(),
+            ))));
+        }
+    }
+
+    // Try to push down the sort requirement
+    if let Some(optimized_input) = try_pushdown_sort(&sort_input, 
required_ordering)? {
+        // Phase 1: Keep the Sort operator
+        // But add the fetch limit to enable early termination
+        // This is where TopK optimization happens!
+        let total_fetch = limit_exec.skip() + limit_exec.fetch().unwrap_or(0);

Review Comment:
   Limit is not needed, i removed this logic, because other optimizer did this 
for this PR cases.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Establish the high level API for sort pushdown and the optimizer rule and support reverse files and row groups [datafusion]

Reply via email to