adriangb commented on code in PR #20117: URL: https://github.com/apache/datafusion/pull/20117#discussion_r2760985953
########## datafusion/optimizer/src/extract_leaf_expressions.rs: ########## @@ -0,0 +1,1916 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ExtractLeafExpressions`] extracts `MoveTowardsLeafNodes` sub-expressions into projections. +//! +//! This optimizer rule normalizes the plan so that all `MoveTowardsLeafNodes` computations +//! (like field accessors) live in Projection nodes immediately above scan nodes, making them +//! eligible for pushdown by the `OptimizeProjections` rule. +//! +//! ## Algorithm +//! +//! This rule uses **TopDown** traversal with projection merging: +//! +//! 1. When encountering a projection with `MoveTowardsLeafNodes` expressions, look at its input +//! 2. If input is a Projection, **merge** the expressions through it using column replacement +//! 3. Continue until we hit a barrier node (TableScan, Join, Aggregate) +//! 4. Idempotency is natural: merged expressions no longer have column refs matching projection outputs +//! +//! ### Special Cases +//! +//! - If ALL expressions in a projection are `MoveTowardsLeafNodes`, push the entire projection down +//! - If NO expressions are `MoveTowardsLeafNodes`, return `Transformed::no` +//! +//! ### Node Classification +//! +//! **Barrier Nodes** (stop pushing, create projection above): +//! - `TableScan` - the leaf, ideal extraction point +//! - `Join` - requires routing to left/right sides +//! - `Aggregate` - changes schema semantics +//! - `SubqueryAlias` - scope boundary +//! - `Union`, `Intersect`, `Except` - schema boundaries +//! +//! **Schema-Preserving Nodes** (push through unchanged): +//! - `Filter` - passes all input columns through +//! - `Sort` - passes all input columns through +//! - `Limit` - passes all input columns through +//! +//! **Projection Nodes** (merge through): +//! - Replace column refs with underlying expressions from the child projection + +use indexmap::{IndexMap, IndexSet}; +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion_common::alias::AliasGenerator; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; +use datafusion_common::{Column, DFSchema, Result}; +use datafusion_expr::expr_rewriter::NamePreserver; +use datafusion_expr::logical_plan::LogicalPlan; +use datafusion_expr::{Expr, ExpressionPlacement, Filter, Limit, Projection, Sort}; + +use crate::optimizer::ApplyOrder; +use crate::push_down_filter::replace_cols_by_name; +use crate::utils::{EXTRACTED_EXPR_PREFIX, has_all_column_refs}; +use crate::{OptimizerConfig, OptimizerRule}; + +/// Extracts `MoveTowardsLeafNodes` sub-expressions from all nodes into projections. +/// +/// This normalizes the plan so that all `MoveTowardsLeafNodes` computations (like field +/// accessors) live in Projection nodes, making them eligible for pushdown. +/// +/// # Example +/// +/// Given a filter with a struct field access: +/// +/// ```text +/// Filter: user['status'] = 'active' +/// TableScan: t [user] +/// ``` +/// +/// This rule extracts the field access into a projection: +/// +/// ```text +/// Filter: __datafusion_extracted_1 = 'active' +/// Projection: user['status'] AS __datafusion_extracted_1, user +/// TableScan: t [user] +/// ``` +/// +/// The `OptimizeProjections` rule can then push this projection down to the scan. +/// +/// **Important:** The `PushDownFilter` rule is aware of projections created by this rule +/// and will not push filters through them. See `is_extracted_expr_projection` in utils.rs. +#[derive(Default, Debug)] +pub struct ExtractLeafExpressions {} + +impl ExtractLeafExpressions { + /// Create a new [`ExtractLeafExpressions`] + pub fn new() -> Self { + Self {} + } +} + +impl OptimizerRule for ExtractLeafExpressions { + fn name(&self) -> &str { + "extract_leaf_expressions" + } + + fn apply_order(&self) -> Option<ApplyOrder> { + Some(ApplyOrder::TopDown) + } + + fn rewrite( + &self, + plan: LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result<Transformed<LogicalPlan>> { + let alias_generator = config.alias_generator(); + extract_from_plan(plan, alias_generator) + } +} + +/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node. +/// +/// With TopDown traversal, we process parent nodes first, allowing us to +/// merge expressions through child projections. +fn extract_from_plan( + plan: LogicalPlan, + alias_generator: &Arc<AliasGenerator>, +) -> Result<Transformed<LogicalPlan>> { + match &plan { + // Schema-preserving nodes - extract and push down + LogicalPlan::Filter(_) | LogicalPlan::Sort(_) | LogicalPlan::Limit(_) => { + extract_from_schema_preserving(plan, alias_generator) Review Comment: I’ll give it a try -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
