comphead commented on code in PR #16015: URL: https://github.com/apache/datafusion/pull/16015#discussion_r2084982971
########## datafusion/optimizer/src/decorrelate.rs: ########## @@ -313,6 +317,11 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { missing_exprs.push(un_matched_row); } } + if aggregate.group_expr.is_empty() { + // TODO: how do we handle the case where we have pulled multiple aggregations? For example, Review Comment: prob we can create a ticket for that? ########## datafusion/optimizer/src/decorrelate_lateral_join.rs: ########## @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`DecorrelateLateralJoin`] decorrelates logical plans produced by lateral joins. + +use std::collections::BTreeSet; + +use crate::decorrelate::PullUpCorrelatedExpr; +use crate::optimizer::ApplyOrder; +use crate::{OptimizerConfig, OptimizerRule}; +use datafusion_expr::{lit, Join}; + +use datafusion_common::tree_node::{ + Transformed, TransformedResult, TreeNode, TreeNodeRecursion, +}; +use datafusion_common::Result; +use datafusion_expr::logical_plan::JoinType; +use datafusion_expr::utils::conjunction; +use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; + +/// Optimizer rule for rewriting lateral joins to joins +#[derive(Default, Debug)] +pub struct DecorrelateLateralJoin {} + +impl DecorrelateLateralJoin { + #[allow(missing_docs)] + pub fn new() -> Self { + Self::default() + } +} + +impl OptimizerRule for DecorrelateLateralJoin { + fn supports_rewrite(&self) -> bool { + true + } + + fn rewrite( + &self, + plan: LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result<Transformed<LogicalPlan>> { + // Find cross joins with outer column references on the right side (i.e., the apply operator). + let LogicalPlan::Join(join) = plan else { + return Ok(Transformed::no(plan)); + }; + + rewrite_internal(join) + } + + fn name(&self) -> &str { + "decorrelate_lateral_join" + } + + fn apply_order(&self) -> Option<ApplyOrder> { + Some(ApplyOrder::TopDown) + } +} + +// Build the decorrelated join based on the original lateral join query. For now, we only support cross/inner +// lateral joins. +fn rewrite_internal(join: Join) -> Result<Transformed<LogicalPlan>> { + if join.join_type != JoinType::Inner { + return Ok(Transformed::no(LogicalPlan::Join(join))); + } + + match join.right.apply_with_subqueries(|p| { + if p.contains_outer_reference() { Review Comment: perhaps we can add a comment why queries with outer reference won't be optimized? 🤔 ########## datafusion/sqllogictest/test_files/join.slt.part: ########## @@ -1404,3 +1404,94 @@ set datafusion.execution.target_partitions = 4; statement ok set datafusion.optimizer.repartition_joins = false; + +statement ok +CREATE TABLE t1(v0 BIGINT, v1 BIGINT); + +statement ok +CREATE TABLE t0(v0 BIGINT, v1 BIGINT); + +statement ok +INSERT INTO t0(v0, v1) VALUES (1, 1), (1, 2), (3, 3), (4, 4); + +statement ok +INSERT INTO t1(v0, v1) VALUES (1, 1), (3, 2), (3, 5); + +query TT +explain SELECT * +FROM t0, +LATERAL (SELECT sum(v1) FROM t1 WHERE t0.v0 = t1.v0); +---- +logical_plan +01)Projection: t0.v0, t0.v1, sum(t1.v1) +02)--Left Join: t0.v0 = t1.v0 +03)----TableScan: t0 projection=[v0, v1] +04)----Projection: sum(t1.v1), t1.v0 +05)------Aggregate: groupBy=[[t1.v0]], aggr=[[sum(t1.v1)]] +06)--------TableScan: t1 projection=[v0, v1] +physical_plan +01)ProjectionExec: expr=[v0@1 as v0, v1@2 as v1, sum(t1.v1)@0 as sum(t1.v1)] +02)--CoalesceBatchesExec: target_batch_size=8192 +03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(v0@1, v0@0)], projection=[sum(t1.v1)@0, v0@2, v1@3] +04)------CoalescePartitionsExec +05)--------ProjectionExec: expr=[sum(t1.v1)@1 as sum(t1.v1), v0@0 as v0] +06)----------AggregateExec: mode=FinalPartitioned, gby=[v0@0 as v0], aggr=[sum(t1.v1)] +07)------------CoalesceBatchesExec: target_batch_size=8192 +08)--------------RepartitionExec: partitioning=Hash([v0@0], 4), input_partitions=4 +09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +10)------------------AggregateExec: mode=Partial, gby=[v0@0 as v0], aggr=[sum(t1.v1)] +11)--------------------DataSourceExec: partitions=1, partition_sizes=[1] +12)------DataSourceExec: partitions=1, partition_sizes=[1] + +query III +SELECT * +FROM t0, +LATERAL (SELECT sum(v1) FROM t1 WHERE t0.v0 = t1.v0); +---- +1 1 1 +1 2 1 +3 3 7 +4 4 NULL + +query TT +explain SELECT * FROM t0, LATERAL (SELECT * FROM t1 WHERE t0.v0 = t1.v0); +---- +logical_plan +01)Inner Join: t0.v0 = t1.v0 +02)--TableScan: t0 projection=[v0, v1] +03)--TableScan: t1 projection=[v0, v1] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(v0@0, v0@0)] +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)----DataSourceExec: partitions=1, partition_sizes=[1] + +query IIII +SELECT * FROM t0, LATERAL (SELECT * FROM t1 WHERE t0.v0 = t1.v0); +---- +1 1 1 1 +1 2 1 1 +3 3 3 2 +3 3 3 5 + +query III +SELECT * FROM t0, LATERAL (SELECT 1); +---- +1 1 1 +1 2 1 +3 3 1 +4 4 1 + +query IIII +SELECT * FROM t0, LATERAL (SELECT * FROM t1 WHERE t1.v0 = 1); Review Comment: should we also check the join syntax? ``` SELECT * FROM t0 JOIN LATERAL (SELECT * FROM t1 WHERE t1.v0 = 1) on true; ``` ? ########## datafusion/optimizer/src/decorrelate_lateral_join.rs: ########## @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`DecorrelateLateralJoin`] decorrelates logical plans produced by lateral joins. + +use std::collections::BTreeSet; + +use crate::decorrelate::PullUpCorrelatedExpr; +use crate::optimizer::ApplyOrder; +use crate::{OptimizerConfig, OptimizerRule}; +use datafusion_expr::{lit, Join}; + +use datafusion_common::tree_node::{ + Transformed, TransformedResult, TreeNode, TreeNodeRecursion, +}; +use datafusion_common::Result; +use datafusion_expr::logical_plan::JoinType; +use datafusion_expr::utils::conjunction; +use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; + +/// Optimizer rule for rewriting lateral joins to joins +#[derive(Default, Debug)] +pub struct DecorrelateLateralJoin {} + +impl DecorrelateLateralJoin { + #[allow(missing_docs)] + pub fn new() -> Self { + Self::default() + } +} + +impl OptimizerRule for DecorrelateLateralJoin { + fn supports_rewrite(&self) -> bool { + true + } + + fn rewrite( + &self, + plan: LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result<Transformed<LogicalPlan>> { + // Find cross joins with outer column references on the right side (i.e., the apply operator). + let LogicalPlan::Join(join) = plan else { + return Ok(Transformed::no(plan)); + }; + + rewrite_internal(join) + } + + fn name(&self) -> &str { + "decorrelate_lateral_join" + } + + fn apply_order(&self) -> Option<ApplyOrder> { + Some(ApplyOrder::TopDown) + } +} + +// Build the decorrelated join based on the original lateral join query. For now, we only support cross/inner +// lateral joins. +fn rewrite_internal(join: Join) -> Result<Transformed<LogicalPlan>> { + if join.join_type != JoinType::Inner { + return Ok(Transformed::no(LogicalPlan::Join(join))); + } + + match join.right.apply_with_subqueries(|p| { + if p.contains_outer_reference() { + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + })? { + TreeNodeRecursion::Stop => {} + TreeNodeRecursion::Continue => { + // The left side contains outer references, we need to decorrelate it. + return Ok(Transformed::new( + LogicalPlan::Join(join), + false, + TreeNodeRecursion::Jump, + )); + } + TreeNodeRecursion::Jump => { + unreachable!("") + } + } + + let LogicalPlan::Subquery(subquery) = join.right.as_ref() else { + return Ok(Transformed::no(LogicalPlan::Join(join))); + }; + + if join.join_type != JoinType::Inner { + return Ok(Transformed::no(LogicalPlan::Join(join))); + } + let subquery_plan = subquery.subquery.as_ref(); + let mut pull_up = PullUpCorrelatedExpr::new().with_need_handle_count_bug(true); Review Comment: now I'm thinking of having this `.with_need_handle_count_bug` to move it to `unsafe` or something to let users know immediately the risks? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org