alamb commented on code in PR #10430: URL: https://github.com/apache/datafusion/pull/10430#discussion_r1594709240
########## datafusion/optimizer/src/eliminate_cross_join.rs: ########## @@ -250,90 +265,67 @@ fn find_inner_join( })) } -fn intersect( Review Comment: This was moved into JoinExprSet ########## datafusion/optimizer/src/join_key_set.rs: ########## @@ -0,0 +1,240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [JoinKeySet] for tracking the set of join keys in a plan. + +use datafusion_expr::Expr; +use indexmap::{Equivalent, IndexSet}; + +/// Tracks a set of equality Join keys +/// +/// A join key is an expression that is used to join two tables via an equality +/// predicate such as `a.x = b.y` +/// +/// This struct models `a.x + 5 = b.y AND a.z = b.z` as two join keys +/// 1. `(a.x + 5, b.y)` +/// 2. `(a.z, b.z)` +/// +/// # Important properties: +/// +/// 1. Retains insert order +/// 2. Can quickly look up if a pair of expressions are in the set. +#[derive(Debug)] Review Comment: I extracted all the relevant behavior for managing join keys into this file and documented it ########## datafusion/optimizer/src/eliminate_cross_join.rs: ########## @@ -250,90 +265,67 @@ fn find_inner_join( })) } -fn intersect( - accum: &mut Vec<(Expr, Expr)>, - vec1: &[(Expr, Expr)], - vec2: &[(Expr, Expr)], -) { - if !(vec1.is_empty() || vec2.is_empty()) { - for x1 in vec1.iter() { - for x2 in vec2.iter() { - if x1.0 == x2.0 && x1.1 == x2.1 || x1.1 == x2.0 && x1.0 == x2.1 { - accum.push((x1.0.clone(), x1.1.clone())); - } - } - } - } -} - /// Extract join keys from a WHERE clause -fn extract_possible_join_keys(expr: &Expr, accum: &mut Vec<(Expr, Expr)>) -> Result<()> { +fn extract_possible_join_keys(expr: &Expr, join_keys: &mut JoinKeySet) { if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr { match op { Operator::Eq => { - // Ensure that we don't add the same Join keys multiple times - if !(accum.contains(&(*left.clone(), *right.clone())) - || accum.contains(&(*right.clone(), *left.clone()))) - { - accum.push((*left.clone(), *right.clone())); - } + // insert handles ensuring we don't add the same Join keys multiple times + join_keys.insert(left, right); } Operator::And => { - extract_possible_join_keys(left, accum)?; - extract_possible_join_keys(right, accum)? + extract_possible_join_keys(left, join_keys); + extract_possible_join_keys(right, join_keys) } // Fix for issue#78 join predicates from inside of OR expr also pulled up properly. Operator::Or => { - let mut left_join_keys = vec![]; - let mut right_join_keys = vec![]; + let mut left_join_keys = JoinKeySet::new(); + let mut right_join_keys = JoinKeySet::new(); - extract_possible_join_keys(left, &mut left_join_keys)?; - extract_possible_join_keys(right, &mut right_join_keys)?; + extract_possible_join_keys(left, &mut left_join_keys); + extract_possible_join_keys(right, &mut right_join_keys); - intersect(accum, &left_join_keys, &right_join_keys) + join_keys.insert_intersection(left_join_keys, right_join_keys) } _ => (), }; } - Ok(()) } /// Remove join expressions from a filter expression -/// Returns Some() when there are few remaining predicates in filter_expr -/// Returns None otherwise -fn remove_join_expressions( - expr: &Expr, - join_keys: &HashSet<(Expr, Expr)>, -) -> Result<Option<Expr>> { +/// +/// # Returns +/// * `Some()` when there are few remaining predicates in filter_expr +/// * `None` otherwise +fn remove_join_expressions(expr: Expr, join_keys: &JoinKeySet) -> Option<Expr> { match expr { - Expr::BinaryExpr(BinaryExpr { left, op, right }) => { - match op { - Operator::Eq => { - if join_keys.contains(&(*left.clone(), *right.clone())) - || join_keys.contains(&(*right.clone(), *left.clone())) Review Comment: there were several `clone`s (deep copies) here too to simply check if the the join keys contained the expressions ########## datafusion/optimizer/src/eliminate_cross_join.rs: ########## @@ -250,90 +265,67 @@ fn find_inner_join( })) } -fn intersect( - accum: &mut Vec<(Expr, Expr)>, - vec1: &[(Expr, Expr)], - vec2: &[(Expr, Expr)], -) { - if !(vec1.is_empty() || vec2.is_empty()) { - for x1 in vec1.iter() { - for x2 in vec2.iter() { - if x1.0 == x2.0 && x1.1 == x2.1 || x1.1 == x2.0 && x1.0 == x2.1 { - accum.push((x1.0.clone(), x1.1.clone())); - } - } - } - } -} - /// Extract join keys from a WHERE clause -fn extract_possible_join_keys(expr: &Expr, accum: &mut Vec<(Expr, Expr)>) -> Result<()> { +fn extract_possible_join_keys(expr: &Expr, join_keys: &mut JoinKeySet) { if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr { match op { Operator::Eq => { - // Ensure that we don't add the same Join keys multiple times - if !(accum.contains(&(*left.clone(), *right.clone())) - || accum.contains(&(*right.clone(), *left.clone()))) - { - accum.push((*left.clone(), *right.clone())); - } + // insert handles ensuring we don't add the same Join keys multiple times + join_keys.insert(left, right); Review Comment: JoinExprSet does not require cloning to check if the exprs should be inserted -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org