alamb commented on a change in pull request #1618:
URL: https://github.com/apache/arrow-datafusion/pull/1618#discussion_r789789566
##########
File path: datafusion/tests/sql/joins.rs
##########
@@ -210,6 +210,26 @@ async fn left_join_unbalanced() -> Result<()> {
Ok(())
}
+#[tokio::test]
+async fn left_join_filter_pushdown() -> Result<()> {
+ // Since t2 is the non-preserved side of the join, we cannot push down a
NULL filter.
+ let mut ctx = create_join_context_with_nulls()?;
+ let sql = "SELECT t1_id, t2_id, t2_name FROM t1 LEFT JOIN t2 ON t1_id =
t2_id WHERE t2_name IS NULL ORDER BY t1_id";
Review comment:
👍 this does match postgres:
```sql
alamb=# create table t1(t1_id, t1_name) as select * from (values (11, 'a'),
(22, 'b'), (33, 'c'), (44, 'd'), (77, 'e')) as sq;
SELECT 5
alamb=# create table t2(t2_id, t2_name) as select * from (values (11, 'z'),
(22, null), (44, 'x'), (55, 'w')) as sq;
SELECT 4
alamb=# SELECT t1_id, t2_id, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id
WHERE t2_name IS NULL ORDER BY t1_id;
t1_id | t2_id | t2_name
-------+-------+---------
22 | 22 |
33 | |
77 | |
(3 rows)
```
I think the key would be to test adding a filter on `t2_id` however (as that
is the predicate type that is created)
##########
File path: datafusion/src/optimizer/filter_push_down.rs
##########
@@ -1139,6 +1136,172 @@ mod tests {
Ok(())
}
+ /// post-join predicates on the right side of a left join are duplicated
and pushed down
+ /// to the left side.
+ #[test]
+ fn filter_using_left_join() -> Result<()> {
+ let table_scan = test_table_scan()?;
+ let left = LogicalPlanBuilder::from(table_scan).build()?;
+ let right_table_scan = test_table_scan_with_name("test2")?;
+ let right = LogicalPlanBuilder::from(right_table_scan)
+ .project(vec![col("a")])?
+ .build()?;
+ let plan = LogicalPlanBuilder::from(left)
+ .join_using(
+ &right,
+ JoinType::Left,
+ vec![Column::from_name("a".to_string())],
+ )?
+ .filter(col("test2.a").lt_eq(lit(1i64)))?
+ .build()?;
+
+ // not part of the test, just good to know:
+ assert_eq!(
+ format!("{:?}", plan),
+ "\
+ Filter: #test2.a <= Int64(1)\
+ \n Join: Using #test.a = #test2.a\
+ \n TableScan: test projection=None\
+ \n Projection: #test2.a\
+ \n TableScan: test2 projection=None"
+ );
+
+ // filter duplicated and pushed down to the left
+ let expected = "\
+ Filter: #test2.a <= Int64(1)\
+ \n Join: Using #test.a = #test2.a\
+ \n Filter: #test.a <= Int64(1)\
Review comment:
I don't think this is correct -- the predicate `test2.a <= 1` can be
pushed down to the *non* preserved (aka `test2`) side of the join. It can't be
pushed down the `test1` side. The rationale is that each row of `test` (NOT
`test2`) must be preserved through the `LEFT JOIN` as if there are no matching
rows in `test2` then the row in `test` is returned padded with nulls.
Here is an example of the discrepancy with postgres:
```sql
create table t1 as select * from (values (1), (null), (2)) as sq;
create table t2 as select * from (values (2), (null), (3)) as sq;
```
Postgres:
```sql
alamb=# select * from t1 LEFT JOIN t2 ON t1.column1=t2.column1 WHERE
t2.column1 IS NULL;
column1 | column1
---------+---------
1 |
|
(2 rows)
```
This branch incorrectly filters out the value from `column1`
```sql
❯ select * from t1 LEFT JOIN t2 ON t1.column1=t2.column1 WHERE t2.column1 IS
NULL;
+---------+---------+
| column1 | column1 |
+---------+---------+
| | |
+---------+---------+
```
##########
File path: datafusion/src/optimizer/filter_push_down.rs
##########
@@ -253,33 +176,107 @@ fn split_members<'a>(predicate: &'a Expr, predicates:
&mut Vec<&'a Expr>) {
}
}
+// For a given JOIN logical plan, determine whether each side of the join is
preserved.
+// We say a join side is preserved if the join returns all or a subset of the
rows from
+// the relevant side - i.e. the side of the join cannot provide nulls. Returns
a tuple
+// of booleans - (left_preserved, right_preserved).
+fn lr_is_preserved(plan: &LogicalPlan) -> (bool, bool) {
+ match plan {
+ LogicalPlan::Join(Join { join_type, .. }) => match join_type {
+ JoinType::Inner => (true, true),
+ JoinType::Left => (true, false),
+ JoinType::Right => (false, true),
+ JoinType::Full => (false, false),
+ // No columns from the right side of the join can be referenced in
output
+ // predicates for semi/anti joins, so whether we specify t/f
doesn't matter.
+ JoinType::Semi | JoinType::Anti => (true, false),
+ },
+ LogicalPlan::CrossJoin(_) => (true, true),
+ _ => unreachable!("lr_is_preserved only valid for JOIN nodes"),
+ }
+}
+
+// Determine which predicates in state can be pushed down to a given side of a
join.
+// To determine this, we need to know the schema of the relevant join side and
whether
+// or not the side's rows are preserved when joining. If the side is not
preserved, we
+// do not push down anything. Otherwise we can push down predicates where all
of the
+// relevant columns are contained on the relevant join side's schema.
+fn get_pushable_join_predicates<'a>(
+ state: &'a State,
+ schema: &DFSchema,
+ preserved: bool,
+) -> Predicates<'a> {
+ if !preserved {
Review comment:
I think this check is wrong -- you can't push predicates to the
*preserved* side
```suggestion
if preserved {
```
##########
File path: datafusion/src/optimizer/filter_push_down.rs
##########
@@ -253,33 +176,107 @@ fn split_members<'a>(predicate: &'a Expr, predicates:
&mut Vec<&'a Expr>) {
}
}
+// For a given JOIN logical plan, determine whether each side of the join is
preserved.
+// We say a join side is preserved if the join returns all or a subset of the
rows from
+// the relevant side - i.e. the side of the join cannot provide nulls. Returns
a tuple
+// of booleans - (left_preserved, right_preserved).
+fn lr_is_preserved(plan: &LogicalPlan) -> (bool, bool) {
+ match plan {
+ LogicalPlan::Join(Join { join_type, .. }) => match join_type {
+ JoinType::Inner => (true, true),
+ JoinType::Left => (true, false),
+ JoinType::Right => (false, true),
+ JoinType::Full => (false, false),
Review comment:
Both sides are preserved in a ` FULL` join (as in this should probably
be (`true`, `true`)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]