alamb commented on a change in pull request #1339: URL: https://github.com/apache/arrow-datafusion/pull/1339#discussion_r753458634
########## File path: datafusion/tests/sql.rs ########## @@ -5999,3 +5999,86 @@ async fn test_expect_distinct() -> Result<()> { assert_batches_eq!(expected, &actual); Ok(()) } + +#[tokio::test] +async fn test_predicate_push_down_with_unsafe_null() -> Result<()> { + let mut ctx = ExecutionContext::new(); + let create_table1_sql = + "create table table1 as SELECT * FROM (VALUES (1), (2), (null)) as t"; + ctx.sql(create_table1_sql).await.unwrap(); + let create_table2_sql = + "create table table2 as SELECT * FROM (VALUES (1), (3), (null)) as t"; + ctx.sql(create_table2_sql).await.unwrap(); + // left join with is_not_null filter + let sql = "SELECT * FROM table1 LEFT JOIN table2 ON table1.column1 = table2.column1 WHERE table2.column1 IS NOT NULL"; + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+---------+---------+", + "| column1 | column1 |", + "+---------+---------+", + "| 1 | 1 |", + "+---------+---------+", + ]; + assert_batches_eq!(expected, &actual); + // left join with is_null filter + let sql = "SELECT * FROM table1 LEFT JOIN table2 ON table1.column1 = table2.column1 WHERE table2.column1 IS NULL ORDER BY table1.column1"; + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+---------+---------+", + "| column1 | column1 |", + "+---------+---------+", + "| | |", + "| 2 | |", + "+---------+---------+", Review comment: This appears to be different than postgres: ``` alamb=# select * from table1; column1 --------- 1 2 (3 rows) alamb=# select * from table2; column1 --------- 1 3 (3 rows) alamb=# SELECT * FROM table1 LEFT JOIN table2 ON table1.column1 = table2.column1 WHERE table2.column1 IS NULL ORDER BY table1.column1; column1 | column1 ---------+--------- 2 | | (2 rows) ``` ########## File path: datafusion/src/optimizer/filter_push_down.rs ########## @@ -408,10 +412,20 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result<LogicalPlan> { // // Join clauses with `Using` constraints also take advantage of this logic to make sure // predicates reference the shared join columns are pushed to both sides. + let mut null_unsafe_predicates = vec![]; let join_side_filters = state .filters .iter() .filter_map(|(predicate, columns)| { + if join_type == &JoinType::Left + || join_type == &JoinType::Right + || join_type == &JoinType::Full + { + if let Expr::IsNull(..) | Expr::IsNotNull(..) = predicate { Review comment: I haven't had a chance to study this PR yeet in depth, but I wanted to point out a few things: I don't think we can push *ANY* filers from the `ON` clause down the non preserved side(s) of an outer join -- in other words, this is not a problem with `IS NULL` and `IS NOT NULL`. For example ```sql alamb=# select * from A; a --- 1 2 (2 rows) alamb=# select * from B; b --- 1 2 (2 rows) ``` Note if you pushed the predicate `a>5` down, there would be no rows, but the output is: ```sql alamb=# select * from A LEFT JOIN B ON (a=b and a>5); a | b ---+--- 1 | 2 | ``` Note the semantics of predicates in the `WHERE` clause are different (they are always pushed down) ``` alamb=# select * from A LEFT JOIN B ON (a=b) WHERE a>5; a | b ---+--- (0 rows) ``` However, note you *CAN* push `b>5` down and get the correct output (because b is not preserved): ```sql alamb=# select * from A LEFT JOIN B ON (a=b and b>5); a | b ---+--- 1 | 2 | (2 rows) ``` The criteria is more like "refers to a column on the non preserved side" -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org