alamb commented on a change in pull request #1339:
URL: https://github.com/apache/arrow-datafusion/pull/1339#discussion_r753458634



##########
File path: datafusion/tests/sql.rs
##########
@@ -5999,3 +5999,86 @@ async fn test_expect_distinct() -> Result<()> {
     assert_batches_eq!(expected, &actual);
     Ok(())
 }
+
+#[tokio::test]
+async fn test_predicate_push_down_with_unsafe_null() -> Result<()> {
+    let mut ctx = ExecutionContext::new();
+    let create_table1_sql =
+        "create table table1 as SELECT * FROM (VALUES (1), (2), (null)) as t";
+    ctx.sql(create_table1_sql).await.unwrap();
+    let create_table2_sql =
+        "create table table2 as SELECT * FROM (VALUES (1), (3), (null)) as t";
+    ctx.sql(create_table2_sql).await.unwrap();
+    // left join with is_not_null filter
+    let sql = "SELECT * FROM table1 LEFT JOIN table2 ON table1.column1 = 
table2.column1 WHERE table2.column1 IS NOT NULL";
+    let actual = execute_to_batches(&mut ctx, sql).await;
+    let expected = vec![
+        "+---------+---------+",
+        "| column1 | column1 |",
+        "+---------+---------+",
+        "| 1       | 1       |",
+        "+---------+---------+",
+    ];
+    assert_batches_eq!(expected, &actual);
+    // left join with is_null filter
+    let sql = "SELECT * FROM table1 LEFT JOIN table2 ON table1.column1 = 
table2.column1 WHERE table2.column1 IS NULL ORDER BY table1.column1";
+    let actual = execute_to_batches(&mut ctx, sql).await;
+    let expected = vec![
+        "+---------+---------+",
+        "| column1 | column1 |",
+        "+---------+---------+",
+        "|         |         |",
+        "| 2       |         |",
+        "+---------+---------+",

Review comment:
       This appears to be different than postgres:
   
   ```
   alamb=# select * from table1;
    column1 
   ---------
          1
          2
           
   (3 rows)
   
   alamb=# select * from table2;
    column1 
   ---------
          1
          3
           
   (3 rows)
   
   alamb=# SELECT * FROM table1 LEFT JOIN table2 ON table1.column1 = 
table2.column1 WHERE table2.column1 IS NULL ORDER BY table1.column1;
    column1 | column1 
   ---------+---------
          2 |        
            |        
   (2 rows)
   ```

##########
File path: datafusion/src/optimizer/filter_push_down.rs
##########
@@ -408,10 +412,20 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> 
Result<LogicalPlan> {
             //
             // Join clauses with `Using` constraints also take advantage of 
this logic to make sure
             // predicates reference the shared join columns are pushed to both 
sides.
+            let mut null_unsafe_predicates = vec![];
             let join_side_filters = state
                 .filters
                 .iter()
                 .filter_map(|(predicate, columns)| {
+                    if join_type == &JoinType::Left
+                        || join_type == &JoinType::Right
+                        || join_type == &JoinType::Full
+                    {
+                        if let Expr::IsNull(..) | Expr::IsNotNull(..) = 
predicate {

Review comment:
       I haven't had a chance to study this PR yeet in depth, but I wanted to 
point out a few things:
   
   I don't think we can push *ANY* filers from the `ON` clause down the non 
preserved side(s) of an outer join -- in other words, this is not a problem 
with `IS NULL` and `IS NOT NULL`.
   
   For example
   
   ```sql
   alamb=# select * from A;
    a 
   ---
    1
    2
   (2 rows)
   
   alamb=# select * from B;
    b 
   ---
    1
    2
   (2 rows)
   ```
   
   Note if you pushed the predicate `a>5` down, there would be no rows, but the 
output is:
   ```sql
   alamb=# select * from A LEFT JOIN B ON (a=b and a>5);
    a | b 
   ---+---
    1 |  
    2 |  
   ```
   
   Note the semantics of predicates in the `WHERE` clause are different (they 
are always pushed down)
   
   ```
   alamb=# select * from A LEFT JOIN B ON (a=b) WHERE  a>5;
    a | b 
   ---+---
   (0 rows)
   ```
   
   
   However, note you *CAN* push `b>5` down and get the correct output (because 
b is not preserved):
   
   ```sql
   alamb=# select * from A LEFT JOIN B ON (a=b and b>5);
    a | b 
   ---+---
    1 |  
    2 |  
   (2 rows)
   ```
   
   The criteria is more like "refers to a column on the non preserved side"




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to