alamb commented on code in PR #4923:
URL: https://github.com/apache/arrow-datafusion/pull/4923#discussion_r1072857252


##########
datafusion/optimizer/src/push_down_filter.rs:
##########
@@ -108,13 +108,9 @@ fn on_lr_is_preserved(plan: &LogicalPlan) -> Result<(bool, 
bool)> {
             JoinType::Left => Ok((false, true)),
             JoinType::Right => Ok((true, false)),
             JoinType::Full => Ok((false, false)),
-            JoinType::LeftSemi
-            | JoinType::LeftAnti
-            | JoinType::RightSemi
-            | JoinType::RightAnti => {
-                // filter_push_down does not yet support SEMI/ANTI joins with 
join conditions
-                Ok((false, false))
-            }
+            JoinType::LeftSemi | JoinType::RightSemi => Ok((true, true)),

Review Comment:
   I thought a Semi join had the same semantics as Left join -- namely that the 
left/right side were preserved but the  others weren't
   
   So I would expect  this to be something like
   
   
   ```suggestion
               JoinType::LeftSemi => Ok((false, true)),
               JoinType::RightSemi => Ok((true, false)),
   ```
   
   Maybe I am mis understanding the semi join



##########
datafusion/core/tests/sqllogictests/test_files/subquery.slt:
##########
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   ❤️ 



##########
datafusion/optimizer/src/push_down_filter.rs:
##########
@@ -2328,4 +2324,183 @@ mod tests {
             .expect("failed to optimize plan");
         assert_optimized_plan_eq(&optimized_plan, expected)
     }
+
+    #[test]
+    fn left_semi_join_with_filters() -> Result<()> {
+        let left = test_table_scan_with_name("test1")?;
+        let right_table_scan = test_table_scan_with_name("test2")?;
+        let right = LogicalPlanBuilder::from(right_table_scan)
+            .project(vec![col("a"), col("b")])?
+            .build()?;
+        let plan = LogicalPlanBuilder::from(left)
+            .join(
+                right,
+                JoinType::LeftSemi,
+                (
+                    vec![Column::from_qualified_name("test1.a")],
+                    vec![Column::from_qualified_name("test2.a")],
+                ),
+                Some(
+                    col("test1.b")
+                        .gt(lit(1u32))
+                        .and(col("test2.b").gt(lit(2u32))),
+                ),
+            )?
+            .build()?;
+
+        // not part of the test, just good to know:
+        assert_eq!(
+            format!("{plan:?}"),
+            "LeftSemi Join: test1.a = test2.a Filter: test1.b > UInt32(1) AND 
test2.b > UInt32(2)\
+            \n  TableScan: test1\
+            \n  Projection: test2.a, test2.b\
+            \n    TableScan: test2",
+        );
+
+        // Both side will be pushed down.
+        let expected = "\
+        LeftSemi Join: test1.a = test2.a\
+        \n  Filter: test1.b > UInt32(1)\
+        \n    TableScan: test1\
+        \n  Projection: test2.a, test2.b\
+        \n    Filter: test2.b > UInt32(2)\

Review Comment:
   as above I don't think this is correct -- but I think we would need to 
review the implementation in spark



##########
datafusion/optimizer/src/push_down_filter.rs:
##########
@@ -108,13 +108,9 @@ fn on_lr_is_preserved(plan: &LogicalPlan) -> Result<(bool, 
bool)> {
             JoinType::Left => Ok((false, true)),
             JoinType::Right => Ok((true, false)),
             JoinType::Full => Ok((false, false)),
-            JoinType::LeftSemi
-            | JoinType::LeftAnti
-            | JoinType::RightSemi
-            | JoinType::RightAnti => {
-                // filter_push_down does not yet support SEMI/ANTI joins with 
join conditions
-                Ok((false, false))
-            }
+            JoinType::LeftSemi | JoinType::RightSemi => Ok((true, true)),

Review Comment:
   
   
   I played around with it in postgres:
   
   ```sql
   
   postgres=# create table bar as values (2, 'a'), (3, 'b');
   SELECT 2
   postgres=# create table foo as values (1, 'X'), (2, 'Y'), (3, 'Z');
   SELECT 3
   
   
   postgres=# explain select * from foo SEMI JOIN bar USING (column1) where 
bar.column2 != 'b' ;
                                  QUERY PLAN                                
   -------------------------------------------------------------------------
    Merge Join  (cost=179.17..305.88 rows=8026 width=68)
      Merge Cond: (bar.column1 = semi.column1)
      ->  Sort  (cost=90.99..94.15 rows=1264 width=36)
            Sort Key: bar.column1
            ->  Seq Scan on bar  (cost=0.00..25.88 rows=1264 width=36)
                  Filter: (column2 <> 'b'::text)
      ->  Sort  (cost=88.17..91.35 rows=1270 width=36)
            Sort Key: semi.column1
            ->  Seq Scan on foo semi  (cost=0.00..22.70 rows=1270 width=36)
   (9 rows)
   
   postgres=# select * from foo SEMI JOIN bar USING (column1);
    column1 | column2 | column2 
   ---------+---------+---------
          2 | Y       | a
          3 | Z       | b
   (2 rows)
   
   postgres=# select * from foo SEMI JOIN bar USING (column1) where bar.column2 
!= 'b' ;
    column1 | column2 | column2 
   ---------+---------+---------
          2 | Y       | a
   (1 row)
   ```
   
   And this this looks correct to me



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to