alamb commented on code in PR #5345:
URL: https://github.com/apache/arrow-datafusion/pull/5345#discussion_r1128498691
##########
datafusion/optimizer/src/decorrelate_where_exists.rs:
##########
@@ -670,4 +673,76 @@ mod tests {
assert_plan_eq(&plan, expected)
}
+
+ #[test]
+ fn exists_distinct_subquery() -> Result<()> {
+ let table_scan = test_table_scan()?;
+ let subquery_scan = test_table_scan_with_name("sq")?;
+ let subquery = LogicalPlanBuilder::from(subquery_scan)
+ .filter((lit(1u32) + col("sq.a")).gt(col("test.a") * lit(2u32)))?
+ .project(vec![col("sq.c")])?
+ .distinct()?
+ .build()?;
+ let plan = LogicalPlanBuilder::from(table_scan)
+ .filter(exists(Arc::new(subquery)))?
+ .project(vec![col("test.b")])?
+ .build()?;
+
+ let expected = "Projection: test.b [b:UInt32]\
+ \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a
* UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\
+ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\
+ \n Distinct: [a:UInt32]\
+ \n Projection: sq.a [a:UInt32]\
+ \n TableScan: sq [a:UInt32, b:UInt32,
c:UInt32]";
+
+ assert_plan_eq(&plan, expected)
+ }
+
+ #[test]
+ fn exists_distinct_expr_subquery() -> Result<()> {
+ let table_scan = test_table_scan()?;
+ let subquery_scan = test_table_scan_with_name("sq")?;
+ let subquery = LogicalPlanBuilder::from(subquery_scan)
+ .filter((lit(1u32) + col("sq.a")).gt(col("test.a") * lit(2u32)))?
+ .project(vec![col("sq.b") + col("sq.c")])?
+ .distinct()?
+ .build()?;
+ let plan = LogicalPlanBuilder::from(table_scan)
+ .filter(exists(Arc::new(subquery)))?
+ .project(vec![col("test.b")])?
+ .build()?;
+
+ let expected = "Projection: test.b [b:UInt32]\
+ \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a
* UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\
+ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\
+ \n Distinct: [a:UInt32]\
+ \n Projection: sq.a [a:UInt32]\
+ \n TableScan: sq [a:UInt32, b:UInt32,
c:UInt32]";
+
+ assert_plan_eq(&plan, expected)
+ }
Review Comment:
> For postgres, it will not add back the distinct to the optimized result.
I think it likely depends on how the join operators are implemented
```
# current logical plan:
Projection: t1.t1_id
LeftSemi Join: Filter: t2.t2_id > t1.t1_id
TableScan: t1 projection=[t1_id]
Projection: t2.t2_id # DISTINCT is not added back
TableScan: t2 projection=[t2_id]
```
If the `LeftSemiJoin` is going to read its build side into (effectively) a
HashSet then it doesn't really matter if the input is deduplicated prior to
input.
However, since there is no equality predicate (the predicate is `>`) , i am
not sure what our LeftSemiJoin will do (maybe it will buffer the entire build
side?) in which case it might help performance to put a DISTINCT on he output
of the Projection to reduce the cardinality that the join buffers.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]