cloud-fan commented on code in PR #46143:
URL: https://github.com/apache/spark/pull/46143#discussion_r2525726304
##########
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala:
##########
@@ -152,20 +156,122 @@ class FilterPushdownSuite extends PlanTest {
test("can't push without rewrite") {
val originalQuery =
testRelation
- .select($"a" + $"b" as "e")
+ .select($"a" + $"b" as "e", $"a" - $"b" as "f")
.where($"e" === 1)
.analyze
val optimized = Optimize.execute(originalQuery.analyze)
val correctAnswer =
testRelation
.where($"a" + $"b" === 1)
- .select($"a" + $"b" as "e")
+ .select($"a" + $"b" as "e", $"a" - $"b" as "f")
+ .analyze
+
+ comparePlans(optimized, correctAnswer)
+ }
+
+ test("SPARK-47672: Avoid double evaluation with projections but push
components that can be") {
+ val originalQuery = testStringRelation
+ .select($"a", $"e".rlike("magic") as "f", $"e".rlike("notmagic") as "j",
$"b")
+ .where($"a" > 5 && $"f")
+ .analyze
+
+ val optimized = Optimize.execute(originalQuery)
+
+ val correctAnswer = testStringRelation
+ .where($"a" > 5)
+ .select($"a", $"b", $"e", $"e".rlike("magic") as "f")
+ .where($"f")
Review Comment:
I get the purpose here is to reduce the input to other expensive functions
in the same Project. But splitting a Project has overhead as well (more
operator, more overhead), and in most cases the benefit of filter pushdown is
to reduce IO for shuffle/scan, shall we defer this optimization? Then the impl
can be much simpler:
https://github.com/apache/spark/pull/46143/files#r2525714854
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]