ulysses-you commented on code in PR #36117:
URL: https://github.com/apache/spark/pull/36117#discussion_r846970117
##########
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala:
##########
@@ -71,4 +72,26 @@ class EliminateDistinctSuite extends PlanTest {
comparePlans(Optimize.execute(query), answer)
}
}
+
+ test("SPARK-38832: Remove unnecessary distinct in aggregate expression by
distinctKeys") {
+ val q1 = testRelation2.groupBy($"a")($"a")
+ .rebalance().groupBy()(countDistinct($"a") as "x", sumDistinct($"a") as
"y").analyze
+ val r1 = testRelation2.groupBy($"a")($"a")
+ .rebalance().groupBy()(count($"a") as "x", sum($"a") as "y").analyze
+ comparePlans(Optimize.execute(q1), r1)
+
+ // not a subset of distinct attr
+ val q2 = testRelation2.groupBy($"a", $"b")($"a", $"b")
+ .rebalance().groupBy()(countDistinct($"a") as "x", sumDistinct($"a") as
"y").analyze
+ comparePlans(Optimize.execute(q2), q2)
+
+ // avoid remove double data type attr
Review Comment:
Physical Aggregate will wrap `NormalizeNaNAndZero` for float/double to
handle NaN and -0.0, so It's result value might be different with the original
expresiion ?
##########
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala:
##########
@@ -71,4 +72,26 @@ class EliminateDistinctSuite extends PlanTest {
comparePlans(Optimize.execute(query), answer)
}
}
+
+ test("SPARK-38832: Remove unnecessary distinct in aggregate expression by
distinctKeys") {
+ val q1 = testRelation2.groupBy($"a")($"a")
+ .rebalance().groupBy()(countDistinct($"a") as "x", sumDistinct($"a") as
"y").analyze
+ val r1 = testRelation2.groupBy($"a")($"a")
+ .rebalance().groupBy()(count($"a") as "x", sum($"a") as "y").analyze
+ comparePlans(Optimize.execute(q1), r1)
+
+ // not a subset of distinct attr
+ val q2 = testRelation2.groupBy($"a", $"b")($"a", $"b")
+ .rebalance().groupBy()(countDistinct($"a") as "x", sumDistinct($"a") as
"y").analyze
+ comparePlans(Optimize.execute(q2), q2)
+
+ // avoid remove double data type attr
Review Comment:
Physical Aggregate will wrap `NormalizeNaNAndZero` for float/double to
handle NaN and -0.0, so It's result value might be different with the original
expression ?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]