maropu commented on a change in pull request #30999:
URL: https://github.com/apache/spark/pull/30999#discussion_r554411937
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
##########
@@ -349,11 +349,20 @@ abstract class Optimizer(catalogManager: CatalogManager)
*/
object EliminateDistinct extends Rule[LogicalPlan] {
override def apply(plan: LogicalPlan): LogicalPlan = plan
transformExpressions {
- case ae: AggregateExpression if ae.isDistinct =>
- ae.aggregateFunction match {
- case _: Max | _: Min => ae.copy(isDistinct = false)
- case _ => ae
- }
+ case ae: AggregateExpression if ae.isDistinct &&
isDuplicateAgnostic(ae.aggregateFunction) =>
+ ae.copy(isDistinct = false)
+ }
+
+ private def isDuplicateAgnostic(af: AggregateFunction): Boolean = af match {
+ case _: Max => true
+ case _: Min => true
+ case _: BitAndAgg => true
+ case _: BitOrAgg => true
+ case _: First => true
+ case _: Last => true
+ case _: HyperLogLogPlusPlus => true
Review comment:
yea, removing it for now looks fine to me.
##########
File path:
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala
##########
@@ -32,25 +34,25 @@ class EliminateDistinctSuite extends PlanTest {
val testRelation = LocalRelation('a.int)
- test("Eliminate Distinct in Max") {
- val query = testRelation
- .select(maxDistinct('a).as('result))
- .analyze
- val answer = testRelation
- .select(max('a).as('result))
- .analyze
- assert(query != answer)
- comparePlans(Optimize.execute(query), answer)
- }
-
- test("Eliminate Distinct in Min") {
- val query = testRelation
- .select(minDistinct('a).as('result))
- .analyze
- val answer = testRelation
- .select(min('a).as('result))
- .analyze
- assert(query != answer)
- comparePlans(Optimize.execute(query), answer)
+ Seq(
+ ("max", Max(_)),
+ ("min", Min(_)),
Review comment:
We need the first name param? How about using `prettyName` instead?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]