This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push: new 40668c5 [SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit… 40668c5 is described below commit 40668c53ed799881db1f316ceaf2f978b294d8ed Author: pengbo <bo.peng1...@gmail.com> AuthorDate: Mon Apr 15 15:37:07 2019 -0700 [SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit… ## What changes were proposed in this pull request? The upper bound of group-by columns row number is to multiply distinct counts of group-by columns. However, column with only null value will cause the output row number to be 0 which is incorrect. Ex: col1 (distinct: 2, rowCount 2) col2 (distinct: 0, rowCount 2) => group by col1, col2 Actual: output rows: 0 Expected: output rows: 2 ## How was this patch tested? According unit test has been added, plus manual test has been done in our tpcds benchmark environement. Closes #24286 from pengbo/master. Lead-authored-by: pengbo <bo.peng1...@gmail.com> Co-authored-by: mingbo_pb <mingbo...@alibaba-inc.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> (cherry picked from commit c58a4fed8d79aff9fbac9f9a33141b2edbfb0cea) Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../plans/logical/statsEstimation/AggregateEstimation.scala | 12 ++++++++++-- .../catalyst/statsEstimation/AggregateEstimationSuite.scala | 12 +++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala index 111c594..7ef22fa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala @@ -39,8 +39,16 @@ object AggregateEstimation { // Multiply distinct counts of group-by columns. This is an upper bound, which assumes // the data contains all combinations of distinct values of group-by columns. var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))( - (res, expr) => res * - childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount.get) + (res, expr) => { + val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute]) + val distinctCount = columnStat.distinctCount.get + val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) { + 1 + } else { + distinctCount + } + res * distinctValue + }) outputRows = if (agg.groupingExpressions.isEmpty) { // If there's no group-by columns, the output is a single row containing values of aggregate diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala index 8213d56..6bdf8cd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala @@ -38,7 +38,9 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest { attr("key22") -> ColumnStat(distinctCount = Some(2), min = Some(10), max = Some(20), nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)), attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None, - nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)) + nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)), + attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None, + nullCount = Some(4), avgLen = Some(4), maxLen = Some(4)) )) private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1) @@ -92,6 +94,14 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest { expectedOutputRowCount = 0) } + test("group-by column with only null value") { + checkAggStats( + tableColumns = Seq("key22", "key32"), + tableRowCount = 6, + groupByColumns = Seq("key22", "key32"), + expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get) + } + test("non-cbo estimation") { val attributes = Seq("key12").map(nameToAttr) val child = StatsTestPlan( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org