Repository: spark Updated Branches: refs/heads/master 755f2f518 -> 287781742
[SPARK-22917][SQL] Should not try to generate histogram for empty/null columns ## What changes were proposed in this pull request? For empty/null column, the result of `ApproximatePercentile` is null. Then in `ApproxCountDistinctForIntervals`, a `MatchError` (for `endpoints`) will be thrown if we try to generate histogram for that column. Besides, there is no need to generate histogram for such column. In this patch, we exclude such column when generating histogram. ## How was this patch tested? Enhanced test cases for empty/null columns. Author: Zhenhua Wang <[email protected]> Closes #20102 from wzhfy/no_record_hgm_bug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28778174 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28778174 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28778174 Branch: refs/heads/master Commit: 28778174208664327b75915e83ae5e611360eef3 Parents: 755f2f5 Author: Zhenhua Wang <[email protected]> Authored: Thu Dec 28 21:49:37 2017 +0800 Committer: Wenchen Fan <[email protected]> Committed: Thu Dec 28 21:49:37 2017 +0800 ---------------------------------------------------------------------- .../command/AnalyzeColumnCommand.scala | 7 ++++++- .../spark/sql/StatisticsCollectionSuite.scala | 21 ++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/28778174/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala index e3bb4d3..1122522 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala @@ -143,7 +143,12 @@ case class AnalyzeColumnCommand( val percentilesRow = new QueryExecution(sparkSession, Aggregate(Nil, namedExprs, relation)) .executedPlan.executeTake(1).head attrsToGenHistogram.zipWithIndex.foreach { case (attr, i) => - attributePercentiles += attr -> percentilesRow.getArray(i) + val percentiles = percentilesRow.getArray(i) + // When there is no non-null value, `percentiles` is null. In such case, there is no + // need to generate histogram. + if (percentiles != null) { + attributePercentiles += attr -> percentiles + } } } AttributeMap(attributePercentiles.toSeq) http://git-wip-us.apache.org/repos/asf/spark/blob/28778174/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala index fba5d26..b11e798 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala @@ -85,13 +85,24 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared test("analyze empty table") { val table = "emptyTable" withTable(table) { - sql(s"CREATE TABLE $table (key STRING, value STRING) USING PARQUET") + val df = Seq.empty[Int].toDF("key") + df.write.format("json").saveAsTable(table) sql(s"ANALYZE TABLE $table COMPUTE STATISTICS noscan") val fetchedStats1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None) assert(fetchedStats1.get.sizeInBytes == 0) sql(s"ANALYZE TABLE $table COMPUTE STATISTICS") val fetchedStats2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0)) assert(fetchedStats2.get.sizeInBytes == 0) + + val expectedColStat = + "key" -> ColumnStat(0, None, None, 0, IntegerType.defaultSize, IntegerType.defaultSize) + + // There won't be histogram for empty column. + Seq("true", "false").foreach { histogramEnabled => + withSQLConf(SQLConf.HISTOGRAM_ENABLED.key -> histogramEnabled) { + checkColStats(df, mutable.LinkedHashMap(expectedColStat)) + } + } } } @@ -178,7 +189,13 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared val expectedColStats = dataTypes.map { case (tpe, idx) => (s"col$idx", ColumnStat(0, None, None, 1, tpe.defaultSize.toLong, tpe.defaultSize.toLong)) } - checkColStats(df, mutable.LinkedHashMap(expectedColStats: _*)) + + // There won't be histograms for null columns. + Seq("true", "false").foreach { histogramEnabled => + withSQLConf(SQLConf.HISTOGRAM_ENABLED.key -> histogramEnabled) { + checkColStats(df, mutable.LinkedHashMap(expectedColStats: _*)) + } + } } test("number format in statistics") { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
