spark git commit: [SPARK-22917][SQL] Should not try to generate histogram for empty/null columns

wenchen Thu, 28 Dec 2017 05:50:41 -0800

Repository: spark
Updated Branches:
  refs/heads/master 755f2f518 -> 287781742



[SPARK-22917][SQL] Should not try to generate histogram for empty/null columns

## What changes were proposed in this pull request?

For empty/null column, the result of `ApproximatePercentile` is null. Then in 
`ApproxCountDistinctForIntervals`, a `MatchError` (for `endpoints`) will be 
thrown if we try to generate histogram for that column. Besides, there is no 
need to generate histogram for such column. In this patch, we exclude such 
column when generating histogram.

## How was this patch tested?

Enhanced test cases for empty/null columns.

Author: Zhenhua Wang <[email protected]>

Closes #20102 from wzhfy/no_record_hgm_bug.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28778174
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28778174
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28778174

Branch: refs/heads/master
Commit: 28778174208664327b75915e83ae5e611360eef3
Parents: 755f2f5
Author: Zhenhua Wang <[email protected]>
Authored: Thu Dec 28 21:49:37 2017 +0800
Committer: Wenchen Fan <[email protected]>
Committed: Thu Dec 28 21:49:37 2017 +0800

----------------------------------------------------------------------
 .../command/AnalyzeColumnCommand.scala          |  7 ++++++-
 .../spark/sql/StatisticsCollectionSuite.scala   | 21 ++++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/28778174/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index e3bb4d3..1122522 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -143,7 +143,12 @@ case class AnalyzeColumnCommand(
       val percentilesRow = new QueryExecution(sparkSession, Aggregate(Nil, 
namedExprs, relation))
         .executedPlan.executeTake(1).head
       attrsToGenHistogram.zipWithIndex.foreach { case (attr, i) =>
-        attributePercentiles += attr -> percentilesRow.getArray(i)
+        val percentiles = percentilesRow.getArray(i)
+        // When there is no non-null value, `percentiles` is null. In such 
case, there is no
+        // need to generate histogram.
+        if (percentiles != null) {
+          attributePercentiles += attr -> percentiles
+        }
       }
     }
     AttributeMap(attributePercentiles.toSeq)

http://git-wip-us.apache.org/repos/asf/spark/blob/28778174/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index fba5d26..b11e798 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -85,13 +85,24 @@ class StatisticsCollectionSuite extends 
StatisticsCollectionTestBase with Shared
   test("analyze empty table") {
     val table = "emptyTable"
     withTable(table) {
-      sql(s"CREATE TABLE $table (key STRING, value STRING) USING PARQUET")
+      val df = Seq.empty[Int].toDF("key")
+      df.write.format("json").saveAsTable(table)
       sql(s"ANALYZE TABLE $table COMPUTE STATISTICS noscan")
       val fetchedStats1 = checkTableStats(table, hasSizeInBytes = true, 
expectedRowCounts = None)
       assert(fetchedStats1.get.sizeInBytes == 0)
       sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
       val fetchedStats2 = checkTableStats(table, hasSizeInBytes = true, 
expectedRowCounts = Some(0))
       assert(fetchedStats2.get.sizeInBytes == 0)
+
+      val expectedColStat =
+        "key" -> ColumnStat(0, None, None, 0, IntegerType.defaultSize, 
IntegerType.defaultSize)
+
+      // There won't be histogram for empty column.
+      Seq("true", "false").foreach { histogramEnabled =>
+        withSQLConf(SQLConf.HISTOGRAM_ENABLED.key -> histogramEnabled) {
+          checkColStats(df, mutable.LinkedHashMap(expectedColStat))
+        }
+      }
     }
   }
 
@@ -178,7 +189,13 @@ class StatisticsCollectionSuite extends 
StatisticsCollectionTestBase with Shared
     val expectedColStats = dataTypes.map { case (tpe, idx) =>
       (s"col$idx", ColumnStat(0, None, None, 1, tpe.defaultSize.toLong, 
tpe.defaultSize.toLong))
     }
-    checkColStats(df, mutable.LinkedHashMap(expectedColStats: _*))
+
+    // There won't be histograms for null columns.
+    Seq("true", "false").foreach { histogramEnabled =>
+      withSQLConf(SQLConf.HISTOGRAM_ENABLED.key -> histogramEnabled) {
+        checkColStats(df, mutable.LinkedHashMap(expectedColStats: _*))
+      }
+    }
   }
 
   test("number format in statistics") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-22917][SQL] Should not try to generate histogram for empty/null columns

Reply via email to