[spark] branch master updated: Revert "[SPARK-39043][SQL] Spark SQL Hive client should not gather statistic by default"

wenchen Thu, 19 May 2022 05:06:55 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f2d6b7bb5dd Revert "[SPARK-39043][SQL] Spark SQL Hive client should 
not gather statistic by default"
f2d6b7bb5dd is described below

commit f2d6b7bb5ddc121249fcd0cd6d01f1c47e6e4c08
Author: Wenchen Fan <[email protected]>
AuthorDate: Thu May 19 20:06:11 2022 +0800

    Revert "[SPARK-39043][SQL] Spark SQL Hive client should not gather 
statistic by default"
    
    This reverts commit fba30cd491b6163f4a469296bb5af293712ca8d4.
---
 docs/sql-migration-guide.md                                           | 4 ----
 .../main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala  | 3 ---
 .../src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala    | 3 ++-
 .../test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala | 1 -
 4 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 555b3125a6b..59b8d47d306 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -22,10 +22,6 @@ license: |
 * Table of contents
 {:toc}
 
-## Upgrading from Spark SQL 3.3 to 3.4
-  
-  - Since Spark 3.4, Spark disables `hive.stats.autogather` by default, which 
means Hive tables won't automatically update statistics that can be consumed by 
Hive (not Spark). To restore the behavior before Spark 3.4, you can set 
`spark.hadoop.hive.stats.autogather` to `true`.
-
 ## Upgrading from Spark SQL 3.2 to 3.3
 
   - Since Spark 3.3, the `histogram_numeric` function in Spark SQL returns an 
output type of an array of structs (x, y), where the type of the 'x' field in 
the return value is propagated from the input values consumed in the aggregate 
function. In Spark 3.2 or earlier, 'x' always had double type. Optionally, use 
the configuration `spark.sql.legacy.histogramNumericPropagateInputType` since 
Spark 3.3 to revert back to the previous behavior. 
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 12cd2740d54..d70ac781c03 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -1270,9 +1270,6 @@ private[hive] object HiveClientImpl extends Logging {
     }
     // Disable CBO because we removed the Calcite dependency.
     hiveConf.setBoolean("hive.cbo.enable", false)
-    // Disable auto gather statistic by default.
-    hiveConf.setBoolean("hive.stats.autogather", 
confMap.contains("hive.stats.autogather") &&
-          confMap("hive.stats.autogather").equalsIgnoreCase("true"))
     // If this is true, SessionState.start will create a file to log hive job 
which will not be
     // deleted on exit and is useless for spark
     if (hiveConf.getBoolean("hive.session.history.enabled", false)) {
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index fb79c025fc3..c689682a46b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -101,7 +101,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase 
with TestHiveSingleto
             .asInstanceOf[HiveTableRelation]
 
           val properties = relation.tableMeta.ignoredProperties
-          assert(properties.get("totalSize").isEmpty)
+          // Since HIVE-6727, Hive fixes table-level stats for external tables 
are incorrect.
+          assert(properties("totalSize").toLong == 6)
           assert(properties.get("rawDataSize").isEmpty)
 
           val sizeInBytes = relation.stats.sizeInBytes
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
index 5949ada158d..ad0f9a56a82 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
@@ -62,7 +62,6 @@ class HiveClientSuite(version: String, allVersions: 
Seq[String])
     System.gc() // Hack to avoid SEGV on some JVM versions.
     val hadoopConf = new Configuration()
     hadoopConf.set("test", "success")
-    hadoopConf.set("hive.stats.autogather", "true")
     client = buildClient(hadoopConf)
     if (versionSpark != null) versionSpark.reset()
     versionSpark = TestHiveVersion(client)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: Revert "[SPARK-39043][SQL] Spark SQL Hive client should not gather statistic by default"

Reply via email to