This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f2d6b7bb5dd Revert "[SPARK-39043][SQL] Spark SQL Hive client should
not gather statistic by default"
f2d6b7bb5dd is described below
commit f2d6b7bb5ddc121249fcd0cd6d01f1c47e6e4c08
Author: Wenchen Fan <[email protected]>
AuthorDate: Thu May 19 20:06:11 2022 +0800
Revert "[SPARK-39043][SQL] Spark SQL Hive client should not gather
statistic by default"
This reverts commit fba30cd491b6163f4a469296bb5af293712ca8d4.
---
docs/sql-migration-guide.md | 4 ----
.../main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala | 3 ---
.../src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala | 3 ++-
.../test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala | 1 -
4 files changed, 2 insertions(+), 9 deletions(-)
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 555b3125a6b..59b8d47d306 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -22,10 +22,6 @@ license: |
* Table of contents
{:toc}
-## Upgrading from Spark SQL 3.3 to 3.4
-
- - Since Spark 3.4, Spark disables `hive.stats.autogather` by default, which
means Hive tables won't automatically update statistics that can be consumed by
Hive (not Spark). To restore the behavior before Spark 3.4, you can set
`spark.hadoop.hive.stats.autogather` to `true`.
-
## Upgrading from Spark SQL 3.2 to 3.3
- Since Spark 3.3, the `histogram_numeric` function in Spark SQL returns an
output type of an array of structs (x, y), where the type of the 'x' field in
the return value is propagated from the input values consumed in the aggregate
function. In Spark 3.2 or earlier, 'x' always had double type. Optionally, use
the configuration `spark.sql.legacy.histogramNumericPropagateInputType` since
Spark 3.3 to revert back to the previous behavior.
diff --git
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 12cd2740d54..d70ac781c03 100644
---
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -1270,9 +1270,6 @@ private[hive] object HiveClientImpl extends Logging {
}
// Disable CBO because we removed the Calcite dependency.
hiveConf.setBoolean("hive.cbo.enable", false)
- // Disable auto gather statistic by default.
- hiveConf.setBoolean("hive.stats.autogather",
confMap.contains("hive.stats.autogather") &&
- confMap("hive.stats.autogather").equalsIgnoreCase("true"))
// If this is true, SessionState.start will create a file to log hive job
which will not be
// deleted on exit and is useless for spark
if (hiveConf.getBoolean("hive.session.history.enabled", false)) {
diff --git
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index fb79c025fc3..c689682a46b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -101,7 +101,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase
with TestHiveSingleto
.asInstanceOf[HiveTableRelation]
val properties = relation.tableMeta.ignoredProperties
- assert(properties.get("totalSize").isEmpty)
+ // Since HIVE-6727, Hive fixes table-level stats for external tables
are incorrect.
+ assert(properties("totalSize").toLong == 6)
assert(properties.get("rawDataSize").isEmpty)
val sizeInBytes = relation.stats.sizeInBytes
diff --git
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
index 5949ada158d..ad0f9a56a82 100644
---
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
+++
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala
@@ -62,7 +62,6 @@ class HiveClientSuite(version: String, allVersions:
Seq[String])
System.gc() // Hack to avoid SEGV on some JVM versions.
val hadoopConf = new Configuration()
hadoopConf.set("test", "success")
- hadoopConf.set("hive.stats.autogather", "true")
client = buildClient(hadoopConf)
if (versionSpark != null) versionSpark.reset()
versionSpark = TestHiveVersion(client)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]