This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 239082d [SPARK-27403][SQL] Fix `updateTableStats` to update table
stats always with new stats or None
239082d is described below
commit 239082d9667a4fa4198bd9524d63c739df147e0e
Author: s71955 <[email protected]>
AuthorDate: Thu Apr 11 08:53:00 2019 -0700
[SPARK-27403][SQL] Fix `updateTableStats` to update table stats always with
new stats or None
## What changes were proposed in this pull request?
System shall update the table stats automatically if user set
spark.sql.statistics.size.autoUpdate.enabled as true, currently this property
is not having any significance even if it is enabled or disabled. This feature
is similar to Hives auto-gather feature where statistics are automatically
computed by default if this feature is enabled.
Reference:
https://cwiki.apache.org/confluence/display/Hive/StatsDev
As part of fix , autoSizeUpdateEnabled validation is been done initially
so that system will calculate the table size for the user automatically and
record it in metastore as per user expectation.
## How was this patch tested?
UT is written and manually verified in cluster.
Tested with unit tests + some internal tests on real cluster.
Before fix:

After fix

Closes #24315 from sujith71955/master_autoupdate.
Authored-by: s71955 <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../spark/sql/execution/command/CommandUtils.scala | 18 ++++++++----------
.../apache/spark/sql/StatisticsCollectionSuite.scala | 20 ++++++++++++++++++++
2 files changed, 28 insertions(+), 10 deletions(-)
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index dea1e01..70e7cd9 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -42,16 +42,14 @@ object CommandUtils extends Logging {
/** Change statistics after changing data by commands. */
def updateTableStats(sparkSession: SparkSession, table: CatalogTable): Unit
= {
- if (table.stats.nonEmpty) {
- val catalog = sparkSession.sessionState.catalog
- if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
- val newTable = catalog.getTableMetadata(table.identifier)
- val newSize = CommandUtils.calculateTotalSize(sparkSession, newTable)
- val newStats = CatalogStatistics(sizeInBytes = newSize)
- catalog.alterTableStats(table.identifier, Some(newStats))
- } else {
- catalog.alterTableStats(table.identifier, None)
- }
+ val catalog = sparkSession.sessionState.catalog
+ if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) {
+ val newTable = catalog.getTableMetadata(table.identifier)
+ val newSize = CommandUtils.calculateTotalSize(sparkSession, newTable)
+ val newStats = CatalogStatistics(sizeInBytes = newSize)
+ catalog.alterTableStats(table.identifier, Some(newStats))
+ } else if (table.stats.nonEmpty) {
+ catalog.alterTableStats(table.identifier, None)
}
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 90b3586..4e1e424 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -337,6 +337,26 @@ class StatisticsCollectionSuite extends
StatisticsCollectionTestBase with Shared
}
}
+ test("auto gather stats after insert command") {
+ val table = "change_stats_insert_datasource_table"
+ Seq(false, true).foreach { autoUpdate =>
+ withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> autoUpdate.toString)
{
+ withTable(table) {
+ sql(s"CREATE TABLE $table (i int, j string) USING PARQUET")
+ // insert into command
+ sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
+ val stats = getCatalogTable(table).stats
+ if (autoUpdate) {
+ assert(stats.isDefined)
+ assert(stats.get.sizeInBytes >= 0)
+ } else {
+ assert(stats.isEmpty)
+ }
+ }
+ }
+ }
+ }
+
test("invalidation of tableRelationCache after inserts") {
val table = "invalidate_catalog_cache_table"
Seq(false, true).foreach { autoUpdate =>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]