Github user wzhfy commented on a diff in the pull request:
https://github.com/apache/spark/pull/18334#discussion_r124469169
--- Diff:
sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala ---
@@ -448,6 +433,145 @@ class StatisticsSuite extends
StatisticsCollectionTestBase with TestHiveSingleto
"ALTER TABLE unset_prop_table UNSET TBLPROPERTIES ('prop1')")
}
+ /**
+ * To see if stats exist, we need to check spark's stats properties
instead of catalog
+ * statistics, because hive would change stats in metastore and thus
change catalog statistics.
+ */
+ private def getStatsProperties(tableName: String): Map[String, String] =
{
+ val hTable =
hiveClient.getTable(spark.sessionState.catalog.getCurrentDatabase, tableName)
+ hTable.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
+ }
+
+ test("change stats after insert command for hive table") {
+ val table = s"change_stats_insert_hive_table"
+ Seq(false, true).foreach { autoUpdate =>
+ withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+ withTable(table) {
+ sql(s"CREATE TABLE $table (i int, j string)")
+ // analyze to get initial stats
+ sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+ val fetched1 = checkTableStats(table, hasSizeInBytes = true,
expectedRowCounts = Some(0))
+ assert(fetched1.get.sizeInBytes == 0)
+ assert(fetched1.get.colStats.size == 2)
+
+ // insert into command
+ sql(s"INSERT INTO TABLE $table SELECT 1, 'abc'")
+ if (autoUpdate) {
+ val fetched2 = checkTableStats(table, hasSizeInBytes = true,
expectedRowCounts = None)
+ assert(fetched2.get.sizeInBytes > 0)
+ assert(fetched2.get.colStats.isEmpty)
+ val statsProp = getStatsProperties(table)
+ assert(statsProp(STATISTICS_TOTAL_SIZE).toLong ==
fetched2.get.sizeInBytes)
+ } else {
+ assert(getStatsProperties(table).isEmpty)
+ }
+ }
+ }
+ }
+ }
+
+ test("change stats after load data command") {
+ val table = "change_stats_load_table"
+ Seq(false, true).foreach { autoUpdate =>
+ withSQLConf(SQLConf.AUTO_UPDATE_SIZE.key -> autoUpdate.toString) {
+ withTable(table) {
+ sql(s"CREATE TABLE $table (i INT, j STRING) STORED AS PARQUET")
+ // analyze to get initial stats
+ sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS i, j")
+ val fetched1 = checkTableStats(table, hasSizeInBytes = true,
expectedRowCounts = Some(0))
+ assert(fetched1.get.sizeInBytes == 0)
+ assert(fetched1.get.colStats.size == 2)
+
+ withTempDir { loadPath =>
+ // load data command
+ val file = new File(loadPath + "/data")
+ val writer = new PrintWriter(file)
+ writer.write("2,xyz")
+ writer.close()
+ sql(s"LOAD DATA INPATH '${loadPath.toURI.toString}' INTO TABLE
$table")
+ if (autoUpdate) {
+ val fetched2 = checkTableStats(table, hasSizeInBytes = true,
expectedRowCounts = None)
+ assert(fetched2.get.sizeInBytes > 0)
+ assert(fetched2.get.colStats.isEmpty)
+ val statsProp = getStatsProperties(table)
+ assert(statsProp(STATISTICS_TOTAL_SIZE).toLong ==
fetched2.get.sizeInBytes)
+ } else {
+ assert(getStatsProperties(table).isEmpty)
+ }
+ }
+ }
+ }
+ }
+ }
+
+ test("change stats after add/drop partition command") {
--- End diff --
we can't get table properties through hiveClient in the parent class.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]