Github user xuanyuanking commented on a diff in the pull request:
https://github.com/apache/spark/pull/16135#discussion_r156254083
--- Diff:
sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
---
@@ -352,4 +353,34 @@ class PartitionedTablePerfStatsSuite
}
}
}
+
+ test("SPARK-18700: table loaded only once even when resolved
concurrently") {
+ withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+ withTable("test") {
+ withTempDir { dir =>
+ HiveCatalogMetrics.reset()
+ setupPartitionedHiveTable("test", dir, 50)
+ // select the table in multi-threads
+ val executorPool = Executors.newFixedThreadPool(10)
+ (1 to 10).map(threadId => {
+ val runnable = new Runnable {
+ override def run(): Unit = {
+ spark.sql("select * from test where partCol1 =
999").count()
+ }
+ }
+ executorPool.execute(runnable)
+ None
+ })
+ executorPool.shutdown()
+ executorPool.awaitTermination(30, TimeUnit.SECONDS)
+ // check the cache hit, we use the metric of
METRIC_FILES_DISCOVERED and
+ // METRIC_PARALLEL_LISTING_JOB_COUNT to check this, while the
lock take effect,
+ // only one thread can really do the build, so the listing job
count is 2, the other
+ // one is cache.load func. Also METRIC_FILES_DISCOVERED is
$partition_num * 2
--- End diff --
@gatorsmile Xiao fixed this in https://github.com/apache/spark/pull/16481
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]