karuppayya commented on a change in pull request #28662:
URL: https://github.com/apache/spark/pull/28662#discussion_r432969864



##########
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
##########
@@ -111,19 +114,29 @@ class ResolveHiveSerdeTable(session: SparkSession) 
extends Rule[LogicalPlan] {
   }
 }
 
-class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
-  private def hiveTableWithStats(relation: HiveTableRelation): 
HiveTableRelation = {
-    val table = relation.tableMeta
+class DetermineTableStats(val session: SparkSession) extends Rule[LogicalPlan] 
{
+  private val relationToSizeMap: mutable.Map[TableIdentifier, Long] = 
mutable.Map.empty
+
+  private[hive] def hiveTableWithStats(relation: HiveTableRelation): 
HiveTableRelation = {
     val partitionCols = relation.partitionCols
     val conf = session.sessionState.conf
     // For partitioned tables, the partition directory may be outside of the 
table directory.
     // Which is expensive to get table size. Please see how we implemented it 
in the AnalyzeTable.
     val sizeInBytes = if (conf.fallBackToHdfsForStatsEnabled && 
partitionCols.isEmpty) {
       try {
-        val hadoopConf = session.sessionState.newHadoopConf()
-        val tablePath = new Path(table.location)
-        val fs: FileSystem = tablePath.getFileSystem(hadoopConf)
-        fs.getContentSummary(tablePath).getLength
+        val table = relation.tableMeta
+        val relationSizeMap = getRelationToSizeMap
+        if (relationSizeMap.contains(table.identifier)) {

Review comment:
       I have now handled this in a more generic way in 
https://github.com/apache/spark/pull/28662/commits/0a1fb93bf3ac40820e6090714967b360d9263db5.
   The way to refresh table in a Spark session is using the `REFRESH TABLE 
<tblname>` command or the corresponding DF apis.
   With this change, the cache invalidation happens whenever a refresh table is 
invoked




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to