karuppayya commented on a change in pull request #28662:
URL: https://github.com/apache/spark/pull/28662#discussion_r432969864
##########
File path:
sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
##########
@@ -111,19 +114,29 @@ class ResolveHiveSerdeTable(session: SparkSession)
extends Rule[LogicalPlan] {
}
}
-class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
- private def hiveTableWithStats(relation: HiveTableRelation):
HiveTableRelation = {
- val table = relation.tableMeta
+class DetermineTableStats(val session: SparkSession) extends Rule[LogicalPlan]
{
+ private val relationToSizeMap: mutable.Map[TableIdentifier, Long] =
mutable.Map.empty
+
+ private[hive] def hiveTableWithStats(relation: HiveTableRelation):
HiveTableRelation = {
val partitionCols = relation.partitionCols
val conf = session.sessionState.conf
// For partitioned tables, the partition directory may be outside of the
table directory.
// Which is expensive to get table size. Please see how we implemented it
in the AnalyzeTable.
val sizeInBytes = if (conf.fallBackToHdfsForStatsEnabled &&
partitionCols.isEmpty) {
try {
- val hadoopConf = session.sessionState.newHadoopConf()
- val tablePath = new Path(table.location)
- val fs: FileSystem = tablePath.getFileSystem(hadoopConf)
- fs.getContentSummary(tablePath).getLength
+ val table = relation.tableMeta
+ val relationSizeMap = getRelationToSizeMap
+ if (relationSizeMap.contains(table.identifier)) {
Review comment:
I have now handled this in a more generic way in
https://github.com/apache/spark/pull/28662/commits/0a1fb93bf3ac40820e6090714967b360d9263db5.
The way to refresh table in a Spark session is using the `REFRESH TABLE
<tblname>` command or the corresponding DF apis(Please let me know if this is
not correct).
With this change, the cache invalidation happens whenever a refresh table is
invoked
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]