Github user jinxing64 commented on a diff in the pull request:
https://github.com/apache/spark/pull/19560#discussion_r146449741
--- Diff:
sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala ---
@@ -120,22 +120,41 @@ class DetermineTableStats(session: SparkSession)
extends Rule[LogicalPlan] {
if DDLUtils.isHiveTable(relation.tableMeta) &&
relation.tableMeta.stats.isEmpty =>
val table = relation.tableMeta
val sizeInBytes = if
(session.sessionState.conf.fallBackToHdfsForStatsEnabled) {
- try {
- val hadoopConf = session.sessionState.newHadoopConf()
- val tablePath = new Path(table.location)
- val fs: FileSystem = tablePath.getFileSystem(hadoopConf)
- fs.getContentSummary(tablePath).getLength
- } catch {
- case e: IOException =>
- logWarning("Failed to get table size from hdfs.", e)
- session.sessionState.conf.defaultSizeInBytes
- }
+ getSizeFromHdfs(table.location)
} else {
session.sessionState.conf.defaultSizeInBytes
}
val withStats = table.copy(stats =
Some(CatalogStatistics(sizeInBytes = BigInt(sizeInBytes))))
relation.copy(tableMeta = withStats)
+
+ case relation: HiveTableRelation
+ if DDLUtils.isHiveTable(relation.tableMeta) &&
relation.tableMeta.stats.nonEmpty &&
+
session.sessionState.conf.verifyStatsFromFileSystemWhenBroadcastJoin &&
+ relation.tableMeta.stats.get.sizeInBytes <
+ session.sessionState.conf.autoBroadcastJoinThreshold =>
+ val table = relation.tableMeta
+ val sizeInBytes = getSizeFromHdfs(table.location)
--- End diff --
Yes, I think it's good idea.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]