Github user wangyum commented on the issue:
https://github.com/apache/spark/pull/22743
This happens when a table `LogicalRelation` has been cached, then we change
`spark.sql.statistics.fallBackToHdfs` or `spark.sql.defaultSizeInBytes` will
not have any effect to stats, it always uses the stats already cached in
`LogicalRelation`. This is an example:
```scala
import org.apache.spark.sql.catalyst.QualifiedTableName
import org.apache.spark.sql.catalyst.catalog.SessionCatalog
import org.apache.spark.sql.execution.datasources.LogicalRelation
spark.sql("CREATE TABLE t1 (c1 bigint) STORED AS PARQUET")
spark.sql("INSERT INTO TABLE t1 VALUES (1)")
spark.sql("REFRESH TABLE t1")
val catalog = spark.sessionState.catalog
val qualifiedTableName = QualifiedTableName(catalog.getCurrentDatabase,
"t1")
spark.sql("SELECT * from t1").collect()
val cachedRelation = catalog.getCachedTable(qualifiedTableName)
cachedRelation.asInstanceOf[LogicalRelation].catalogTable.get.stats.get.sizeInBytes
// res4: BigInt = 9223372036854775807
spark.sql("set spark.sql.statistics.fallBackToHdfs=true")
spark.sql("SELECT * from t1").collect()
val cachedRelation = catalog.getCachedTable(qualifiedTableName)
cachedRelation.asInstanceOf[LogicalRelation].catalogTable.get.stats.get.sizeInBytes
// res7: BigInt = 9223372036854775807
// It should compute from file system, but still 9223372036854775807
spark.sql("REFRESH TABLE t1")
spark.sql("SELECT * from t1").collect()
val cachedRelation = catalog.getCachedTable(qualifiedTableName)
cachedRelation.asInstanceOf[LogicalRelation].catalogTable.get.stats.get.sizeInBytes
// res10: BigInt = 708
// If we refresh this table, it correct.
```
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]