Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19932#discussion_r156396216
--- Diff:
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
---
@@ -413,32 +413,7 @@ private[hive] class HiveClientImpl(
case (key, _) => excludedTableProperties.contains(key)
}
val comment = properties.get("comment")
-
- // Here we are reading statistics from Hive.
- // Note that this statistics could be overridden by Spark's
statistics if that's available.
- val totalSize =
properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_))
- val rawDataSize =
properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_))
- val rowCount =
properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_))
- // TODO: check if this estimate is valid for tables after partition
pruning.
- // NOTE: getting `totalSize` directly from params is kind of hacky,
but this should be
- // relatively cheap if parameters for the table are populated into
the metastore.
- // Currently, only totalSize, rawDataSize, and rowCount are used to
build the field `stats`
- // TODO: stats should include all the other two fields (`numFiles`
and `numPartitions`).
- // (see StatsSetupConst in Hive)
- val stats =
- // When table is external, `totalSize` is always zero, which will
influence join strategy.
- // So when `totalSize` is zero, use `rawDataSize` instead. When
`rawDataSize` is also zero,
- // return None.
- // In Hive, when statistics gathering is disabled, `rawDataSize`
and `numRows` is always
- // zero after INSERT command. So they are used here only if they
are larger than zero.
- if (totalSize.isDefined && totalSize.get > 0L) {
- Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount =
rowCount.filter(_ > 0)))
- } else if (rawDataSize.isDefined && rawDataSize.get > 0) {
- Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount =
rowCount.filter(_ > 0)))
- } else {
- // TODO: still fill the rowCount even if sizeInBytes is empty.
Might break anything?
- None
- }
+ val hiveStats = readHiveStats(properties)
--- End diff --
nit: we can inline it
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]