Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19932#discussion_r156395848
--- Diff:
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
---
@@ -1021,8 +998,38 @@ private[hive] object HiveClientImpl {
compressed = apiPartition.getSd.isCompressed,
properties = Option(apiPartition.getSd.getSerdeInfo.getParameters)
.map(_.asScala.toMap).orNull),
- parameters =
- if (hp.getParameters() != null) hp.getParameters().asScala.toMap
else Map.empty)
+ parameters = properties,
+ stats = readHiveStats(properties))
+ }
+
+ /**
+ * Reads statistics from Hive.
+ * Note that this statistics could be overridden by Spark's statistics
if that's available.
+ */
+ private def readHiveStats(properties: Map[String, String]):
Option[CatalogStatistics] = {
+ val totalSize =
properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_))
+ val rawDataSize =
properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_))
+ val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_))
+ // TODO: check if this estimate is valid for tables after partition
pruning.
--- End diff --
do we still need this TODO?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]