fuwhu commented on a change in pull request #25919: [SPARK-15616][SQL] Hive
table supports partition pruning in JoinSelection
URL: https://github.com/apache/spark/pull/25919#discussion_r352461294
##########
File path:
sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
##########
@@ -231,6 +232,68 @@ case class RelationConversions(
}
}
+/**
+ * TODO: merge this with PruneFileSourcePartitions after we completely make
hive as a data source.
+ */
+case class PruneHiveTablePartitions(
+ session: SparkSession) extends Rule[LogicalPlan] with PredicateHelper {
+ override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+ case op @ PhysicalOperation(projections, predicates, relation:
HiveTableRelation)
+ if predicates.nonEmpty && relation.isPartitioned &&
relation.prunedPartitions.isEmpty =>
+ val normalizedFilters = predicates.map { e =>
+ e transform {
+ case a: AttributeReference =>
+ a.withName(relation.output.find(_.semanticEquals(a)).get.name)
+ }
+ }
+ val partitionSet = AttributeSet(relation.partitionCols)
+ val pruningPredicates = normalizedFilters.filter { predicate =>
+ !predicate.references.isEmpty &&
predicate.references.subsetOf(partitionSet)
+ }
+ // SPARK-24085: scalar subquery should be skipped for partition pruning
+ val hasScalarSubquery =
pruningPredicates.exists(SubqueryExpression.hasSubquery)
+ val conf = session.sessionState.conf
+ if (conf.metastorePartitionPruning && pruningPredicates.nonEmpty &&
!hasScalarSubquery) {
+ val prunedPartitions =
session.sharedState.externalCatalog.listPartitionsByFilter(
+ relation.tableMeta.database,
+ relation.tableMeta.identifier.table,
+ pruningPredicates,
+ conf.sessionLocalTimeZone)
+ val sizeInBytes = try {
+ val sizeOfPartitions = prunedPartitions.map { part =>
+ val rawDataSize =
part.parameters.get(StatsSetupConst.RAW_DATA_SIZE).map(_.toLong)
+ val totalSize =
part.parameters.get(StatsSetupConst.TOTAL_SIZE).map(_.toLong)
+ if (rawDataSize.isDefined && rawDataSize.get > 0) {
+ rawDataSize.get
+ } else if (totalSize.isDefined && totalSize.get > 0L) {
+ totalSize.get
+ } else if (conf.fallBackToHdfsForStatsEnabled) {
Review comment:
Per the doc of the conf "spark.sql.statistics.fallBackToHdfs", it is only
for non-partitioned hive table :
"This flag is effective only for non-partitioned Hive tables."
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]