This is an automated email from the ASF dual-hosted git repository.
danny0405 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 32650ef6995 [MINOR] Remove repeated look up candidate (#11569)
32650ef6995 is described below
commit 32650ef69953d8187f689cb1ac8ceea11c7e7486
Author: KnightChess <[email protected]>
AuthorDate: Mon Jul 15 16:10:19 2024 +0800
[MINOR] Remove repeated look up candidate (#11569)
---
.../src/main/scala/org/apache/hudi/HoodieFileIndex.scala | 9 +++++----
.../sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala | 2 +-
2 files changed, 6 insertions(+), 5 deletions(-)
diff --git
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
index ccd9c597c51..8e4a3db8cba 100644
---
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
+++
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
@@ -220,17 +220,20 @@ case class HoodieFileIndex(spark: SparkSession,
*
* @param dataFilters data columns filters
* @param partitionFilters partition column filters
+ * @param partitionPrune for HoodiePruneFileSourcePartitions rule only prune
partitions
* @return A sequence of pruned partitions and corresponding filtered file
slices
*/
- def filterFileSlices(dataFilters: Seq[Expression], partitionFilters:
Seq[Expression])
+ def filterFileSlices(dataFilters: Seq[Expression], partitionFilters:
Seq[Expression], isPartitionPruned: Boolean = false)
: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])] = {
val (isPruned, prunedPartitionsAndFileSlices) =
prunePartitionsAndGetFileSlices(dataFilters, partitionFilters)
+ hasPushedDownPartitionPredicates = true
// If there are no data filters, return all the file slices.
+ // If isPartitionPurge is true, this fun is trigger by
HoodiePruneFileSourcePartitions, don't look up candidate files
// If there are no file slices, return empty list.
- if (prunedPartitionsAndFileSlices.isEmpty || dataFilters.isEmpty) {
+ if (prunedPartitionsAndFileSlices.isEmpty || dataFilters.isEmpty ||
isPartitionPruned ) {
prunedPartitionsAndFileSlices
} else {
// Look up candidate files names in the col-stats or record level index,
if all of the following conditions are true
@@ -284,8 +287,6 @@ case class HoodieFileIndex(spark: SparkSession,
s"candidate file slices after data skipping: $candidateFileSliceSize;
" +
s"skipping percentage $skippingRatio")
- hasPushedDownPartitionPredicates = true
-
prunedPartitionsAndFilteredFileSlices
}
}
diff --git
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
index 08b5685f46a..753af2baa28 100644
---
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
+++
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
@@ -52,7 +52,7 @@ case class HoodiePruneFileSourcePartitions(spark:
SparkSession) extends Rule[Log
// [[HudiFileIndex]] is a caching one, therefore we don't need to
reconstruct new relation,
// instead we simply just refresh the index and update the stats
- fileIndex.listFiles(partitionPruningFilters, dataFilters)
+ fileIndex.filterFileSlices(dataFilters, partitionPruningFilters,
isPartitionPruned = true)
if (partitionPruningFilters.nonEmpty) {
// Change table stats based on the sizeInBytes of pruned files