(hudi) branch master updated: [MINOR] Remove repeated look up candidate (#11569)

danny0405 Mon, 15 Jul 2024 01:10:31 -0700

This is an automated email from the ASF dual-hosted git repository.

danny0405 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git



The following commit(s) were added to refs/heads/master by this push:
     new 32650ef6995 [MINOR] Remove repeated look up candidate (#11569)
32650ef6995 is described below

commit 32650ef69953d8187f689cb1ac8ceea11c7e7486
Author: KnightChess <[email protected]>
AuthorDate: Mon Jul 15 16:10:19 2024 +0800

    [MINOR] Remove repeated look up candidate (#11569)
---
 .../src/main/scala/org/apache/hudi/HoodieFileIndex.scala         | 9 +++++----
 .../sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala      | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
index ccd9c597c51..8e4a3db8cba 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
@@ -220,17 +220,20 @@ case class HoodieFileIndex(spark: SparkSession,
    *
    * @param dataFilters data columns filters
    * @param partitionFilters partition column filters
+   * @param partitionPrune for HoodiePruneFileSourcePartitions rule only prune 
partitions
    * @return A sequence of pruned partitions and corresponding filtered file 
slices
    */
-  def filterFileSlices(dataFilters: Seq[Expression], partitionFilters: 
Seq[Expression])
+  def filterFileSlices(dataFilters: Seq[Expression], partitionFilters: 
Seq[Expression], isPartitionPruned: Boolean = false)
   : Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])] = {
 
     val (isPruned, prunedPartitionsAndFileSlices) =
       prunePartitionsAndGetFileSlices(dataFilters, partitionFilters)
+    hasPushedDownPartitionPredicates = true
 
     // If there are no data filters, return all the file slices.
+    // If isPartitionPurge is true, this fun is trigger by 
HoodiePruneFileSourcePartitions, don't look up candidate files
     // If there are no file slices, return empty list.
-    if (prunedPartitionsAndFileSlices.isEmpty || dataFilters.isEmpty) {
+    if (prunedPartitionsAndFileSlices.isEmpty || dataFilters.isEmpty || 
isPartitionPruned ) {
       prunedPartitionsAndFileSlices
     } else {
       // Look up candidate files names in the col-stats or record level index, 
if all of the following conditions are true
@@ -284,8 +287,6 @@ case class HoodieFileIndex(spark: SparkSession,
         s"candidate file slices after data skipping: $candidateFileSliceSize; 
" +
         s"skipping percentage $skippingRatio")
 
-      hasPushedDownPartitionPredicates = true
-
       prunedPartitionsAndFilteredFileSlices
     }
   }
diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
index 08b5685f46a..753af2baa28 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/analysis/HoodiePruneFileSourcePartitions.scala
@@ -52,7 +52,7 @@ case class HoodiePruneFileSourcePartitions(spark: 
SparkSession) extends Rule[Log
 
       // [[HudiFileIndex]] is a caching one, therefore we don't need to 
reconstruct new relation,
       // instead we simply just refresh the index and update the stats
-      fileIndex.listFiles(partitionPruningFilters, dataFilters)
+      fileIndex.filterFileSlices(dataFilters, partitionPruningFilters, 
isPartitionPruned = true)
 
       if (partitionPruningFilters.nonEmpty) {
         // Change table stats based on the sizeInBytes of pruned files

(hudi) branch master updated: [MINOR] Remove repeated look up candidate (#11569)

Reply via email to