This is an automated email from the ASF dual-hosted git repository.

codope pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new b01e3f9053d [HUDI-6477] Lazy fetching partition path & file slice when 
refresh in HoodieFileIndex (#9122)
b01e3f9053d is described below

commit b01e3f9053d0add8f4dd97a6ded9f95454e08e20
Author: Zouxxyy <[email protected]>
AuthorDate: Wed Jul 5 20:22:54 2023 +0800

    [HUDI-6477] Lazy fetching partition path & file slice when refresh in 
HoodieFileIndex (#9122)
---
 .../org/apache/hudi/BaseHoodieTableFileIndex.java  | 25 +++++++++++-----------
 .../org/apache/hudi/TestHoodieFileIndex.scala      | 16 +++++++-------
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git 
a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java 
b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java
index 3eda336ca35..c5adafa38e2 100644
--- a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java
+++ b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java
@@ -84,6 +84,14 @@ public abstract class BaseHoodieTableFileIndex implements 
AutoCloseable {
 
   private final boolean shouldIncludePendingCommits;
   private final boolean shouldValidateInstant;
+
+  // The `shouldListLazily` variable controls how we initialize/refresh the 
TableFileIndex:
+  //  - non-lazy/eager listing (shouldListLazily=false):  all partitions and 
file slices will be loaded eagerly during initialization.
+  //  - lazy listing (shouldListLazily=true): partitions listing will be done 
lazily with the knowledge from query predicate on partition
+  //        columns. And file slices fetching only happens for partitions 
satisfying the given filter.
+  //
+  // In SparkSQL, `shouldListLazily` is controlled by option 
`REFRESH_PARTITION_AND_FILES_IN_INITIALIZATION`.
+  // In lazy listing case, if no predicate on partition is provided, all 
partitions will still be loaded.
   private final boolean shouldListLazily;
 
   private final Path basePath;
@@ -144,18 +152,7 @@ public abstract class BaseHoodieTableFileIndex implements 
AutoCloseable {
     this.engineContext = engineContext;
     this.fileStatusCache = fileStatusCache;
 
-    // The `shouldListLazily` variable controls how we initialize the 
TableFileIndex:
-    //  - non-lazy/eager listing (shouldListLazily=false):  all partitions and 
file slices will be loaded eagerly during initialization.
-    //  - lazy listing (shouldListLazily=true): partitions listing will be 
done lazily with the knowledge from query predicate on partition
-    //        columns. And file slices fetching only happens for partitions 
satisfying the given filter.
-    //
-    // In SparkSQL, `shouldListLazily` is controlled by option 
`REFRESH_PARTITION_AND_FILES_IN_INITIALIZATION`.
-    // In lazy listing case, if no predicate on partition is provided, all 
partitions will still be loaded.
-    if (shouldListLazily) {
-      this.tableMetadata = createMetadataTable(engineContext, metadataConfig, 
basePath);
-    } else {
-      doRefresh();
-    }
+    doRefresh();
   }
 
   protected abstract Object[] doParsePartitionColumnValues(String[] 
partitionColumns, String partitionPath);
@@ -378,7 +375,9 @@ public abstract class BaseHoodieTableFileIndex implements 
AutoCloseable {
 
     // Reset it to null to trigger re-loading of all partition path
     this.cachedAllPartitionPaths = null;
-    ensurePreloadedPartitions(getAllQueryPartitionPaths());
+    if (!shouldListLazily) {
+      ensurePreloadedPartitions(getAllQueryPartitionPaths());
+    }
 
     LOG.info(String.format("Refresh table %s, spent: %d ms", 
metaClient.getTableConfig().getTableName(), timer.endTimer()));
   }
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
index c1db55c5822..f8f082489ce 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
@@ -319,18 +319,18 @@ class TestHoodieFileIndex extends 
HoodieSparkClientTestBase with ScalaAssertionS
         .mode(SaveMode.Overwrite)
         .save(basePath)
 
+      fileIndex.refresh()
+
+      val partitionFilter2 = And(
+        EqualTo(attribute("dt"), literal("2021/03/01")),
+        EqualTo(attribute("hh"), literal("10"))
+      )
+
       // NOTE: That if file-index is in lazy-listing mode and we can't parse 
partition values, there's no way
       //       to recover from this since Spark by default have to inject 
partition values parsed from the partition paths.
       if (listingModeOverride == 
DataSourceReadOptions.FILE_INDEX_LISTING_MODE_LAZY) {
-        assertThrows(classOf[HoodieException]) { fileIndex.refresh() }
+        assertThrows(classOf[HoodieException]) { 
fileIndex.listFiles(Seq(partitionFilter2), Seq.empty) }
       } else {
-        fileIndex.refresh()
-
-        val partitionFilter2 = And(
-          EqualTo(attribute("dt"), literal("2021/03/01")),
-          EqualTo(attribute("hh"), literal("10"))
-        )
-
         val partitionAndFilesNoPruning = 
fileIndex.listFiles(Seq(partitionFilter2), Seq.empty)
 
         assertEquals(1, partitionAndFilesNoPruning.size)

Reply via email to