This is an automated email from the ASF dual-hosted git repository.
codope pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new b01e3f9053d [HUDI-6477] Lazy fetching partition path & file slice when
refresh in HoodieFileIndex (#9122)
b01e3f9053d is described below
commit b01e3f9053d0add8f4dd97a6ded9f95454e08e20
Author: Zouxxyy <[email protected]>
AuthorDate: Wed Jul 5 20:22:54 2023 +0800
[HUDI-6477] Lazy fetching partition path & file slice when refresh in
HoodieFileIndex (#9122)
---
.../org/apache/hudi/BaseHoodieTableFileIndex.java | 25 +++++++++++-----------
.../org/apache/hudi/TestHoodieFileIndex.scala | 16 +++++++-------
2 files changed, 20 insertions(+), 21 deletions(-)
diff --git
a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java
b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java
index 3eda336ca35..c5adafa38e2 100644
--- a/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java
+++ b/hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java
@@ -84,6 +84,14 @@ public abstract class BaseHoodieTableFileIndex implements
AutoCloseable {
private final boolean shouldIncludePendingCommits;
private final boolean shouldValidateInstant;
+
+ // The `shouldListLazily` variable controls how we initialize/refresh the
TableFileIndex:
+ // - non-lazy/eager listing (shouldListLazily=false): all partitions and
file slices will be loaded eagerly during initialization.
+ // - lazy listing (shouldListLazily=true): partitions listing will be done
lazily with the knowledge from query predicate on partition
+ // columns. And file slices fetching only happens for partitions
satisfying the given filter.
+ //
+ // In SparkSQL, `shouldListLazily` is controlled by option
`REFRESH_PARTITION_AND_FILES_IN_INITIALIZATION`.
+ // In lazy listing case, if no predicate on partition is provided, all
partitions will still be loaded.
private final boolean shouldListLazily;
private final Path basePath;
@@ -144,18 +152,7 @@ public abstract class BaseHoodieTableFileIndex implements
AutoCloseable {
this.engineContext = engineContext;
this.fileStatusCache = fileStatusCache;
- // The `shouldListLazily` variable controls how we initialize the
TableFileIndex:
- // - non-lazy/eager listing (shouldListLazily=false): all partitions and
file slices will be loaded eagerly during initialization.
- // - lazy listing (shouldListLazily=true): partitions listing will be
done lazily with the knowledge from query predicate on partition
- // columns. And file slices fetching only happens for partitions
satisfying the given filter.
- //
- // In SparkSQL, `shouldListLazily` is controlled by option
`REFRESH_PARTITION_AND_FILES_IN_INITIALIZATION`.
- // In lazy listing case, if no predicate on partition is provided, all
partitions will still be loaded.
- if (shouldListLazily) {
- this.tableMetadata = createMetadataTable(engineContext, metadataConfig,
basePath);
- } else {
- doRefresh();
- }
+ doRefresh();
}
protected abstract Object[] doParsePartitionColumnValues(String[]
partitionColumns, String partitionPath);
@@ -378,7 +375,9 @@ public abstract class BaseHoodieTableFileIndex implements
AutoCloseable {
// Reset it to null to trigger re-loading of all partition path
this.cachedAllPartitionPaths = null;
- ensurePreloadedPartitions(getAllQueryPartitionPaths());
+ if (!shouldListLazily) {
+ ensurePreloadedPartitions(getAllQueryPartitionPaths());
+ }
LOG.info(String.format("Refresh table %s, spent: %d ms",
metaClient.getTableConfig().getTableName(), timer.endTimer()));
}
diff --git
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
index c1db55c5822..f8f082489ce 100644
---
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
+++
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieFileIndex.scala
@@ -319,18 +319,18 @@ class TestHoodieFileIndex extends
HoodieSparkClientTestBase with ScalaAssertionS
.mode(SaveMode.Overwrite)
.save(basePath)
+ fileIndex.refresh()
+
+ val partitionFilter2 = And(
+ EqualTo(attribute("dt"), literal("2021/03/01")),
+ EqualTo(attribute("hh"), literal("10"))
+ )
+
// NOTE: That if file-index is in lazy-listing mode and we can't parse
partition values, there's no way
// to recover from this since Spark by default have to inject
partition values parsed from the partition paths.
if (listingModeOverride ==
DataSourceReadOptions.FILE_INDEX_LISTING_MODE_LAZY) {
- assertThrows(classOf[HoodieException]) { fileIndex.refresh() }
+ assertThrows(classOf[HoodieException]) {
fileIndex.listFiles(Seq(partitionFilter2), Seq.empty) }
} else {
- fileIndex.refresh()
-
- val partitionFilter2 = And(
- EqualTo(attribute("dt"), literal("2021/03/01")),
- EqualTo(attribute("hh"), literal("10"))
- )
-
val partitionAndFilesNoPruning =
fileIndex.listFiles(Seq(partitionFilter2), Seq.empty)
assertEquals(1, partitionAndFilesNoPruning.size)