This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 4c9dcb32bd58 [MINOR] Preload file listing for partitions in BloomIndex
to avoid repeated listings (#17462)
4c9dcb32bd58 is described below
commit 4c9dcb32bd581634ab9ff25c2e8a705f450cc1cc
Author: Prashant Wason <[email protected]>
AuthorDate: Sun Feb 8 14:49:15 2026 -0800
[MINOR] Preload file listing for partitions in BloomIndex to avoid repeated
listings (#17462)
* [MINOR] Preload the file listing for each partition into the
FileSystemView so that executors do not have to perform the listing again.
When config.getBloomIndexPruneByRanges() is enabled, the latest parquet
file for each fileID is opened to reads the footers for min/max record keys.
Finding the latest parquet file requires listing the partition which will be
perform many times. Preloading caches the file listing of the partition into
the FileSystemView and is serialized and send to the executors.
* Only preload file listings when embedded timeline server is disabled
Address review comment: Confine the preloading optimization to only run
when TLS is disabled, since TLS already caches file listings.
Co-Authored-By: Claude (claude-opus-4-5) <[email protected]>
* Remove unnecessary sync() call and add test for partition preloading
- Removed sync() call from getBloomIndexFileInfoForPartitions() as the
FileSystemView
is already fresh for new operations
- Added test testPreloadPartitionsWhenTimelineServerDisabled to verify
preloading
behavior when embedded timeline server is disabled
Co-Authored-By: Claude Opus 4.5 <[email protected]>
---------
Co-authored-by: Claude (claude-opus-4-5) <[email protected]>
---
.../apache/hudi/index/bloom/HoodieBloomIndex.java | 6 +++
.../hudi/index/bloom/TestHoodieBloomIndex.java | 60 ++++++++++++++++++++++
2 files changed, 66 insertions(+)
diff --git
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
index f0dbf3276ec3..ca72304b6215 100644
---
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
+++
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
@@ -136,6 +136,12 @@ public class HoodieBloomIndex extends HoodieIndex<Object,
Object> {
List<String> affectedPartitionPathList) {
List<Pair<String, BloomIndexFileInfo>> fileInfoList = new ArrayList<>();
+ // Preload the partitions so that each parallel op does not have to
perform listing.
+ // This is only needed when the embedded timeline server is not enabled,
as TLS caches file listings.
+ if (!config.isEmbeddedTimelineServerEnabled()) {
+ affectedPartitionPathList.forEach(partition ->
hoodieTable.getBaseFileOnlyView().getAllBaseFiles(partition));
+ }
+
if (config.getBloomIndexPruneByRanges()) {
// load column ranges from metadata index if column stats index is
enabled and column_stats metadata partition is available
if (config.getBloomIndexUseMetadata()
diff --git
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java
index 995893e413cd..3489cfbbf6a2 100644
---
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java
+++
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java
@@ -659,6 +659,66 @@ public class TestHoodieBloomIndex extends
TestHoodieMetadataBase {
}
}
+ @Test
+ public void testPreloadPartitionsWhenTimelineServerDisabled() throws
Exception {
+ // This test verifies that when TLS is disabled, file listings are
preloaded
+ // to avoid redundant listings by parallel operations
+ HoodieWriteConfig config =
HoodieWriteConfig.newBuilder().withPath(basePath)
+ .withEmbeddedTimelineServerEnabled(false)
+ .withIndexConfig(HoodieIndexConfig.newBuilder()
+ .bloomIndexPruneByRanges(true)
+ .bloomIndexUseMetadata(false)
+ .build())
+ .build();
+
+ HoodieBloomIndex index = new HoodieBloomIndex(config,
SparkHoodieBloomIndexHelper.getInstance());
+ metadataWriter = SparkHoodieBackedTableMetadataWriter.create(storageConf,
config, context);
+ HoodieSparkWriteableTestTable testTable =
HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter,
Option.of(context));
+
+ // Create partitions with files
+ final String partition1 = "2016/01/31";
+ final String partition2 = "2015/01/31";
+ testTable.withPartitionMetaFiles(partition1, partition2);
+
+ HoodieRecord record1 = createSimpleRecord("000",
"2016-01-31T03:16:41.415Z", 12);
+ HoodieRecord record2 = createSimpleRecord("001",
"2015-01-31T03:16:41.415Z", 15);
+
+ final Map<String, List<Pair<String, Integer>>>
partitionToFilesNameLengthMap = new HashMap<>();
+ String commitTime = "20160131010101";
+ StoragePath baseFilePath = testTable.forCommit(commitTime)
+ .withInserts(partition1, "file1", Collections.singletonList(record1));
+ long baseFileLength = storage.getPathInfo(baseFilePath).getLength();
+ partitionToFilesNameLengthMap.put(partition1,
+ Collections.singletonList(Pair.of("file1", (int) baseFileLength)));
+ testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT,
+ Collections.singletonList(partition1), partitionToFilesNameLengthMap,
false, false);
+
+ commitTime = "20150131010101";
+ partitionToFilesNameLengthMap.clear();
+ baseFilePath = testTable.forCommit(commitTime)
+ .withInserts(partition2, "file2", Collections.singletonList(record2));
+ baseFileLength = storage.getPathInfo(baseFilePath).getLength();
+ partitionToFilesNameLengthMap.put(partition2,
+ Collections.singletonList(Pair.of("file2", (int) baseFileLength)));
+ testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT,
+ Collections.singletonList(partition2), partitionToFilesNameLengthMap,
false, false);
+
+ metaClient = HoodieTableMetaClient.reload(metaClient);
+ HoodieTable hoodieTable = HoodieSparkTable.create(config, context,
metaClient);
+
+ // Load column ranges - this should work correctly with preloading
+ List<String> partitions = Arrays.asList(partition1, partition2);
+ List<Pair<String, BloomIndexFileInfo>> filesList =
index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
+
+ // Verify files were loaded for both partitions
+ assertEquals(2, filesList.size());
+ Set<String> loadedPartitions = filesList.stream()
+ .map(Pair::getLeft)
+ .collect(Collectors.toSet());
+ assertTrue(loadedPartitions.contains(partition1));
+ assertTrue(loadedPartitions.contains(partition2));
+ }
+
private static String genRandomUUID() {
return genPseudoRandomUUID(RANDOM).toString();
}