(hudi) branch master updated: [MINOR] Preload file listing for partitions in BloomIndex to avoid repeated listings (#17462)

sivabalan Sun, 08 Feb 2026 14:49:29 -0800

This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git



The following commit(s) were added to refs/heads/master by this push:
     new 4c9dcb32bd58 [MINOR] Preload file listing for partitions in BloomIndex 
to avoid repeated listings (#17462)
4c9dcb32bd58 is described below

commit 4c9dcb32bd581634ab9ff25c2e8a705f450cc1cc
Author: Prashant Wason <[email protected]>
AuthorDate: Sun Feb 8 14:49:15 2026 -0800

    [MINOR] Preload file listing for partitions in BloomIndex to avoid repeated 
listings (#17462)
    
    * [MINOR] Preload the file listing for each partition into the 
FileSystemView so that executors do not have to perform the listing again.
    
    When config.getBloomIndexPruneByRanges() is enabled, the latest parquet 
file for each fileID is opened to reads the footers for min/max record keys. 
Finding the latest parquet file requires listing the partition which will be 
perform many times. Preloading caches the file listing of the partition into 
the FileSystemView and is serialized and send to the executors.
    
    * Only preload file listings when embedded timeline server is disabled
    
    Address review comment: Confine the preloading optimization to only run
    when TLS is disabled, since TLS already caches file listings.
    
    Co-Authored-By: Claude (claude-opus-4-5) <[email protected]>
    
    * Remove unnecessary sync() call and add test for partition preloading
    
    - Removed sync() call from getBloomIndexFileInfoForPartitions() as the 
FileSystemView
      is already fresh for new operations
    - Added test testPreloadPartitionsWhenTimelineServerDisabled to verify 
preloading
      behavior when embedded timeline server is disabled
    
    Co-Authored-By: Claude Opus 4.5 <[email protected]>
    
    ---------
    
    Co-authored-by: Claude (claude-opus-4-5) <[email protected]>
---
 .../apache/hudi/index/bloom/HoodieBloomIndex.java  |  6 +++
 .../hudi/index/bloom/TestHoodieBloomIndex.java     | 60 ++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
index f0dbf3276ec3..ca72304b6215 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java
@@ -136,6 +136,12 @@ public class HoodieBloomIndex extends HoodieIndex<Object, 
Object> {
                                                                                
     List<String> affectedPartitionPathList) {
     List<Pair<String, BloomIndexFileInfo>> fileInfoList = new ArrayList<>();
 
+    // Preload the partitions so that each parallel op does not have to 
perform listing.
+    // This is only needed when the embedded timeline server is not enabled, 
as TLS caches file listings.
+    if (!config.isEmbeddedTimelineServerEnabled()) {
+      affectedPartitionPathList.forEach(partition -> 
hoodieTable.getBaseFileOnlyView().getAllBaseFiles(partition));
+    }
+
     if (config.getBloomIndexPruneByRanges()) {
       // load column ranges from metadata index if column stats index is 
enabled and column_stats metadata partition is available
       if (config.getBloomIndexUseMetadata()
diff --git 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java
 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java
index 995893e413cd..3489cfbbf6a2 100644
--- 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java
+++ 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java
@@ -659,6 +659,66 @@ public class TestHoodieBloomIndex extends 
TestHoodieMetadataBase {
     }
   }
 
+  @Test
+  public void testPreloadPartitionsWhenTimelineServerDisabled() throws 
Exception {
+    // This test verifies that when TLS is disabled, file listings are 
preloaded
+    // to avoid redundant listings by parallel operations
+    HoodieWriteConfig config = 
HoodieWriteConfig.newBuilder().withPath(basePath)
+        .withEmbeddedTimelineServerEnabled(false)
+        .withIndexConfig(HoodieIndexConfig.newBuilder()
+            .bloomIndexPruneByRanges(true)
+            .bloomIndexUseMetadata(false)
+            .build())
+        .build();
+
+    HoodieBloomIndex index = new HoodieBloomIndex(config, 
SparkHoodieBloomIndexHelper.getInstance());
+    metadataWriter = SparkHoodieBackedTableMetadataWriter.create(storageConf, 
config, context);
+    HoodieSparkWriteableTestTable testTable = 
HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, metadataWriter, 
Option.of(context));
+
+    // Create partitions with files
+    final String partition1 = "2016/01/31";
+    final String partition2 = "2015/01/31";
+    testTable.withPartitionMetaFiles(partition1, partition2);
+
+    HoodieRecord record1 = createSimpleRecord("000", 
"2016-01-31T03:16:41.415Z", 12);
+    HoodieRecord record2 = createSimpleRecord("001", 
"2015-01-31T03:16:41.415Z", 15);
+
+    final Map<String, List<Pair<String, Integer>>> 
partitionToFilesNameLengthMap = new HashMap<>();
+    String commitTime = "20160131010101";
+    StoragePath baseFilePath = testTable.forCommit(commitTime)
+        .withInserts(partition1, "file1", Collections.singletonList(record1));
+    long baseFileLength = storage.getPathInfo(baseFilePath).getLength();
+    partitionToFilesNameLengthMap.put(partition1,
+        Collections.singletonList(Pair.of("file1", (int) baseFileLength)));
+    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT,
+        Collections.singletonList(partition1), partitionToFilesNameLengthMap, 
false, false);
+
+    commitTime = "20150131010101";
+    partitionToFilesNameLengthMap.clear();
+    baseFilePath = testTable.forCommit(commitTime)
+        .withInserts(partition2, "file2", Collections.singletonList(record2));
+    baseFileLength = storage.getPathInfo(baseFilePath).getLength();
+    partitionToFilesNameLengthMap.put(partition2,
+        Collections.singletonList(Pair.of("file2", (int) baseFileLength)));
+    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT,
+        Collections.singletonList(partition2), partitionToFilesNameLengthMap, 
false, false);
+
+    metaClient = HoodieTableMetaClient.reload(metaClient);
+    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, 
metaClient);
+
+    // Load column ranges - this should work correctly with preloading
+    List<String> partitions = Arrays.asList(partition1, partition2);
+    List<Pair<String, BloomIndexFileInfo>> filesList = 
index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
+
+    // Verify files were loaded for both partitions
+    assertEquals(2, filesList.size());
+    Set<String> loadedPartitions = filesList.stream()
+        .map(Pair::getLeft)
+        .collect(Collectors.toSet());
+    assertTrue(loadedPartitions.contains(partition1));
+    assertTrue(loadedPartitions.contains(partition2));
+  }
+
   private static String genRandomUUID() {
     return genPseudoRandomUUID(RANDOM).toString();
   }

(hudi) branch master updated: [MINOR] Preload file listing for partitions in BloomIndex to avoid repeated listings (#17462)

Reply via email to