This is an automated email from the ASF dual-hosted git repository.

vhs pushed a commit to branch release-1.0.2
in repository https://gitbox.apache.org/repos/asf/hudi.git

commit c3e9cbc2281ff9ccaa9f64d056fbd3d8d35bc6b4
Author: Lin Liu <[email protected]>
AuthorDate: Mon Apr 21 02:27:40 2025 -0700

    [HUDI-8601] Support multi-level data block index in HFile (#13166)
    
    Co-authored-by: Y Ethan Guo <[email protected]>
    (cherry picked from commit ae50130b5ee5aa79caa9562ebecdcfe03a673593)
---
 .../java/org/apache/hudi/io/hfile/HFileBlock.java  |   4 +
 .../hudi/io/hfile/HFileIntermediateIndexBlock.java |  32 +++
 .../apache/hudi/io/hfile/HFileLeafIndexBlock.java  |  84 +++++++
 .../org/apache/hudi/io/hfile/HFileReaderImpl.java  |  80 +++++-
 .../apache/hudi/io/hfile/HFileRootIndexBlock.java  |  42 ++--
 .../org/apache/hudi/io/hfile/TestHFileReader.java  | 276 +++++++++++++++++++++
 ...2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile | Bin 0 -> 246061 bytes
 ..._1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile | Bin 0 -> 458683 bytes
 ...2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile | Bin 0 -> 1218730 bytes
 9 files changed, 500 insertions(+), 18 deletions(-)

diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
index 8ad2bf4b97c..1723b482a95 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
@@ -122,6 +122,10 @@ public abstract class HFileBlock {
     switch (blockType) {
       case ROOT_INDEX:
         return new HFileRootIndexBlock(context, byteBuff, startOffsetInBuff);
+      case LEAF_INDEX:
+        return new HFileLeafIndexBlock(context, byteBuff, startOffsetInBuff);
+      case INTERMEDIATE_INDEX:
+        return new HFileIntermediateIndexBlock(context, byteBuff, 
startOffsetInBuff);
       case FILE_INFO:
         return new HFileFileInfoBlock(context, byteBuff, startOffsetInBuff);
       case DATA:
diff --git 
a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileIntermediateIndexBlock.java
 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileIntermediateIndexBlock.java
new file mode 100644
index 00000000000..89f86f9547c
--- /dev/null
+++ 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileIntermediateIndexBlock.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.hfile;
+
+/**
+ * Represents a {@link HFileBlockType#INTERMEDIATE_INDEX} block, as
+ * part of a multi-level block index.
+ */
+public class HFileIntermediateIndexBlock extends HFileLeafIndexBlock {
+  protected HFileIntermediateIndexBlock(HFileContext context,
+                                        byte[] byteBuff,
+                                        int startOffsetInBuff) {
+    super(context, HFileBlockType.INTERMEDIATE_INDEX, byteBuff, 
startOffsetInBuff);
+  }
+}
diff --git 
a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileLeafIndexBlock.java 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileLeafIndexBlock.java
new file mode 100644
index 00000000000..e664145bbc1
--- /dev/null
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileLeafIndexBlock.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.hfile;
+
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.hudi.io.util.IOUtils.copy;
+import static org.apache.hudi.io.util.IOUtils.readInt;
+import static org.apache.hudi.io.util.IOUtils.readLong;
+
+/**
+ * Represents a {@link HFileBlockType#LEAF_INDEX} block, as
+ * part of a multi-level block index.
+ */
+public class HFileLeafIndexBlock extends HFileBlock {
+  protected HFileLeafIndexBlock(HFileContext context,
+                                byte[] byteBuff,
+                                int startOffsetInBuff) {
+    super(context, HFileBlockType.LEAF_INDEX, byteBuff, startOffsetInBuff);
+  }
+
+  protected HFileLeafIndexBlock(HFileContext context,
+                                HFileBlockType blockType,
+                                byte[] byteBuff,
+                                int startOffsetInBuff) {
+    super(context, blockType, byteBuff, startOffsetInBuff);
+  }
+
+  /**
+   * Reads the index block and returns the block index entries.
+   */
+  public List<BlockIndexEntry> readBlockIndex() {
+    // 0. Print block magic
+    int buffOffset = startOffsetInBuff + HFILEBLOCK_HEADER_SIZE;
+
+    // 1. Get the number of entries.
+    int numEntries = readInt(byteBuff, buffOffset);
+    buffOffset += DataSize.SIZEOF_INT32;
+    // 2. Parse the secondary index.
+    List<Integer> relativeOffsets = new ArrayList<>();
+    for (int i = 0; i <= numEntries; i++) {
+      relativeOffsets.add(readInt(byteBuff, buffOffset));
+      buffOffset += DataSize.SIZEOF_INT32;
+    }
+    // 3. Read index entries.
+    List<BlockIndexEntry> indexEntries = new ArrayList<>();
+    int secondIndexAfterOffset = buffOffset;
+    for (int i = 0; i < numEntries; i++) {
+      ValidationUtils.checkState(buffOffset - secondIndexAfterOffset == 
relativeOffsets.get(i));
+      long offset = readLong(byteBuff, buffOffset);
+      int size = readInt(byteBuff, buffOffset + 8);
+      // Key parsing requires different logic than that of root index.
+      int keyStartOffset = buffOffset + 12;
+      int nextEntryStartOffset = secondIndexAfterOffset + 
relativeOffsets.get(i + 1);
+      int keyLength = nextEntryStartOffset - keyStartOffset;
+      byte[] keyBytes = copy(byteBuff, buffOffset + 12, keyLength);
+      Key key = new Key(keyBytes);
+      indexEntries.add(new BlockIndexEntry(key, Option.empty(), offset, size));
+      buffOffset += (12 + keyLength);
+    }
+    return indexEntries;
+  }
+}
diff --git 
a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java
index b376ce766ae..d176af6d251 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java
@@ -20,6 +20,7 @@
 package org.apache.hudi.io.hfile;
 
 import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.ValidationUtils;
 import org.apache.hudi.io.SeekableDataInputStream;
 
 import org.apache.logging.log4j.util.Strings;
@@ -28,7 +29,10 @@ import java.io.ByteArrayInputStream;
 import java.io.DataInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Map;
+import java.util.Queue;
 import java.util.TreeMap;
 
 import static org.apache.hudi.io.hfile.HFileBlock.HFILEBLOCK_HEADER_SIZE;
@@ -73,9 +77,8 @@ public class HFileReaderImpl implements HFileReader {
     HFileBlockReader blockReader = new HFileBlockReader(
         context, stream, trailer.getLoadOnOpenDataOffset(),
         fileSize - HFileTrailer.getTrailerSize());
-    HFileRootIndexBlock dataIndexBlock =
-        (HFileRootIndexBlock) blockReader.nextBlock(HFileBlockType.ROOT_INDEX);
-    this.dataBlockIndexEntryMap = 
dataIndexBlock.readBlockIndex(trailer.getDataIndexCount(), false);
+    this.dataBlockIndexEntryMap = readDataBlockIndex(
+        blockReader, trailer.getDataIndexCount(), 
trailer.getNumDataIndexLevels());
     HFileRootIndexBlock metaIndexBlock =
         (HFileRootIndexBlock) blockReader.nextBlock(HFileBlockType.ROOT_INDEX);
     this.metaBlockIndexEntryMap = 
metaIndexBlock.readBlockIndex(trailer.getMetaIndexCount(), true);
@@ -313,4 +316,75 @@ public class HFileReaderImpl implements HFileReader {
     }
     return false;
   }
+
+  /**
+   * Read single-level or multiple-level data block index, and load all data 
block
+   * information into memory in BFS fashion.
+   *
+   * @param rootBlockReader a {@link HFileBlockReader} used to read root data 
index block;
+   *                        this reader will be used to read subsequent meta 
index block
+   *                        afterward
+   * @param numEntries      the number of entries in the root index block
+   * @param levels          the level of the indexes
+   * @return
+   */
+  private TreeMap<Key, BlockIndexEntry> readDataBlockIndex(HFileBlockReader 
rootBlockReader,
+                                                           int numEntries,
+                                                           int levels) throws 
IOException {
+    ValidationUtils.checkArgument(levels > 0,
+        "levels of data block index must be greater than 0");
+    // Parse root data index block
+    HFileRootIndexBlock rootDataIndexBlock =
+        (HFileRootIndexBlock) 
rootBlockReader.nextBlock(HFileBlockType.ROOT_INDEX);
+    if (levels == 1) {
+      // Single-level data block index
+      return rootDataIndexBlock.readBlockIndex(numEntries, false);
+    }
+
+    // Multi-level data block index
+    // This list stores next patch of leaf index entries in order
+    List<BlockIndexEntry> indexEntryList =
+        rootDataIndexBlock.readBlockIndexEntry(numEntries, false);
+    levels--;
+
+    // Supports BFS search for leaf index entries
+    Queue<BlockIndexEntry> queue = new LinkedList<>();
+    while (levels >= 1) {
+      // (2) Put intermediate / leaf index entries to the queue
+      queue.addAll(indexEntryList);
+      indexEntryList.clear();
+
+      // (3) BFS
+      while (!queue.isEmpty()) {
+        BlockIndexEntry indexEntry = queue.poll();
+        HFileBlockReader blockReader = new HFileBlockReader(
+            context, stream, indexEntry.getOffset(), indexEntry.getOffset() + 
indexEntry.getSize());
+        HFileBlockType blockType = levels > 1
+            ? HFileBlockType.INTERMEDIATE_INDEX : HFileBlockType.LEAF_INDEX;
+        HFileBlock tempBlock = blockReader.nextBlock(blockType);
+        indexEntryList.addAll(((HFileLeafIndexBlock) 
tempBlock).readBlockIndex());
+      }
+
+      // (4) Lower index level
+      levels--;
+    }
+
+    // (5) Now all entries are data block index entries. Put them into the map
+    TreeMap<Key, BlockIndexEntry> blockIndexEntryMap = new TreeMap<>();
+    for (int i = 0; i < indexEntryList.size(); i++) {
+      Key key = indexEntryList.get(i).getFirstKey();
+      blockIndexEntryMap.put(
+          key,
+          new BlockIndexEntry(
+              key,
+              i < indexEntryList.size() - 1
+                  ? Option.of(indexEntryList.get(i + 1).getFirstKey())
+                  : Option.empty(),
+              indexEntryList.get(i).getOffset(),
+              indexEntryList.get(i).getSize()));
+    }
+
+    // (6) Returns the combined index entry map
+    return blockIndexEntryMap;
+  }
 }
diff --git 
a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java
index 9612d75ff60..099e42ad44c 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java
@@ -45,15 +45,35 @@ public class HFileRootIndexBlock extends HFileBlock {
    * Reads the index block and returns the block index entry to an in-memory 
{@link TreeMap}
    * for searches.
    *
-   * @param numEntries the number of entries in the block.
-   * @return a {@link TreeMap} of block index entries.
+   * @param numEntries     the number of entries in the block
+   * @param contentKeyOnly whether the key part contains content only
+   * @return a {@link TreeMap} of block index entries
    */
   public TreeMap<Key, BlockIndexEntry> readBlockIndex(int numEntries, boolean 
contentKeyOnly) {
     TreeMap<Key, BlockIndexEntry> blockIndexEntryMap = new TreeMap<>();
+    List<BlockIndexEntry> indexEntryList = readBlockIndexEntry(numEntries, 
contentKeyOnly);
+    for (int i = 0; i < numEntries; i++) {
+      Key key = indexEntryList.get(i).getFirstKey();
+      blockIndexEntryMap.put(key, new BlockIndexEntry(
+          key,
+          i < numEntries - 1 ? Option.of(indexEntryList.get(i + 
1).getFirstKey()) : Option.empty(),
+          indexEntryList.get(i).getOffset(),
+          indexEntryList.get(i).getSize()));
+    }
+    return blockIndexEntryMap;
+  }
+
+  /**
+   * Returns the block index entries contained in the root index block.
+   *
+   * @param numEntries     the number of entries in the block
+   * @param contentKeyOnly whether the key part contains content only
+   * @return a {@link List} of block index entries
+   */
+  public List<BlockIndexEntry> readBlockIndexEntry(int numEntries,
+                                                   boolean contentKeyOnly) {
+    List<BlockIndexEntry> indexEntryList = new ArrayList<>();
     int buffOffset = startOffsetInBuff + HFILEBLOCK_HEADER_SIZE;
-    List<Key> keyList = new ArrayList<>();
-    List<Long> offsetList = new ArrayList<>();
-    List<Integer> sizeList = new ArrayList();
     for (int i = 0; i < numEntries; i++) {
       long offset = readLong(byteBuff, buffOffset);
       int size = readInt(byteBuff, buffOffset + 8);
@@ -61,17 +81,9 @@ public class HFileRootIndexBlock extends HFileBlock {
       int keyLength = (int) readVarLong(byteBuff, buffOffset + 12, 
varLongSizeOnDist);
       byte[] keyBytes = copy(byteBuff, buffOffset + 12 + varLongSizeOnDist, 
keyLength);
       Key key = contentKeyOnly ? new UTF8StringKey(keyBytes) : new 
Key(keyBytes);
-      keyList.add(key);
-      offsetList.add(offset);
-      sizeList.add(size);
+      indexEntryList.add(new BlockIndexEntry(key, Option.empty(), offset, 
size));
       buffOffset += (12 + varLongSizeOnDist + keyLength);
     }
-    for (int i = 0; i < numEntries; i++) {
-      Key key = keyList.get(i);
-      blockIndexEntryMap.put(key, new BlockIndexEntry(
-          key, i < numEntries - 1 ? Option.of(keyList.get(i + 1)) : 
Option.empty(),
-          offsetList.get(i), sizeList.get(i)));
-    }
-    return blockIndexEntryMap;
+    return indexEntryList;
   }
 }
diff --git 
a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java 
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java
index 28f02c10d71..469ba37c277 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java
@@ -65,6 +65,9 @@ public class TestHFileReader {
   public static final Function<Integer, String> KEY_CREATOR = i -> 
String.format("hudi-key-%09d", i);
   public static final Function<Integer, String> KEY_CREATOR_WITH_SUFFIX = i -> 
String.format("hudi-key-%09d-abcdefghij", i);
   public static final Function<Integer, String> VALUE_CREATOR = i -> 
String.format("hudi-value-%09d", i);
+  private static final String LARGE_KEY_PREFIX = 
"hudi-key-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+      + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-";
+  private static final Function<Integer, String> LARGE_KEY_CREATOR = i -> 
LARGE_KEY_PREFIX + String.format("%09d", i);
   private static final int SEEK_TO_THROW_EXCEPTION = -3;
 
   static Stream<Arguments> testArgsReadHFilePointAndPrefixLookup() {
@@ -386,6 +389,279 @@ public class TestHFileReader {
                 new KeyLookUpInfo("hudi-key-000020000", SEEK_TO_EOF, "", ""),
                 new KeyLookUpInfo("hudi-key-000020001", SEEK_TO_EOF, "", "")
             )
+        ),
+        // This HFile has large keys (key length > 100B), generated with 
LARGE_KEY_CREATOR
+        // using {@link 
TestHoodieHBaseHFileReaderWriter#generateHFileForTesting}
+        // The number of data block index levels is 2
+        Arguments.of(
+            "/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile",
+            20000,
+            LARGE_KEY_CREATOR,
+            Arrays.asList(
+                // before first key
+                new KeyLookUpInfo("", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                new KeyLookUpInfo("as", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // backward seekTo before first key is allowed and safe
+                new KeyLookUpInfo("aa", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000000", 
SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // first key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000000", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // key in the block 0
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000005", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000000005", "hudi-value-000000005"),
+                // backward seek not supported in a block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000004", 
SEEK_TO_THROW_EXCEPTION, "", ""),
+                // prefix lookup, the pointer should not move
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000010", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000099", "hudi-value-000000099"),
+                // non-exact lookup, the pointer should move
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"),
+                // prefix lookup with a jump, the pointer should not go beyond 
the lookup key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000040", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000399", "hudi-value-000000399"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"),
+                // last key of the block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                // first key of the next block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002786", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000002786", "hudi-value-000002786"),
+                // more lookups
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000005340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000005340", "hudi-value-000005340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000007340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000007340", "hudi-value-000007340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000009340", "hudi-value-000009340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013899", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000013899", "hudi-value-000013899"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013899", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000013899", "hudi-value-000013899"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013900", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000013900", "hudi-value-000013900"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013901", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000013901", "hudi-value-000013901"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000013902", "hudi-value-000013902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000015902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000015902", "hudi-value-000015902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000017902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000017902", "hudi-value-000017902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019500", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000019500", "hudi-value-000019500"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000196", 
SEEK_TO_BEFORE_BLOCK_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000019600", "hudi-value-000019600"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00001960", 
SEEK_TO_BEFORE_BLOCK_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000019600", "hudi-value-000019600"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019600a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000019600", "hudi-value-000019600"),
+                // second to last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019998", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000019998", "hudi-value-000019998"),
+                // last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019999", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000019999", "hudi-value-000019999"),
+                // after last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019999a", 
SEEK_TO_EOF, "", ""),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019999b", 
SEEK_TO_EOF, "", "")
+            )
+        ),
+        // This HFile has large keys (key length > 100B), generated with 
LARGE_KEY_CREATOR
+        // using {@link 
TestHoodieHBaseHFileReaderWriter#generateHFileForTesting}
+        // and the HFile configs: hfile.index.block.max.size = 2048, 
hfile.index.block.min.entries = 4
+        // The number of data block index levels is 3
+        Arguments.of(
+            
"/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile",
+            10000,
+            LARGE_KEY_CREATOR,
+            Arrays.asList(
+                // before first key
+                new KeyLookUpInfo("", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                new KeyLookUpInfo("as", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // backward seekTo before first key is allowed and safe
+                new KeyLookUpInfo("aa", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000000", 
SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // first key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000000", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // key in the block 0
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000005", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000000005", "hudi-value-000000005"),
+                // backward seek not supported in a block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000004", 
SEEK_TO_THROW_EXCEPTION, "", ""),
+                // prefix lookup, the pointer should not move
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000010", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000099", "hudi-value-000000099"),
+                // non-exact lookup, the pointer should move
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"),
+                // prefix lookup with a jump, the pointer should not go beyond 
the lookup key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000040", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000399", "hudi-value-000000399"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"),
+                // last key of the block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                // first key of the next block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002786", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000002786", "hudi-value-000002786"),
+                // more lookups
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000003340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000003340", "hudi-value-000003340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000004340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000004340", "hudi-value-000004340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000005340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000005340", "hudi-value-000005340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006899", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000006899", "hudi-value-000006899"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006899", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000006899", "hudi-value-000006899"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006900", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000006900", "hudi-value-000006900"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006901", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000006901", "hudi-value-000006901"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000006902", "hudi-value-000006902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000007902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000007902", "hudi-value-000007902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000008902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000008902", "hudi-value-000008902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000091", 
SEEK_TO_BEFORE_BLOCK_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000009100", "hudi-value-000009100"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000910", 
SEEK_TO_BEFORE_BLOCK_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000009100", "hudi-value-000009100"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009100a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000009100", "hudi-value-000009100"),
+                // second to last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009998", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000009998", "hudi-value-000009998"),
+                // last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009999", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000009999", "hudi-value-000009999"),
+                // after last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009999a", 
SEEK_TO_EOF, "", ""),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009999b", 
SEEK_TO_EOF, "", "")
+            )
+        ),
+        // This HFile has large keys (key length > 100B), generated with 
LARGE_KEY_CREATOR
+        // using {@link 
TestHoodieHBaseHFileReaderWriter#generateHFileForTesting}
+        // and the HFile configs: hfile.index.block.max.size = 2048, 
hfile.index.block.min.entries = 4
+        // The number of data block index levels is 4
+        Arguments.of(
+            
"/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile",
+            50000,
+            LARGE_KEY_CREATOR,
+            Arrays.asList(
+                // before first key
+                new KeyLookUpInfo("", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                new KeyLookUpInfo("as", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // backward seekTo before first key is allowed and safe
+                new KeyLookUpInfo("aa", SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000000", 
SEEK_TO_BEFORE_FILE_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // first key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000000", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"),
+                // key in the block 0
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000005", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000000005", "hudi-value-000000005"),
+                // backward seek not supported in a block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000004", 
SEEK_TO_THROW_EXCEPTION, "", ""),
+                // prefix lookup, the pointer should not move
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000010", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000099", "hudi-value-000000099"),
+                // non-exact lookup, the pointer should move
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"),
+                // prefix lookup with a jump, the pointer should not go beyond 
the lookup key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000040", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000399", "hudi-value-000000399"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"),
+                // last key of the block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785b", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"),
+                // first key of the next block
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002786", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000002786", "hudi-value-000002786"),
+                // more lookups
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000005340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000005340", "hudi-value-000005340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000010340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000010340", "hudi-value-000010340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019340", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000019340", "hudi-value-000019340"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023899", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000023899", "hudi-value-000023899"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023899", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000023899", "hudi-value-000023899"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023900", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000023900", "hudi-value-000023900"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023901", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000023901", "hudi-value-000023901"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000023902", "hudi-value-000023902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000030902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000030902", "hudi-value-000030902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000037902", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000037902", "hudi-value-000037902"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000039500", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000039500", "hudi-value-000039500"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000448", 
SEEK_TO_BEFORE_BLOCK_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000044800", "hudi-value-000044800"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "00004480", 
SEEK_TO_BEFORE_BLOCK_FIRST_KEY,
+                    LARGE_KEY_PREFIX + "000044800", "hudi-value-000044800"),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000044800a", 
SEEK_TO_IN_RANGE,
+                    LARGE_KEY_PREFIX + "000044800", "hudi-value-000044800"),
+                // second to last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000049998", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000049998", "hudi-value-000049998"),
+                // last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000049999", 
SEEK_TO_FOUND,
+                    LARGE_KEY_PREFIX + "000049999", "hudi-value-000049999"),
+                // after last key
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000049999a", 
SEEK_TO_EOF, "", ""),
+                new KeyLookUpInfo(LARGE_KEY_PREFIX + "000049999b", 
SEEK_TO_EOF, "", "")
+            )
         )
     );
   }
diff --git 
a/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile
 
b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile
new file mode 100644
index 00000000000..03c7960ba09
Binary files /dev/null and 
b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile
 differ
diff --git 
a/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile
 
b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile
new file mode 100644
index 00000000000..bacd24549a3
Binary files /dev/null and 
b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile
 differ
diff --git 
a/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile
 
b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile
new file mode 100644
index 00000000000..ba3e2d9c919
Binary files /dev/null and 
b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile
 differ


Reply via email to