This is an automated email from the ASF dual-hosted git repository. vhs pushed a commit to branch release-1.0.2 in repository https://gitbox.apache.org/repos/asf/hudi.git
commit c3e9cbc2281ff9ccaa9f64d056fbd3d8d35bc6b4 Author: Lin Liu <[email protected]> AuthorDate: Mon Apr 21 02:27:40 2025 -0700 [HUDI-8601] Support multi-level data block index in HFile (#13166) Co-authored-by: Y Ethan Guo <[email protected]> (cherry picked from commit ae50130b5ee5aa79caa9562ebecdcfe03a673593) --- .../java/org/apache/hudi/io/hfile/HFileBlock.java | 4 + .../hudi/io/hfile/HFileIntermediateIndexBlock.java | 32 +++ .../apache/hudi/io/hfile/HFileLeafIndexBlock.java | 84 +++++++ .../org/apache/hudi/io/hfile/HFileReaderImpl.java | 80 +++++- .../apache/hudi/io/hfile/HFileRootIndexBlock.java | 42 ++-- .../org/apache/hudi/io/hfile/TestHFileReader.java | 276 +++++++++++++++++++++ ...2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile | Bin 0 -> 246061 bytes ..._1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile | Bin 0 -> 458683 bytes ...2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile | Bin 0 -> 1218730 bytes 9 files changed, 500 insertions(+), 18 deletions(-) diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java index 8ad2bf4b97c..1723b482a95 100644 --- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java +++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java @@ -122,6 +122,10 @@ public abstract class HFileBlock { switch (blockType) { case ROOT_INDEX: return new HFileRootIndexBlock(context, byteBuff, startOffsetInBuff); + case LEAF_INDEX: + return new HFileLeafIndexBlock(context, byteBuff, startOffsetInBuff); + case INTERMEDIATE_INDEX: + return new HFileIntermediateIndexBlock(context, byteBuff, startOffsetInBuff); case FILE_INFO: return new HFileFileInfoBlock(context, byteBuff, startOffsetInBuff); case DATA: diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileIntermediateIndexBlock.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileIntermediateIndexBlock.java new file mode 100644 index 00000000000..89f86f9547c --- /dev/null +++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileIntermediateIndexBlock.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.io.hfile; + +/** + * Represents a {@link HFileBlockType#INTERMEDIATE_INDEX} block, as + * part of a multi-level block index. + */ +public class HFileIntermediateIndexBlock extends HFileLeafIndexBlock { + protected HFileIntermediateIndexBlock(HFileContext context, + byte[] byteBuff, + int startOffsetInBuff) { + super(context, HFileBlockType.INTERMEDIATE_INDEX, byteBuff, startOffsetInBuff); + } +} diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileLeafIndexBlock.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileLeafIndexBlock.java new file mode 100644 index 00000000000..e664145bbc1 --- /dev/null +++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileLeafIndexBlock.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.io.hfile; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; + +import java.util.ArrayList; +import java.util.List; + +import static org.apache.hudi.io.util.IOUtils.copy; +import static org.apache.hudi.io.util.IOUtils.readInt; +import static org.apache.hudi.io.util.IOUtils.readLong; + +/** + * Represents a {@link HFileBlockType#LEAF_INDEX} block, as + * part of a multi-level block index. + */ +public class HFileLeafIndexBlock extends HFileBlock { + protected HFileLeafIndexBlock(HFileContext context, + byte[] byteBuff, + int startOffsetInBuff) { + super(context, HFileBlockType.LEAF_INDEX, byteBuff, startOffsetInBuff); + } + + protected HFileLeafIndexBlock(HFileContext context, + HFileBlockType blockType, + byte[] byteBuff, + int startOffsetInBuff) { + super(context, blockType, byteBuff, startOffsetInBuff); + } + + /** + * Reads the index block and returns the block index entries. + */ + public List<BlockIndexEntry> readBlockIndex() { + // 0. Print block magic + int buffOffset = startOffsetInBuff + HFILEBLOCK_HEADER_SIZE; + + // 1. Get the number of entries. + int numEntries = readInt(byteBuff, buffOffset); + buffOffset += DataSize.SIZEOF_INT32; + // 2. Parse the secondary index. + List<Integer> relativeOffsets = new ArrayList<>(); + for (int i = 0; i <= numEntries; i++) { + relativeOffsets.add(readInt(byteBuff, buffOffset)); + buffOffset += DataSize.SIZEOF_INT32; + } + // 3. Read index entries. + List<BlockIndexEntry> indexEntries = new ArrayList<>(); + int secondIndexAfterOffset = buffOffset; + for (int i = 0; i < numEntries; i++) { + ValidationUtils.checkState(buffOffset - secondIndexAfterOffset == relativeOffsets.get(i)); + long offset = readLong(byteBuff, buffOffset); + int size = readInt(byteBuff, buffOffset + 8); + // Key parsing requires different logic than that of root index. + int keyStartOffset = buffOffset + 12; + int nextEntryStartOffset = secondIndexAfterOffset + relativeOffsets.get(i + 1); + int keyLength = nextEntryStartOffset - keyStartOffset; + byte[] keyBytes = copy(byteBuff, buffOffset + 12, keyLength); + Key key = new Key(keyBytes); + indexEntries.add(new BlockIndexEntry(key, Option.empty(), offset, size)); + buffOffset += (12 + keyLength); + } + return indexEntries; + } +} diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java index b376ce766ae..d176af6d251 100644 --- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java +++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java @@ -20,6 +20,7 @@ package org.apache.hudi.io.hfile; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.io.SeekableDataInputStream; import org.apache.logging.log4j.util.Strings; @@ -28,7 +29,10 @@ import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.LinkedList; +import java.util.List; import java.util.Map; +import java.util.Queue; import java.util.TreeMap; import static org.apache.hudi.io.hfile.HFileBlock.HFILEBLOCK_HEADER_SIZE; @@ -73,9 +77,8 @@ public class HFileReaderImpl implements HFileReader { HFileBlockReader blockReader = new HFileBlockReader( context, stream, trailer.getLoadOnOpenDataOffset(), fileSize - HFileTrailer.getTrailerSize()); - HFileRootIndexBlock dataIndexBlock = - (HFileRootIndexBlock) blockReader.nextBlock(HFileBlockType.ROOT_INDEX); - this.dataBlockIndexEntryMap = dataIndexBlock.readBlockIndex(trailer.getDataIndexCount(), false); + this.dataBlockIndexEntryMap = readDataBlockIndex( + blockReader, trailer.getDataIndexCount(), trailer.getNumDataIndexLevels()); HFileRootIndexBlock metaIndexBlock = (HFileRootIndexBlock) blockReader.nextBlock(HFileBlockType.ROOT_INDEX); this.metaBlockIndexEntryMap = metaIndexBlock.readBlockIndex(trailer.getMetaIndexCount(), true); @@ -313,4 +316,75 @@ public class HFileReaderImpl implements HFileReader { } return false; } + + /** + * Read single-level or multiple-level data block index, and load all data block + * information into memory in BFS fashion. + * + * @param rootBlockReader a {@link HFileBlockReader} used to read root data index block; + * this reader will be used to read subsequent meta index block + * afterward + * @param numEntries the number of entries in the root index block + * @param levels the level of the indexes + * @return + */ + private TreeMap<Key, BlockIndexEntry> readDataBlockIndex(HFileBlockReader rootBlockReader, + int numEntries, + int levels) throws IOException { + ValidationUtils.checkArgument(levels > 0, + "levels of data block index must be greater than 0"); + // Parse root data index block + HFileRootIndexBlock rootDataIndexBlock = + (HFileRootIndexBlock) rootBlockReader.nextBlock(HFileBlockType.ROOT_INDEX); + if (levels == 1) { + // Single-level data block index + return rootDataIndexBlock.readBlockIndex(numEntries, false); + } + + // Multi-level data block index + // This list stores next patch of leaf index entries in order + List<BlockIndexEntry> indexEntryList = + rootDataIndexBlock.readBlockIndexEntry(numEntries, false); + levels--; + + // Supports BFS search for leaf index entries + Queue<BlockIndexEntry> queue = new LinkedList<>(); + while (levels >= 1) { + // (2) Put intermediate / leaf index entries to the queue + queue.addAll(indexEntryList); + indexEntryList.clear(); + + // (3) BFS + while (!queue.isEmpty()) { + BlockIndexEntry indexEntry = queue.poll(); + HFileBlockReader blockReader = new HFileBlockReader( + context, stream, indexEntry.getOffset(), indexEntry.getOffset() + indexEntry.getSize()); + HFileBlockType blockType = levels > 1 + ? HFileBlockType.INTERMEDIATE_INDEX : HFileBlockType.LEAF_INDEX; + HFileBlock tempBlock = blockReader.nextBlock(blockType); + indexEntryList.addAll(((HFileLeafIndexBlock) tempBlock).readBlockIndex()); + } + + // (4) Lower index level + levels--; + } + + // (5) Now all entries are data block index entries. Put them into the map + TreeMap<Key, BlockIndexEntry> blockIndexEntryMap = new TreeMap<>(); + for (int i = 0; i < indexEntryList.size(); i++) { + Key key = indexEntryList.get(i).getFirstKey(); + blockIndexEntryMap.put( + key, + new BlockIndexEntry( + key, + i < indexEntryList.size() - 1 + ? Option.of(indexEntryList.get(i + 1).getFirstKey()) + : Option.empty(), + indexEntryList.get(i).getOffset(), + indexEntryList.get(i).getSize())); + } + + // (6) Returns the combined index entry map + return blockIndexEntryMap; + } } diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java index 9612d75ff60..099e42ad44c 100644 --- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java +++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileRootIndexBlock.java @@ -45,15 +45,35 @@ public class HFileRootIndexBlock extends HFileBlock { * Reads the index block and returns the block index entry to an in-memory {@link TreeMap} * for searches. * - * @param numEntries the number of entries in the block. - * @return a {@link TreeMap} of block index entries. + * @param numEntries the number of entries in the block + * @param contentKeyOnly whether the key part contains content only + * @return a {@link TreeMap} of block index entries */ public TreeMap<Key, BlockIndexEntry> readBlockIndex(int numEntries, boolean contentKeyOnly) { TreeMap<Key, BlockIndexEntry> blockIndexEntryMap = new TreeMap<>(); + List<BlockIndexEntry> indexEntryList = readBlockIndexEntry(numEntries, contentKeyOnly); + for (int i = 0; i < numEntries; i++) { + Key key = indexEntryList.get(i).getFirstKey(); + blockIndexEntryMap.put(key, new BlockIndexEntry( + key, + i < numEntries - 1 ? Option.of(indexEntryList.get(i + 1).getFirstKey()) : Option.empty(), + indexEntryList.get(i).getOffset(), + indexEntryList.get(i).getSize())); + } + return blockIndexEntryMap; + } + + /** + * Returns the block index entries contained in the root index block. + * + * @param numEntries the number of entries in the block + * @param contentKeyOnly whether the key part contains content only + * @return a {@link List} of block index entries + */ + public List<BlockIndexEntry> readBlockIndexEntry(int numEntries, + boolean contentKeyOnly) { + List<BlockIndexEntry> indexEntryList = new ArrayList<>(); int buffOffset = startOffsetInBuff + HFILEBLOCK_HEADER_SIZE; - List<Key> keyList = new ArrayList<>(); - List<Long> offsetList = new ArrayList<>(); - List<Integer> sizeList = new ArrayList(); for (int i = 0; i < numEntries; i++) { long offset = readLong(byteBuff, buffOffset); int size = readInt(byteBuff, buffOffset + 8); @@ -61,17 +81,9 @@ public class HFileRootIndexBlock extends HFileBlock { int keyLength = (int) readVarLong(byteBuff, buffOffset + 12, varLongSizeOnDist); byte[] keyBytes = copy(byteBuff, buffOffset + 12 + varLongSizeOnDist, keyLength); Key key = contentKeyOnly ? new UTF8StringKey(keyBytes) : new Key(keyBytes); - keyList.add(key); - offsetList.add(offset); - sizeList.add(size); + indexEntryList.add(new BlockIndexEntry(key, Option.empty(), offset, size)); buffOffset += (12 + varLongSizeOnDist + keyLength); } - for (int i = 0; i < numEntries; i++) { - Key key = keyList.get(i); - blockIndexEntryMap.put(key, new BlockIndexEntry( - key, i < numEntries - 1 ? Option.of(keyList.get(i + 1)) : Option.empty(), - offsetList.get(i), sizeList.get(i))); - } - return blockIndexEntryMap; + return indexEntryList; } } diff --git a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java index 28f02c10d71..469ba37c277 100644 --- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java +++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java @@ -65,6 +65,9 @@ public class TestHFileReader { public static final Function<Integer, String> KEY_CREATOR = i -> String.format("hudi-key-%09d", i); public static final Function<Integer, String> KEY_CREATOR_WITH_SUFFIX = i -> String.format("hudi-key-%09d-abcdefghij", i); public static final Function<Integer, String> VALUE_CREATOR = i -> String.format("hudi-value-%09d", i); + private static final String LARGE_KEY_PREFIX = "hudi-key-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-"; + private static final Function<Integer, String> LARGE_KEY_CREATOR = i -> LARGE_KEY_PREFIX + String.format("%09d", i); private static final int SEEK_TO_THROW_EXCEPTION = -3; static Stream<Arguments> testArgsReadHFilePointAndPrefixLookup() { @@ -386,6 +389,279 @@ public class TestHFileReader { new KeyLookUpInfo("hudi-key-000020000", SEEK_TO_EOF, "", ""), new KeyLookUpInfo("hudi-key-000020001", SEEK_TO_EOF, "", "") ) + ), + // This HFile has large keys (key length > 100B), generated with LARGE_KEY_CREATOR + // using {@link TestHoodieHBaseHFileReaderWriter#generateHFileForTesting} + // The number of data block index levels is 2 + Arguments.of( + "/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile", + 20000, + LARGE_KEY_CREATOR, + Arrays.asList( + // before first key + new KeyLookUpInfo("", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + new KeyLookUpInfo("as", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // backward seekTo before first key is allowed and safe + new KeyLookUpInfo("aa", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000000", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // first key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000000", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // key in the block 0 + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000005", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000000005", "hudi-value-000000005"), + // backward seek not supported in a block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000004", SEEK_TO_THROW_EXCEPTION, "", ""), + // prefix lookup, the pointer should not move + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000010", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000099", "hudi-value-000000099"), + // non-exact lookup, the pointer should move + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"), + // prefix lookup with a jump, the pointer should not go beyond the lookup key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000040", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000399", "hudi-value-000000399"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"), + // last key of the block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + // first key of the next block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002786", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000002786", "hudi-value-000002786"), + // more lookups + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000005340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000005340", "hudi-value-000005340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000007340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000007340", "hudi-value-000007340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000009340", "hudi-value-000009340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013899", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000013899", "hudi-value-000013899"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013899", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000013899", "hudi-value-000013899"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013900", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000013900", "hudi-value-000013900"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013901", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000013901", "hudi-value-000013901"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000013902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000013902", "hudi-value-000013902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000015902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000015902", "hudi-value-000015902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000017902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000017902", "hudi-value-000017902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019500", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000019500", "hudi-value-000019500"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000196", SEEK_TO_BEFORE_BLOCK_FIRST_KEY, + LARGE_KEY_PREFIX + "000019600", "hudi-value-000019600"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00001960", SEEK_TO_BEFORE_BLOCK_FIRST_KEY, + LARGE_KEY_PREFIX + "000019600", "hudi-value-000019600"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019600a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000019600", "hudi-value-000019600"), + // second to last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019998", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000019998", "hudi-value-000019998"), + // last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019999", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000019999", "hudi-value-000019999"), + // after last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019999a", SEEK_TO_EOF, "", ""), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019999b", SEEK_TO_EOF, "", "") + ) + ), + // This HFile has large keys (key length > 100B), generated with LARGE_KEY_CREATOR + // using {@link TestHoodieHBaseHFileReaderWriter#generateHFileForTesting} + // and the HFile configs: hfile.index.block.max.size = 2048, hfile.index.block.min.entries = 4 + // The number of data block index levels is 3 + Arguments.of( + "/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile", + 10000, + LARGE_KEY_CREATOR, + Arrays.asList( + // before first key + new KeyLookUpInfo("", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + new KeyLookUpInfo("as", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // backward seekTo before first key is allowed and safe + new KeyLookUpInfo("aa", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000000", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // first key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000000", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // key in the block 0 + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000005", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000000005", "hudi-value-000000005"), + // backward seek not supported in a block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000004", SEEK_TO_THROW_EXCEPTION, "", ""), + // prefix lookup, the pointer should not move + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000010", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000099", "hudi-value-000000099"), + // non-exact lookup, the pointer should move + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"), + // prefix lookup with a jump, the pointer should not go beyond the lookup key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000040", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000399", "hudi-value-000000399"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"), + // last key of the block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + // first key of the next block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002786", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000002786", "hudi-value-000002786"), + // more lookups + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000003340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000003340", "hudi-value-000003340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000004340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000004340", "hudi-value-000004340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000005340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000005340", "hudi-value-000005340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006899", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000006899", "hudi-value-000006899"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006899", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000006899", "hudi-value-000006899"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006900", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000006900", "hudi-value-000006900"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006901", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000006901", "hudi-value-000006901"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000006902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000006902", "hudi-value-000006902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000007902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000007902", "hudi-value-000007902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000008902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000008902", "hudi-value-000008902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000091", SEEK_TO_BEFORE_BLOCK_FIRST_KEY, + LARGE_KEY_PREFIX + "000009100", "hudi-value-000009100"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000910", SEEK_TO_BEFORE_BLOCK_FIRST_KEY, + LARGE_KEY_PREFIX + "000009100", "hudi-value-000009100"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009100a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000009100", "hudi-value-000009100"), + // second to last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009998", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000009998", "hudi-value-000009998"), + // last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009999", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000009999", "hudi-value-000009999"), + // after last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009999a", SEEK_TO_EOF, "", ""), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000009999b", SEEK_TO_EOF, "", "") + ) + ), + // This HFile has large keys (key length > 100B), generated with LARGE_KEY_CREATOR + // using {@link TestHoodieHBaseHFileReaderWriter#generateHFileForTesting} + // and the HFile configs: hfile.index.block.max.size = 2048, hfile.index.block.min.entries = 4 + // The number of data block index levels is 4 + Arguments.of( + "/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile", + 50000, + LARGE_KEY_CREATOR, + Arrays.asList( + // before first key + new KeyLookUpInfo("", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + new KeyLookUpInfo("as", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // backward seekTo before first key is allowed and safe + new KeyLookUpInfo("aa", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000000", SEEK_TO_BEFORE_FILE_FIRST_KEY, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // first key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000000", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000000000", "hudi-value-000000000"), + // key in the block 0 + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000005", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000000005", "hudi-value-000000005"), + // backward seek not supported in a block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000004", SEEK_TO_THROW_EXCEPTION, "", ""), + // prefix lookup, the pointer should not move + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000010", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000099", "hudi-value-000000099"), + // non-exact lookup, the pointer should move + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000100b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000100", "hudi-value-000000100"), + // prefix lookup with a jump, the pointer should not go beyond the lookup key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00000040", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000399", "hudi-value-000000399"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000000400b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000000400", "hudi-value-000000400"), + // last key of the block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002785b", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000002785", "hudi-value-000002785"), + // first key of the next block + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000002786", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000002786", "hudi-value-000002786"), + // more lookups + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000005340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000005340", "hudi-value-000005340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000010340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000010340", "hudi-value-000010340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000019340", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000019340", "hudi-value-000019340"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023899", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000023899", "hudi-value-000023899"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023899", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000023899", "hudi-value-000023899"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023900", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000023900", "hudi-value-000023900"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023901", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000023901", "hudi-value-000023901"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000023902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000023902", "hudi-value-000023902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000030902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000030902", "hudi-value-000030902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000037902", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000037902", "hudi-value-000037902"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000039500", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000039500", "hudi-value-000039500"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "0000448", SEEK_TO_BEFORE_BLOCK_FIRST_KEY, + LARGE_KEY_PREFIX + "000044800", "hudi-value-000044800"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "00004480", SEEK_TO_BEFORE_BLOCK_FIRST_KEY, + LARGE_KEY_PREFIX + "000044800", "hudi-value-000044800"), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000044800a", SEEK_TO_IN_RANGE, + LARGE_KEY_PREFIX + "000044800", "hudi-value-000044800"), + // second to last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000049998", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000049998", "hudi-value-000049998"), + // last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000049999", SEEK_TO_FOUND, + LARGE_KEY_PREFIX + "000049999", "hudi-value-000049999"), + // after last key + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000049999a", SEEK_TO_EOF, "", ""), + new KeyLookUpInfo(LARGE_KEY_PREFIX + "000049999b", SEEK_TO_EOF, "", "") + ) ) ); } diff --git a/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile new file mode 100644 index 00000000000..03c7960ba09 Binary files /dev/null and b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_10000_large_keys_deep_index.hfile differ diff --git a/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile new file mode 100644 index 00000000000..bacd24549a3 Binary files /dev/null and b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_20000_large_keys.hfile differ diff --git a/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile new file mode 100644 index 00000000000..ba3e2d9c919 Binary files /dev/null and b/hudi-io/src/test/resources/hfile/hudi_1_0_hbase_2_4_13_1KB_GZ_50000_large_keys_deep_index.hfile differ
