Re: [PR] [HUDI-8601] Support LEAF_INDEX block type in HFile [hudi]

via GitHub Sun, 20 Apr 2025 23:46:36 -0700


codope commented on code in PR #13166:
URL: https://github.com/apache/hudi/pull/13166#discussion_r2052030372



##########
hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileLeafIndexBlock.java:
##########
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.hfile;
+
+import org.apache.hudi.common.util.Option;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.hudi.io.util.IOUtils.copy;
+import static org.apache.hudi.io.util.IOUtils.readInt;
+import static org.apache.hudi.io.util.IOUtils.readLong;
+
+/**
+ * Represents a {@link HFileBlockType#LEAF_INDEX} block, as
+ * part of a multi-level block index.
+ */
+public class HFileLeafIndexBlock extends HFileBlock {
+  protected HFileLeafIndexBlock(HFileContext context,
+                                byte[] byteBuff,
+                                int startOffsetInBuff) {
+    super(context, HFileBlockType.LEAF_INDEX, byteBuff, startOffsetInBuff);
+  }
+
+  protected HFileLeafIndexBlock(HFileContext context,
+                                HFileBlockType blockType,
+                                byte[] byteBuff,
+                                int startOffsetInBuff) {
+    super(context, blockType, byteBuff, startOffsetInBuff);
+  }
+
+  /**
+   * Reads the index block and returns the block index entries.
+   */
+  public List<BlockIndexEntry> readBlockIndex(boolean contentKeyOnly) {
+    // 0. Print block magic
+    int buffOffset = startOffsetInBuff + HFILEBLOCK_HEADER_SIZE;
+
+    // 1. Get the number of entries.
+    int numEntries = readInt(byteBuff, buffOffset);
+    buffOffset += DataSize.SIZEOF_INT32;
+    // 2. Parse the secondary index.
+    List<Integer> relativeOffsets = new ArrayList<>();
+    for (int i = 0; i <= numEntries; i++) {
+      relativeOffsets.add(readInt(byteBuff, buffOffset));
+      buffOffset += DataSize.SIZEOF_INT32;
+    }
+    // 3. Read index entries.
+    List<BlockIndexEntry> indexEntries = new ArrayList<>();
+    int secondIndexAfterOffset = buffOffset;
+    for (int i = 0; i < numEntries; i++) {
+      assert (buffOffset - secondIndexAfterOffset == relativeOffsets.get(i));

Review Comment:
   replace with `ValidationUtils.checkState`?



##########
hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileReaderImpl.java:
##########
@@ -299,4 +301,74 @@ private boolean isAtFirstKey() {
     }
     return false;
   }
+
+  /**
+   * Read single-level or multiple-level data block index, and load all data 
block
+   * information into memory in BFS fashion.
+   *
+   * @param rootBlockReader a {@link HFileBlockReader} used to read root data 
index block;
+   *                        this reader will be used to read subsequent meta 
index block
+   *                        afterward
+   * @param numEntries      the number of entries in the root index block
+   * @param levels          the level of the indexes
+   * @return
+   */
+  private TreeMap<Key, BlockIndexEntry> readDataBlockIndex(HFileBlockReader 
rootBlockReader,
+                                                           int numEntries,
+                                                           int levels) throws 
IOException {
+    // Parse root data index block
+    HFileRootIndexBlock rootDataIndexBlock =
+        (HFileRootIndexBlock) 
rootBlockReader.nextBlock(HFileBlockType.ROOT_INDEX);
+    if (levels == 1) {

Review Comment:
   Should we add an empty index guard i.e. return early if levels is 0 or 
numEntries is 0?



##########
hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileLeafIndexBlock.java:
##########
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.io.hfile;
+
+import org.apache.hudi.common.util.Option;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.apache.hudi.io.util.IOUtils.copy;
+import static org.apache.hudi.io.util.IOUtils.readInt;
+import static org.apache.hudi.io.util.IOUtils.readLong;
+
+/**
+ * Represents a {@link HFileBlockType#LEAF_INDEX} block, as
+ * part of a multi-level block index.
+ */
+public class HFileLeafIndexBlock extends HFileBlock {
+  protected HFileLeafIndexBlock(HFileContext context,
+                                byte[] byteBuff,
+                                int startOffsetInBuff) {
+    super(context, HFileBlockType.LEAF_INDEX, byteBuff, startOffsetInBuff);
+  }
+
+  protected HFileLeafIndexBlock(HFileContext context,
+                                HFileBlockType blockType,
+                                byte[] byteBuff,
+                                int startOffsetInBuff) {
+    super(context, blockType, byteBuff, startOffsetInBuff);
+  }
+
+  /**
+   * Reads the index block and returns the block index entries.
+   */
+  public List<BlockIndexEntry> readBlockIndex(boolean contentKeyOnly) {
+    // 0. Print block magic
+    int buffOffset = startOffsetInBuff + HFILEBLOCK_HEADER_SIZE;
+
+    // 1. Get the number of entries.
+    int numEntries = readInt(byteBuff, buffOffset);
+    buffOffset += DataSize.SIZEOF_INT32;
+    // 2. Parse the secondary index.
+    List<Integer> relativeOffsets = new ArrayList<>();
+    for (int i = 0; i <= numEntries; i++) {
+      relativeOffsets.add(readInt(byteBuff, buffOffset));
+      buffOffset += DataSize.SIZEOF_INT32;
+    }
+    // 3. Read index entries.
+    List<BlockIndexEntry> indexEntries = new ArrayList<>();
+    int secondIndexAfterOffset = buffOffset;
+    for (int i = 0; i < numEntries; i++) {
+      assert (buffOffset - secondIndexAfterOffset == relativeOffsets.get(i));
+      long offset = readLong(byteBuff, buffOffset);
+      int size = readInt(byteBuff, buffOffset + 8);
+      // Key parsing requires different logic than that of root index.
+      int keyStartOffset = buffOffset + 12;
+      int nextEntryStartOffset = secondIndexAfterOffset + 
relativeOffsets.get(i + 1);
+      int keyLength = nextEntryStartOffset - keyStartOffset;
+      byte[] keyBytes = copy(byteBuff, buffOffset + 12, keyLength);

Review Comment:
   this copy is done for each entry right.. maybe we could wrap the underlying 
byteBuff in a ByteBuffer and slice without copying?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-8601] Support LEAF_INDEX block type in HFile [hudi]

Reply via email to