Re: [PR] HBASE-29995 Improve existing hash implementations by reading 4 bytes at once [hbase]

via GitHub Thu, 19 Mar 2026 18:02:51 -0700


Copilot commented on code in PR #7934:
URL: https://github.com/apache/hbase/pull/7934#discussion_r2963483472



##########
hbase-common/src/test/java/org/apache/hadoop/hbase/util/TestLittleEndianBytesBase.java:
##########
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.nio.ByteBuffer;
+import org.apache.hadoop.hbase.ByteBufferExtendedCell;
+import org.apache.hadoop.hbase.ByteBufferKeyValue;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.KeyValue;
+import org.junit.jupiter.api.Test;
+
+public abstract class TestLittleEndianBytesBase {
+
+  @Test
+  public void testToInt() {
+    byte[] b = generateByteArray(32);
+
+    for (int i = 0; i <= b.length - Integer.BYTES; i++) {
+      int expected = readIntLE(b, i);
+      assertEquals(expected, LittleEndianBytes.toInt(b, i));
+    }
+  }
+
+  @Test
+  public void testByteBufferToInt() {
+    byte[] b = generateByteArray(32);
+    ByteBuffer buf = ByteBuffer.wrap(b);
+
+    for (int i = 0; i <= b.length - Integer.BYTES; i++) {
+      int expected = readIntLE(b, i);
+      assertEquals(expected, LittleEndianBytes.toInt(buf, i));
+    }
+  }
+
+  @Test
+  public void testPutInt() {
+    byte[] b = new byte[16];
+
+    int offset = 5;
+    int value = 0x12345678;
+    LittleEndianBytes.putInt(b, offset, value);
+    int expected = readIntLE(b, offset);
+    assertEquals(value, expected);
+
+    offset += Integer.BYTES;
+    value = 0x9ABCDEF0;
+    LittleEndianBytes.putInt(b, offset, value);
+    expected = readIntLE(b, offset);
+    assertEquals(value, expected);
+  }
+
+  @Test
+  public void testGetRowAsIntFromByteBufferExtendedCell() {
+    Cell bbCell = createByteBufferExtendedCell();
+    byte[] row = bbCell.getRowArray();
+
+    for (int i = bbCell.getRowOffset(); i <= bbCell.getRowLength() - 
Integer.BYTES; i++) {
+      int expected = readIntLE(row, i);
+      assertEquals(expected, LittleEndianBytes.getRowAsInt(bbCell, i));
+    }
+  }
+
+  @Test
+  public void testGetRowAsIntFromCell() {
+    KeyValue cell = createCell();
+    byte[] row = cell.getRowArray();
+
+    for (int i = cell.getRowOffset(); i <= cell.getRowLength() - 
Integer.BYTES; i++) {
+      int expected = readIntLE(row, cell.getRowOffset() + i);
+      assertEquals(expected, LittleEndianBytes.getRowAsInt(cell, i));
+    }
+  }
+
+  @Test
+  public void testGetQualifierAsIntFromByteBufferExtendedCell() {
+    Cell bbCell = createByteBufferExtendedCell();
+    byte[] qual = bbCell.getQualifierArray();
+
+    for (int i = bbCell.getQualifierOffset(); i
+        <= bbCell.getQualifierLength() - Integer.BYTES; i++) {
+      int expected = readIntLE(qual, i);
+      assertEquals(expected, LittleEndianBytes.getQualifierAsInt(bbCell, i));
+    }
+  }
+
+  @Test
+  public void testGetQualifierAsIntFromCell() {
+    KeyValue cell = createCell();
+    byte[] qual = cell.getQualifierArray();
+
+    for (int i = cell.getQualifierOffset(); i <= cell.getQualifierLength() - 
Integer.BYTES; i++) {
+      int expected = readIntLE(qual, cell.getQualifierOffset() + i);
+      assertEquals(expected, LittleEndianBytes.getQualifierAsInt(cell, i));
+    }

Review Comment:
   testGetQualifierAsIntFromCell has the same offset issue as the row test: 
getQualifierAsInt expects an offset relative to the qualifier start, but the 
loop starts at cell.getQualifierOffset() (absolute). Iterate i from 0 to 
qualifierLength - Integer.BYTES and compute expected from 
cell.getQualifierOffset() + i.



##########
hbase-common/src/test/java/org/apache/hadoop/hbase/util/TestRowColBloomHashKey.java:
##########
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.KeyValue;
+import org.apache.hadoop.hbase.PrivateCellUtil;
+import org.apache.hadoop.hbase.testclassification.MiscTests;
+import org.apache.hadoop.hbase.testclassification.SmallTests;
+import org.junit.ClassRule;
+import org.junit.experimental.categories.Category;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+@Category({ MiscTests.class, SmallTests.class })
+public class TestRowColBloomHashKey {
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestRowColBloomHashKey.class);

Review Comment:
   The JUnit annotations are mixed between JUnit4 (Category/ClassRule) and 
JUnit5 (BeforeEach/Test). Under the Jupiter engine, `@Category` and 
`@ClassRule` will be ignored, so this test may not be classified/executed as 
intended. Convert the class to a consistent testing style (preferably JUnit5): 
use @Tag(MiscTests.TAG)/@Tag(SmallTests.TAG) instead of @Category, and remove 
`@ClassRule` (or migrate to the JUnit5 extension equivalent if needed).
   ```suggestion
   import org.apache.hadoop.hbase.HConstants;
   import org.apache.hadoop.hbase.KeyValue;
   import org.apache.hadoop.hbase.PrivateCellUtil;
   import org.apache.hadoop.hbase.testclassification.MiscTests;
   import org.apache.hadoop.hbase.testclassification.SmallTests;
   import org.junit.jupiter.api.BeforeEach;
   import org.junit.jupiter.api.Tag;
   import org.junit.jupiter.api.Test;
   
   @Tag(MiscTests.TAG)
   @Tag(SmallTests.TAG)
   public class TestRowColBloomHashKey {
   ```



##########
hbase-common/src/test/java/org/apache/hadoop/hbase/util/TestLittleEndianBytesBase.java:
##########
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.nio.ByteBuffer;
+import org.apache.hadoop.hbase.ByteBufferExtendedCell;
+import org.apache.hadoop.hbase.ByteBufferKeyValue;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.KeyValue;
+import org.junit.jupiter.api.Test;
+
+public abstract class TestLittleEndianBytesBase {
+
+  @Test
+  public void testToInt() {
+    byte[] b = generateByteArray(32);
+
+    for (int i = 0; i <= b.length - Integer.BYTES; i++) {
+      int expected = readIntLE(b, i);
+      assertEquals(expected, LittleEndianBytes.toInt(b, i));
+    }
+  }
+
+  @Test
+  public void testByteBufferToInt() {
+    byte[] b = generateByteArray(32);
+    ByteBuffer buf = ByteBuffer.wrap(b);
+
+    for (int i = 0; i <= b.length - Integer.BYTES; i++) {
+      int expected = readIntLE(b, i);
+      assertEquals(expected, LittleEndianBytes.toInt(buf, i));
+    }
+  }
+
+  @Test
+  public void testPutInt() {
+    byte[] b = new byte[16];
+
+    int offset = 5;
+    int value = 0x12345678;
+    LittleEndianBytes.putInt(b, offset, value);
+    int expected = readIntLE(b, offset);
+    assertEquals(value, expected);
+
+    offset += Integer.BYTES;
+    value = 0x9ABCDEF0;
+    LittleEndianBytes.putInt(b, offset, value);
+    expected = readIntLE(b, offset);
+    assertEquals(value, expected);
+  }
+
+  @Test
+  public void testGetRowAsIntFromByteBufferExtendedCell() {
+    Cell bbCell = createByteBufferExtendedCell();
+    byte[] row = bbCell.getRowArray();
+
+    for (int i = bbCell.getRowOffset(); i <= bbCell.getRowLength() - 
Integer.BYTES; i++) {
+      int expected = readIntLE(row, i);
+      assertEquals(expected, LittleEndianBytes.getRowAsInt(bbCell, i));
+    }
+  }
+
+  @Test
+  public void testGetRowAsIntFromCell() {
+    KeyValue cell = createCell();
+    byte[] row = cell.getRowArray();
+
+    for (int i = cell.getRowOffset(); i <= cell.getRowLength() - 
Integer.BYTES; i++) {
+      int expected = readIntLE(row, cell.getRowOffset() + i);
+      assertEquals(expected, LittleEndianBytes.getRowAsInt(cell, i));
+    }

Review Comment:
   testGetRowAsIntFromCell iterates using cell.getRowOffset() as if it were a 
row-relative index, but LittleEndianBytes.getRowAsInt expects an offset 
relative to the start of the row (0..rowLength-4). With KeyValue, getRowOffset 
is an absolute offset into the backing array, so this loop computes 
expected/actual from different positions and can go out of bounds. Iterate i 
from 0 to rowLength - Integer.BYTES and compute expected using 
cell.getRowOffset() + i.



##########
hbase-common/src/main/java/org/apache/hadoop/hbase/util/LittleEndianBytes.java:
##########
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.util;
+
+import java.nio.ByteBuffer;
+import org.apache.hadoop.hbase.ByteBufferExtendedCell;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.unsafe.HBasePlatformDependent;
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * Utility methods for reading and writing little-endian integers from byte[] 
and ByteBuffer. Used
+ * by hashing components to perform fast, low-level LE conversions with 
optional Unsafe
+ * acceleration.
+ */
[email protected]
+public final class LittleEndianBytes {
+  final static boolean UNSAFE_UNALIGNED = HBasePlatformDependent.unaligned();
+
+  static abstract class Converter {
+    abstract int toInt(byte[] bytes, int offset);
+
+    abstract int toInt(ByteBuffer buffer, int offset);
+
+    abstract int putInt(byte[] bytes, int offset, int val);
+  }
+
+  static class ConverterHolder {
+    static final String UNSAFE_CONVERTER_NAME =
+      ConverterHolder.class.getName() + "$UnsafeConverter";
+    static final Converter BEST_CONVERTER = getBestConverter();
+
+    static Converter getBestConverter() {
+      try {
+        Class<? extends Converter> theClass =
+          Class.forName(UNSAFE_CONVERTER_NAME).asSubclass(Converter.class);
+        return theClass.getConstructor().newInstance();
+      } catch (Throwable t) {
+        return PureJavaConverter.INSTANCE;
+      }
+    }
+
+    static final class PureJavaConverter extends Converter {
+      static final PureJavaConverter INSTANCE = new PureJavaConverter();
+
+      private PureJavaConverter() {
+      }
+
+      @Override
+      int toInt(byte[] bytes, int offset) {
+        int n = 0;
+        for (int i = offset + 3; i >= offset; i--) {
+          n <<= 8;
+          n ^= (bytes[i] & 0xFF);
+        }
+        return n;
+      }
+
+      @Override
+      int toInt(ByteBuffer buffer, int offset) {
+        return Integer.reverseBytes(buffer.getInt(offset));

Review Comment:
   PureJavaConverter.toInt(ByteBuffer, offset) currently depends on the 
ByteBuffer's byte order (getInt honors buffer.order()). LittleEndianBytes 
should decode little-endian deterministically regardless of ByteBuffer order; 
otherwise callers that use a LITTLE_ENDIAN-ordered buffer will get incorrect 
results. Decode using per-byte reads or use a duplicate with explicit 
LITTLE_ENDIAN order (without mutating the original buffer).
   ```suggestion
           int b0 = buffer.get(offset) & 0xFF;
           int b1 = buffer.get(offset + 1) & 0xFF;
           int b2 = buffer.get(offset + 2) & 0xFF;
           int b3 = buffer.get(offset + 3) & 0xFF;
           return (b0) | (b1 << 8) | (b2 << 16) | (b3 << 24);
   ```



##########
hbase-common/src/main/java/org/apache/hadoop/hbase/util/RowColBloomHashKey.java:
##########
@@ -23,66 +23,149 @@
 import org.apache.yetus.audience.InterfaceAudience;
 
 /**
- * An hash key for ROWCOL bloom. This assumes the cells to be serialized in 
the Keyvalue
+ * A hash key for ROWCOL bloom. This assumes the cells to be serialized in the 
Keyvalue
  * serialization format with Empty column family. Note that the byte 
representing the family length
  * is considered to be 0
  */
 @InterfaceAudience.Private
 public class RowColBloomHashKey extends CellHashKey {
-
   private final int rowLength;
   private final int qualLength;
+  private final int totalLength;
 
   public RowColBloomHashKey(Cell cell) {
     super(cell);
     rowLength = cell.getRowLength();
     // We don't consider the family length for ROWCOL bloom. So subtract the 
famLen from the
     // length calculation. Timestamp and type are of no relevance here
     qualLength = cell.getQualifierLength();
+    // ROWCOL Bloom byte layout:
+    // <2B RK length> <RK> <1B CF length> <CQ> <8B TS> <1B TYPE>
+    totalLength = KeyValue.ROW_LENGTH_SIZE + rowLength + 
KeyValue.FAMILY_LENGTH_SIZE + qualLength
+      + KeyValue.TIMESTAMP_TYPE_SIZE;

Review Comment:
   This comment says timestamp/type are 'of no relevance', but the computed 
ROWCOL bloom layout and totalLength explicitly include timestamp+type bytes 
(and get/getIntLE also materialize them). Update the comment to reflect that 
timestamp/type are included with fixed values (LATEST_TS/MAX_TYPE) for 
compatibility, and clarify that only the column family bytes are omitted 
(family length byte is always 0).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] HBASE-29995 Improve existing hash implementations by reading 4 bytes at once [hbase]

Reply via email to