This is an automated email from the ASF dual-hosted git repository. ravindra pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new a4480f7 ARROW-6030: [Java] Efficiently compute hash code for ArrowBufPointer a4480f7 is described below commit a4480f7ebce639c950695f80bdbea1d94bd3f8f6 Author: liyafan82 <liya.fa...@gmail.com> AuthorDate: Mon Aug 5 12:00:46 2019 +0530 ARROW-6030: [Java] Efficiently compute hash code for ArrowBufPointer As ArrowBufHasher is introduced, we can compute the hash code of a continuous region within an ArrowBuf. We optimize the process to make it efficient to avoid recomputation. Closes #4939 from liyafan82/fly_0725_pthash1 and squashes the following commits: a73432545 <liyafan82> Efficiently compute hash code for ArrowBufPointer Authored-by: liyafan82 <liya.fa...@gmail.com> Signed-off-by: Pindikura Ravindra <ravin...@dremio.com> --- .../apache/arrow/memory/util/ArrowBufPointer.java | 64 +++++++++- .../arrow/memory/util/hash/DirectHasher.java | 5 + .../arrow/memory/util/TestArrowBufPointer.java | 131 ++++++++++++++++++++- 3 files changed, 197 insertions(+), 3 deletions(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java b/java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java index cafe4b0..3d0bf7b 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java @@ -17,6 +17,10 @@ package org.apache.arrow.memory.util; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.DirectHasher; +import org.apache.arrow.util.Preconditions; + import io.netty.buffer.ArrowBuf; /** @@ -25,17 +29,40 @@ import io.netty.buffer.ArrowBuf; */ public final class ArrowBufPointer { + /** + * The hash code when the arrow buffer is null. + */ + public static final int NULL_HASH_CODE = 0; + private ArrowBuf buf; private int offset; private int length; + private int hashCode = NULL_HASH_CODE; + + private final ArrowBufHasher hasher; + + /** + * A flag indicating if the underlying memory region has changed. + */ + private boolean hashCodeChanged = false; + /** * The default constructor. */ public ArrowBufPointer() { + this(DirectHasher.INSTANCE); + } + /** + * Constructs an arrow buffer pointer with the specified hasher. + * @param hasher the hasher to use. + */ + public ArrowBufPointer(ArrowBufHasher hasher) { + Preconditions.checkNotNull(hasher); + this.hasher = hasher; } /** @@ -45,6 +72,19 @@ public final class ArrowBufPointer { * @param length the length off set of the memory region pointed to. */ public ArrowBufPointer(ArrowBuf buf, int offset, int length) { + this(buf, offset, length, DirectHasher.INSTANCE); + } + + /** + * Constructs an Arrow buffer pointer. + * @param buf the underlying {@link ArrowBuf}, which can be null. + * @param offset the start off set of the memory region pointed to. + * @param length the length off set of the memory region pointed to. + * @param hasher the hasher used to calculate the hash code. + */ + public ArrowBufPointer(ArrowBuf buf, int offset, int length, ArrowBufHasher hasher) { + Preconditions.checkNotNull(hasher); + this.hasher = hasher; set(buf, offset, length); } @@ -58,6 +98,8 @@ public final class ArrowBufPointer { this.buf = buf; this.offset = offset; this.length = length; + + hashCodeChanged = true; } /** @@ -85,6 +127,13 @@ public final class ArrowBufPointer { return false; } + if (!hasher.equals(((ArrowBufPointer) o).hasher)) { + // note that the hasher is incorporated in equality determination + // this is to avoid problems in cases where two Arrow buffer pointers are not equal + // while having different hashers and equal hash codes. + return false; + } + ArrowBufPointer other = (ArrowBufPointer) o; if (buf == null || other.buf == null) { if (buf == null && other.buf == null) { @@ -100,7 +149,18 @@ public final class ArrowBufPointer { @Override public int hashCode() { - // implement after ARROW-5898 - throw new UnsupportedOperationException(); + if (!hashCodeChanged) { + return hashCode; + } + + // re-compute the hash code + if (buf == null) { + hashCode = NULL_HASH_CODE; + } else { + hashCode = hasher.hashCode(buf, offset, length); + } + + hashCodeChanged = false; + return hashCode; } } diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java index 18f39c3..aa889e6 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java @@ -79,4 +79,9 @@ public class DirectHasher extends ArrowBufHasher { return hash; } + + @Override + public boolean equals(Object obj) { + return obj != null && this.getClass() == obj.getClass(); + } } diff --git a/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java index f9d738e..36b78af 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java @@ -17,12 +17,15 @@ package org.apache.arrow.memory.util; +import static junit.framework.TestCase.assertEquals; import static junit.framework.TestCase.assertTrue; - +import static org.junit.jupiter.api.Assertions.assertFalse; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.memory.util.hash.DirectHasher; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -67,4 +70,130 @@ public class TestArrowBufPointer { } } } + + @Test + public void testArrowBufPointersHashCode() { + final int vectorLength = 100; + try (ArrowBuf buf1 = allocator.buffer(vectorLength * 4); + ArrowBuf buf2 = allocator.buffer(vectorLength * 4)) { + for (int i = 0; i < vectorLength; i++) { + buf1.setInt(i * 4, i); + buf2.setInt(i * 4, i); + } + + CounterHasher hasher1 = new CounterHasher(); + CounterHasher hasher2 = new CounterHasher(); + + ArrowBufPointer pointer1 = new ArrowBufPointer(hasher1); + assertEquals(ArrowBufPointer.NULL_HASH_CODE, pointer1.hashCode()); + + ArrowBufPointer pointer2 = new ArrowBufPointer(hasher2); + assertEquals(ArrowBufPointer.NULL_HASH_CODE, pointer2.hashCode()); + + for (int i = 0; i < vectorLength; i++) { + pointer1.set(buf1, i * 4, 4); + pointer2.set(buf2, i * 4, 4); + + assertEquals(pointer1.hashCode(), pointer2.hashCode()); + + // verify that the hash codes have been re-computed + assertEquals(hasher1.counter, i + 1); + assertEquals(hasher2.counter, i + 1); + } + } + } + + @Test + public void testNullPointersHashCode() { + ArrowBufPointer pointer = new ArrowBufPointer(); + assertEquals(ArrowBufPointer.NULL_HASH_CODE, pointer.hashCode()); + + pointer.set(null, 0, 0); + assertEquals(ArrowBufPointer.NULL_HASH_CODE, pointer.hashCode()); + } + + @Test + public void testReuseHashCode() { + try (ArrowBuf buf = allocator.buffer(10)) { + buf.setInt(0, 10); + buf.setInt(4, 20); + + CounterHasher hasher = new CounterHasher(); + ArrowBufPointer pointer = new ArrowBufPointer(hasher); + + pointer.set(buf, 0, 4); + pointer.hashCode(); + + // hash code computed + assertEquals(1, hasher.counter); + + // no hash code re-compute + pointer.hashCode(); + assertEquals(1, hasher.counter); + + // hash code re-computed + pointer.set(buf, 4, 4); + pointer.hashCode(); + assertEquals(2, hasher.counter); + } + } + + @Test + public void testHashersForEquality() { + try (ArrowBuf buf = allocator.buffer(10)) { + // pointer 1 uses the default hasher + ArrowBufPointer pointer1 = new ArrowBufPointer(buf, 0, 10); + + // pointer 2 uses the counter hasher + ArrowBufPointer pointer2 = new ArrowBufPointer(buf, 0, 10, new CounterHasher()); + + // the two pointers cannot be equal, since they have different hashers + assertFalse(pointer1.equals(pointer2)); + } + } + + /** + * Hasher with a counter that increments each time a hash code is calculated. + * This is to validate that the hash code in {@link ArrowBufPointer} is reused. + */ + class CounterHasher extends ArrowBufHasher { + + protected int counter = 0; + + @Override + protected int combineHashCode(int currentHashCode, int newHashCode) { + return 0; + } + + @Override + protected int getByteHashCode(byte byteValue) { + return 0; + } + + @Override + protected int getIntHashCode(int intValue) { + return 0; + } + + @Override + protected int getLongHashCode(long longValue) { + return 0; + } + + @Override + protected int finalizeHashCode(int hashCode) { + return 0; + } + + @Override + public int hashCode(ArrowBuf buf, int offset, int length) { + counter += 1; + return DirectHasher.INSTANCE.hashCode(buf, offset, length); + } + + @Override + public boolean equals(Object o) { + return o != null && this.getClass() == o.getClass(); + } + } }