Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/19222#discussion_r176985334
--- Diff:
common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java ---
@@ -49,49 +51,70 @@ public static int hashInt(int input, int seed) {
}
public int hashUnsafeWords(Object base, long offset, int lengthInBytes) {
- return hashUnsafeWords(base, offset, lengthInBytes, seed);
+ return hashUnsafeWordsBlock(MemoryBlock.allocateFromObject(base,
offset, lengthInBytes), seed);
}
- public static int hashUnsafeWords(Object base, long offset, int
lengthInBytes, int seed) {
+ public static int hashUnsafeWordsBlock(MemoryBlock base, int seed) {
// This is based on Guava's
`Murmur32_Hasher.processRemaining(ByteBuffer)` method.
+ int lengthInBytes = Ints.checkedCast(base.size());
assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of
8 (word-aligned)";
- int h1 = hashBytesByInt(base, offset, lengthInBytes, seed);
+ int h1 = hashBytesByIntBlock(base, seed);
return fmix(h1, lengthInBytes);
}
- public static int hashUnsafeBytes(Object base, long offset, int
lengthInBytes, int seed) {
+ public static int hashUnsafeWords(Object base, long offset, int
lengthInBytes, int seed) {
+ // This is based on Guava's
`Murmur32_Hasher.processRemaining(ByteBuffer)` method.
+ assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of
8 (word-aligned)";
+ return hashUnsafeWordsBlock(MemoryBlock.allocateFromObject(base,
offset, lengthInBytes), seed);
+ }
+
+ public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) {
// This is not compatible with original and another implementations.
// But remain it for backward compatibility for the components
existing before 2.3.
+ long offset = base.getBaseOffset();
+ long lengthInBytes = base.size();
assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
- int lengthAligned = lengthInBytes - lengthInBytes % 4;
- int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
- for (int i = lengthAligned; i < lengthInBytes; i++) {
- int halfWord = Platform.getByte(base, offset + i);
+ long lengthAligned = lengthInBytes - lengthInBytes % 4;
+ int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
+ for (long i = lengthAligned; i < lengthInBytes; i++) {
+ int halfWord = base.getByte(offset + i);
int k1 = mixK1(halfWord);
h1 = mixH1(h1, k1);
}
- return fmix(h1, lengthInBytes);
+ return fmix(h1, Ints.checkedCast(lengthInBytes));
+ }
+
+ public static int hashUnsafeBytes(Object base, long offset, int
lengthInBytes, int seed) {
+ return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base,
offset, lengthInBytes), seed);
}
public static int hashUnsafeBytes2(Object base, long offset, int
lengthInBytes, int seed) {
+ return hashUnsafeBytes2Block(MemoryBlock.allocateFromObject(base,
offset, lengthInBytes), seed);
+ }
+
+ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
// This is compatible with original and another implementations.
// Use this method for new components after Spark 2.3.
- assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
- int lengthAligned = lengthInBytes - lengthInBytes % 4;
- int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
+ long offset = base.getBaseOffset();
+ long lengthInBytes = base.size();
--- End diff --
ditto
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]