spark git commit: [SPARK-23381][CORE] Murmur3 hash generates a different value from other implementations

2018-02-16 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/master 0a73aa31f -> d5ed2108d


[SPARK-23381][CORE] Murmur3 hash generates a different value from other 
implementations

## What changes were proposed in this pull request?
Murmur3 hash generates a different value from the original and other 
implementations (like Scala standard library and Guava or so) when the length 
of a bytes array is not multiple of 4.

## How was this patch tested?
Added a unit test.

**Note: When we merge this PR, please give all the credits to Shintaro 
Murakami.**

Author: Shintaro Murakami 

Author: gatorsmile 
Author: Shintaro Murakami 

Closes #20630 from gatorsmile/pr-20568.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d5ed2108
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d5ed2108
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d5ed2108

Branch: refs/heads/master
Commit: d5ed2108d32e1d95b26ee7fed39e8a733e935e2c
Parents: 0a73aa3
Author: Shintaro Murakami 
Authored: Fri Feb 16 17:17:55 2018 -0800
Committer: gatorsmile 
Committed: Fri Feb 16 17:17:55 2018 -0800

--
 .../spark/util/sketch/Murmur3_x86_32.java   | 16 ++
 .../spark/unsafe/hash/Murmur3_x86_32.java   | 16 ++
 .../spark/unsafe/hash/Murmur3_x86_32Suite.java  | 19 +++
 .../apache/spark/ml/feature/FeatureHasher.scala | 33 +++-
 .../apache/spark/mllib/feature/HashingTF.scala  |  2 +-
 .../spark/ml/feature/FeatureHasherSuite.scala   | 11 ++-
 python/pyspark/ml/feature.py|  4 +--
 7 files changed, 96 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d5ed2108/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
--
diff --git 
a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java 
b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
index a61ce4f..e83b331 100644
--- 
a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
+++ 
b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
@@ -60,6 +60,8 @@ final class Murmur3_x86_32 {
   }
 
   public static int hashUnsafeBytes(Object base, long offset, int 
lengthInBytes, int seed) {
+// This is not compatible with original and another implementations.
+// But remain it for backward compatibility for the components existing 
before 2.3.
 assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
 int lengthAligned = lengthInBytes - lengthInBytes % 4;
 int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
@@ -71,6 +73,20 @@ final class Murmur3_x86_32 {
 return fmix(h1, lengthInBytes);
   }
 
+  public static int hashUnsafeBytes2(Object base, long offset, int 
lengthInBytes, int seed) {
+// This is compatible with original and another implementations.
+// Use this method for new components after Spark 2.3.
+assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
+int lengthAligned = lengthInBytes - lengthInBytes % 4;
+int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
+int k1 = 0;
+for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) 
{
+  k1 ^= (Platform.getByte(base, offset + i) & 0xFF) << shift;
+}
+h1 ^= mixK1(k1);
+return fmix(h1, lengthInBytes);
+  }
+
   private static int hashBytesByInt(Object base, long offset, int 
lengthInBytes, int seed) {
 assert (lengthInBytes % 4 == 0);
 int h1 = seed;

http://git-wip-us.apache.org/repos/asf/spark/blob/d5ed2108/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
--
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index 5e7ee48..d239de6 100644
--- 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -60,6 +60,8 @@ public final class Murmur3_x86_32 {
   }
 
   public static int hashUnsafeBytes(Object base, long offset, int 
lengthInBytes, int seed) {
+// This is not compatible with original and another implementations.
+// But remain it for backward compatibility for the components existing 
before 2.3.
 assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
 int lengthAligned = lengthInBytes - lengthInBytes % 4;
 int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
@@ -71,6 +73,20 @@ public final class 

spark git commit: [SPARK-23381][CORE] Murmur3 hash generates a different value from other implementations

2018-02-16 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 ccb0a59d7 -> 8360da071


[SPARK-23381][CORE] Murmur3 hash generates a different value from other 
implementations

## What changes were proposed in this pull request?
Murmur3 hash generates a different value from the original and other 
implementations (like Scala standard library and Guava or so) when the length 
of a bytes array is not multiple of 4.

## How was this patch tested?
Added a unit test.

**Note: When we merge this PR, please give all the credits to Shintaro 
Murakami.**

Author: Shintaro Murakami 

Author: gatorsmile 
Author: Shintaro Murakami 

Closes #20630 from gatorsmile/pr-20568.

(cherry picked from commit d5ed2108d32e1d95b26ee7fed39e8a733e935e2c)
Signed-off-by: gatorsmile 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8360da07
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8360da07
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8360da07

Branch: refs/heads/branch-2.3
Commit: 8360da07110d847a01b243e6d786922a5057ad9f
Parents: ccb0a59
Author: Shintaro Murakami 
Authored: Fri Feb 16 17:17:55 2018 -0800
Committer: gatorsmile 
Committed: Fri Feb 16 17:18:15 2018 -0800

--
 .../spark/util/sketch/Murmur3_x86_32.java   | 16 ++
 .../spark/unsafe/hash/Murmur3_x86_32.java   | 16 ++
 .../spark/unsafe/hash/Murmur3_x86_32Suite.java  | 19 +++
 .../apache/spark/ml/feature/FeatureHasher.scala | 33 +++-
 .../apache/spark/mllib/feature/HashingTF.scala  |  2 +-
 .../spark/ml/feature/FeatureHasherSuite.scala   | 11 ++-
 python/pyspark/ml/feature.py|  4 +--
 7 files changed, 96 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8360da07/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
--
diff --git 
a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java 
b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
index a61ce4f..e83b331 100644
--- 
a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
+++ 
b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java
@@ -60,6 +60,8 @@ final class Murmur3_x86_32 {
   }
 
   public static int hashUnsafeBytes(Object base, long offset, int 
lengthInBytes, int seed) {
+// This is not compatible with original and another implementations.
+// But remain it for backward compatibility for the components existing 
before 2.3.
 assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
 int lengthAligned = lengthInBytes - lengthInBytes % 4;
 int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
@@ -71,6 +73,20 @@ final class Murmur3_x86_32 {
 return fmix(h1, lengthInBytes);
   }
 
+  public static int hashUnsafeBytes2(Object base, long offset, int 
lengthInBytes, int seed) {
+// This is compatible with original and another implementations.
+// Use this method for new components after Spark 2.3.
+assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
+int lengthAligned = lengthInBytes - lengthInBytes % 4;
+int h1 = hashBytesByInt(base, offset, lengthAligned, seed);
+int k1 = 0;
+for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) 
{
+  k1 ^= (Platform.getByte(base, offset + i) & 0xFF) << shift;
+}
+h1 ^= mixK1(k1);
+return fmix(h1, lengthInBytes);
+  }
+
   private static int hashBytesByInt(Object base, long offset, int 
lengthInBytes, int seed) {
 assert (lengthInBytes % 4 == 0);
 int h1 = seed;

http://git-wip-us.apache.org/repos/asf/spark/blob/8360da07/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
--
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index 5e7ee48..d239de6 100644
--- 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -60,6 +60,8 @@ public final class Murmur3_x86_32 {
   }
 
   public static int hashUnsafeBytes(Object base, long offset, int 
lengthInBytes, int seed) {
+// This is not compatible with original and another implementations.
+// But remain it for backward compatibility for the components existing 
before 2.3.
 assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
 int lengthAligned =