This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 270746e845317ed2d0a53acec94a1877798e9d88
Author: Bankim Bhavsar <[email protected]>
AuthorDate: Mon Mar 30 11:24:37 2020 -0700

    [util] Add special handling for nullptr in fast hash
    
    Importing special handling for nullptr from Impala that helps distinguish
    between nullptr with possibly non-zero length input and empty object.
    
https://github.com/apache/impala/blob/master/be/src/runtime/raw-value.inline.h#L40
    
https://github.com/apache/impala/blob/master/be/src/runtime/raw-value-ir.cc#L179
    
    Fast hash already handles zero-length input and hence no special handling
    for zero length input.
    
    Fast hash is currently only used for BlockBloomFilter and hence no backward
    compatibility concerns. Don't plan to use Murmur2 hash with BlockBloomFilter
    and hence not importing the same logic for Murmur2 or other hash functions.
    Importing the same logic for other hash functions will require taking 
backward
    compatibility into account.
    
    Additionally this change updates Java implementation of Fast hash
    to drop the explicit "len" parameter as arrays in Java include length.
    
    Change-Id: Idf1ccff3dde7ccf54c4c2c6c2910915c69153316
    Reviewed-on: http://gerrit.cloudera.org:8080/15600
    Reviewed-by: Wenzhe Zhou <[email protected]>
    Reviewed-by: Adar Dembo <[email protected]>
    Tested-by: Adar Dembo <[email protected]>
---
 .../main/java/org/apache/kudu/util/HashUtil.java   | 33 ++++++++++++++++++----
 .../java/org/apache/kudu/util/TestFashHash.java    | 24 ++++++++++++----
 src/kudu/util/hash_util-test.cc                    | 12 ++++++++
 src/kudu/util/hash_util.h                          | 24 ++++++++++++++--
 4 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java 
b/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
index d8dc526..9d27d4d 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
@@ -21,18 +21,42 @@ package org.apache.kudu.util;
  * Hash utility functions.
  */
 public class HashUtil {
+  // Constant imported from Apache Impala used to compute hash values for 
special cases.
+  // It's an arbitrary constant obtained by taking lower bytes of generated 
UUID. Helps
+  // distinguish NULL values from empty objects.
+  // Impala uses the direct BlockBloomFilter C++ API and inserts hash value 
directly using
+  // its own implementation of the Fast hash. Hence the value must match with 
Impala.
+  // Though Impala will use C++ API, keeping the implementation of the Fast 
hash algorithm
+  // consistent across C++ and Java.
+  private static final int HASH_VAL_NULL = 0x58081667;
+  private static final byte[] HASH_VAL_NULL_BYTE_BUF = new byte[4];
+
+  static {
+    HASH_VAL_NULL_BYTE_BUF[0] = (byte) (HASH_VAL_NULL >>> 0);
+    HASH_VAL_NULL_BYTE_BUF[1] = (byte) (HASH_VAL_NULL >>> 8);
+    HASH_VAL_NULL_BYTE_BUF[2] = (byte) (HASH_VAL_NULL >>> 16);
+    HASH_VAL_NULL_BYTE_BUF[3] = (byte) (HASH_VAL_NULL >>> 24);
+  }
+
   /**
    * Compute 64-bit FastHash of the supplied data backed by byte array.
    *
    * FastHash is simple, robust, and efficient general-purpose hash function 
from Google.
    * Implementation is adapted from 
https://code.google.com/archive/p/fast-hash/
    *
+   * Adds special handling for null input.
+   *
    * @param buf the data to hash
-   * @param len length of the supplied data
    * @param seed seed to compute the hash
    * @return computed 64-bit hash value
    */
-  public static long fastHash64(final byte[] buf, int len, long seed) {
+  public static long fastHash64(byte[] buf, long seed) {
+    // Special handling for null input with possible non-zero length as could 
be the
+    // case with nullable column values.
+    if (buf == null) {
+      buf = HASH_VAL_NULL_BYTE_BUF;
+    }
+    final int len = buf.length;
     final long m = 0x880355f21e6d1965L;
     long h = seed ^ (len * m);
     long v;
@@ -89,15 +113,14 @@ public class HashUtil {
    * Implementation is adapted from 
https://code.google.com/archive/p/fast-hash/
    *
    * @param buf the data to compute the hash
-   * @param len length of the supplied data
    * @param seed seed to compute the hash
    * @return computed 32-bit hash value
    */
-  public static int fastHash32(final byte[] buf, int len, int seed) {
+  public static int fastHash32(byte[] buf, int seed) {
     // the following trick converts the 64-bit hashcode to Fermat
     // residue, which shall retain information from both the higher
     // and lower parts of hashcode.
-    long h = fastHash64(buf, len, seed);
+    long h = fastHash64(buf, seed);
     return (int)(h - (h >>> 32));
   }
 
diff --git 
a/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java 
b/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
index fe08ba3..731a0f2 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
@@ -39,27 +39,39 @@ public class TestFashHash {
   public void testFastHash64() {
     long hash;
 
-    hash = HashUtil.fastHash64("ab".getBytes(UTF_8), 2, 0);
+    hash = HashUtil.fastHash64("ab".getBytes(UTF_8), 0);
     assertEquals(Long.parseUnsignedLong("17293172613997361769"), hash);
 
-    hash = HashUtil.fastHash64("abcdefg".getBytes(UTF_8), 7, 0);
+    hash = HashUtil.fastHash64("abcdefg".getBytes(UTF_8), 0);
     assertEquals(Long.parseUnsignedLong("10206404559164245992"), hash);
 
-    hash = HashUtil.fastHash64("quick brown fox".getBytes(UTF_8), 15, 42);
+    hash = HashUtil.fastHash64("quick brown fox".getBytes(UTF_8), 42);
     assertEquals(Long.parseUnsignedLong("3757424404558187042"), hash);
+
+    hash = HashUtil.fastHash64(null, 0);
+    assertEquals(Long.parseUnsignedLong("12680076593665652444"), hash);
+
+    hash = HashUtil.fastHash64("".getBytes(UTF_8), 0);
+    assertEquals(0, hash);
   }
 
   @Test
   public void testFastHash32() {
     int hash;
 
-    hash = HashUtil.fastHash32("ab".getBytes(UTF_8), 2, 0);
+    hash = HashUtil.fastHash32("ab".getBytes(UTF_8), 0);
     assertEquals(Integer.parseUnsignedInt("2564147595"), hash);
 
-    hash = HashUtil.fastHash32("abcdefg".getBytes(UTF_8), 7, 0);
+    hash = HashUtil.fastHash32("abcdefg".getBytes(UTF_8), 0);
     assertEquals(Integer.parseUnsignedInt("1497700618"), hash);
 
-    hash = HashUtil.fastHash32("quick brown fox".getBytes(UTF_8), 15, 42);
+    hash = HashUtil.fastHash32("quick brown fox".getBytes(UTF_8), 42);
     assertEquals(Integer.parseUnsignedInt("1676541068"), hash);
+
+    hash = HashUtil.fastHash32(null, 0);
+    assertEquals(Integer.parseUnsignedInt("842467426"), hash);
+
+    hash = HashUtil.fastHash32("".getBytes(UTF_8), 0);
+    assertEquals(0, hash);
   }
 }
diff --git a/src/kudu/util/hash_util-test.cc b/src/kudu/util/hash_util-test.cc
index 9fadfcf..1e9298e 100644
--- a/src/kudu/util/hash_util-test.cc
+++ b/src/kudu/util/hash_util-test.cc
@@ -57,6 +57,12 @@ TEST(HashUtilTest, TestFastHash64) {
 
   hash = HashUtil::FastHash64("quick brown fox", 15, 42);
   ASSERT_EQ(3757424404558187042UL, hash);
+
+  hash = HashUtil::FastHash64(nullptr, 0, 0);
+  ASSERT_EQ(12680076593665652444UL, hash);
+
+  hash = HashUtil::FastHash64("", 0, 0);
+  ASSERT_EQ(0, hash);
 }
 
 TEST(HashUtilTest, TestFastHash32) {
@@ -70,6 +76,12 @@ TEST(HashUtilTest, TestFastHash32) {
 
   hash = HashUtil::FastHash32("quick brown fox", 15, 42);
   ASSERT_EQ(1676541068U, hash);
+
+  hash = HashUtil::FastHash32(nullptr, 0, 0);
+  ASSERT_EQ(842467426U, hash);
+
+  hash = HashUtil::FastHash32("", 0, 0);
+  ASSERT_EQ(0, hash);
 }
 
 TEST(HashUtilTest, TestComputeHash32Available) {
diff --git a/src/kudu/util/hash_util.h b/src/kudu/util/hash_util.h
index 71055e1..1664abd 100644
--- a/src/kudu/util/hash_util.h
+++ b/src/kudu/util/hash_util.h
@@ -29,11 +29,22 @@
 
 namespace kudu {
 
-/// Utility class to compute hash values.
+// Constant imported from Apache Impala used to compute hash values for 
special cases. It's an
+// arbitrary constant obtained by taking lower bytes of generated UUID. Helps 
distinguish NULL
+// values from empty objects.
+// Impala uses the direct BlockBloomFilter API and inserts hash value directly 
using its own
+// implementation of the Fast hash. Hence the value must match with Impala.
+//
+// Note: Since address of this static constexpr variable is used, declaring 
this as
+//       a member variable of HashUtil requires an explicit definition in .cc 
file
+//       and this class is completely defined in the header file to allow 
inlining.
+static constexpr uint32_t kHashValNull = 0x58081667;
+
+// Utility class to compute hash values.
 class HashUtil {
  public:
 
-  /// Murmur2 hash implementation returning 64-bit hashes.
+  // Murmur2 hash implementation returning 64-bit hashes.
   ATTRIBUTE_NO_SANITIZE_INTEGER
   static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) {
     static constexpr uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995UL;
@@ -75,9 +86,17 @@ class HashUtil {
   // FastHash is simple, robust, and efficient general-purpose hash function 
from Google.
   // Implementation is adapted from 
https://code.google.com/archive/p/fast-hash/
   //
+  // Adds special handling for nullptr input.
+  //
   // Compute 64-bit FastHash.
   ATTRIBUTE_NO_SANITIZE_INTEGER
   static uint64_t FastHash64(const void* buf, size_t len, uint64_t seed) {
+    // Special handling for nullptr input with possible non-zero length as 
could be the
+    // case with nullable column values.
+    if (buf == nullptr) {
+      buf = &kHashValNull;
+      len = sizeof(kHashValNull);
+    }
     static constexpr uint64_t kMultiplier = 0x880355f21e6d1965UL;
     const uint64_t* pos = static_cast<const uint64_t*>(buf);
     const uint64_t* end = pos + (len / 8);
@@ -131,7 +150,6 @@ class HashUtil {
   // Compute 32-bit hash of the supplied data using the specified hash 
algorithm.
   // Must be kept in sync with IsComputeHash32Available() function.
   static uint32_t ComputeHash32(const Slice& data, HashAlgorithm 
hash_algorithm, uint32_t seed) {
-    // TODO(bankim): Consider adding special handling for zero-length/NULL 
objects.
     switch (hash_algorithm) {
       case FAST_HASH:
         return FastHash32(data.data(), data.size(), seed);

Reply via email to