This is an automated email from the ASF dual-hosted git repository.

bankim pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 0720f6d  [util] Add special handling for empty strings in FastHash
0720f6d is described below

commit 0720f6da1f7cb49b56bfc587c45e7221663ad4c6
Author: Bankim Bhavsar <[email protected]>
AuthorDate: Thu Apr 29 14:28:53 2021 -0700

    [util] Add special handling for empty strings in FastHash
    
    Impala uses special handling for empty strings while hashing
    
https://github.com/apache/impala/blob/master/be/src/runtime/raw-value.inline.h#L352
    
    This leads to discrepancy between hash values for empty
    strings between Impala and Kudu.
    This change basically matches the behavior with Impala.
    
    Changes to the implementation of hashing function for empty
    strings generates a different value and hence incorrect
    results across different Kudu client and server versions.
    To resolve this issue, a new feature flag BLOOM_FILTER_PREDICATE_V2 has
    been added and the older one BLOOM_FILTER_PREDICATE has been marked as
    deprecated.
    
    Since FastHash is not used for any persistent data structures
    so there are no backward compatibility issues that needs special
    handling during upgrade.
    
    Discovered bugs in the Java implementation when a negative
    byte is promoted to a long and left shifted. Discovery
    was because of the magic value for empty strings/buffer.
    Thankfully the Java implementation of the FastHash
    is not used currently.
    
    Also fixed a bug in predicate test where flush would be missed
    due to increase in number of values which affected TSAN.
    
    Change-Id: Idb2d401ab692decd5bc03b48a96d710546c0039e
    Reviewed-on: http://gerrit.cloudera.org:8080/17376
    Tested-by: Kudu Jenkins
    Reviewed-by: Grant Henke <[email protected]>
    Reviewed-by: Wenzhe Zhou <[email protected]>
    Reviewed-by: Andrew Wong <[email protected]>
---
 .../main/java/org/apache/kudu/util/HashUtil.java   | 40 ++++++++++++++--------
 .../java/org/apache/kudu/util/TestFashHash.java    | 26 ++++++++++++--
 src/kudu/client/CMakeLists.txt                     |  2 +-
 src/kudu/client/predicate-test.cc                  |  6 ++--
 src/kudu/client/scanner-internal.cc                |  4 +--
 src/kudu/tserver/tablet_service.cc                 |  4 +--
 src/kudu/tserver/tserver.proto                     |  4 +++
 src/kudu/util/hash_util-test.cc                    | 27 +++++++++++++--
 src/kudu/util/hash_util.h                          | 18 ++++++----
 9 files changed, 99 insertions(+), 32 deletions(-)

diff --git a/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java 
b/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
index c2bfa20..10ecdc4 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
@@ -21,9 +21,9 @@ package org.apache.kudu.util;
  * Hash utility functions.
  */
 public class HashUtil {
-  // Constant imported from Apache Impala used to compute hash values for 
special cases.
-  // It's an arbitrary constant obtained by taking lower bytes of generated 
UUID. Helps
-  // distinguish NULL values from empty objects.
+  // Constants imported from Apache Impala used to compute hash values for 
special cases.
+  // They are arbitrary constant obtained by taking lower bytes of generated 
UUID. Helps
+  // distinguish NULL values and zero-length objects like empty strings.
   // Impala uses the direct BlockBloomFilter C++ API and inserts hash value 
directly using
   // its own implementation of the Fast hash. Hence the value must match with 
Impala.
   // Though Impala will use C++ API, keeping the implementation of the Fast 
hash algorithm
@@ -31,11 +31,19 @@ public class HashUtil {
   private static final int HASH_VAL_NULL = 0x58081667;
   private static final byte[] HASH_VAL_NULL_BYTE_BUF = new byte[4];
 
+  private static final int HASH_VAL_EMPTY = 0x7dca7eee;
+  private static final byte[] HASH_VAL_EMPTY_BYTE_BUF = new byte[4];
+
   static {
     HASH_VAL_NULL_BYTE_BUF[0] = (byte) (HASH_VAL_NULL >>> 0);
     HASH_VAL_NULL_BYTE_BUF[1] = (byte) (HASH_VAL_NULL >>> 8);
     HASH_VAL_NULL_BYTE_BUF[2] = (byte) (HASH_VAL_NULL >>> 16);
     HASH_VAL_NULL_BYTE_BUF[3] = (byte) (HASH_VAL_NULL >>> 24);
+
+    HASH_VAL_EMPTY_BYTE_BUF[0] = (byte) (HASH_VAL_EMPTY >>> 0);
+    HASH_VAL_EMPTY_BYTE_BUF[1] = (byte) (HASH_VAL_EMPTY >>> 8);
+    HASH_VAL_EMPTY_BYTE_BUF[2] = (byte) (HASH_VAL_EMPTY >>> 16);
+    HASH_VAL_EMPTY_BYTE_BUF[3] = (byte) (HASH_VAL_EMPTY >>> 24);
   }
 
   /** Non-constructable utility class. */
@@ -59,6 +67,8 @@ public class HashUtil {
     // case with nullable column values.
     if (buf == null) {
       buf = HASH_VAL_NULL_BYTE_BUF;
+    } else if (buf.length == 0) {
+      buf = HASH_VAL_EMPTY_BYTE_BUF;
     }
     final int len = buf.length;
     final long m = 0x880355f21e6d1965L;
@@ -68,11 +78,11 @@ public class HashUtil {
     int len8 = len / 8;
     for (int i = 0; i < len8; ++i) {
       int pos = i * 8;
-      v = buf[pos] +
-          ((long)buf[pos + 1] << 8) +  ((long)buf[pos + 2] << 16) +
-          ((long)buf[pos + 3] << 24) + ((long)buf[pos + 4] << 32) +
-          ((long)buf[pos + 5] << 40) + ((long)buf[pos + 6] << 48) +
-          ((long)buf[pos + 7] << 56);
+      v = (buf[pos] & 0xFF) |
+          ((long)(buf[pos + 1] & 0xFF) << 8) |  ((long)(buf[pos + 2] & 0xFF) 
<< 16) |
+          ((long)(buf[pos + 3] & 0xFF) << 24) | ((long)(buf[pos + 4] & 0xFF) 
<< 32) |
+          ((long)(buf[pos + 5] & 0xFF) << 40) | ((long)(buf[pos + 6] & 0xFF) 
<< 48) |
+          ((long)(buf[pos + 7] & 0xFF) << 56);
       h ^= fastHashMix(v);
       h *= m;
     }
@@ -82,25 +92,25 @@ public class HashUtil {
     //CHECKSTYLE:OFF
     switch (len & 7) {
       case 7:
-        v ^= (long)buf[pos2 + 6] << 48;
+        v ^= (long)(buf[pos2 + 6] & 0xFF) << 48;
       // fall through
       case 6:
-        v ^= (long)buf[pos2 + 5] << 40;
+        v ^= (long)(buf[pos2 + 5] & 0xFF) << 40;
       // fall through
       case 5:
-        v ^= (long)buf[pos2 + 4] << 32;
+        v ^= (long)(buf[pos2 + 4] & 0xFF) << 32;
       // fall through
       case 4:
-        v ^= (long)buf[pos2 + 3] << 24;
+        v ^= (long)(buf[pos2 + 3] & 0xFF) << 24;
       // fall through
       case 3:
-        v ^= (long)buf[pos2 + 2] << 16;
+        v ^= (long)(buf[pos2 + 2] & 0xFF) << 16;
       // fall through
       case 2:
-        v ^= (long)buf[pos2 + 1] << 8;
+        v ^= (long)(buf[pos2 + 1] & 0xFF) << 8;
       // fall through
       case 1:
-        v ^= buf[pos2];
+        v ^= (buf[pos2] & 0xFF);
         h ^= fastHashMix(v);
         h *= m;
     }
diff --git 
a/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java 
b/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
index 731a0f2..3581b23 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
@@ -52,7 +52,19 @@ public class TestFashHash {
     assertEquals(Long.parseUnsignedLong("12680076593665652444"), hash);
 
     hash = HashUtil.fastHash64("".getBytes(UTF_8), 0);
-    assertEquals(0, hash);
+    assertEquals(Long.parseUnsignedLong("4144680785095980158"), hash);
+
+    hash = HashUtil.fastHash64("".getBytes(UTF_8), 1234);
+    assertEquals(Long.parseUnsignedLong("3296774803014270295"), hash);
+
+    // 15 byte buffer with negative and positive numbers help test the 8 byte 
loop iteration
+    // and the remainder of 7 bytes in the fast hash implementation.
+    byte[] buffer = new byte[15];
+    for (int i = 0; i < buffer.length; i++) {
+      buffer[i] = (byte)(i - 8);
+    }
+    hash = HashUtil.fastHash64(buffer, 0);
+    assertEquals(Long.parseUnsignedLong("7308577902719593318"), hash);
   }
 
   @Test
@@ -72,6 +84,16 @@ public class TestFashHash {
     assertEquals(Integer.parseUnsignedInt("842467426"), hash);
 
     hash = HashUtil.fastHash32("".getBytes(UTF_8), 0);
-    assertEquals(0, hash);
+    assertEquals(Integer.parseUnsignedInt("3045300040"), hash);
+
+    hash = HashUtil.fastHash32("".getBytes(UTF_8), 1234);
+    assertEquals(Integer.parseUnsignedInt("811548192"), hash);
+
+    byte[] buffer = new byte[15];
+    for (int i = 0; i < buffer.length; i++) {
+      buffer[i] = (byte)(i - 8);
+    }
+    hash = HashUtil.fastHash32(buffer, 0);
+    assertEquals(Integer.parseUnsignedInt("3815875205"), hash);
   }
 }
diff --git a/src/kudu/client/CMakeLists.txt b/src/kudu/client/CMakeLists.txt
index 5c07020..0681d75 100644
--- a/src/kudu/client/CMakeLists.txt
+++ b/src/kudu/client/CMakeLists.txt
@@ -275,5 +275,5 @@ SET_KUDU_TEST_LINK_LIBS(
 ADD_KUDU_TEST(client-test NUM_SHARDS 8 PROCESSORS 2
                           DATA_FILES ../scripts/first_argument.sh)
 ADD_KUDU_TEST(client-unittest)
-ADD_KUDU_TEST(predicate-test)
+ADD_KUDU_TEST(predicate-test NUM_SHARDS 4)
 ADD_KUDU_TEST(scan_token-test PROCESSORS 2)
diff --git a/src/kudu/client/predicate-test.cc 
b/src/kudu/client/predicate-test.cc
index becf186..b300d3a 100644
--- a/src/kudu/client/predicate-test.cc
+++ b/src/kudu/client/predicate-test.cc
@@ -1543,12 +1543,13 @@ TEST_P(ParameterizedBloomFilterPredicateTest, 
TestDisabledBloomFilterWithRepeate
   // Use a set of predetermined strings and populate the table.
   // Create a BF predicate with same set of strings.
   deque<string> values = {"Alice", "Bob", "Charlie", "Doug", "Elizabeth", 
"Frank", "George",
-                          "Harry"};
+                          "Harry", ""};
 
   // Populate table with a small set of strings that are repeated.
   static constexpr int kNumRows = 10000;
   auto upsert_rows = [&]() {
     int i = 0;
+    int last_flush = i;
     while (i < kNumRows) {
       for (int j = 0; j < values.size() && i < kNumRows; j++, i++) {
         const string &value = values[j];
@@ -1558,8 +1559,9 @@ TEST_P(ParameterizedBloomFilterPredicateTest, 
TestDisabledBloomFilterWithRepeate
         ASSERT_OK(session->Apply(upsert.release()));
       }
       // TSAN builds timeout and fail on flushing with large number of rows.
-      if ((i % (kNumRows / 10))  == 0) {
+      if (i - last_flush >= 1000) {
         ASSERT_OK(session->Flush());
+        last_flush = i;
       }
     }
     ASSERT_OK(session->Flush());
diff --git a/src/kudu/client/scanner-internal.cc 
b/src/kudu/client/scanner-internal.cc
index a56a8f3..717fb32 100644
--- a/src/kudu/client/scanner-internal.cc
+++ b/src/kudu/client/scanner-internal.cc
@@ -355,7 +355,7 @@ ScanRpcStatus KuduScanner::Data::SendScanRpc(const 
MonoTime& overall_deadline,
   // Capture previously sent Bloom filter predicate feature flag so that we 
don't have to make
   // expensive call to determine the flag on scan continuations.
   bool prev_bloom_filter_feature = 
ContainsKey(controller_.required_server_features(),
-                                               
TabletServerFeatures::BLOOM_FILTER_PREDICATE);
+                                               
TabletServerFeatures::BLOOM_FILTER_PREDICATE_V2);
 
   controller_.Reset();
   controller_.set_deadline(rpc_deadline);
@@ -364,7 +364,7 @@ ScanRpcStatus KuduScanner::Data::SendScanRpc(const 
MonoTime& overall_deadline,
     if (prev_bloom_filter_feature ||
         (next_req_.has_new_scan_request() &&
          configuration().spec().ContainsBloomFilterPredicate())) {
-      
controller_.RequireServerFeature(TabletServerFeatures::BLOOM_FILTER_PREDICATE);
+      
controller_.RequireServerFeature(TabletServerFeatures::BLOOM_FILTER_PREDICATE_V2);
     }
   }
   if (configuration().row_format_flags() & 
KuduScanner::PAD_UNIXTIME_MICROS_TO_16_BYTES) {
diff --git a/src/kudu/tserver/tablet_service.cc 
b/src/kudu/tserver/tablet_service.cc
index 8adc52b..eec5b3c 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -1395,7 +1395,7 @@ bool TabletServiceAdminImpl::SupportsFeature(uint32_t 
feature) const {
     case TabletServerFeatures::COLUMN_PREDICATES:
     case TabletServerFeatures::PAD_UNIXTIME_MICROS_TO_16_BYTES:
     case TabletServerFeatures::QUIESCING:
-    case TabletServerFeatures::BLOOM_FILTER_PREDICATE:
+    case TabletServerFeatures::BLOOM_FILTER_PREDICATE_V2:
     // TODO(awong): once transactions are useable, add a feature flag.
       return true;
     default:
@@ -2525,7 +2525,7 @@ bool TabletServiceImpl::SupportsFeature(uint32_t feature) 
const {
     case TabletServerFeatures::COLUMN_PREDICATES:
     case TabletServerFeatures::PAD_UNIXTIME_MICROS_TO_16_BYTES:
     case TabletServerFeatures::QUIESCING:
-    case TabletServerFeatures::BLOOM_FILTER_PREDICATE:
+    case TabletServerFeatures::BLOOM_FILTER_PREDICATE_V2:
     case TabletServerFeatures::COLUMNAR_LAYOUT_FEATURE:
       return true;
     default:
diff --git a/src/kudu/tserver/tserver.proto b/src/kudu/tserver/tserver.proto
index a603975..a5d0a9e 100644
--- a/src/kudu/tserver/tserver.proto
+++ b/src/kudu/tserver/tserver.proto
@@ -494,7 +494,11 @@ enum TabletServerFeatures {
   // Whether the server supports padding UNIXTIME_MICROS slots to 16 bytes.
   PAD_UNIXTIME_MICROS_TO_16_BYTES = 2;
   QUIESCING = 3;
+  // Deprecated. See note below for BLOOM_FILTER_PREDICATE_V2.
   BLOOM_FILTER_PREDICATE = 4;
   // Whether the server supports the COLUMNAR_LAYOUT format flag.
   COLUMNAR_LAYOUT_FEATURE = 5;
+  // Update to implementation of Fast hash for Bloom filter predicate leads
+  // to incorrect results if incompatible client and server versions are used.
+  BLOOM_FILTER_PREDICATE_V2 = 6;
 }
diff --git a/src/kudu/util/hash_util-test.cc b/src/kudu/util/hash_util-test.cc
index 1e9298e..d42d379 100644
--- a/src/kudu/util/hash_util-test.cc
+++ b/src/kudu/util/hash_util-test.cc
@@ -17,6 +17,7 @@
 
 #include "kudu/util/hash_util.h"
 
+#include <array>
 #include <cstdint>
 
 #include <gtest/gtest.h>
@@ -62,7 +63,19 @@ TEST(HashUtilTest, TestFastHash64) {
   ASSERT_EQ(12680076593665652444UL, hash);
 
   hash = HashUtil::FastHash64("", 0, 0);
-  ASSERT_EQ(0, hash);
+  ASSERT_EQ(4144680785095980158UL, hash);
+
+  hash = HashUtil::FastHash64("", 0, 1234);
+  ASSERT_EQ(3296774803014270295UL, hash);
+
+  // 15 byte buffer with negative and positive numbers help test the 8 byte 
loop iteration
+  // and the remainder of 7 bytes in the fast hash implementation.
+  std::array<char, 15> buffer;
+  for (int i = 0; i < buffer.size(); i++) {
+    buffer[i] = static_cast<char>(i - 8);
+  }
+  hash = HashUtil::FastHash64(buffer.data(), buffer.size(), 0);
+  ASSERT_EQ(7308577902719593318, hash);
 }
 
 TEST(HashUtilTest, TestFastHash32) {
@@ -81,7 +94,17 @@ TEST(HashUtilTest, TestFastHash32) {
   ASSERT_EQ(842467426U, hash);
 
   hash = HashUtil::FastHash32("", 0, 0);
-  ASSERT_EQ(0, hash);
+  ASSERT_EQ(3045300040U, hash);
+
+  hash = HashUtil::FastHash32("", 0, 1234);
+  ASSERT_EQ(811548192U, hash);
+
+  std::array<char, 15> buffer;
+  for (int i = 0; i < buffer.size(); i++) {
+    buffer[i] = static_cast<char>(i - 8);
+  }
+  hash = HashUtil::FastHash32(buffer.data(), buffer.size(), 0);
+  ASSERT_EQ(3815875205, hash);
 }
 
 TEST(HashUtilTest, TestComputeHash32Available) {
diff --git a/src/kudu/util/hash_util.h b/src/kudu/util/hash_util.h
index 1664abd..bb4a8e8 100644
--- a/src/kudu/util/hash_util.h
+++ b/src/kudu/util/hash_util.h
@@ -29,9 +29,9 @@
 
 namespace kudu {
 
-// Constant imported from Apache Impala used to compute hash values for 
special cases. It's an
-// arbitrary constant obtained by taking lower bytes of generated UUID. Helps 
distinguish NULL
-// values from empty objects.
+// Constants imported from Apache Impala used to compute hash values for 
special cases.
+// They are arbitrary constants obtained by taking lower bytes of generated 
UUID.
+// Helps distinguish NULL values and zero-length objects like empty strings.
 // Impala uses the direct BlockBloomFilter API and inserts hash value directly 
using its own
 // implementation of the Fast hash. Hence the value must match with Impala.
 //
@@ -39,6 +39,7 @@ namespace kudu {
 //       a member variable of HashUtil requires an explicit definition in .cc 
file
 //       and this class is completely defined in the header file to allow 
inlining.
 static constexpr uint32_t kHashValNull = 0x58081667;
+static constexpr uint32_t kHashValEmpty = 0x7dca7eee;
 
 // Utility class to compute hash values.
 class HashUtil {
@@ -86,17 +87,22 @@ class HashUtil {
   // FastHash is simple, robust, and efficient general-purpose hash function 
from Google.
   // Implementation is adapted from 
https://code.google.com/archive/p/fast-hash/
   //
-  // Adds special handling for nullptr input.
+  // Adds special handling for nullptr and zero-length input.
   //
   // Compute 64-bit FastHash.
   ATTRIBUTE_NO_SANITIZE_INTEGER
   static uint64_t FastHash64(const void* buf, size_t len, uint64_t seed) {
-    // Special handling for nullptr input with possible non-zero length as 
could be the
-    // case with nullable column values.
     if (buf == nullptr) {
+      // Special handling for nullptr input with possible non-zero length as 
could be the
+      // case with nullable column values.
       buf = &kHashValNull;
       len = sizeof(kHashValNull);
+    } else if (len == 0) {
+      // Special handling for zero-length input which is the case for empty 
strings.
+      buf = &kHashValEmpty;
+      len = sizeof(kHashValEmpty);
     }
+
     static constexpr uint64_t kMultiplier = 0x880355f21e6d1965UL;
     const uint64_t* pos = static_cast<const uint64_t*>(buf);
     const uint64_t* end = pos + (len / 8);

Reply via email to