This is an automated email from the ASF dual-hosted git repository.
bankim pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 0720f6d [util] Add special handling for empty strings in FastHash
0720f6d is described below
commit 0720f6da1f7cb49b56bfc587c45e7221663ad4c6
Author: Bankim Bhavsar <[email protected]>
AuthorDate: Thu Apr 29 14:28:53 2021 -0700
[util] Add special handling for empty strings in FastHash
Impala uses special handling for empty strings while hashing
https://github.com/apache/impala/blob/master/be/src/runtime/raw-value.inline.h#L352
This leads to discrepancy between hash values for empty
strings between Impala and Kudu.
This change basically matches the behavior with Impala.
Changes to the implementation of hashing function for empty
strings generates a different value and hence incorrect
results across different Kudu client and server versions.
To resolve this issue, a new feature flag BLOOM_FILTER_PREDICATE_V2 has
been added and the older one BLOOM_FILTER_PREDICATE has been marked as
deprecated.
Since FastHash is not used for any persistent data structures
so there are no backward compatibility issues that needs special
handling during upgrade.
Discovered bugs in the Java implementation when a negative
byte is promoted to a long and left shifted. Discovery
was because of the magic value for empty strings/buffer.
Thankfully the Java implementation of the FastHash
is not used currently.
Also fixed a bug in predicate test where flush would be missed
due to increase in number of values which affected TSAN.
Change-Id: Idb2d401ab692decd5bc03b48a96d710546c0039e
Reviewed-on: http://gerrit.cloudera.org:8080/17376
Tested-by: Kudu Jenkins
Reviewed-by: Grant Henke <[email protected]>
Reviewed-by: Wenzhe Zhou <[email protected]>
Reviewed-by: Andrew Wong <[email protected]>
---
.../main/java/org/apache/kudu/util/HashUtil.java | 40 ++++++++++++++--------
.../java/org/apache/kudu/util/TestFashHash.java | 26 ++++++++++++--
src/kudu/client/CMakeLists.txt | 2 +-
src/kudu/client/predicate-test.cc | 6 ++--
src/kudu/client/scanner-internal.cc | 4 +--
src/kudu/tserver/tablet_service.cc | 4 +--
src/kudu/tserver/tserver.proto | 4 +++
src/kudu/util/hash_util-test.cc | 27 +++++++++++++--
src/kudu/util/hash_util.h | 18 ++++++----
9 files changed, 99 insertions(+), 32 deletions(-)
diff --git a/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
b/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
index c2bfa20..10ecdc4 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/util/HashUtil.java
@@ -21,9 +21,9 @@ package org.apache.kudu.util;
* Hash utility functions.
*/
public class HashUtil {
- // Constant imported from Apache Impala used to compute hash values for
special cases.
- // It's an arbitrary constant obtained by taking lower bytes of generated
UUID. Helps
- // distinguish NULL values from empty objects.
+ // Constants imported from Apache Impala used to compute hash values for
special cases.
+ // They are arbitrary constant obtained by taking lower bytes of generated
UUID. Helps
+ // distinguish NULL values and zero-length objects like empty strings.
// Impala uses the direct BlockBloomFilter C++ API and inserts hash value
directly using
// its own implementation of the Fast hash. Hence the value must match with
Impala.
// Though Impala will use C++ API, keeping the implementation of the Fast
hash algorithm
@@ -31,11 +31,19 @@ public class HashUtil {
private static final int HASH_VAL_NULL = 0x58081667;
private static final byte[] HASH_VAL_NULL_BYTE_BUF = new byte[4];
+ private static final int HASH_VAL_EMPTY = 0x7dca7eee;
+ private static final byte[] HASH_VAL_EMPTY_BYTE_BUF = new byte[4];
+
static {
HASH_VAL_NULL_BYTE_BUF[0] = (byte) (HASH_VAL_NULL >>> 0);
HASH_VAL_NULL_BYTE_BUF[1] = (byte) (HASH_VAL_NULL >>> 8);
HASH_VAL_NULL_BYTE_BUF[2] = (byte) (HASH_VAL_NULL >>> 16);
HASH_VAL_NULL_BYTE_BUF[3] = (byte) (HASH_VAL_NULL >>> 24);
+
+ HASH_VAL_EMPTY_BYTE_BUF[0] = (byte) (HASH_VAL_EMPTY >>> 0);
+ HASH_VAL_EMPTY_BYTE_BUF[1] = (byte) (HASH_VAL_EMPTY >>> 8);
+ HASH_VAL_EMPTY_BYTE_BUF[2] = (byte) (HASH_VAL_EMPTY >>> 16);
+ HASH_VAL_EMPTY_BYTE_BUF[3] = (byte) (HASH_VAL_EMPTY >>> 24);
}
/** Non-constructable utility class. */
@@ -59,6 +67,8 @@ public class HashUtil {
// case with nullable column values.
if (buf == null) {
buf = HASH_VAL_NULL_BYTE_BUF;
+ } else if (buf.length == 0) {
+ buf = HASH_VAL_EMPTY_BYTE_BUF;
}
final int len = buf.length;
final long m = 0x880355f21e6d1965L;
@@ -68,11 +78,11 @@ public class HashUtil {
int len8 = len / 8;
for (int i = 0; i < len8; ++i) {
int pos = i * 8;
- v = buf[pos] +
- ((long)buf[pos + 1] << 8) + ((long)buf[pos + 2] << 16) +
- ((long)buf[pos + 3] << 24) + ((long)buf[pos + 4] << 32) +
- ((long)buf[pos + 5] << 40) + ((long)buf[pos + 6] << 48) +
- ((long)buf[pos + 7] << 56);
+ v = (buf[pos] & 0xFF) |
+ ((long)(buf[pos + 1] & 0xFF) << 8) | ((long)(buf[pos + 2] & 0xFF)
<< 16) |
+ ((long)(buf[pos + 3] & 0xFF) << 24) | ((long)(buf[pos + 4] & 0xFF)
<< 32) |
+ ((long)(buf[pos + 5] & 0xFF) << 40) | ((long)(buf[pos + 6] & 0xFF)
<< 48) |
+ ((long)(buf[pos + 7] & 0xFF) << 56);
h ^= fastHashMix(v);
h *= m;
}
@@ -82,25 +92,25 @@ public class HashUtil {
//CHECKSTYLE:OFF
switch (len & 7) {
case 7:
- v ^= (long)buf[pos2 + 6] << 48;
+ v ^= (long)(buf[pos2 + 6] & 0xFF) << 48;
// fall through
case 6:
- v ^= (long)buf[pos2 + 5] << 40;
+ v ^= (long)(buf[pos2 + 5] & 0xFF) << 40;
// fall through
case 5:
- v ^= (long)buf[pos2 + 4] << 32;
+ v ^= (long)(buf[pos2 + 4] & 0xFF) << 32;
// fall through
case 4:
- v ^= (long)buf[pos2 + 3] << 24;
+ v ^= (long)(buf[pos2 + 3] & 0xFF) << 24;
// fall through
case 3:
- v ^= (long)buf[pos2 + 2] << 16;
+ v ^= (long)(buf[pos2 + 2] & 0xFF) << 16;
// fall through
case 2:
- v ^= (long)buf[pos2 + 1] << 8;
+ v ^= (long)(buf[pos2 + 1] & 0xFF) << 8;
// fall through
case 1:
- v ^= buf[pos2];
+ v ^= (buf[pos2] & 0xFF);
h ^= fastHashMix(v);
h *= m;
}
diff --git
a/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
b/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
index 731a0f2..3581b23 100644
--- a/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
+++ b/java/kudu-client/src/test/java/org/apache/kudu/util/TestFashHash.java
@@ -52,7 +52,19 @@ public class TestFashHash {
assertEquals(Long.parseUnsignedLong("12680076593665652444"), hash);
hash = HashUtil.fastHash64("".getBytes(UTF_8), 0);
- assertEquals(0, hash);
+ assertEquals(Long.parseUnsignedLong("4144680785095980158"), hash);
+
+ hash = HashUtil.fastHash64("".getBytes(UTF_8), 1234);
+ assertEquals(Long.parseUnsignedLong("3296774803014270295"), hash);
+
+ // 15 byte buffer with negative and positive numbers help test the 8 byte
loop iteration
+ // and the remainder of 7 bytes in the fast hash implementation.
+ byte[] buffer = new byte[15];
+ for (int i = 0; i < buffer.length; i++) {
+ buffer[i] = (byte)(i - 8);
+ }
+ hash = HashUtil.fastHash64(buffer, 0);
+ assertEquals(Long.parseUnsignedLong("7308577902719593318"), hash);
}
@Test
@@ -72,6 +84,16 @@ public class TestFashHash {
assertEquals(Integer.parseUnsignedInt("842467426"), hash);
hash = HashUtil.fastHash32("".getBytes(UTF_8), 0);
- assertEquals(0, hash);
+ assertEquals(Integer.parseUnsignedInt("3045300040"), hash);
+
+ hash = HashUtil.fastHash32("".getBytes(UTF_8), 1234);
+ assertEquals(Integer.parseUnsignedInt("811548192"), hash);
+
+ byte[] buffer = new byte[15];
+ for (int i = 0; i < buffer.length; i++) {
+ buffer[i] = (byte)(i - 8);
+ }
+ hash = HashUtil.fastHash32(buffer, 0);
+ assertEquals(Integer.parseUnsignedInt("3815875205"), hash);
}
}
diff --git a/src/kudu/client/CMakeLists.txt b/src/kudu/client/CMakeLists.txt
index 5c07020..0681d75 100644
--- a/src/kudu/client/CMakeLists.txt
+++ b/src/kudu/client/CMakeLists.txt
@@ -275,5 +275,5 @@ SET_KUDU_TEST_LINK_LIBS(
ADD_KUDU_TEST(client-test NUM_SHARDS 8 PROCESSORS 2
DATA_FILES ../scripts/first_argument.sh)
ADD_KUDU_TEST(client-unittest)
-ADD_KUDU_TEST(predicate-test)
+ADD_KUDU_TEST(predicate-test NUM_SHARDS 4)
ADD_KUDU_TEST(scan_token-test PROCESSORS 2)
diff --git a/src/kudu/client/predicate-test.cc
b/src/kudu/client/predicate-test.cc
index becf186..b300d3a 100644
--- a/src/kudu/client/predicate-test.cc
+++ b/src/kudu/client/predicate-test.cc
@@ -1543,12 +1543,13 @@ TEST_P(ParameterizedBloomFilterPredicateTest,
TestDisabledBloomFilterWithRepeate
// Use a set of predetermined strings and populate the table.
// Create a BF predicate with same set of strings.
deque<string> values = {"Alice", "Bob", "Charlie", "Doug", "Elizabeth",
"Frank", "George",
- "Harry"};
+ "Harry", ""};
// Populate table with a small set of strings that are repeated.
static constexpr int kNumRows = 10000;
auto upsert_rows = [&]() {
int i = 0;
+ int last_flush = i;
while (i < kNumRows) {
for (int j = 0; j < values.size() && i < kNumRows; j++, i++) {
const string &value = values[j];
@@ -1558,8 +1559,9 @@ TEST_P(ParameterizedBloomFilterPredicateTest,
TestDisabledBloomFilterWithRepeate
ASSERT_OK(session->Apply(upsert.release()));
}
// TSAN builds timeout and fail on flushing with large number of rows.
- if ((i % (kNumRows / 10)) == 0) {
+ if (i - last_flush >= 1000) {
ASSERT_OK(session->Flush());
+ last_flush = i;
}
}
ASSERT_OK(session->Flush());
diff --git a/src/kudu/client/scanner-internal.cc
b/src/kudu/client/scanner-internal.cc
index a56a8f3..717fb32 100644
--- a/src/kudu/client/scanner-internal.cc
+++ b/src/kudu/client/scanner-internal.cc
@@ -355,7 +355,7 @@ ScanRpcStatus KuduScanner::Data::SendScanRpc(const
MonoTime& overall_deadline,
// Capture previously sent Bloom filter predicate feature flag so that we
don't have to make
// expensive call to determine the flag on scan continuations.
bool prev_bloom_filter_feature =
ContainsKey(controller_.required_server_features(),
-
TabletServerFeatures::BLOOM_FILTER_PREDICATE);
+
TabletServerFeatures::BLOOM_FILTER_PREDICATE_V2);
controller_.Reset();
controller_.set_deadline(rpc_deadline);
@@ -364,7 +364,7 @@ ScanRpcStatus KuduScanner::Data::SendScanRpc(const
MonoTime& overall_deadline,
if (prev_bloom_filter_feature ||
(next_req_.has_new_scan_request() &&
configuration().spec().ContainsBloomFilterPredicate())) {
-
controller_.RequireServerFeature(TabletServerFeatures::BLOOM_FILTER_PREDICATE);
+
controller_.RequireServerFeature(TabletServerFeatures::BLOOM_FILTER_PREDICATE_V2);
}
}
if (configuration().row_format_flags() &
KuduScanner::PAD_UNIXTIME_MICROS_TO_16_BYTES) {
diff --git a/src/kudu/tserver/tablet_service.cc
b/src/kudu/tserver/tablet_service.cc
index 8adc52b..eec5b3c 100644
--- a/src/kudu/tserver/tablet_service.cc
+++ b/src/kudu/tserver/tablet_service.cc
@@ -1395,7 +1395,7 @@ bool TabletServiceAdminImpl::SupportsFeature(uint32_t
feature) const {
case TabletServerFeatures::COLUMN_PREDICATES:
case TabletServerFeatures::PAD_UNIXTIME_MICROS_TO_16_BYTES:
case TabletServerFeatures::QUIESCING:
- case TabletServerFeatures::BLOOM_FILTER_PREDICATE:
+ case TabletServerFeatures::BLOOM_FILTER_PREDICATE_V2:
// TODO(awong): once transactions are useable, add a feature flag.
return true;
default:
@@ -2525,7 +2525,7 @@ bool TabletServiceImpl::SupportsFeature(uint32_t feature)
const {
case TabletServerFeatures::COLUMN_PREDICATES:
case TabletServerFeatures::PAD_UNIXTIME_MICROS_TO_16_BYTES:
case TabletServerFeatures::QUIESCING:
- case TabletServerFeatures::BLOOM_FILTER_PREDICATE:
+ case TabletServerFeatures::BLOOM_FILTER_PREDICATE_V2:
case TabletServerFeatures::COLUMNAR_LAYOUT_FEATURE:
return true;
default:
diff --git a/src/kudu/tserver/tserver.proto b/src/kudu/tserver/tserver.proto
index a603975..a5d0a9e 100644
--- a/src/kudu/tserver/tserver.proto
+++ b/src/kudu/tserver/tserver.proto
@@ -494,7 +494,11 @@ enum TabletServerFeatures {
// Whether the server supports padding UNIXTIME_MICROS slots to 16 bytes.
PAD_UNIXTIME_MICROS_TO_16_BYTES = 2;
QUIESCING = 3;
+ // Deprecated. See note below for BLOOM_FILTER_PREDICATE_V2.
BLOOM_FILTER_PREDICATE = 4;
// Whether the server supports the COLUMNAR_LAYOUT format flag.
COLUMNAR_LAYOUT_FEATURE = 5;
+ // Update to implementation of Fast hash for Bloom filter predicate leads
+ // to incorrect results if incompatible client and server versions are used.
+ BLOOM_FILTER_PREDICATE_V2 = 6;
}
diff --git a/src/kudu/util/hash_util-test.cc b/src/kudu/util/hash_util-test.cc
index 1e9298e..d42d379 100644
--- a/src/kudu/util/hash_util-test.cc
+++ b/src/kudu/util/hash_util-test.cc
@@ -17,6 +17,7 @@
#include "kudu/util/hash_util.h"
+#include <array>
#include <cstdint>
#include <gtest/gtest.h>
@@ -62,7 +63,19 @@ TEST(HashUtilTest, TestFastHash64) {
ASSERT_EQ(12680076593665652444UL, hash);
hash = HashUtil::FastHash64("", 0, 0);
- ASSERT_EQ(0, hash);
+ ASSERT_EQ(4144680785095980158UL, hash);
+
+ hash = HashUtil::FastHash64("", 0, 1234);
+ ASSERT_EQ(3296774803014270295UL, hash);
+
+ // 15 byte buffer with negative and positive numbers help test the 8 byte
loop iteration
+ // and the remainder of 7 bytes in the fast hash implementation.
+ std::array<char, 15> buffer;
+ for (int i = 0; i < buffer.size(); i++) {
+ buffer[i] = static_cast<char>(i - 8);
+ }
+ hash = HashUtil::FastHash64(buffer.data(), buffer.size(), 0);
+ ASSERT_EQ(7308577902719593318, hash);
}
TEST(HashUtilTest, TestFastHash32) {
@@ -81,7 +94,17 @@ TEST(HashUtilTest, TestFastHash32) {
ASSERT_EQ(842467426U, hash);
hash = HashUtil::FastHash32("", 0, 0);
- ASSERT_EQ(0, hash);
+ ASSERT_EQ(3045300040U, hash);
+
+ hash = HashUtil::FastHash32("", 0, 1234);
+ ASSERT_EQ(811548192U, hash);
+
+ std::array<char, 15> buffer;
+ for (int i = 0; i < buffer.size(); i++) {
+ buffer[i] = static_cast<char>(i - 8);
+ }
+ hash = HashUtil::FastHash32(buffer.data(), buffer.size(), 0);
+ ASSERT_EQ(3815875205, hash);
}
TEST(HashUtilTest, TestComputeHash32Available) {
diff --git a/src/kudu/util/hash_util.h b/src/kudu/util/hash_util.h
index 1664abd..bb4a8e8 100644
--- a/src/kudu/util/hash_util.h
+++ b/src/kudu/util/hash_util.h
@@ -29,9 +29,9 @@
namespace kudu {
-// Constant imported from Apache Impala used to compute hash values for
special cases. It's an
-// arbitrary constant obtained by taking lower bytes of generated UUID. Helps
distinguish NULL
-// values from empty objects.
+// Constants imported from Apache Impala used to compute hash values for
special cases.
+// They are arbitrary constants obtained by taking lower bytes of generated
UUID.
+// Helps distinguish NULL values and zero-length objects like empty strings.
// Impala uses the direct BlockBloomFilter API and inserts hash value directly
using its own
// implementation of the Fast hash. Hence the value must match with Impala.
//
@@ -39,6 +39,7 @@ namespace kudu {
// a member variable of HashUtil requires an explicit definition in .cc
file
// and this class is completely defined in the header file to allow
inlining.
static constexpr uint32_t kHashValNull = 0x58081667;
+static constexpr uint32_t kHashValEmpty = 0x7dca7eee;
// Utility class to compute hash values.
class HashUtil {
@@ -86,17 +87,22 @@ class HashUtil {
// FastHash is simple, robust, and efficient general-purpose hash function
from Google.
// Implementation is adapted from
https://code.google.com/archive/p/fast-hash/
//
- // Adds special handling for nullptr input.
+ // Adds special handling for nullptr and zero-length input.
//
// Compute 64-bit FastHash.
ATTRIBUTE_NO_SANITIZE_INTEGER
static uint64_t FastHash64(const void* buf, size_t len, uint64_t seed) {
- // Special handling for nullptr input with possible non-zero length as
could be the
- // case with nullable column values.
if (buf == nullptr) {
+ // Special handling for nullptr input with possible non-zero length as
could be the
+ // case with nullable column values.
buf = &kHashValNull;
len = sizeof(kHashValNull);
+ } else if (len == 0) {
+ // Special handling for zero-length input which is the case for empty
strings.
+ buf = &kHashValEmpty;
+ len = sizeof(kHashValEmpty);
}
+
static constexpr uint64_t kMultiplier = 0x880355f21e6d1965UL;
const uint64_t* pos = static_cast<const uint64_t*>(buf);
const uint64_t* end = pos + (len / 8);