Repository: incubator-impala Updated Branches: refs/heads/master 1ca3adf46 -> a772f8456
IMPALA-2281: Replace FNV with FastHash in exchange nodes FNV is not a good enough hash function. This patch introduces FastHash into the codebase and uses it in exchange nodes. Testing: Two test cases involving arbitrary ordering are changed. Single node performance benchmark shows no performance difference. Change-Id: I778317d982dcdb94173a369a65b39f32b4f7ded2 Reviewed-on: http://gerrit.cloudera.org:8080/8417 Reviewed-by: Jim Apple <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/73d1bc3a Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/73d1bc3a Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/73d1bc3a Branch: refs/heads/master Commit: 73d1bc3ad448181626c559c36f9d86b23e036fef Parents: 1ca3adf Author: Tianyi Wang <[email protected]> Authored: Thu Oct 26 15:26:59 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Nov 8 07:39:02 2017 +0000 ---------------------------------------------------------------------- LICENSE.txt | 26 ++++++ be/src/benchmarks/hash-benchmark.cc | 98 +++++++++++++++----- be/src/codegen/gen_ir_descriptions.py | 1 - be/src/codegen/llvm-codegen.cc | 4 - be/src/runtime/data-stream-sender.cc | 17 ++-- be/src/runtime/data-stream-sender.h | 3 + be/src/runtime/data-stream-test.cc | 4 +- be/src/runtime/raw-value-test.cc | 44 ++++----- be/src/runtime/raw-value.cc | 42 ++++----- be/src/runtime/raw-value.h | 10 +- be/src/util/hash-util-ir.cc | 5 - be/src/util/hash-util.h | 42 +++++++++ .../queries/QueryTest/nested-types-runtime.test | 12 +-- 13 files changed, 207 insertions(+), 101 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/LICENSE.txt ---------------------------------------------------------------------- diff --git a/LICENSE.txt b/LICENSE.txt index 01359d4..419a726 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -920,3 +920,29 @@ be/src/kudu/util/x509_check_host.*: OpenSSL software license: [including the GNU Public Licence.] -------------------------------------------------------------------------------- + +be/src/util/hash-util.h (some portions): MIT license + +Some portions of this module are derived from code from FastHash +(https://code.google.com/archive/p/fast-hash/). + + Copyright (C) 2012 Zilong Tan ([email protected]) + + Permission is hereby granted, free of charge, to any person obtaining a copy of this + software and associated documentation files (the "Software"), to deal in the Software + without restriction, including without limitation the rights to use, copy, modify, + merge, publish, distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice shall be included in all copies or + substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT + OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/benchmarks/hash-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/hash-benchmark.cc b/be/src/benchmarks/hash-benchmark.cc index b183915..d0ecc82 100644 --- a/be/src/benchmarks/hash-benchmark.cc +++ b/be/src/benchmarks/hash-benchmark.cc @@ -16,8 +16,6 @@ // under the License. #include <iostream> -#include <stdlib.h> -#include <stdio.h> #include <vector> #include <boost/functional/hash.hpp> @@ -26,12 +24,9 @@ #include "common/init.h" #include "experiments/data-provider.h" #include "runtime/mem-tracker.h" -#include "runtime/string-value.h" #include "runtime/test-env.h" #include "service/fe-support.h" #include "util/benchmark.h" -#include "util/cpu-info.h" -#include "util/hash-util.h" #include "common/names.h" @@ -50,6 +45,7 @@ using namespace llvm; // The different hash functions benchmarked: // 1. FNV Hash: Fowler-Noll-Vo hash function // 2. FNV Hash with empty string handling: FNV with special-case for empty string +// 3. FastHash64: 64-bit FastHash function // 3. Murmur2_64 Hash: Murmur2 hash function // 4. Boost Hash: boost hash function // 5. Crc: hash using sse4 crc hash instruction @@ -62,24 +58,30 @@ using namespace llvm; // = lim n->inf n(1 - 1/n) ^n // = n / e // = 367 - -// Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz -// Int Hash: Function Rate (iters/ms) Comparison -// ---------------------------------------------------------------------- -// Fnv 114.6 1X -// Murmur2_64 129.9 1.134X -// Boost 294.1 2.567X -// Crc 536.2 4.68X -// Codegen 1413 12.33X -// -// Mixed Hash: Function Rate (iters/ms) Comparison -// ---------------------------------------------------------------------- -// Fnv 90.88 1X -// FnvEmpty 91.58 1.008X -// Murmur2_64 124.9 1.374X -// Boost 133.5 1.469X -// Crc 435.8 4.795X -// Codegen 379.3 4.174X +/* +Machine Info: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz +Int Hash: +Function 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile + (relative) (relative) (relative) +----------- ---------------------------------------------------------- + Fnv 88.5 109 111 1X 1X 1X + FastHash64 96.9 110 112 1.1X 1.01X 1.01X + Murmur2_64 90.7 124 126 1.03X 1.14X 1.14X + Boost 203 277 282 2.3X 2.55X 2.55X + Crc 415 536 540 4.68X 4.93X 4.89X + Codegen 1.5e+03 1.85e+03 1.88e+03 16.9X 17X 17X +Mixed Hash: +Function 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile + (relative) (relative) (relative) +---------------------------------------------------------------------- + Fnv 75.3 78 78.9 1X 1X 1X + FastHash64 91.2 95.4 96.3 1.21X 1.22X 1.22X + FnvEmpty 82.7 86.3 86.9 1.1X 1.11X 1.1X + Murmur2_64 109 113 114 1.45X 1.45X 1.45X + Boost 109 112 113 1.45X 1.43X 1.44X + Crc 467 481 489 6.21X 6.17X 6.21X + Codegen 292 312 318 3.88X 4X 4.03X +*/ typedef uint32_t (*CodegenHashFn)(int rows, char* data, int32_t* results); @@ -125,6 +127,23 @@ void TestCrcIntHash(int batch, void* d) { } } +void TestFastHashIntHash(int batch, void* d) { + TestData* data = reinterpret_cast<TestData*>(d); + int rows = data->num_rows; + int cols = data->num_cols; + for (int i = 0; i < batch; ++i) { + int32_t* values = reinterpret_cast<int32_t*>(data->data); + for (int j = 0; j < rows; ++j) { + uint64_t hash = HashUtil::FNV_SEED; + for (int k = 0; k < cols; ++k) { + hash = HashUtil::FastHash64(&values[k], sizeof(uint32_t), hash); + } + data->results[j] = hash; + values += cols; + } + } +} + void TestBoostIntHash(int batch, void* d) { TestData* data = reinterpret_cast<TestData*>(d); int rows = data->num_rows; @@ -235,6 +254,32 @@ void TestCrcMixedHash(int batch, void* d) { } } +void TestFastHashMixedHash(int batch, void* d) { + TestData* data = reinterpret_cast<TestData*>(d); + int rows = data->num_rows; + for (int i = 0; i < batch; ++i) { + char* values = reinterpret_cast<char*>(data->data); + for (int j = 0; j < rows; ++j) { + uint64_t hash = HashUtil::FNV_SEED; + + hash = HashUtil::FastHash64(values, sizeof(int8_t), hash); + values += sizeof(int8_t); + + hash = HashUtil::FastHash64(values, sizeof(int32_t), hash); + values += sizeof(int32_t); + + hash = HashUtil::FastHash64(values, sizeof(int64_t), hash); + values += sizeof(int64_t); + + StringValue* str = reinterpret_cast<StringValue*>(values); + hash = HashUtil::FastHash64(str->ptr, str->len, hash); + values += sizeof(StringValue); + + data->results[j] = hash; + } + } +} + void TestCodegenMixedHash(int batch, void* d) { TestData* data = reinterpret_cast<TestData*>(d); CodegenHashFn fn = reinterpret_cast<CodegenHashFn>(data->jitted_fn); @@ -416,9 +461,8 @@ Function* CodegenCrcHash(LlvmCodeGen* codegen, bool mixed) { } int main(int argc, char **argv) { - CpuInfo::Init(); - cout << Benchmark::GetMachineInfo() << endl; impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST); + cout << Benchmark::GetMachineInfo() << endl; impala::InitFeSupport(); ABORT_IF_ERROR(LlvmCodeGen::InitializeLlvm()); @@ -506,6 +550,7 @@ int main(int argc, char **argv) { Benchmark int_suite("Int Hash"); int_suite.AddBenchmark("Fnv", TestFnvIntHash, &int_data); + int_suite.AddBenchmark("FastHash64", TestFastHashIntHash, &int_data); int_suite.AddBenchmark("Murmur2_64", TestMurmur2_64IntHash, &int_data); int_suite.AddBenchmark("Boost", TestBoostIntHash, &int_data); int_suite.AddBenchmark("Crc", TestCrcIntHash, &int_data); @@ -514,6 +559,7 @@ int main(int argc, char **argv) { Benchmark mixed_suite("Mixed Hash"); mixed_suite.AddBenchmark("Fnv", TestFnvMixedHash, &mixed_data); + mixed_suite.AddBenchmark("FastHash64", TestFastHashMixedHash, &mixed_data); mixed_suite.AddBenchmark("FnvEmpty", TestFnvEmptyMixedHash, &mixed_data); mixed_suite.AddBenchmark("Murmur2_64", TestMurmur2_64MixedHash, &mixed_data); mixed_suite.AddBenchmark("Boost", TestBoostMixedHash, &mixed_data); @@ -521,5 +567,7 @@ int main(int argc, char **argv) { mixed_suite.AddBenchmark("Codegen", TestCodegenMixedHash, &mixed_data); cout << mixed_suite.Measure(); + codegen->Close(); + mem_pool.FreeAll(); return 0; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/codegen/gen_ir_descriptions.py ---------------------------------------------------------------------- diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py index 1044a40..28e0a71 100755 --- a/be/src/codegen/gen_ir_descriptions.py +++ b/be/src/codegen/gen_ir_descriptions.py @@ -96,7 +96,6 @@ ir_functions = [ ["SCALAR_EXPR_GET_DECIMAL_VAL", "_ZN6impala10ScalarExpr13GetDecimalValEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"], ["HASH_CRC", "IrCrcHash"], - ["HASH_FNV", "IrFnvHash"], ["HASH_MURMUR", "IrMurmurHash"], ["PHJ_PROCESS_BUILD_BATCH", "_ZN6impala10PhjBuilder17ProcessBuildBatchEPNS_8RowBatchEPNS_12HashTableCtxEbb"], http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/codegen/llvm-codegen.cc ---------------------------------------------------------------------- diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc index 777ca0a..77a45c6 100644 --- a/be/src/codegen/llvm-codegen.cc +++ b/be/src/codegen/llvm-codegen.cc @@ -1584,10 +1584,6 @@ static Function* GetLenOptimizedHashFn( return codegen->FinalizeFunction(fn); } -Function* LlvmCodeGen::GetFnvHashFunction(int len) { - return GetLenOptimizedHashFn(this, IRFunction::HASH_FNV, len); -} - Function* LlvmCodeGen::GetMurmurHashFunction(int len) { return GetLenOptimizedHashFn(this, IRFunction::HASH_MURMUR, len); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/data-stream-sender.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/data-stream-sender.cc b/be/src/runtime/data-stream-sender.cc index ed156dc..cf749cd 100644 --- a/be/src/runtime/data-stream-sender.cc +++ b/be/src/runtime/data-stream-sender.cc @@ -464,17 +464,16 @@ Status DataStreamSender::Send(RuntimeState* state, RowBatch* batch) { int num_channels = channels_.size(); for (int i = 0; i < batch->num_rows(); ++i) { TupleRow* row = batch->GetRow(i); - uint32_t hash_val = HashUtil::FNV_SEED; - for (int i = 0; i < partition_exprs_.size(); ++i) { - ScalarExprEvaluator* eval = partition_expr_evals_[i]; + uint64_t hash_val = EXCHANGE_HASH_SEED; + for (int j = 0; j < partition_exprs_.size(); ++j) { + ScalarExprEvaluator* eval = partition_expr_evals_[j]; void* partition_val = eval->GetValue(row); - // We can't use the crc hash function here because it does not result - // in uncorrelated hashes with different seeds. Instead we must use - // fnv hash. + // We can't use the crc hash function here because it does not result in + // uncorrelated hashes with different seeds. Instead we use FastHash. // TODO: fix crc hash/GetHashValue() - DCHECK(&partition_expr_evals_[i]->root() == partition_exprs_[i]); - hash_val = RawValue::GetHashValueFnv( - partition_val, partition_exprs_[i]->type(), hash_val); + DCHECK(&(eval->root()) == partition_exprs_[j]); + hash_val = RawValue::GetHashValueFastHash( + partition_val, partition_exprs_[j]->type(), hash_val); } RETURN_IF_ERROR(channels_[hash_val % num_channels]->AddRow(row)); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/data-stream-sender.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/data-stream-sender.h b/be/src/runtime/data-stream-sender.h index 95a83e5..8ed4bd0 100644 --- a/be/src/runtime/data-stream-sender.h +++ b/be/src/runtime/data-stream-sender.h @@ -153,6 +153,9 @@ class DataStreamSender : public DataSink { /// Used for Kudu partitioning to round-robin rows that don't correspond to a partition /// or when errors are encountered. int next_unknown_partition_; + + /// An arbitrary hash seed used for exchanges. + static constexpr uint64_t EXCHANGE_HASH_SEED = 0x66bd68df22c3ef37; }; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/data-stream-test.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/data-stream-test.cc b/be/src/runtime/data-stream-test.cc index 79a97b6..3f4844c 100644 --- a/be/src/runtime/data-stream-test.cc +++ b/be/src/runtime/data-stream-test.cc @@ -425,8 +425,8 @@ class DataStreamTest : public testing::Test { } else if (stream_type == TPartitionType::HASH_PARTITIONED) { // hash-partitioned streams send values to the right partition int64_t value = *j; - uint32_t hash_val = - RawValue::GetHashValueFnv(&value, TYPE_BIGINT, HashUtil::FNV_SEED); + uint64_t hash_val = RawValue::GetHashValueFastHash(&value, TYPE_BIGINT, + DataStreamSender::EXCHANGE_HASH_SEED); EXPECT_EQ(hash_val % receiver_info_.size(), info.receiver_num); } } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/raw-value-test.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/raw-value-test.cc b/be/src/runtime/raw-value-test.cc index 9c71822..bced866 100644 --- a/be/src/runtime/raw-value-test.cc +++ b/be/src/runtime/raw-value-test.cc @@ -79,35 +79,37 @@ TEST_F(RawValueTest, TypeChar) { // IMPALA-2270: "", false, and NULL should hash to distinct values. TEST_F(RawValueTest, HashEmptyAndNull) { uint32_t seed = 12345; - uint32_t null_hash = RawValue::GetHashValue(NULL, TYPE_STRING, seed); - uint32_t null_hash_fnv = RawValue::GetHashValueFnv(NULL, TYPE_STRING, seed); - StringValue empty(NULL, 0); - uint32_t empty_hash = RawValue::GetHashValue(&empty, TYPE_STRING, seed); - uint32_t empty_hash_fnv = RawValue::GetHashValueFnv(&empty, TYPE_STRING, seed); + uint32_t hash_null = RawValue::GetHashValue(nullptr, TYPE_STRING, seed); + uint64_t fast_hash_null = RawValue::GetHashValueFastHash(nullptr, TYPE_STRING, seed); + StringValue empty(nullptr, 0); + uint32_t hash_empty = RawValue::GetHashValue(&empty, TYPE_STRING, seed); + uint64_t fast_hash_empty = + RawValue::GetHashValueFastHash(&empty, TYPE_STRING, seed); bool false_val = false; - uint32_t false_hash = RawValue::GetHashValue(&false_val, TYPE_BOOLEAN, seed); - uint32_t false_hash_fnv = RawValue::GetHashValue(&false_val, TYPE_BOOLEAN, seed); - EXPECT_NE(seed, null_hash); - EXPECT_NE(seed, empty_hash); - EXPECT_NE(seed, false_hash); - EXPECT_NE(seed, null_hash_fnv); - EXPECT_NE(seed, empty_hash_fnv); - EXPECT_NE(seed, false_hash_fnv); - EXPECT_NE(null_hash, empty_hash); - EXPECT_NE(null_hash_fnv, empty_hash_fnv); - EXPECT_NE(null_hash, false_hash); - EXPECT_NE(false_hash, null_hash_fnv); + uint32_t hash_false = RawValue::GetHashValue(&false_val, TYPE_BOOLEAN, seed); + uint64_t fast_hash_false = + RawValue::GetHashValueFastHash(&false_val, TYPE_BOOLEAN, seed); + EXPECT_NE(seed, hash_null); + EXPECT_NE(seed, hash_empty); + EXPECT_NE(seed, hash_false); + EXPECT_NE(seed, fast_hash_null); + EXPECT_NE(seed, fast_hash_empty); + EXPECT_NE(seed, fast_hash_false); + EXPECT_NE(hash_null, hash_empty); + EXPECT_NE(fast_hash_null, fast_hash_empty); + EXPECT_NE(hash_null, hash_false); + EXPECT_NE(hash_false, fast_hash_null); } -/// IMPALA-2270: Test that FNV hash of (int, "") is not skewed. +/// IMPALA-2270: Test that FastHash of (int, "") is not skewed. TEST(HashUtil, IntNullSkew) { int num_values = 100000; int num_buckets = 16; vector<int> buckets(num_buckets, 0); for (int32_t i = 0; i < num_values; ++i) { - uint32_t hash = RawValue::GetHashValueFnv(&i, TYPE_INT, 9999); - StringValue empty(NULL, 0); - hash = RawValue::GetHashValueFnv(&empty, TYPE_STRING, hash); + uint64_t hash = RawValue::GetHashValueFastHash(&i, TYPE_INT, 9999); + StringValue empty(nullptr, 0); + hash = RawValue::GetHashValueFastHash(&empty, TYPE_STRING, hash); ++buckets[hash % num_buckets]; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/raw-value.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/raw-value.cc b/be/src/runtime/raw-value.cc index d7f4fd5..1db9741 100644 --- a/be/src/runtime/raw-value.cc +++ b/be/src/runtime/raw-value.cc @@ -60,7 +60,7 @@ void RawValue::PrintValueAsBytes(const void* value, const ColumnType& type, case TYPE_STRING: case TYPE_VARCHAR: string_val = reinterpret_cast<const StringValue*>(value); - stream->write(static_cast<char*>(string_val->ptr), string_val->len); + stream->write(string_val->ptr, string_val->len); break; case TYPE_TIMESTAMP: stream->write(chars, TimestampValue::Size()); @@ -98,7 +98,7 @@ void RawValue::PrintValue(const void* value, const ColumnType& type, int scale, case TYPE_STRING: case TYPE_VARCHAR: string_val = reinterpret_cast<const StringValue*>(value); - tmp.assign(static_cast<char*>(string_val->ptr), string_val->len); + tmp.assign(string_val->ptr, string_val->len); str->swap(tmp); return; case TYPE_CHAR: @@ -193,31 +193,29 @@ void RawValue::Write(const void* value, Tuple* tuple, const SlotDescriptor* slot } } -uint32_t RawValue::GetHashValueFnv(const void* v, const ColumnType& type, uint32_t seed) { - // Use HashCombine with arbitrary constant to ensure we don't return seed. - if (v == NULL) return HashUtil::HashCombine32(HASH_VAL_NULL, seed); - +uint64_t RawValue::GetHashValueFastHash(const void* v, const ColumnType& type, + uint64_t seed) { + // Hash with an arbitrary constant to ensure we don't return seed. + if (v == nullptr) { + return HashUtil::FastHash64(&HASH_VAL_NULL, sizeof(HASH_VAL_NULL), seed); + } switch (type.type) { case TYPE_STRING: case TYPE_VARCHAR: { const StringValue* string_value = reinterpret_cast<const StringValue*>(v); - if (string_value->len == 0) { - return HashUtil::HashCombine32(HASH_VAL_EMPTY, seed); - } - return HashUtil::FnvHash64to32(string_value->ptr, string_value->len, seed); + return HashUtil::FastHash64(string_value->ptr, + static_cast<size_t>(string_value->len), seed); } - case TYPE_BOOLEAN: - return HashUtil::HashCombine32(*reinterpret_cast<const bool*>(v), seed); - case TYPE_TINYINT: return HashUtil::FnvHash64to32(v, 1, seed); - case TYPE_SMALLINT: return HashUtil::FnvHash64to32(v, 2, seed); - case TYPE_INT: return HashUtil::FnvHash64to32(v, 4, seed); - case TYPE_BIGINT: return HashUtil::FnvHash64to32(v, 8, seed); - case TYPE_FLOAT: return HashUtil::FnvHash64to32(v, 4, seed); - case TYPE_DOUBLE: return HashUtil::FnvHash64to32(v, 8, seed); - case TYPE_TIMESTAMP: return HashUtil::FnvHash64to32(v, 12, seed); - case TYPE_CHAR: - return HashUtil::FnvHash64to32(v, type.len, seed); - case TYPE_DECIMAL: return HashUtil::FnvHash64to32(v, type.GetByteSize(), seed); + case TYPE_BOOLEAN: return HashUtil::FastHash64(v, 1, seed); + case TYPE_TINYINT: return HashUtil::FastHash64(v, 1, seed); + case TYPE_SMALLINT: return HashUtil::FastHash64(v, 2, seed); + case TYPE_INT: return HashUtil::FastHash64(v, 4, seed); + case TYPE_BIGINT: return HashUtil::FastHash64(v, 8, seed); + case TYPE_FLOAT: return HashUtil::FastHash64(v, 4, seed); + case TYPE_DOUBLE: return HashUtil::FastHash64(v, 8, seed); + case TYPE_TIMESTAMP: return HashUtil::FastHash64(v, 12, seed); + case TYPE_CHAR: return HashUtil::FastHash64(v, type.len, seed); + case TYPE_DECIMAL: return HashUtil::FastHash64(v, type.GetByteSize(), seed); default: DCHECK(false); return 0; } } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/raw-value.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/raw-value.h b/be/src/runtime/raw-value.h index 89ae323..6ee9191 100644 --- a/be/src/runtime/raw-value.h +++ b/be/src/runtime/raw-value.h @@ -71,12 +71,10 @@ class RawValue { static inline uint32_t GetHashValueNonNull(const T* v, const ColumnType& type, uint32_t seed = 0); - /// Get a 32-bit hash value using the FNV hash function. - /// Using different seeds with FNV results in different hash functions. - /// GetHashValue() does not have this property and cannot be safely used as the first - /// step in data repartitioning. However, GetHashValue() can be significantly faster. - /// TODO: fix GetHashValue - static uint32_t GetHashValueFnv(const void* v, const ColumnType& type, uint32_t seed); + /// Get a 64-bit hash value using the FastHash function. + /// https://code.google.com/archive/p/fast-hash/ + static uint64_t GetHashValueFastHash(const void* v, const ColumnType& type, + uint64_t seed); /// Compares both values. /// Return value is < 0 if v1 < v2, 0 if v1 == v2, > 0 if v1 > v2. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/util/hash-util-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/util/hash-util-ir.cc b/be/src/util/hash-util-ir.cc index cf812eb..da3271b 100644 --- a/be/src/util/hash-util-ir.cc +++ b/be/src/util/hash-util-ir.cc @@ -23,11 +23,6 @@ using namespace impala; extern "C" -uint32_t IrFnvHash(const void* data, int32_t bytes, uint32_t hash) { - return HashUtil::FnvHash64to32(data, bytes, hash); -} - -extern "C" uint32_t IrMurmurHash(const void* data, int32_t bytes, uint32_t hash) { return HashUtil::MurmurHash2_64(data, bytes, hash); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/util/hash-util.h ---------------------------------------------------------------------- diff --git a/be/src/util/hash-util.h b/be/src/util/hash-util.h index d0c791a..054cb8a 100644 --- a/be/src/util/hash-util.h +++ b/be/src/util/hash-util.h @@ -231,6 +231,48 @@ class HashUtil { // Rehash32to32(hash2) is minimal. return (static_cast<uint64_t>(hash) * m + a) >> 32; } + + // The FastHash64 implementation is adapted from + // https://code.google.com/archive/p/fast-hash/ + // available under MIT license. + static inline uint64_t FastHashMix(uint64_t h) { + h ^= h >> 23; + h *= 0x2127599bf4325c37ULL; + h ^= h >> 47; + return h; + } + + static inline uint64_t FastHash64(const void* buf, int64_t len, uint64_t seed) + { + const uint64_t m = 0x880355f21e6d1965ULL; + const uint64_t* pos = reinterpret_cast<const uint64_t*>(buf); + const uint64_t* end = pos + (len / 8); + uint64_t h = seed ^ (len * m); + uint64_t v; + + while (pos != end) { + v = *pos++; + h ^= FastHashMix(v); + h *= m; + } + + const uint8_t* pos2 = reinterpret_cast<const uint8_t*>(pos); + v = 0; + + switch (len & 7) { + case 7: v ^= static_cast<uint64_t>(pos2[6]) << 48; + case 6: v ^= static_cast<uint64_t>(pos2[5]) << 40; + case 5: v ^= static_cast<uint64_t>(pos2[4]) << 32; + case 4: v ^= static_cast<uint64_t>(pos2[3]) << 24; + case 3: v ^= static_cast<uint64_t>(pos2[2]) << 16; + case 2: v ^= static_cast<uint64_t>(pos2[1]) << 8; + case 1: v ^= static_cast<uint64_t>(pos2[0]); + h ^= FastHashMix(v); + h *= m; + } + + return FastHashMix(h); + } }; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test index 35e27c5..c72558a 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test +++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test @@ -139,16 +139,16 @@ bigint,string,bigint ---- QUERY # Test several analytic functions with incompatible partition by and order by clauses # on top of a subplan that flattens a map. -select id, key, value, max(key) over(partition by id), row_number() over (order by value) -from complextypestbl t, t.int_map +select id, key, value, max(key) over(partition by id), row_number() +over (order by value, key) from complextypestbl t, t.int_map ---- RESULTS: VERIFY_IS_EQUAL_SORTED +8,'k1',-1,'k1',1 1,'k1',1,'k2',2 -1,'k2',100,'k2',4 2,'k1',2,'k2',3 -2,'k2',NULL,'k2',5 -7,'k1',NULL,'k3',6 +1,'k2',100,'k2',4 +7,'k1',NULL,'k3',5 +2,'k2',NULL,'k2',6 7,'k3',NULL,'k3',7 -8,'k1',-1,'k1',1 ---- TYPES bigint,string,int,string,bigint ====
