Repository: incubator-impala
Updated Branches:
  refs/heads/master 1ca3adf46 -> a772f8456


IMPALA-2281: Replace FNV with FastHash in exchange nodes

FNV is not a good enough hash function. This patch introduces FastHash
into the codebase and uses it in exchange nodes.

Testing: Two test cases involving arbitrary ordering are changed.
Single node performance benchmark shows no performance difference.

Change-Id: I778317d982dcdb94173a369a65b39f32b4f7ded2
Reviewed-on: http://gerrit.cloudera.org:8080/8417
Reviewed-by: Jim Apple <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/73d1bc3a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/73d1bc3a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/73d1bc3a

Branch: refs/heads/master
Commit: 73d1bc3ad448181626c559c36f9d86b23e036fef
Parents: 1ca3adf
Author: Tianyi Wang <[email protected]>
Authored: Thu Oct 26 15:26:59 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Wed Nov 8 07:39:02 2017 +0000

----------------------------------------------------------------------
 LICENSE.txt                                     | 26 ++++++
 be/src/benchmarks/hash-benchmark.cc             | 98 +++++++++++++++-----
 be/src/codegen/gen_ir_descriptions.py           |  1 -
 be/src/codegen/llvm-codegen.cc                  |  4 -
 be/src/runtime/data-stream-sender.cc            | 17 ++--
 be/src/runtime/data-stream-sender.h             |  3 +
 be/src/runtime/data-stream-test.cc              |  4 +-
 be/src/runtime/raw-value-test.cc                | 44 ++++-----
 be/src/runtime/raw-value.cc                     | 42 ++++-----
 be/src/runtime/raw-value.h                      | 10 +-
 be/src/util/hash-util-ir.cc                     |  5 -
 be/src/util/hash-util.h                         | 42 +++++++++
 .../queries/QueryTest/nested-types-runtime.test | 12 +--
 13 files changed, 207 insertions(+), 101 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/LICENSE.txt
----------------------------------------------------------------------
diff --git a/LICENSE.txt b/LICENSE.txt
index 01359d4..419a726 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -920,3 +920,29 @@ be/src/kudu/util/x509_check_host.*: OpenSSL software 
license:
   [including the GNU Public Licence.]
 
 
--------------------------------------------------------------------------------
+
+be/src/util/hash-util.h (some portions): MIT license
+
+Some portions of this module are derived from code from FastHash
+(https://code.google.com/archive/p/fast-hash/).
+
+  Copyright (C) 2012 Zilong Tan ([email protected])
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy 
of this
+  software and associated documentation files (the "Software"), to deal in the 
Software
+  without restriction, including without limitation the rights to use, copy, 
modify,
+  merge, publish, distribute, sublicense, and/or sell copies of the Software, 
and to
+  permit persons to whom the Software is furnished to do so, subject to the 
following
+  conditions:
+
+  The above copyright notice and this permission notice shall be included in 
all copies or
+  substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
IMPLIED,
+  INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR 
A PARTICULAR
+  PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 
HOLDERS BE
+  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
CONTRACT, TORT
+  OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
USE OR
+  OTHER DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/benchmarks/hash-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/hash-benchmark.cc 
b/be/src/benchmarks/hash-benchmark.cc
index b183915..d0ecc82 100644
--- a/be/src/benchmarks/hash-benchmark.cc
+++ b/be/src/benchmarks/hash-benchmark.cc
@@ -16,8 +16,6 @@
 // under the License.
 
 #include <iostream>
-#include <stdlib.h>
-#include <stdio.h>
 #include <vector>
 
 #include <boost/functional/hash.hpp>
@@ -26,12 +24,9 @@
 #include "common/init.h"
 #include "experiments/data-provider.h"
 #include "runtime/mem-tracker.h"
-#include "runtime/string-value.h"
 #include "runtime/test-env.h"
 #include "service/fe-support.h"
 #include "util/benchmark.h"
-#include "util/cpu-info.h"
-#include "util/hash-util.h"
 
 #include "common/names.h"
 
@@ -50,6 +45,7 @@ using namespace llvm;
 // The different hash functions benchmarked:
 //   1. FNV Hash: Fowler-Noll-Vo hash function
 //   2. FNV Hash with empty string handling: FNV with special-case for empty 
string
+//   3. FastHash64: 64-bit FastHash function
 //   3. Murmur2_64 Hash: Murmur2 hash function
 //   4. Boost Hash: boost hash function
 //   5. Crc: hash using sse4 crc hash instruction
@@ -62,24 +58,30 @@ using namespace llvm;
 //                      = lim n->inf n(1 - 1/n) ^n
 //                      = n / e
 //                      = 367
-
-// Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
-// Int Hash:             Function     Rate (iters/ms)          Comparison
-// ----------------------------------------------------------------------
-//                            Fnv               114.6                  1X
-//                     Murmur2_64               129.9              1.134X
-//                          Boost               294.1              2.567X
-//                            Crc               536.2               4.68X
-//                        Codegen                1413              12.33X
-//
-// Mixed Hash:           Function     Rate (iters/ms)          Comparison
-// ----------------------------------------------------------------------
-//                            Fnv               90.88                  1X
-//                       FnvEmpty               91.58              1.008X
-//                     Murmur2_64               124.9              1.374X
-//                          Boost               133.5              1.469X
-//                            Crc               435.8              4.795X
-//                        Codegen               379.3              4.174X
+/*
+Machine Info:  Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
+Int Hash:
+Function     10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+                                      (relative) (relative) (relative)
+----------- ----------------------------------------------------------
+        Fnv    88.5      109      111         1X         1X         1X
+ FastHash64    96.9      110      112       1.1X      1.01X      1.01X
+ Murmur2_64    90.7      124      126      1.03X      1.14X      1.14X
+      Boost     203      277      282       2.3X      2.55X      2.55X
+        Crc     415      536      540      4.68X      4.93X      4.89X
+    Codegen 1.5e+03 1.85e+03 1.88e+03      16.9X        17X        17X
+Mixed Hash:
+Function  10%ile   50%ile   90%ile        10%ile     50%ile     90%ile
+                                      (relative) (relative) (relative)
+----------------------------------------------------------------------
+        Fnv    75.3       78     78.9         1X         1X         1X
+ FastHash64    91.2     95.4     96.3      1.21X      1.22X      1.22X
+   FnvEmpty    82.7     86.3     86.9       1.1X      1.11X       1.1X
+ Murmur2_64     109      113      114      1.45X      1.45X      1.45X
+      Boost     109      112      113      1.45X      1.43X      1.44X
+        Crc     467      481      489      6.21X      6.17X      6.21X
+    Codegen     292      312      318      3.88X         4X      4.03X
+*/
 
 typedef uint32_t (*CodegenHashFn)(int rows, char* data, int32_t* results);
 
@@ -125,6 +127,23 @@ void TestCrcIntHash(int batch, void* d) {
   }
 }
 
+void TestFastHashIntHash(int batch, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  int rows = data->num_rows;
+  int cols = data->num_cols;
+  for (int i = 0; i < batch; ++i) {
+    int32_t* values = reinterpret_cast<int32_t*>(data->data);
+    for (int j = 0; j < rows; ++j) {
+      uint64_t hash = HashUtil::FNV_SEED;
+      for (int k = 0; k < cols; ++k) {
+        hash = HashUtil::FastHash64(&values[k], sizeof(uint32_t), hash);
+      }
+      data->results[j] = hash;
+      values += cols;
+    }
+  }
+}
+
 void TestBoostIntHash(int batch, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   int rows = data->num_rows;
@@ -235,6 +254,32 @@ void TestCrcMixedHash(int batch, void* d) {
   }
 }
 
+void TestFastHashMixedHash(int batch, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  int rows = data->num_rows;
+  for (int i = 0; i < batch; ++i) {
+    char* values = reinterpret_cast<char*>(data->data);
+    for (int j = 0; j < rows; ++j) {
+      uint64_t hash = HashUtil::FNV_SEED;
+
+      hash = HashUtil::FastHash64(values, sizeof(int8_t), hash);
+      values += sizeof(int8_t);
+
+      hash = HashUtil::FastHash64(values, sizeof(int32_t), hash);
+      values += sizeof(int32_t);
+
+      hash = HashUtil::FastHash64(values, sizeof(int64_t), hash);
+      values += sizeof(int64_t);
+
+      StringValue* str = reinterpret_cast<StringValue*>(values);
+      hash = HashUtil::FastHash64(str->ptr, str->len, hash);
+      values += sizeof(StringValue);
+
+      data->results[j] = hash;
+    }
+  }
+}
+
 void TestCodegenMixedHash(int batch, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
   CodegenHashFn fn = reinterpret_cast<CodegenHashFn>(data->jitted_fn);
@@ -416,9 +461,8 @@ Function* CodegenCrcHash(LlvmCodeGen* codegen, bool mixed) {
 }
 
 int main(int argc, char **argv) {
-  CpuInfo::Init();
-  cout << Benchmark::GetMachineInfo() << endl;
   impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
+  cout << Benchmark::GetMachineInfo() << endl;
   impala::InitFeSupport();
   ABORT_IF_ERROR(LlvmCodeGen::InitializeLlvm());
 
@@ -506,6 +550,7 @@ int main(int argc, char **argv) {
 
   Benchmark int_suite("Int Hash");
   int_suite.AddBenchmark("Fnv", TestFnvIntHash, &int_data);
+  int_suite.AddBenchmark("FastHash64", TestFastHashIntHash, &int_data);
   int_suite.AddBenchmark("Murmur2_64", TestMurmur2_64IntHash, &int_data);
   int_suite.AddBenchmark("Boost", TestBoostIntHash, &int_data);
   int_suite.AddBenchmark("Crc", TestCrcIntHash, &int_data);
@@ -514,6 +559,7 @@ int main(int argc, char **argv) {
 
   Benchmark mixed_suite("Mixed Hash");
   mixed_suite.AddBenchmark("Fnv", TestFnvMixedHash, &mixed_data);
+  mixed_suite.AddBenchmark("FastHash64", TestFastHashMixedHash, &mixed_data);
   mixed_suite.AddBenchmark("FnvEmpty", TestFnvEmptyMixedHash, &mixed_data);
   mixed_suite.AddBenchmark("Murmur2_64", TestMurmur2_64MixedHash, &mixed_data);
   mixed_suite.AddBenchmark("Boost", TestBoostMixedHash, &mixed_data);
@@ -521,5 +567,7 @@ int main(int argc, char **argv) {
   mixed_suite.AddBenchmark("Codegen", TestCodegenMixedHash, &mixed_data);
   cout << mixed_suite.Measure();
 
+  codegen->Close();
+  mem_pool.FreeAll();
   return 0;
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/codegen/gen_ir_descriptions.py
----------------------------------------------------------------------
diff --git a/be/src/codegen/gen_ir_descriptions.py 
b/be/src/codegen/gen_ir_descriptions.py
index 1044a40..28e0a71 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -96,7 +96,6 @@ ir_functions = [
   ["SCALAR_EXPR_GET_DECIMAL_VAL",
    
"_ZN6impala10ScalarExpr13GetDecimalValEPS0_PNS_19ScalarExprEvaluatorEPKNS_8TupleRowE"],
   ["HASH_CRC", "IrCrcHash"],
-  ["HASH_FNV", "IrFnvHash"],
   ["HASH_MURMUR", "IrMurmurHash"],
   ["PHJ_PROCESS_BUILD_BATCH",
    
"_ZN6impala10PhjBuilder17ProcessBuildBatchEPNS_8RowBatchEPNS_12HashTableCtxEbb"],

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/codegen/llvm-codegen.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index 777ca0a..77a45c6 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -1584,10 +1584,6 @@ static Function* GetLenOptimizedHashFn(
   return codegen->FinalizeFunction(fn);
 }
 
-Function* LlvmCodeGen::GetFnvHashFunction(int len) {
-  return GetLenOptimizedHashFn(this, IRFunction::HASH_FNV, len);
-}
-
 Function* LlvmCodeGen::GetMurmurHashFunction(int len) {
   return GetLenOptimizedHashFn(this, IRFunction::HASH_MURMUR, len);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/data-stream-sender.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/data-stream-sender.cc 
b/be/src/runtime/data-stream-sender.cc
index ed156dc..cf749cd 100644
--- a/be/src/runtime/data-stream-sender.cc
+++ b/be/src/runtime/data-stream-sender.cc
@@ -464,17 +464,16 @@ Status DataStreamSender::Send(RuntimeState* state, 
RowBatch* batch) {
     int num_channels = channels_.size();
     for (int i = 0; i < batch->num_rows(); ++i) {
       TupleRow* row = batch->GetRow(i);
-      uint32_t hash_val = HashUtil::FNV_SEED;
-      for (int i = 0; i < partition_exprs_.size(); ++i) {
-        ScalarExprEvaluator* eval = partition_expr_evals_[i];
+      uint64_t hash_val = EXCHANGE_HASH_SEED;
+      for (int j = 0; j < partition_exprs_.size(); ++j) {
+        ScalarExprEvaluator* eval = partition_expr_evals_[j];
         void* partition_val = eval->GetValue(row);
-        // We can't use the crc hash function here because it does not result
-        // in uncorrelated hashes with different seeds.  Instead we must use
-        // fnv hash.
+        // We can't use the crc hash function here because it does not result 
in
+        // uncorrelated hashes with different seeds. Instead we use FastHash.
         // TODO: fix crc hash/GetHashValue()
-        DCHECK(&partition_expr_evals_[i]->root() == partition_exprs_[i]);
-        hash_val = RawValue::GetHashValueFnv(
-            partition_val, partition_exprs_[i]->type(), hash_val);
+        DCHECK(&(eval->root()) == partition_exprs_[j]);
+        hash_val = RawValue::GetHashValueFastHash(
+            partition_val, partition_exprs_[j]->type(), hash_val);
       }
       RETURN_IF_ERROR(channels_[hash_val % num_channels]->AddRow(row));
     }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/data-stream-sender.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/data-stream-sender.h 
b/be/src/runtime/data-stream-sender.h
index 95a83e5..8ed4bd0 100644
--- a/be/src/runtime/data-stream-sender.h
+++ b/be/src/runtime/data-stream-sender.h
@@ -153,6 +153,9 @@ class DataStreamSender : public DataSink {
   /// Used for Kudu partitioning to round-robin rows that don't correspond to 
a partition
   /// or when errors are encountered.
   int next_unknown_partition_;
+
+  /// An arbitrary hash seed used for exchanges.
+  static constexpr uint64_t EXCHANGE_HASH_SEED = 0x66bd68df22c3ef37;
 };
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/data-stream-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/data-stream-test.cc 
b/be/src/runtime/data-stream-test.cc
index 79a97b6..3f4844c 100644
--- a/be/src/runtime/data-stream-test.cc
+++ b/be/src/runtime/data-stream-test.cc
@@ -425,8 +425,8 @@ class DataStreamTest : public testing::Test {
         } else if (stream_type == TPartitionType::HASH_PARTITIONED) {
           // hash-partitioned streams send values to the right partition
           int64_t value = *j;
-          uint32_t hash_val =
-              RawValue::GetHashValueFnv(&value, TYPE_BIGINT, 
HashUtil::FNV_SEED);
+          uint64_t hash_val = RawValue::GetHashValueFastHash(&value, 
TYPE_BIGINT,
+              DataStreamSender::EXCHANGE_HASH_SEED);
           EXPECT_EQ(hash_val % receiver_info_.size(), info.receiver_num);
         }
       }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/raw-value-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/raw-value-test.cc b/be/src/runtime/raw-value-test.cc
index 9c71822..bced866 100644
--- a/be/src/runtime/raw-value-test.cc
+++ b/be/src/runtime/raw-value-test.cc
@@ -79,35 +79,37 @@ TEST_F(RawValueTest, TypeChar) {
 // IMPALA-2270: "", false, and NULL should hash to distinct values.
 TEST_F(RawValueTest, HashEmptyAndNull) {
   uint32_t seed = 12345;
-  uint32_t null_hash = RawValue::GetHashValue(NULL, TYPE_STRING, seed);
-  uint32_t null_hash_fnv = RawValue::GetHashValueFnv(NULL, TYPE_STRING, seed);
-  StringValue empty(NULL, 0);
-  uint32_t empty_hash = RawValue::GetHashValue(&empty, TYPE_STRING, seed);
-  uint32_t empty_hash_fnv = RawValue::GetHashValueFnv(&empty, TYPE_STRING, 
seed);
+  uint32_t hash_null = RawValue::GetHashValue(nullptr, TYPE_STRING, seed);
+  uint64_t fast_hash_null = RawValue::GetHashValueFastHash(nullptr, 
TYPE_STRING, seed);
+  StringValue empty(nullptr, 0);
+  uint32_t hash_empty = RawValue::GetHashValue(&empty, TYPE_STRING, seed);
+  uint64_t fast_hash_empty =
+      RawValue::GetHashValueFastHash(&empty, TYPE_STRING, seed);
   bool false_val = false;
-  uint32_t false_hash = RawValue::GetHashValue(&false_val, TYPE_BOOLEAN, seed);
-  uint32_t false_hash_fnv = RawValue::GetHashValue(&false_val, TYPE_BOOLEAN, 
seed);
-  EXPECT_NE(seed, null_hash);
-  EXPECT_NE(seed, empty_hash);
-  EXPECT_NE(seed, false_hash);
-  EXPECT_NE(seed, null_hash_fnv);
-  EXPECT_NE(seed, empty_hash_fnv);
-  EXPECT_NE(seed, false_hash_fnv);
-  EXPECT_NE(null_hash, empty_hash);
-  EXPECT_NE(null_hash_fnv, empty_hash_fnv);
-  EXPECT_NE(null_hash, false_hash);
-  EXPECT_NE(false_hash, null_hash_fnv);
+  uint32_t hash_false = RawValue::GetHashValue(&false_val, TYPE_BOOLEAN, seed);
+  uint64_t fast_hash_false =
+      RawValue::GetHashValueFastHash(&false_val, TYPE_BOOLEAN, seed);
+  EXPECT_NE(seed, hash_null);
+  EXPECT_NE(seed, hash_empty);
+  EXPECT_NE(seed, hash_false);
+  EXPECT_NE(seed, fast_hash_null);
+  EXPECT_NE(seed, fast_hash_empty);
+  EXPECT_NE(seed, fast_hash_false);
+  EXPECT_NE(hash_null, hash_empty);
+  EXPECT_NE(fast_hash_null, fast_hash_empty);
+  EXPECT_NE(hash_null, hash_false);
+  EXPECT_NE(hash_false, fast_hash_null);
 }
 
-/// IMPALA-2270: Test that FNV hash of (int, "") is not skewed.
+/// IMPALA-2270: Test that FastHash of (int, "") is not skewed.
 TEST(HashUtil, IntNullSkew) {
   int num_values = 100000;
   int num_buckets = 16;
   vector<int> buckets(num_buckets, 0);
   for (int32_t i = 0; i < num_values; ++i) {
-    uint32_t hash = RawValue::GetHashValueFnv(&i, TYPE_INT, 9999);
-    StringValue empty(NULL, 0);
-    hash = RawValue::GetHashValueFnv(&empty, TYPE_STRING, hash);
+    uint64_t hash = RawValue::GetHashValueFastHash(&i, TYPE_INT, 9999);
+    StringValue empty(nullptr, 0);
+    hash = RawValue::GetHashValueFastHash(&empty, TYPE_STRING, hash);
     ++buckets[hash % num_buckets];
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/raw-value.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/raw-value.cc b/be/src/runtime/raw-value.cc
index d7f4fd5..1db9741 100644
--- a/be/src/runtime/raw-value.cc
+++ b/be/src/runtime/raw-value.cc
@@ -60,7 +60,7 @@ void RawValue::PrintValueAsBytes(const void* value, const 
ColumnType& type,
     case TYPE_STRING:
     case TYPE_VARCHAR:
       string_val = reinterpret_cast<const StringValue*>(value);
-      stream->write(static_cast<char*>(string_val->ptr), string_val->len);
+      stream->write(string_val->ptr, string_val->len);
       break;
     case TYPE_TIMESTAMP:
       stream->write(chars, TimestampValue::Size());
@@ -98,7 +98,7 @@ void RawValue::PrintValue(const void* value, const 
ColumnType& type, int scale,
     case TYPE_STRING:
     case TYPE_VARCHAR:
       string_val = reinterpret_cast<const StringValue*>(value);
-      tmp.assign(static_cast<char*>(string_val->ptr), string_val->len);
+      tmp.assign(string_val->ptr, string_val->len);
       str->swap(tmp);
       return;
     case TYPE_CHAR:
@@ -193,31 +193,29 @@ void RawValue::Write(const void* value, Tuple* tuple, 
const SlotDescriptor* slot
   }
 }
 
-uint32_t RawValue::GetHashValueFnv(const void* v, const ColumnType& type, 
uint32_t seed) {
-  // Use HashCombine with arbitrary constant to ensure we don't return seed.
-  if (v == NULL) return HashUtil::HashCombine32(HASH_VAL_NULL, seed);
-
+uint64_t RawValue::GetHashValueFastHash(const void* v, const ColumnType& type,
+    uint64_t seed) {
+  // Hash with an arbitrary constant to ensure we don't return seed.
+  if (v == nullptr) {
+    return HashUtil::FastHash64(&HASH_VAL_NULL, sizeof(HASH_VAL_NULL), seed);
+  }
   switch (type.type) {
     case TYPE_STRING:
     case TYPE_VARCHAR: {
       const StringValue* string_value = reinterpret_cast<const 
StringValue*>(v);
-      if (string_value->len == 0) {
-        return HashUtil::HashCombine32(HASH_VAL_EMPTY, seed);
-      }
-      return HashUtil::FnvHash64to32(string_value->ptr, string_value->len, 
seed);
+      return HashUtil::FastHash64(string_value->ptr,
+          static_cast<size_t>(string_value->len), seed);
     }
-    case TYPE_BOOLEAN:
-      return HashUtil::HashCombine32(*reinterpret_cast<const bool*>(v), seed);
-    case TYPE_TINYINT: return HashUtil::FnvHash64to32(v, 1, seed);
-    case TYPE_SMALLINT: return HashUtil::FnvHash64to32(v, 2, seed);
-    case TYPE_INT: return HashUtil::FnvHash64to32(v, 4, seed);
-    case TYPE_BIGINT: return HashUtil::FnvHash64to32(v, 8, seed);
-    case TYPE_FLOAT: return HashUtil::FnvHash64to32(v, 4, seed);
-    case TYPE_DOUBLE: return HashUtil::FnvHash64to32(v, 8, seed);
-    case TYPE_TIMESTAMP: return HashUtil::FnvHash64to32(v, 12, seed);
-    case TYPE_CHAR:
-      return HashUtil::FnvHash64to32(v, type.len, seed);
-    case TYPE_DECIMAL: return HashUtil::FnvHash64to32(v, type.GetByteSize(), 
seed);
+    case TYPE_BOOLEAN: return HashUtil::FastHash64(v, 1, seed);
+    case TYPE_TINYINT: return HashUtil::FastHash64(v, 1, seed);
+    case TYPE_SMALLINT: return HashUtil::FastHash64(v, 2, seed);
+    case TYPE_INT: return HashUtil::FastHash64(v, 4, seed);
+    case TYPE_BIGINT: return HashUtil::FastHash64(v, 8, seed);
+    case TYPE_FLOAT: return HashUtil::FastHash64(v, 4, seed);
+    case TYPE_DOUBLE: return HashUtil::FastHash64(v, 8, seed);
+    case TYPE_TIMESTAMP: return HashUtil::FastHash64(v, 12, seed);
+    case TYPE_CHAR: return HashUtil::FastHash64(v, type.len, seed);
+    case TYPE_DECIMAL: return HashUtil::FastHash64(v, type.GetByteSize(), 
seed);
     default: DCHECK(false); return 0;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/runtime/raw-value.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/raw-value.h b/be/src/runtime/raw-value.h
index 89ae323..6ee9191 100644
--- a/be/src/runtime/raw-value.h
+++ b/be/src/runtime/raw-value.h
@@ -71,12 +71,10 @@ class RawValue {
   static inline uint32_t GetHashValueNonNull(const T* v, const ColumnType& 
type,
       uint32_t seed = 0);
 
-  /// Get a 32-bit hash value using the FNV hash function.
-  /// Using different seeds with FNV results in different hash functions.
-  /// GetHashValue() does not have this property and cannot be safely used as 
the first
-  /// step in data repartitioning. However, GetHashValue() can be 
significantly faster.
-  /// TODO: fix GetHashValue
-  static uint32_t GetHashValueFnv(const void* v, const ColumnType& type, 
uint32_t seed);
+  /// Get a 64-bit hash value using the FastHash function.
+  /// https://code.google.com/archive/p/fast-hash/
+  static uint64_t GetHashValueFastHash(const void* v, const ColumnType& type,
+      uint64_t seed);
 
   /// Compares both values.
   /// Return value is < 0  if v1 < v2, 0 if v1 == v2, > 0 if v1 > v2.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/util/hash-util-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/util/hash-util-ir.cc b/be/src/util/hash-util-ir.cc
index cf812eb..da3271b 100644
--- a/be/src/util/hash-util-ir.cc
+++ b/be/src/util/hash-util-ir.cc
@@ -23,11 +23,6 @@
 using namespace impala;
 
 extern "C"
-uint32_t IrFnvHash(const void* data, int32_t bytes, uint32_t hash) {
-  return HashUtil::FnvHash64to32(data, bytes, hash);
-}
-
-extern "C"
 uint32_t IrMurmurHash(const void* data, int32_t bytes, uint32_t hash) {
   return HashUtil::MurmurHash2_64(data, bytes, hash);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/be/src/util/hash-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/hash-util.h b/be/src/util/hash-util.h
index d0c791a..054cb8a 100644
--- a/be/src/util/hash-util.h
+++ b/be/src/util/hash-util.h
@@ -231,6 +231,48 @@ class HashUtil {
     // Rehash32to32(hash2) is minimal.
     return (static_cast<uint64_t>(hash) * m + a) >> 32;
   }
+
+  // The FastHash64 implementation is adapted from
+  // https://code.google.com/archive/p/fast-hash/
+  // available under MIT license.
+  static inline uint64_t FastHashMix(uint64_t h) {
+    h ^= h >> 23;
+    h *= 0x2127599bf4325c37ULL;
+    h ^= h >> 47;
+    return h;
+  }
+
+  static inline uint64_t FastHash64(const void* buf, int64_t len, uint64_t 
seed)
+  {
+    const uint64_t m = 0x880355f21e6d1965ULL;
+    const uint64_t* pos = reinterpret_cast<const uint64_t*>(buf);
+    const uint64_t* end = pos + (len / 8);
+    uint64_t h = seed ^ (len * m);
+    uint64_t v;
+
+    while (pos != end) {
+      v  = *pos++;
+      h ^= FastHashMix(v);
+      h *= m;
+    }
+
+    const uint8_t* pos2 = reinterpret_cast<const uint8_t*>(pos);
+    v = 0;
+
+    switch (len & 7) {
+      case 7: v ^= static_cast<uint64_t>(pos2[6]) << 48;
+      case 6: v ^= static_cast<uint64_t>(pos2[5]) << 40;
+      case 5: v ^= static_cast<uint64_t>(pos2[4]) << 32;
+      case 4: v ^= static_cast<uint64_t>(pos2[3]) << 24;
+      case 3: v ^= static_cast<uint64_t>(pos2[2]) << 16;
+      case 2: v ^= static_cast<uint64_t>(pos2[1]) << 8;
+      case 1: v ^= static_cast<uint64_t>(pos2[0]);
+        h ^= FastHashMix(v);
+        h *= m;
+    }
+
+    return FastHashMix(h);
+  }
 };
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/73d1bc3a/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test
 
b/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test
index 35e27c5..c72558a 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/nested-types-runtime.test
@@ -139,16 +139,16 @@ bigint,string,bigint
 ---- QUERY
 # Test several analytic functions with incompatible partition by and order by 
clauses
 # on top of a subplan that flattens a map.
-select id, key, value, max(key) over(partition by id), row_number() over 
(order by value)
-from complextypestbl t, t.int_map
+select id, key, value, max(key) over(partition by id), row_number()
+over (order by value, key) from complextypestbl t, t.int_map
 ---- RESULTS: VERIFY_IS_EQUAL_SORTED
+8,'k1',-1,'k1',1
 1,'k1',1,'k2',2
-1,'k2',100,'k2',4
 2,'k1',2,'k2',3
-2,'k2',NULL,'k2',5
-7,'k1',NULL,'k3',6
+1,'k2',100,'k2',4
+7,'k1',NULL,'k3',5
+2,'k2',NULL,'k2',6
 7,'k3',NULL,'k3',7
-8,'k1',-1,'k1',1
 ---- TYPES
 bigint,string,int,string,bigint
 ====

Reply via email to