[2/2] incubator-impala git commit: Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark.

jbapple Tue, 06 Sep 2016 20:29:18 -0700

Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark.

Additionally, change the parameter of rotate to a compile-time
constant, and add "inline" to functions, increasing the performance
dramatically. The compiler can't inline the SIMD versions, because
they use Intel intrinsics -- added a TODO to add these intrinsics to
sse-util.h.


Change-Id: I11d48f8816d5b129858a1f773015e51049dd1d61
Reviewed-on: http://gerrit.cloudera.org:8080/4313
Reviewed-by: Tim Armstrong <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/39e01abc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/39e01abc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/39e01abc

Branch: refs/heads/master
Commit: 39e01abcf38bafaa03810fdbfd743a1558db92d8
Parents: b35689d
Author: Jim Apple <[email protected]>
Authored: Mon Sep 5 14:15:06 2016 -0700
Committer: Internal Jenkins <[email protected]>
Committed: Wed Sep 7 01:35:30 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/int-hash-benchmark.cc | 302 ++++++++++++++++++++++-----
 1 file changed, 245 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/39e01abc/be/src/benchmarks/int-hash-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/int-hash-benchmark.cc 
b/be/src/benchmarks/int-hash-benchmark.cc
index a0787f1..9bed3c4 100644
--- a/be/src/benchmarks/int-hash-benchmark.cc
+++ b/be/src/benchmarks/int-hash-benchmark.cc
@@ -19,10 +19,14 @@
 
 #include <iostream>
 #include <limits>
+#include <memory>
 #include <vector>
 
+#include <immintrin.h>
+
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
+#include "util/hash-util.h"
 #include "util/sse-util.h"
 
 using namespace std;
@@ -31,22 +35,49 @@ using namespace impala;
 // Test hash functions that take integers as arguments and produce integers as 
the result.
 //
 // Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
-// 32 -> 32:             Function     Rate (iters/ms)          Comparison
-// ----------------------------------------------------------------------
-//                       Jenkins2               14.06                  1X
-//                       Jenkins1               16.97              1.207X
-//                        MultRot               19.14              1.361X
-//               MultiplyAddShift               21.21              1.509X
-//                  MultiplyShift                25.3              1.799X
-//                            CRC               25.53              1.816X
+// 32 -> 32:                  Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                                 FNV               25.7       26     26.4    
     1X         1X         1X
+//                             Zobrist               30.2     30.7     30.9    
  1.18X      1.18X      1.17X
+//                             MultRot               42.6     43.4     43.4    
  1.66X      1.67X      1.64X
+//                    MultiplyAddShift               42.4     43.2     43.2    
  1.65X      1.66X      1.64X
+//                            Jenkins1               51.8       54       54    
  2.02X      2.08X      2.05X
+//                            Jenkins2               66.2     67.4     67.5    
  2.58X      2.59X      2.56X
+//                                 CRC               98.6      100      101    
  3.84X      3.85X      3.84X
+//                       MultiplyShift                150      152      153    
  5.84X      5.86X      5.79X
+//
+// 32 x 4 -> 32 x 4:          Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//              (Multiple<Zobrist, 4>)               30.8     31.2     31.4    
     1X         1X         1X
+//     (Multiple<MultiplyAddShift, 4>)               44.2       45       45    
  1.43X      1.44X      1.43X
+//                  (Multiple<CRC, 4>)                118      120      121    
  3.84X      3.86X      3.85X
+//        (Multiple<MultiplyShift, 4>)                156      159      159    
  5.07X       5.1X      5.08X
+//                 MultiplyAddShift128               75.7     77.2     77.2    
  2.46X      2.48X      2.46X
+//                    MultiplyShift128                128      131      133    
  4.16X      4.21X      4.23X
+//
+// 32 x 8 -> 32 x 8:          Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//              (Multiple<Zobrist, 8>)                 31     31.5     31.8    
     1X         1X         1X
+//     (Multiple<MultiplyAddShift, 8>)               44.3     44.5     45.2    
  1.43X      1.41X      1.42X
+//                  (Multiple<CRC, 8>)                121      123      123    
   3.9X       3.9X      3.88X
+//        (Multiple<MultiplyShift, 8>)                158      159      160    
  5.11X      5.05X      5.04X
+//                    Zobrist256simple               16.5     16.5     16.6    
 0.533X     0.524X     0.522X
+//                    Zobrist256gather               18.8     19.2     19.4    
 0.608X     0.608X      0.61X
+//                 MultiplyAddShift256                151      154      154    
  4.88X      4.88X      4.84X
+//                    MultiplyShift256                209      212      212    
  6.73X      6.71X      6.67X
 
 // Rotate 32 bits right by 'shift'. This is _rotr in the intel instrinsics, 
but that isn't
 // usable on Clang yet. Fortunately, both GCC and Clang can optimize this to 
use the 'ror'
 // instruction.
-uint32_t RotateRight(uint32_t x, int shift) {
-  DCHECK_GT(shift, 0);
-  DCHECK_LT(shift, std::numeric_limits<decltype(x)>::digits);
-  return (x << (std::numeric_limits<decltype(x)>::digits - shift)) | (x >> 
shift);
+template<int SHIFT>
+inline uint32_t RotateRight(uint32_t x) {
+  static_assert(SHIFT > 0, "Only positive shifts are defined behavior and 
useful");
+  static_assert(
+      SHIFT < std::numeric_limits<decltype(x)>::digits, "This much shift is 
just 0");
+  return (x << (std::numeric_limits<decltype(x)>::digits - SHIFT)) | (x >> 
SHIFT);
 }
 
 // Make a random uint32_t, avoiding the absent high bit and the low-entropy 
low bits
@@ -60,61 +91,178 @@ uint32_t MakeRandU32() {
 
 // Almost universal hashing, M. Dietzfelbinger, T. Hagerup, J. Katajainen, and 
M.
 // Penttonen, "A reliable randomized algorithm for the closest-pair problem".
-uint32_t MultiplyShift(uint32_t x) {
+inline void MultiplyShift(uint32_t* x) {
   static const uint32_t m = 0x61eaf8e9u;
-  return x*m;
+  *x = *x * m;
+}
+
+// Like MultiplyShift, but using SSE's 128-bit SIMD registers to do 4 at once.
+//
+// TODO: Add the Intel intrinsics used in this function and the other 
functions in this
+// file to sse-util.h so that these functions can be inlined.
+inline void MultiplyShift128(__m128i* x) __attribute__((__target__("sse4.1")));
+inline void MultiplyShift128(__m128i* x) {
+  const __m128i m = _mm_set1_epi32(0x61eaf8e9);
+  _mm_storeu_si128(x, _mm_mullo_epi32(_mm_loadu_si128(x), m));
+}
+
+// Like MultiplyShift, but using AVX2's 256-bit SIMD registers to do 8 at once.
+//
+// Not inline, because it degrades the performance for unknown reasons.
+void MultiplyShift256(__m256i* x) __attribute__((__target__("avx2")));
+void MultiplyShift256(__m256i* x) {
+  const __m256i m = _mm256_set1_epi32(0x61eaf8e9);
+  _mm256_storeu_si256(x, _mm256_mullo_epi32(_mm256_loadu_si256(x), m));
 }
 
 // 2-independent hashing. M. Dietzfelbinger, "Universal hashing and k-wise 
independent
 // random variables via integer arithmetic without primes"
-uint32_t MultiplyAddShift(uint32_t x) {
+inline void MultiplyAddShift(uint32_t* x) {
   static const uint64_t m = 0xa1f1bd3e020b4be0ull, a = 0x86b0426193d86e66ull;
-  return (static_cast<uint64_t>(x) * m + a) >> 32;
+  *x = (static_cast<uint64_t>(*x) * m + a) >> 32;
+}
+
+// Like MultiplyAddShift, but using SSE's 128-bit SIMD registers to do 4 at 
once.
+inline void MultiplyAddShift128(__m128i* x) 
__attribute__((__target__("sse4.1")));
+inline void MultiplyAddShift128(__m128i* x) {
+  const auto m = _mm_set1_epi64x(0xa1f1bd3e020b4be0ull),
+                mhi = _mm_set1_epi32(0xa1f1bd3e),
+                a = _mm_set1_epi64x(0x86b0426193d86e66ull);
+  auto input = _mm_loadu_si128(x);
+  auto prod32easy = _mm_mullo_epi32(input, mhi);
+  auto input_odds = _mm_srli_epi64(input, 32);
+  auto prod64_evens = _mm_mul_epu32(input, m),
+          prod64_odds = _mm_mul_epu32(input_odds, m);
+  prod64_evens = _mm_add_epi64(a, prod64_evens);
+  prod64_odds = _mm_add_epi64(a, prod64_odds);
+  auto prod32hard = _mm_unpackhi_epi32(prod64_evens, prod64_odds);
+  _mm_storeu_si128(x, _mm_add_epi32(prod32easy, prod32hard));
+}
+
+// Like MultiplyAddShift, but using AVX2's 256-bit SIMD registers to do 8 at 
once.
+inline void MultiplyAddShift256(__m256i* x) 
__attribute__((__target__("avx2")));
+inline void MultiplyAddShift256(__m256i* x) {
+  const __m256i m = _mm256_set1_epi64x(0xa1f1bd3e020b4be0ull),
+                mhi = _mm256_set1_epi32(0xa1f1bd3e),
+                a = _mm256_set1_epi64x(0x86b0426193d86e66ull);
+  __m256i input = _mm256_loadu_si256(x);
+  __m256i prod32easy = _mm256_mullo_epi32(input, mhi);
+  __m256i input_odds = _mm256_srli_epi64(input, 32);
+  __m256i prod64_evens = _mm256_mul_epu32(input, m),
+          prod64_odds = _mm256_mul_epu32(input_odds, m);
+  prod64_evens = _mm256_add_epi64(a, prod64_evens);
+  prod64_odds = _mm256_add_epi64(a, prod64_odds);
+  __m256i prod32hard = _mm256_unpackhi_epi32(prod64_evens, prod64_odds);
+  _mm256_storeu_si256(x, _mm256_add_epi32(prod32easy, prod32hard));
 }
 
 // From 
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-int32_t Jenkins1(int32_t x) {
-  x = ~x + (x << 15);  // x = (x << 15) - x - 1;
-  x = x ^ RotateRight(x, 12);
-  x = x + (x << 2);
-  x = x ^ RotateRight(x, 4);
-  x = x * 2057;  // x = (x + (x << 3)) + (x << 11);
-  x = x ^ RotateRight(x, 16);
-  return x;
+inline void Jenkins1(int32_t* x) {
+  *x = ~*x + (*x << 15); // x = (x << 15) - x - 1;
+  *x = *x ^ RotateRight<12>(*x);
+  *x = *x + (*x << 2);
+  *x = *x ^ RotateRight<4>(*x);
+  *x = *x * 2057; // x = (x + (x << 3)) + (x << 11);
+  *x = *x ^ RotateRight<16>(*x);
 }
 
 // From 
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-uint32_t Jenkins2(uint32_t a) {
-  a = (a + 0x7ed55d16) + (a << 12);
-  a = (a ^ 0xc761c23c) ^ (a >> 19);
-  a = (a + 0x165667b1) + (a << 5);
-  a = (a + 0xd3a2646c) ^ (a << 9);
-  a = (a + 0xfd7046c5) + (a << 3);
-  a = (a ^ 0xb55a4f09) ^ (a >> 16);
-  return a;
+inline void Jenkins2(uint32_t* a) {
+  *a = (*a + 0x7ed55d16) + (*a << 12);
+  *a = (*a ^ 0xc761c23c) ^ (*a >> 19);
+  *a = (*a + 0x165667b1) + (*a << 5);
+  *a = (*a + 0xd3a2646c) ^ (*a << 9);
+  *a = (*a + 0xfd7046c5) + (*a << 3);
+  *a = (*a ^ 0xb55a4f09) ^ (*a >> 16);
 }
 
 // From 
http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm:
-int32_t MultRot(int32_t key) {
+inline void MultRot(int32_t* key) {
   static const int32_t c2 = 0x27d4eb2d;  // a prime or an odd constant
-  key = (key ^ 61) ^ RotateRight(key, 16);
-  key = key + (key << 3);
-  key = key ^ RotateRight(key, 4);
-  key = key * c2;
-  key = key ^ RotateRight(key, 15);
-  return key;
+  *key = (*key ^ 61) ^ RotateRight<16>(*key);
+  *key = *key + (*key << 3);
+  *key = *key ^ RotateRight<4>(*key);
+  *key = *key * c2;
+  *key = *key ^ RotateRight<15>(*key);
+}
+
+inline void CRC(uint32_t* x) {
+  *x = SSE4_crc32_u32(*x, 0xab8ce2abu);
+}
+
+inline void FNV(uint32_t* key) {
+  *key = HashUtil::FnvHash64to32(key, sizeof(*key), HashUtil::FNV_SEED);
+}
+
+// Zobrist hashing, also known as tabulation hashing or simple tabulation 
hashing, is an
+// old technique that has been recently analyzed and found to be very good for 
a number of
+// applications. See "The Power of Simple Tabulation Hashing", by Mihai 
Patrascu and
+// Mikkel Thorup.
+
+uint32_t ZOBRIST_DATA[4][256];
+
+inline void Zobrist(uint32_t* key) {
+  const uint8_t* key_chars = reinterpret_cast<const uint8_t*>(key);
+  *key = ZOBRIST_DATA[0][key_chars[0]] ^ ZOBRIST_DATA[1][key_chars[1]]
+      ^ ZOBRIST_DATA[2][key_chars[2]] ^ ZOBRIST_DATA[3][key_chars[3]];
+}
+
+// Like Zobrist, but uses AVX2's "gather" primatives to hash 8 values at once.
+inline void Zobrist256gather(__m256i* key) __attribute__((__target__("avx2")));
+inline void Zobrist256gather(__m256i* key) {
+  const auto k = _mm256_loadu_si256(key);
+  const auto low_mask = _mm256_set1_epi32(0xff);
+  auto k0 = _mm256_and_si256(low_mask, k),
+       k1 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 8)),
+       k2 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 16)),
+       k3 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 24));
+  k0 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[0]), 
k0, 1);
+  k1 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[1]), 
k1, 1);
+  k2 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[2]), 
k2, 1);
+  k3 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[3]), 
k3, 1);
+  auto k01 = _mm256_xor_si256(k0, k1), k23 = _mm256_xor_si256(k2, k3);
+  _mm256_storeu_si256(key, _mm256_xor_si256(k01, k23));
 }
 
-uint32_t CRC(uint32_t x) {
-  return SSE4_crc32_u32(x,0xab8ce2abu);
+// Like Zobrist256gather, but only uses AVX2's SIMD xor, not its gather.
+inline void Zobrist256simple(uint32_t (*key)[8]) 
__attribute__((__target__("avx2")));
+inline void Zobrist256simple(uint32_t (*key)[8]) {
+  uint32_t row[4][8];
+  const uint8_t (*key_chars)[8][4] = reinterpret_cast<const uint8_t 
(*)[8][4]>(key);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      row[i][j] = ZOBRIST_DATA[i][(*key_chars)[j][i]];
+    }
+  }
+  auto result0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[0])),
+       result1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[1])),
+       result2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[2])),
+       result3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[3]));
+  auto k01 = _mm256_xor_si256(result0, result1), k23 = 
_mm256_xor_si256(result2, result3);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(*key), _mm256_xor_si256(k01, 
k23));
+}
+
+// Perform one hash function the given number of times. This can sometimes 
auto-vectorize.
+//
+// TODO: We could also test the costs of running on non-contiguous uint32_t's. 
For
+// instance, ExprValuesCache.expr_values_array_ might have multiple values to 
hash per
+// input row.
+template <void (*F)(uint32_t*), size_t N>
+inline void Multiple(uint32_t (*x)[N]) {
+  for (int i = 0; i < N; ++i) {
+    F((*x) + i);
+  }
 }
 
-template<typename T, T (*HASH)(T)>
+// The size of the test data we run each hash function on:
+static const size_t DATA_SIZE = 1 << 15;
+
+template<typename T, void (*HASH)(T*)>
 void Run(int batch_size, void* data) {
-  vector<T>* d = reinterpret_cast<vector<T>*>(data);
+  T* d = reinterpret_cast<T*>(data);
   for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < d->size(); ++j) {
-      (*d)[j] = HASH((*d)[j]);
+    for (int j = 0; j < ((sizeof(uint32_t)) * DATA_SIZE) / sizeof(T); ++j) {
+      HASH(&d[j]);
     }
   }
 }
@@ -124,21 +272,61 @@ int main() {
   cout << endl
        << Benchmark::GetMachineInfo() << endl;
 
-  vector<uint32_t> ud(1 << 15);
-  for (size_t i = 0; i < ud.size(); ++i) {
+  unique_ptr<uint32_t[]> ud(new uint32_t[DATA_SIZE]);
+  for (size_t i = 0; i < (DATA_SIZE); ++i) {
     ud[i] = MakeRandU32();
   }
 
-  Benchmark suite("32 -> 32");
+  for (size_t i = 0; i < 4; ++i) {
+    for (size_t j = 0; j < 256; ++j) {
+      ZOBRIST_DATA[i][j] = MakeRandU32();
+    }
+  }
+
+  Benchmark suite32("32 -> 32");
 
-#define BENCH(T,x) suite.AddBenchmark(#x, Run<T, x>, &ud)
-  BENCH(uint32_t, Jenkins2);
-  BENCH(int32_t, Jenkins1);
-  BENCH(int32_t, MultRot);
-  BENCH(uint32_t, MultiplyAddShift);
-  BENCH(uint32_t, MultiplyShift);
-  BENCH(uint32_t, CRC);
-#undef BENCH
+#define BENCH(T,x) AddBenchmark(#x, Run<T, x>, ud.get())
+
+  suite32.BENCH(uint32_t, FNV);
+  suite32.BENCH(uint32_t, Zobrist);
+  suite32.BENCH(int32_t, MultRot);
+  suite32.BENCH(uint32_t, MultiplyAddShift);
+  suite32.BENCH(int32_t, Jenkins1);
+  suite32.BENCH(uint32_t, Jenkins2);
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) suite32.BENCH(uint32_t, CRC);
+  suite32.BENCH(uint32_t, MultiplyShift);
 
-  cout << suite.Measure() << endl;
+  cout << suite32.Measure() << endl;
+
+  Benchmark suite32x4("32 x 4 -> 32 x 4");
+
+  suite32x4.BENCH(uint32_t[4], (Multiple<Zobrist, 4>));
+  suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyAddShift, 4>));
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
+    suite32x4.BENCH(uint32_t[4], (Multiple<CRC, 4>));
+  }
+  suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyShift, 4>));
+  if (CpuInfo::IsSupported(CpuInfo::SSE4_1)) {
+    suite32x4.BENCH(__m128i, MultiplyAddShift128);
+    suite32x4.BENCH(__m128i, MultiplyShift128);
+  }
+
+  cout << suite32x4.Measure() << endl;
+
+  Benchmark suite32x8("32 x 8 -> 32 x 8");
+
+  suite32x8.BENCH(uint32_t[8], (Multiple<Zobrist, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyAddShift, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<CRC, 8>));
+  suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyShift, 8>));
+  if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+    suite32x8.BENCH(uint32_t[8], Zobrist256simple);
+    suite32x8.BENCH(__m256i, Zobrist256gather);
+    suite32x8.BENCH(__m256i, MultiplyAddShift256);
+    suite32x8.BENCH(__m256i, MultiplyShift256);
+  }
+
+  cout << suite32x8.Measure() << endl;
+
+#undef BENCH
 }

[2/2] incubator-impala git commit: Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark.

Reply via email to