Add FNV, Zobrist, and SIMD hash functions to the int hash benchmark. Additionally, change the parameter of rotate to a compile-time constant, and add "inline" to functions, increasing the performance dramatically. The compiler can't inline the SIMD versions, because they use Intel intrinsics -- added a TODO to add these intrinsics to sse-util.h.
Change-Id: I11d48f8816d5b129858a1f773015e51049dd1d61 Reviewed-on: http://gerrit.cloudera.org:8080/4313 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/39e01abc Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/39e01abc Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/39e01abc Branch: refs/heads/master Commit: 39e01abcf38bafaa03810fdbfd743a1558db92d8 Parents: b35689d Author: Jim Apple <[email protected]> Authored: Mon Sep 5 14:15:06 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Wed Sep 7 01:35:30 2016 +0000 ---------------------------------------------------------------------- be/src/benchmarks/int-hash-benchmark.cc | 302 ++++++++++++++++++++++----- 1 file changed, 245 insertions(+), 57 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/39e01abc/be/src/benchmarks/int-hash-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/int-hash-benchmark.cc b/be/src/benchmarks/int-hash-benchmark.cc index a0787f1..9bed3c4 100644 --- a/be/src/benchmarks/int-hash-benchmark.cc +++ b/be/src/benchmarks/int-hash-benchmark.cc @@ -19,10 +19,14 @@ #include <iostream> #include <limits> +#include <memory> #include <vector> +#include <immintrin.h> + #include "util/benchmark.h" #include "util/cpu-info.h" +#include "util/hash-util.h" #include "util/sse-util.h" using namespace std; @@ -31,22 +35,49 @@ using namespace impala; // Test hash functions that take integers as arguments and produce integers as the result. // // Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz -// 32 -> 32: Function Rate (iters/ms) Comparison -// ---------------------------------------------------------------------- -// Jenkins2 14.06 1X -// Jenkins1 16.97 1.207X -// MultRot 19.14 1.361X -// MultiplyAddShift 21.21 1.509X -// MultiplyShift 25.3 1.799X -// CRC 25.53 1.816X +// 32 -> 32: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile +// (relative) (relative) (relative) +// --------------------------------------------------------------------------------------------------------- +// FNV 25.7 26 26.4 1X 1X 1X +// Zobrist 30.2 30.7 30.9 1.18X 1.18X 1.17X +// MultRot 42.6 43.4 43.4 1.66X 1.67X 1.64X +// MultiplyAddShift 42.4 43.2 43.2 1.65X 1.66X 1.64X +// Jenkins1 51.8 54 54 2.02X 2.08X 2.05X +// Jenkins2 66.2 67.4 67.5 2.58X 2.59X 2.56X +// CRC 98.6 100 101 3.84X 3.85X 3.84X +// MultiplyShift 150 152 153 5.84X 5.86X 5.79X +// +// 32 x 4 -> 32 x 4: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile +// (relative) (relative) (relative) +// --------------------------------------------------------------------------------------------------------- +// (Multiple<Zobrist, 4>) 30.8 31.2 31.4 1X 1X 1X +// (Multiple<MultiplyAddShift, 4>) 44.2 45 45 1.43X 1.44X 1.43X +// (Multiple<CRC, 4>) 118 120 121 3.84X 3.86X 3.85X +// (Multiple<MultiplyShift, 4>) 156 159 159 5.07X 5.1X 5.08X +// MultiplyAddShift128 75.7 77.2 77.2 2.46X 2.48X 2.46X +// MultiplyShift128 128 131 133 4.16X 4.21X 4.23X +// +// 32 x 8 -> 32 x 8: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile +// (relative) (relative) (relative) +// --------------------------------------------------------------------------------------------------------- +// (Multiple<Zobrist, 8>) 31 31.5 31.8 1X 1X 1X +// (Multiple<MultiplyAddShift, 8>) 44.3 44.5 45.2 1.43X 1.41X 1.42X +// (Multiple<CRC, 8>) 121 123 123 3.9X 3.9X 3.88X +// (Multiple<MultiplyShift, 8>) 158 159 160 5.11X 5.05X 5.04X +// Zobrist256simple 16.5 16.5 16.6 0.533X 0.524X 0.522X +// Zobrist256gather 18.8 19.2 19.4 0.608X 0.608X 0.61X +// MultiplyAddShift256 151 154 154 4.88X 4.88X 4.84X +// MultiplyShift256 209 212 212 6.73X 6.71X 6.67X // Rotate 32 bits right by 'shift'. This is _rotr in the intel instrinsics, but that isn't // usable on Clang yet. Fortunately, both GCC and Clang can optimize this to use the 'ror' // instruction. -uint32_t RotateRight(uint32_t x, int shift) { - DCHECK_GT(shift, 0); - DCHECK_LT(shift, std::numeric_limits<decltype(x)>::digits); - return (x << (std::numeric_limits<decltype(x)>::digits - shift)) | (x >> shift); +template<int SHIFT> +inline uint32_t RotateRight(uint32_t x) { + static_assert(SHIFT > 0, "Only positive shifts are defined behavior and useful"); + static_assert( + SHIFT < std::numeric_limits<decltype(x)>::digits, "This much shift is just 0"); + return (x << (std::numeric_limits<decltype(x)>::digits - SHIFT)) | (x >> SHIFT); } // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits @@ -60,61 +91,178 @@ uint32_t MakeRandU32() { // Almost universal hashing, M. Dietzfelbinger, T. Hagerup, J. Katajainen, and M. // Penttonen, "A reliable randomized algorithm for the closest-pair problem". -uint32_t MultiplyShift(uint32_t x) { +inline void MultiplyShift(uint32_t* x) { static const uint32_t m = 0x61eaf8e9u; - return x*m; + *x = *x * m; +} + +// Like MultiplyShift, but using SSE's 128-bit SIMD registers to do 4 at once. +// +// TODO: Add the Intel intrinsics used in this function and the other functions in this +// file to sse-util.h so that these functions can be inlined. +inline void MultiplyShift128(__m128i* x) __attribute__((__target__("sse4.1"))); +inline void MultiplyShift128(__m128i* x) { + const __m128i m = _mm_set1_epi32(0x61eaf8e9); + _mm_storeu_si128(x, _mm_mullo_epi32(_mm_loadu_si128(x), m)); +} + +// Like MultiplyShift, but using AVX2's 256-bit SIMD registers to do 8 at once. +// +// Not inline, because it degrades the performance for unknown reasons. +void MultiplyShift256(__m256i* x) __attribute__((__target__("avx2"))); +void MultiplyShift256(__m256i* x) { + const __m256i m = _mm256_set1_epi32(0x61eaf8e9); + _mm256_storeu_si256(x, _mm256_mullo_epi32(_mm256_loadu_si256(x), m)); } // 2-independent hashing. M. Dietzfelbinger, "Universal hashing and k-wise independent // random variables via integer arithmetic without primes" -uint32_t MultiplyAddShift(uint32_t x) { +inline void MultiplyAddShift(uint32_t* x) { static const uint64_t m = 0xa1f1bd3e020b4be0ull, a = 0x86b0426193d86e66ull; - return (static_cast<uint64_t>(x) * m + a) >> 32; + *x = (static_cast<uint64_t>(*x) * m + a) >> 32; +} + +// Like MultiplyAddShift, but using SSE's 128-bit SIMD registers to do 4 at once. +inline void MultiplyAddShift128(__m128i* x) __attribute__((__target__("sse4.1"))); +inline void MultiplyAddShift128(__m128i* x) { + const auto m = _mm_set1_epi64x(0xa1f1bd3e020b4be0ull), + mhi = _mm_set1_epi32(0xa1f1bd3e), + a = _mm_set1_epi64x(0x86b0426193d86e66ull); + auto input = _mm_loadu_si128(x); + auto prod32easy = _mm_mullo_epi32(input, mhi); + auto input_odds = _mm_srli_epi64(input, 32); + auto prod64_evens = _mm_mul_epu32(input, m), + prod64_odds = _mm_mul_epu32(input_odds, m); + prod64_evens = _mm_add_epi64(a, prod64_evens); + prod64_odds = _mm_add_epi64(a, prod64_odds); + auto prod32hard = _mm_unpackhi_epi32(prod64_evens, prod64_odds); + _mm_storeu_si128(x, _mm_add_epi32(prod32easy, prod32hard)); +} + +// Like MultiplyAddShift, but using AVX2's 256-bit SIMD registers to do 8 at once. +inline void MultiplyAddShift256(__m256i* x) __attribute__((__target__("avx2"))); +inline void MultiplyAddShift256(__m256i* x) { + const __m256i m = _mm256_set1_epi64x(0xa1f1bd3e020b4be0ull), + mhi = _mm256_set1_epi32(0xa1f1bd3e), + a = _mm256_set1_epi64x(0x86b0426193d86e66ull); + __m256i input = _mm256_loadu_si256(x); + __m256i prod32easy = _mm256_mullo_epi32(input, mhi); + __m256i input_odds = _mm256_srli_epi64(input, 32); + __m256i prod64_evens = _mm256_mul_epu32(input, m), + prod64_odds = _mm256_mul_epu32(input_odds, m); + prod64_evens = _mm256_add_epi64(a, prod64_evens); + prod64_odds = _mm256_add_epi64(a, prod64_odds); + __m256i prod32hard = _mm256_unpackhi_epi32(prod64_evens, prod64_odds); + _mm256_storeu_si256(x, _mm256_add_epi32(prod32easy, prod32hard)); } // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm: -int32_t Jenkins1(int32_t x) { - x = ~x + (x << 15); // x = (x << 15) - x - 1; - x = x ^ RotateRight(x, 12); - x = x + (x << 2); - x = x ^ RotateRight(x, 4); - x = x * 2057; // x = (x + (x << 3)) + (x << 11); - x = x ^ RotateRight(x, 16); - return x; +inline void Jenkins1(int32_t* x) { + *x = ~*x + (*x << 15); // x = (x << 15) - x - 1; + *x = *x ^ RotateRight<12>(*x); + *x = *x + (*x << 2); + *x = *x ^ RotateRight<4>(*x); + *x = *x * 2057; // x = (x + (x << 3)) + (x << 11); + *x = *x ^ RotateRight<16>(*x); } // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm: -uint32_t Jenkins2(uint32_t a) { - a = (a + 0x7ed55d16) + (a << 12); - a = (a ^ 0xc761c23c) ^ (a >> 19); - a = (a + 0x165667b1) + (a << 5); - a = (a + 0xd3a2646c) ^ (a << 9); - a = (a + 0xfd7046c5) + (a << 3); - a = (a ^ 0xb55a4f09) ^ (a >> 16); - return a; +inline void Jenkins2(uint32_t* a) { + *a = (*a + 0x7ed55d16) + (*a << 12); + *a = (*a ^ 0xc761c23c) ^ (*a >> 19); + *a = (*a + 0x165667b1) + (*a << 5); + *a = (*a + 0xd3a2646c) ^ (*a << 9); + *a = (*a + 0xfd7046c5) + (*a << 3); + *a = (*a ^ 0xb55a4f09) ^ (*a >> 16); } // From http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm: -int32_t MultRot(int32_t key) { +inline void MultRot(int32_t* key) { static const int32_t c2 = 0x27d4eb2d; // a prime or an odd constant - key = (key ^ 61) ^ RotateRight(key, 16); - key = key + (key << 3); - key = key ^ RotateRight(key, 4); - key = key * c2; - key = key ^ RotateRight(key, 15); - return key; + *key = (*key ^ 61) ^ RotateRight<16>(*key); + *key = *key + (*key << 3); + *key = *key ^ RotateRight<4>(*key); + *key = *key * c2; + *key = *key ^ RotateRight<15>(*key); +} + +inline void CRC(uint32_t* x) { + *x = SSE4_crc32_u32(*x, 0xab8ce2abu); +} + +inline void FNV(uint32_t* key) { + *key = HashUtil::FnvHash64to32(key, sizeof(*key), HashUtil::FNV_SEED); +} + +// Zobrist hashing, also known as tabulation hashing or simple tabulation hashing, is an +// old technique that has been recently analyzed and found to be very good for a number of +// applications. See "The Power of Simple Tabulation Hashing", by Mihai Patrascu and +// Mikkel Thorup. + +uint32_t ZOBRIST_DATA[4][256]; + +inline void Zobrist(uint32_t* key) { + const uint8_t* key_chars = reinterpret_cast<const uint8_t*>(key); + *key = ZOBRIST_DATA[0][key_chars[0]] ^ ZOBRIST_DATA[1][key_chars[1]] + ^ ZOBRIST_DATA[2][key_chars[2]] ^ ZOBRIST_DATA[3][key_chars[3]]; +} + +// Like Zobrist, but uses AVX2's "gather" primatives to hash 8 values at once. +inline void Zobrist256gather(__m256i* key) __attribute__((__target__("avx2"))); +inline void Zobrist256gather(__m256i* key) { + const auto k = _mm256_loadu_si256(key); + const auto low_mask = _mm256_set1_epi32(0xff); + auto k0 = _mm256_and_si256(low_mask, k), + k1 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 8)), + k2 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 16)), + k3 = _mm256_and_si256(low_mask, _mm256_srli_epi32(k, 24)); + k0 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[0]), k0, 1); + k1 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[1]), k1, 1); + k2 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[2]), k2, 1); + k3 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(ZOBRIST_DATA[3]), k3, 1); + auto k01 = _mm256_xor_si256(k0, k1), k23 = _mm256_xor_si256(k2, k3); + _mm256_storeu_si256(key, _mm256_xor_si256(k01, k23)); } -uint32_t CRC(uint32_t x) { - return SSE4_crc32_u32(x,0xab8ce2abu); +// Like Zobrist256gather, but only uses AVX2's SIMD xor, not its gather. +inline void Zobrist256simple(uint32_t (*key)[8]) __attribute__((__target__("avx2"))); +inline void Zobrist256simple(uint32_t (*key)[8]) { + uint32_t row[4][8]; + const uint8_t (*key_chars)[8][4] = reinterpret_cast<const uint8_t (*)[8][4]>(key); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 8; ++j) { + row[i][j] = ZOBRIST_DATA[i][(*key_chars)[j][i]]; + } + } + auto result0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[0])), + result1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[1])), + result2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[2])), + result3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row[3])); + auto k01 = _mm256_xor_si256(result0, result1), k23 = _mm256_xor_si256(result2, result3); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(*key), _mm256_xor_si256(k01, k23)); +} + +// Perform one hash function the given number of times. This can sometimes auto-vectorize. +// +// TODO: We could also test the costs of running on non-contiguous uint32_t's. For +// instance, ExprValuesCache.expr_values_array_ might have multiple values to hash per +// input row. +template <void (*F)(uint32_t*), size_t N> +inline void Multiple(uint32_t (*x)[N]) { + for (int i = 0; i < N; ++i) { + F((*x) + i); + } } -template<typename T, T (*HASH)(T)> +// The size of the test data we run each hash function on: +static const size_t DATA_SIZE = 1 << 15; + +template<typename T, void (*HASH)(T*)> void Run(int batch_size, void* data) { - vector<T>* d = reinterpret_cast<vector<T>*>(data); + T* d = reinterpret_cast<T*>(data); for (int i = 0; i < batch_size; ++i) { - for (int j = 0; j < d->size(); ++j) { - (*d)[j] = HASH((*d)[j]); + for (int j = 0; j < ((sizeof(uint32_t)) * DATA_SIZE) / sizeof(T); ++j) { + HASH(&d[j]); } } } @@ -124,21 +272,61 @@ int main() { cout << endl << Benchmark::GetMachineInfo() << endl; - vector<uint32_t> ud(1 << 15); - for (size_t i = 0; i < ud.size(); ++i) { + unique_ptr<uint32_t[]> ud(new uint32_t[DATA_SIZE]); + for (size_t i = 0; i < (DATA_SIZE); ++i) { ud[i] = MakeRandU32(); } - Benchmark suite("32 -> 32"); + for (size_t i = 0; i < 4; ++i) { + for (size_t j = 0; j < 256; ++j) { + ZOBRIST_DATA[i][j] = MakeRandU32(); + } + } + + Benchmark suite32("32 -> 32"); -#define BENCH(T,x) suite.AddBenchmark(#x, Run<T, x>, &ud) - BENCH(uint32_t, Jenkins2); - BENCH(int32_t, Jenkins1); - BENCH(int32_t, MultRot); - BENCH(uint32_t, MultiplyAddShift); - BENCH(uint32_t, MultiplyShift); - BENCH(uint32_t, CRC); -#undef BENCH +#define BENCH(T,x) AddBenchmark(#x, Run<T, x>, ud.get()) + + suite32.BENCH(uint32_t, FNV); + suite32.BENCH(uint32_t, Zobrist); + suite32.BENCH(int32_t, MultRot); + suite32.BENCH(uint32_t, MultiplyAddShift); + suite32.BENCH(int32_t, Jenkins1); + suite32.BENCH(uint32_t, Jenkins2); + if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) suite32.BENCH(uint32_t, CRC); + suite32.BENCH(uint32_t, MultiplyShift); - cout << suite.Measure() << endl; + cout << suite32.Measure() << endl; + + Benchmark suite32x4("32 x 4 -> 32 x 4"); + + suite32x4.BENCH(uint32_t[4], (Multiple<Zobrist, 4>)); + suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyAddShift, 4>)); + if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) { + suite32x4.BENCH(uint32_t[4], (Multiple<CRC, 4>)); + } + suite32x4.BENCH(uint32_t[4], (Multiple<MultiplyShift, 4>)); + if (CpuInfo::IsSupported(CpuInfo::SSE4_1)) { + suite32x4.BENCH(__m128i, MultiplyAddShift128); + suite32x4.BENCH(__m128i, MultiplyShift128); + } + + cout << suite32x4.Measure() << endl; + + Benchmark suite32x8("32 x 8 -> 32 x 8"); + + suite32x8.BENCH(uint32_t[8], (Multiple<Zobrist, 8>)); + suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyAddShift, 8>)); + suite32x8.BENCH(uint32_t[8], (Multiple<CRC, 8>)); + suite32x8.BENCH(uint32_t[8], (Multiple<MultiplyShift, 8>)); + if (CpuInfo::IsSupported(CpuInfo::AVX2)) { + suite32x8.BENCH(uint32_t[8], Zobrist256simple); + suite32x8.BENCH(__m256i, Zobrist256gather); + suite32x8.BENCH(__m256i, MultiplyAddShift256); + suite32x8.BENCH(__m256i, MultiplyShift256); + } + + cout << suite32x8.Measure() << endl; + +#undef BENCH }
