Use AVX2 operations to speedup Bloom filters by 10-100%. As a reminder, our Bloom filters use so-called "block" Bloom filters, in which each Bloom filter is actually a set of tiny Bloom fitlers, each the size of a cache line.
The big idea is to make the tiny Bloom filters that make up a large Bloom filter the size of AVX2 registers (256 bits) rather than cache lines (512 bits). This enables a number of SIMD optimizations, and the resulting AVX2 code does not need loops or conditionals. Impala supports machines that do not have AVX2 instructions, so the fast path is only conditionally enabled. Checking whether AVX2 instructions are available does not seem to hurt operation speed, perhaps because the branch becomes very easy to predict. Change-Id: I6fef4f6652876f8fd7e3f0e41431702380418c98 Reviewed-on: http://gerrit.cloudera.org:8080/3338 Reviewed-by: Jim Apple <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/38b18ea1 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/38b18ea1 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/38b18ea1 Branch: refs/heads/master Commit: 38b18ea1c599cf8bfdad5c072beefad869088b45 Parents: fc3ff1c Author: Jim Apple <[email protected]> Authored: Thu Jun 2 08:55:32 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Tue Jun 21 17:38:12 2016 -0700 ---------------------------------------------------------------------- be/src/benchmarks/bloom-filter-benchmark.cc | 138 ++++++++++++----- be/src/util/bloom-filter-test.cc | 90 +++++++---- be/src/util/bloom-filter.cc | 20 +-- be/src/util/bloom-filter.h | 183 +++++++++++++++++------ be/src/util/cpu-info.cc | 1 + be/src/util/cpu-info.h | 30 ++++ 6 files changed, 339 insertions(+), 123 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/benchmarks/bloom-filter-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc b/be/src/benchmarks/bloom-filter-benchmark.cc index 7374ed4..35d48fc 100644 --- a/be/src/benchmarks/bloom-filter-benchmark.cc +++ b/be/src/benchmarks/bloom-filter-benchmark.cc @@ -38,51 +38,103 @@ using namespace impala; // As in bloom-filter.h, ndv refers to the number of unique items inserted into a filter // and fpp is the probability of false positives. // +// // Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz +// +// With AVX2: +// // initialize: Function Rate (iters/ms) Comparison // ---------------------------------------------------------------------- -// ndv 10k fpp 10.0% 6628 1X -// ndv 10k fpp 1.0% 3655 0.5515X -// ndv 10k fpp 0.1% 1293 0.195X -// ndv 1000k fpp 10.0% 28.92 0.004363X -// ndv 1000k fpp 1.0% 14.5 0.002188X -// ndv 1000k fpp 0.1% 14.51 0.00219X -// ndv 100000k fpp 10.0% 0.05863 8.846e-06X -// ndv 100000k fpp 1.0% 0.05776 8.714e-06X -// ndv 100000k fpp 0.1% 0.02849 4.298e-06X +// ndv 10k fpp 10.0% 6607 1X +// ndv 10k fpp 1.0% 3427 0.5187X +// ndv 10k fpp 0.1% 1203 0.182X +// ndv 1000k fpp 10.0% 5.273 0.0007982X +// ndv 1000k fpp 1.0% 3.297 0.000499X +// ndv 1000k fpp 0.1% 3.31 0.000501X +// ndv 100000k fpp 10.0% 0.08597 1.301e-05X +// ndv 100000k fpp 1.0% 0.0846 1.28e-05X +// ndv 100000k fpp 0.1% 0.04349 6.582e-06X // // insert: Function Rate (iters/ms) Comparison // ---------------------------------------------------------------------- -// ndv 10k fpp 10.0% 1.109e+05 1X -// ndv 10k fpp 1.0% 1.083e+05 0.9771X -// ndv 10k fpp 0.1% 1.088e+05 0.9808X -// ndv 1000k fpp 10.0% 8.975e+04 0.8094X -// ndv 1000k fpp 1.0% 8.8e+04 0.7937X -// ndv 1000k fpp 0.1% 9.353e+04 0.8435X -// ndv 100000k fpp 10.0% 2.322e+04 0.2094X -// ndv 100000k fpp 1.0% 2.314e+04 0.2087X -// ndv 100000k fpp 0.1% 1.953e+04 0.1762X +// ndv 10k fpp 10.0% 1.858e+05 1X +// ndv 10k fpp 1.0% 1.801e+05 0.9693X +// ndv 10k fpp 0.1% 1.869e+05 1.006X +// ndv 1000k fpp 10.0% 1.686e+05 0.9076X +// ndv 1000k fpp 1.0% 1.627e+05 0.8756X +// ndv 1000k fpp 0.1% 1.53e+05 0.8234X +// ndv 100000k fpp 10.0% 4.262e+04 0.2294X +// ndv 100000k fpp 1.0% 4.326e+04 0.2329X +// ndv 100000k fpp 0.1% 4.185e+04 0.2253X // // find: Function Rate (iters/ms) Comparison // ---------------------------------------------------------------------- -// present ndv 10k fpp 10.0% 2.044e+05 1X -// absent ndv 10k fpp 10.0% 1.019e+05 0.4984X -// present ndv 10k fpp 1.0% 2.039e+05 0.9976X -// absent ndv 10k fpp 1.0% 1.234e+05 0.6037X -// present ndv 10k fpp 0.1% 1.928e+05 0.9431X -// absent ndv 10k fpp 0.1% 1.998e+05 0.9774X -// present ndv 1000k fpp 10.0% 1.367e+05 0.6686X -// absent ndv 1000k fpp 10.0% 7.115e+04 0.348X -// present ndv 1000k fpp 1.0% 1.164e+05 0.5694X -// absent ndv 1000k fpp 1.0% 9.859e+04 0.4822X -// present ndv 1000k fpp 0.1% 1.153e+05 0.5638X -// absent ndv 1000k fpp 0.1% 9.787e+04 0.4787X -// present ndv 100000k fpp 10.0% 2.869e+04 0.1403X -// absent ndv 100000k fpp 10.0% 3.222e+04 0.1576X -// present ndv 100000k fpp 1.0% 2.868e+04 0.1403X -// absent ndv 100000k fpp 1.0% 3.212e+04 0.1571X -// present ndv 100000k fpp 0.1% 2.793e+04 0.1366X -// absent ndv 100000k fpp 0.1% 3.948e+04 0.1931X +// present ndv 10k fpp 10.0% 2.277e+05 1X +// absent ndv 10k fpp 10.0% 2.258e+05 0.9914X +// present ndv 10k fpp 1.0% 2.277e+05 1X +// absent ndv 10k fpp 1.0% 2.295e+05 1.008X +// present ndv 10k fpp 0.1% 2.258e+05 0.9916X +// absent ndv 10k fpp 0.1% 2.283e+05 1.003X +// present ndv 1000k fpp 10.0% 1.799e+05 0.7901X +// absent ndv 1000k fpp 10.0% 1.777e+05 0.7803X +// present ndv 1000k fpp 1.0% 1.52e+05 0.6674X +// absent ndv 1000k fpp 1.0% 1.625e+05 0.7134X +// present ndv 1000k fpp 0.1% 1.825e+05 0.8013X +// absent ndv 1000k fpp 0.1% 1.836e+05 0.806X +// present ndv 100000k fpp 10.0% 4.125e+04 0.1811X +// absent ndv 100000k fpp 10.0% 4.147e+04 0.1821X +// present ndv 100000k fpp 1.0% 4.203e+04 0.1845X +// absent ndv 100000k fpp 1.0% 4.189e+04 0.1839X +// present ndv 100000k fpp 0.1% 3.506e+04 0.1539X +// absent ndv 100000k fpp 0.1% 3.507e+04 0.154X +// +// +// Without AVX2: +// +// initialize: Function Rate (iters/ms) Comparison +// ---------------------------------------------------------------------- +// ndv 10k fpp 10.0% 6453 1X +// ndv 10k fpp 1.0% 3271 0.5068X +// ndv 10k fpp 0.1% 1280 0.1984X +// ndv 1000k fpp 10.0% 5.213 0.0008078X +// ndv 1000k fpp 1.0% 2.574 0.0003989X +// ndv 1000k fpp 0.1% 2.584 0.0004005X +// ndv 100000k fpp 10.0% 0.03276 5.076e-06X +// ndv 100000k fpp 1.0% 0.03224 4.996e-06X +// ndv 100000k fpp 0.1% 0.0161 2.494e-06X +// +// insert: Function Rate (iters/ms) Comparison +// ---------------------------------------------------------------------- +// ndv 10k fpp 10.0% 1.128e+05 1X +// ndv 10k fpp 1.0% 1.162e+05 1.03X +// ndv 10k fpp 0.1% 1.145e+05 1.015X +// ndv 1000k fpp 10.0% 1.086e+05 0.9626X +// ndv 1000k fpp 1.0% 8.377e+04 0.7427X +// ndv 1000k fpp 0.1% 8.902e+04 0.7892X +// ndv 100000k fpp 10.0% 2.548e+04 0.2259X +// ndv 100000k fpp 1.0% 2.37e+04 0.2101X +// ndv 100000k fpp 0.1% 2.256e+04 0.2X +// +// find: Function Rate (iters/ms) Comparison +// ---------------------------------------------------------------------- +// present ndv 10k fpp 10.0% 1.676e+05 1X +// absent ndv 10k fpp 10.0% 1.067e+05 0.6366X +// present ndv 10k fpp 1.0% 1.683e+05 1.004X +// absent ndv 10k fpp 1.0% 1.291e+05 0.7705X +// present ndv 10k fpp 0.1% 1.662e+05 0.9917X +// absent ndv 10k fpp 0.1% 2.238e+05 1.336X +// present ndv 1000k fpp 10.0% 1.231e+05 0.7344X +// absent ndv 1000k fpp 10.0% 6.903e+04 0.4119X +// present ndv 1000k fpp 1.0% 1.215e+05 0.725X +// absent ndv 1000k fpp 1.0% 1.124e+05 0.6707X +// present ndv 1000k fpp 0.1% 1.095e+05 0.6532X +// absent ndv 1000k fpp 0.1% 1.034e+05 0.6171X +// present ndv 100000k fpp 10.0% 2.733e+04 0.1631X +// absent ndv 100000k fpp 10.0% 3.447e+04 0.2057X +// present ndv 100000k fpp 1.0% 2.779e+04 0.1658X +// absent ndv 100000k fpp 1.0% 3.36e+04 0.2005X +// present ndv 100000k fpp 0.1% 2.725e+04 0.1626X +// absent ndv 100000k fpp 0.1% 4.342e+04 0.2591X // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits // produced by rand(). @@ -174,9 +226,7 @@ void Absent(int batch_size, void* data) { } // namespace find -int main(int argc, char **argv) { - CpuInfo::Init(); - cout << endl << Benchmark::GetMachineInfo() << endl; +void RunBenchmarks() { char name[120]; @@ -220,6 +270,14 @@ int main(int argc, char **argv) { } cout << suite.Measure() << endl; } +} - return 0; +int main(int argc, char **argv) { + CpuInfo::Init(); + cout << endl << Benchmark::GetMachineInfo() << endl << endl + << "With AVX2:" << endl << endl; + RunBenchmarks(); + cout << endl << "Without AVX2:" << endl << endl; + CpuInfo::TempDisable t(CpuInfo::AVX2); + RunBenchmarks(); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/bloom-filter-test.cc ---------------------------------------------------------------------- diff --git a/be/src/util/bloom-filter-test.cc b/be/src/util/bloom-filter-test.cc index 70de938..2f2f8da 100644 --- a/be/src/util/bloom-filter-test.cc +++ b/be/src/util/bloom-filter-test.cc @@ -15,15 +15,20 @@ #include "util/bloom-filter.h" #include <algorithm> -#include <set> +#include <unordered_set> #include <vector> #include <gtest/gtest.h> +#include "util/cpu-info.h" + using namespace std; namespace { -// Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits + +using namespace impala; + +// Make a random uint64_t, avoiding the absent high bit and the low-entropy low bits // produced by rand(). uint64_t MakeRand() { uint32_t result = (rand() >> 8) & 0xffff; @@ -32,7 +37,30 @@ uint64_t MakeRand() { return result; } +// BfInsert() and BfFind() are like BloomFilter::{Insert,Find}, except they randomly +// disable AVX2 instructions half of the time. These are used for testing that AVX2 +// machines and non-AVX2 machines produce compatible BloomFilters. + +void BfInsert(BloomFilter& bf, uint32_t h) { + if (MakeRand() & 0x1) { + bf.Insert(h); + } else { + CpuInfo::TempDisable t1(CpuInfo::AVX2); + bf.Insert(h); + } +} + +bool BfFind(BloomFilter& bf, uint32_t h) { + if (MakeRand() & 0x1) { + return bf.Find(h); + } else { + CpuInfo::TempDisable t1(CpuInfo::AVX2); + return bf.Find(h); + } +} + } // namespace + namespace impala { // We can construct (and destruct) Bloom filters with different spaces. @@ -48,7 +76,7 @@ TEST(BloomFilter, Insert) { for (int i = 13; i < 17; ++i) { BloomFilter bf(i); for (int k = 0; k < (1 << 15); ++k) { - bf.Insert(MakeRand()); + BfInsert(bf, MakeRand()); } } } @@ -60,8 +88,8 @@ TEST(BloomFilter, Find) { BloomFilter bf(i); for (int k = 0; k < (1 << 15); ++k) { const uint64_t to_insert = MakeRand(); - bf.Insert(to_insert); - EXPECT_TRUE(bf.Find(to_insert)); + BfInsert(bf, to_insert); + EXPECT_TRUE(BfFind(bf, to_insert)); } } } @@ -75,9 +103,9 @@ TEST(BloomFilter, CumulativeFind) { for (int k = 0; k < (1 << 10); ++k) { const uint32_t to_insert = MakeRand(); inserted.push_back(to_insert); - bf.Insert(to_insert); + BfInsert(bf, to_insert); for (int n = 0; n < inserted.size(); ++n) { - EXPECT_TRUE(bf.Find(inserted[n])); + EXPECT_TRUE(BfFind(bf, inserted[n])); } } } @@ -88,17 +116,19 @@ TEST(BloomFilter, CumulativeFind) { TEST(BloomFilter, FindInvalid) { srand(0); static const int find_limit = 1 << 20; - set<uint32_t> to_find; + unordered_set<uint32_t> to_find; while (to_find.size() < find_limit) { to_find.insert(MakeRand()); } static const int max_log_ndv = 19; - set<uint32_t> to_insert; + unordered_set<uint32_t> to_insert; while (to_insert.size() < (1ull << max_log_ndv)) { - to_insert.insert(MakeRand()); + const auto candidate = MakeRand(); + if (to_find.find(candidate) == to_find.end()) { + to_insert.insert(candidate); + } } vector<uint32_t> shuffled_insert(to_insert.begin(), to_insert.end()); - random_shuffle(shuffled_insert.begin(), shuffled_insert.end()); for (int log_ndv = 12; log_ndv < max_log_ndv; ++log_ndv) { for (int log_fpp = 4; log_fpp < 15; ++log_fpp) { double fpp = 1.0 / (1 << log_fpp); @@ -107,20 +137,22 @@ TEST(BloomFilter, FindInvalid) { BloomFilter bf(log_heap_space); // Fill up a BF with exactly as much ndv as we planned for it: for (size_t i = 0; i < ndv; ++i) { - bf.Insert(shuffled_insert[i]); + BfInsert(bf, shuffled_insert[i]); } int found = 0; // Now we sample from the set of possible hashes, looking for hits. for (const auto& i : to_find) { - found += bf.Find(i); + found += BfFind(bf, i); } - EXPECT_LE(found, 3 * find_limit * fpp) + EXPECT_LE(found, find_limit * fpp * 2) << "Too many false positives with -log2(fpp) = " << log_fpp; // Because the space is rounded up to a power of 2, we might actually get a lower // fpp than the one passed to MinLogSpace(). const double expected_fpp = BloomFilter::FalsePositiveProb(ndv, log_heap_space); - EXPECT_GE(found, 0.33 * find_limit * expected_fpp) + EXPECT_GE(found, find_limit * expected_fpp) << "Too few false positives with -log2(fpp) = " << log_fpp; + EXPECT_LE(found, find_limit * expected_fpp * 8) + << "Too many false positives with -log2(fpp) = " << log_fpp; } } } @@ -193,11 +225,11 @@ TEST(BloomFilter, MinSpaceForFpp) { TEST(BloomFilter, Thrift) { BloomFilter bf(BloomFilter::MinLogSpace(100, 0.01)); - for (int i = 0; i < 10; ++i) bf.Insert(i); + for (int i = 0; i < 10; ++i) BfInsert(bf, i); // Check no unexpected new false positives. - set<int> missing_ints; + unordered_set<int> missing_ints; for (int i = 11; i < 100; ++i) { - if (!bf.Find(i)) missing_ints.insert(i); + if (!BfFind(bf, i)) missing_ints.insert(i); } TBloomFilter to_thrift; @@ -205,8 +237,8 @@ TEST(BloomFilter, Thrift) { EXPECT_EQ(to_thrift.always_true, false); BloomFilter from_thrift(to_thrift); - for (int i = 0; i < 10; ++i) ASSERT_TRUE(from_thrift.Find(i)); - for (int missing: missing_ints) ASSERT_FALSE(from_thrift.Find(missing)); + for (int i = 0; i < 10; ++i) ASSERT_TRUE(BfFind(from_thrift, i)); + for (int missing: missing_ints) ASSERT_FALSE(BfFind(from_thrift, missing)); BloomFilter::ToThrift(NULL, &to_thrift); EXPECT_EQ(to_thrift.always_true, true); @@ -215,23 +247,25 @@ TEST(BloomFilter, Thrift) { TEST(BloomFilter, Or) { BloomFilter bf1(BloomFilter::MinLogSpace(100, 0.01)); BloomFilter bf2(BloomFilter::MinLogSpace(100, 0.01)); - for (int i = 60; i < 80; ++i) bf2.Insert(i); + for (int i = 60; i < 80; ++i) BfInsert(bf2, i); - for (int i = 0; i < 10; ++i) bf1.Insert(i); + for (int i = 0; i < 10; ++i) BfInsert(bf1, i); bf2.Or(bf1); - for (int i = 0; i < 10; ++i) ASSERT_TRUE(bf2.Find(i)); - for (int i = 60; i < 80; ++i) ASSERT_TRUE(bf2.Find(i)); + for (int i = 0; i < 10; ++i) ASSERT_TRUE(BfFind(bf2, i)); + for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf2, i)); - for (int i = 11; i < 50; ++i) bf1.Insert(i); + for (int i = 11; i < 50; ++i) BfInsert(bf1, i); bf2.Or(bf1); - for (int i = 11; i < 50; ++i) ASSERT_TRUE(bf2.Find(i)); - for (int i = 60; i < 80; ++i) ASSERT_TRUE(bf2.Find(i)); - ASSERT_FALSE(bf2.Find(81)); + for (int i = 11; i < 50; ++i) ASSERT_TRUE(BfFind(bf2, i)); + for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf2, i)); + ASSERT_FALSE(BfFind(bf2, 81)); } } // namespace impala int main(int argc, char** argv) { + using namespace impala; + CpuInfo::Init(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/bloom-filter.cc ---------------------------------------------------------------------- diff --git a/be/src/util/bloom-filter.cc b/be/src/util/bloom-filter.cc index fc26e32..2d36be5 100644 --- a/be/src/util/bloom-filter.cc +++ b/be/src/util/bloom-filter.cc @@ -26,21 +26,22 @@ using namespace std; namespace impala { -BloomFilter* BloomFilter::ALWAYS_TRUE_FILTER = NULL; +BloomFilter* const BloomFilter::ALWAYS_TRUE_FILTER = NULL; + +constexpr uint32_t BloomFilter::REHASH[8] __attribute__((aligned(32))); BloomFilter::BloomFilter(const int log_heap_space) - : // Since log_heap_space is in bytes, we need to convert it to cache lines. There - // are 64 = 2^6 bytes in a cache line. - log_num_buckets_(std::max(1, log_heap_space - LOG_BUCKET_WORD_BITS)), - // Don't use log_num_buckets_ if it will lead to undefined behavior by a shift - // that is too large. - directory_mask_((1ull << std::min(63, log_num_buckets_)) - 1), - directory_(NULL) { + : // Since log_heap_space is in bytes, we need to convert it to the number of tiny Bloom + // filters we will use. + log_num_buckets_(std::max(1, log_heap_space - LOG_BUCKET_BYTE_SIZE)), + // Don't use log_num_buckets_ if it will lead to undefined behavior by a shift + // that is too large. + directory_mask_((1ull << std::min(63, log_num_buckets_)) - 1), + directory_(NULL) { // Since we use 32 bits in the arguments of Insert() and Find(), log_num_buckets_ // must be limited. DCHECK(log_num_buckets_ <= 32) << "Bloom filter too large. log_heap_space: " << log_heap_space; - // Each bucket has 64 = 2^6 bytes: const size_t alloc_size = directory_size(); const int malloc_failed = posix_memalign(reinterpret_cast<void**>(&directory_), 64, alloc_size); @@ -84,6 +85,7 @@ void BloomFilter::Or(const BloomFilter& other) { BucketWord* dir_ptr = reinterpret_cast<BucketWord*>(directory_); const BucketWord* other_dir_ptr = reinterpret_cast<const BucketWord*>(other.directory_); int directory_size_in_words = directory_size() / sizeof(BucketWord); + // TODO: use SIMD here: for (int i = 0; i < directory_size_in_words; ++i) dir_ptr[i] |= other_dir_ptr[i]; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/bloom-filter.h ---------------------------------------------------------------------- diff --git a/be/src/util/bloom-filter.h b/be/src/util/bloom-filter.h index 6fdd140..8a94ea8 100644 --- a/be/src/util/bloom-filter.h +++ b/be/src/util/bloom-filter.h @@ -20,6 +20,8 @@ #include <limits> +#include <immintrin.h> + #include "gutil/macros.h" #include "gen-cpp/ImpalaInternalService_types.h" @@ -40,21 +42,23 @@ namespace impala { /// 2. NDV: the number of unique items that have been inserted /// /// BloomFilter is implemented using block Bloom filters from Putze et al.'s "Cache-, -/// Hash- and Space-Efficient Bloom Filters". The basic idea is to hash the item to a -/// single cache line and then treat that cache line like a Bloom filter. This -/// implementation sets 8 bits in each cache-line-sized Bloom filter. This provides a -/// false positive rate near optimal for between 5 and 15 bits per distinct value, which -/// corresponds to false positive probabilities between 0.1% (for 15 bits) and 10% (for 5 -/// bits). +/// Hash- and Space-Efficient Bloom Filters". The basic idea is to hash the item to a tiny +/// Bloom filter the size of a single cache line or smaller. This implementation sets 8 +/// bits in each tiny Bloom filter. This provides a false positive rate near optimal for +/// between 5 and 15 bits per distinct value, which corresponds to false positive +/// probabilities between 0.1% (for 15 bits) and 10% (for 5 bits). +/// +/// Our tiny BloomFilters are 32 bytes to take advantage of 32-byte SIMD in newer Intel +/// machines. class BloomFilter { public: /// Consumes at most (1 << log_heap_space) bytes on the heap. - BloomFilter(const int log_heap_space); - BloomFilter(const TBloomFilter& thrift); + explicit BloomFilter(const int log_heap_space); + explicit BloomFilter(const TBloomFilter& thrift); ~BloomFilter(); /// Representation of a filter which allows all elements to pass. - static BloomFilter* ALWAYS_TRUE_FILTER; + static BloomFilter* const ALWAYS_TRUE_FILTER; /// Converts 'filter' to its corresponding Thrift representation. If the first argument /// is NULL, it is interpreted as a complete filter which contains all elements. @@ -97,32 +101,48 @@ class BloomFilter { } private: - /// log_directory_space_ is the log (base 2) of the number of buckets in the directory. + /// The BloomFilter is divided up into Buckets + static const uint64_t BUCKET_WORDS = 8; + typedef uint32_t BucketWord; + + // log2(number of bits in a BucketWord) + static const int LOG_BUCKET_WORD_BITS = 5; + static const BucketWord BUCKET_WORD_MASK = (1 << LOG_BUCKET_WORD_BITS) - 1; + + /// log2(number of bytes in a bucket) + static const int LOG_BUCKET_BYTE_SIZE = 5; + + static_assert((1 << LOG_BUCKET_WORD_BITS) == std::numeric_limits<BucketWord>::digits, + "BucketWord must have a bit-width that is be a power of 2, like 64 for uint64_t."); + + typedef BucketWord Bucket[BUCKET_WORDS]; + + /// log_num_buckets_ is the log (base 2) of the number of buckets in the directory. const int log_num_buckets_; /// directory_mask_ is (1 << log_num_buckets_) - 1. It is precomputed for /// efficiency reasons. const uint32_t directory_mask_; - /// The BloomFilter is divided up into Buckets, each of which is a cache line. - static const uint64_t BUCKET_WORDS = 8; - typedef uint64_t BucketWord; + Bucket* directory_; - // log2(number of bits in a BucketWord) - // TODO: Use Bits::Log2Ceiling64(numeric_limits<BucketWord>::digits) once we enable - // C++14 for codegen. - static const int LOG_BUCKET_WORD_BITS = 6; - static const BucketWord BUCKET_WORD_MASK = 63; // 2^LOG_BUCKET_WORD_BITS - 1 + /// Does the actual work of Insert(). bucket_idx is the index of the bucket to insert + /// into and 'hash' is the value passed to Insert(). + void BucketInsert(const uint32_t bucket_idx, const uint32_t hash); - /// log2(number of bytes in a bucket) - static const int LOG_BUCKET_BYTE_SIZE = 6; + /// A faster SIMD version of BucketInsert(). + void BucketInsertAVX2(const uint32_t bucket_idx, const uint32_t hash) + __attribute__((__target__("avx2"))); - // TODO: Re-enable static_asserts when C++14 is enabled. - //static_assert((1 << LOG_BUCKET_WORD_BITS) == std::numeric_limits<BucketWord>::digits, - // "BucketWord must have a bit-width that is be a power of 2, like 64 for uint64_t."); + /// BucketFind() and BucketFindAVX2() are just like BucketInsert() and + /// BucketInsertAVX2(), but for Find(). + bool BucketFind(const uint32_t bucket_idx, const uint32_t hash) const; + bool BucketFindAVX2(const uint32_t bucket_idx, const uint32_t hash) const + __attribute__((__target__("avx2"))); - typedef BucketWord Bucket[BUCKET_WORDS]; - Bucket* directory_; + /// A helper function for the AVX2 methods. Turns a 32-bit hash into a 256-bit Bucket + /// with 1 single 1-bit set in each 32-bit lane. + static __m256i MakeMask(const uint32_t hash) __attribute__((__target__("avx2"))); int64_t directory_size() const { return 1uLL << (log_num_buckets_ + LOG_BUCKET_BYTE_SIZE); @@ -131,44 +151,115 @@ class BloomFilter { /// Serializes this filter as Thrift. void ToThrift(TBloomFilter* thrift) const; + /// Some constants used in hashing. #defined for efficiency reasons. +#define IMPALA_BLOOM_HASH_CONSTANTS \ + 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, 0x705495c7U, 0x2df1424bU, \ + 0x9efc4947U, 0x5c6bfb31U + + /// REHASH is used as 8 odd 32-bit unsigned ints. See Dietzfelbinger et al.'s "A + /// reliable randomized algorithm for the closest-pair problem". + static constexpr uint32_t REHASH[8] + __attribute__((aligned(32))) = {IMPALA_BLOOM_HASH_CONSTANTS}; + DISALLOW_COPY_AND_ASSIGN(BloomFilter); }; +// To set 8 bits in an 32-byte Bloom filter, we set one bit in each 32-bit uint32_t. This +// is a "split Bloom filter", and it has approximately the same false positive probability +// as standard a Bloom filter; See Mitzenmacher's "Bloom Filters and Such". It also has +// the advantage of requiring fewer random bits: log2(32) * 8 = 5 * 8 = 40 random bits for +// a split Bloom filter, but log2(256) * 8 = 64 random bits for a standard Bloom filter. + inline void BloomFilter::Insert(const uint32_t hash) { const uint32_t bucket_idx = HashUtil::Rehash32to32(hash) & directory_mask_; - uint64_t bits_to_set = HashUtil::Rehash32to64(hash); - // To set 8 bits in an 64-byte cache line, we set one bit in each 64-bit uint64_t in - // that cache line. This is a "split Bloom filter", and it has approximately the same - // false positive probability as standard a Bloom filter; See Mitzenmacher's "Bloom - // Filters and Such". It also has the advantage of requiring fewer random bits for the - // 2016-era Intel cache line sizes and machine word sizes: log2(64) * 8 = 6 * 8 = 48 - // random bits for a split Bloom filter, but log2(512) * 8 = 72 random bits for a - // standard Bloom filter. In fact, this leaves the most significant 16 bits of - // bits_to_set unused. - DCHECK_GE(std::numeric_limits<uint64_t>::digits, LOG_BUCKET_WORD_BITS * BUCKET_WORDS) - << "bits_to_set must have enough bits to index into all the bucket words"; - for (int i = 0; i < BUCKET_WORDS; ++i) { - // Use LOG_BUCKET_WORD_BITS bits of hash data to index into a BucketWord and set one - // of its bits. - directory_[bucket_idx][i] |= static_cast<BucketWord>(1) - << (bits_to_set & BUCKET_WORD_MASK); - bits_to_set >>= LOG_BUCKET_WORD_BITS; + if (CpuInfo::IsSupported(CpuInfo::AVX2)) { + BucketInsertAVX2(bucket_idx, hash); + } else { + BucketInsert(bucket_idx, hash); } } inline bool BloomFilter::Find(const uint32_t hash) const { const uint32_t bucket_idx = HashUtil::Rehash32to32(hash) & directory_mask_; - uint64_t bits_to_set = HashUtil::Rehash32to64(hash); + if (CpuInfo::IsSupported(CpuInfo::AVX2)) { + return BucketFindAVX2(bucket_idx, hash); + } else { + return BucketFind(bucket_idx, hash); + } +} + +// The SIMD reinterpret_casts technically violate C++'s strict aliasing rules. However, we +// compile with -fno-strict-aliasing. + +inline void BloomFilter::BucketInsert(const uint32_t bucket_idx, const uint32_t hash) { + // new_bucket will be all zeros except for eight 1-bits, one in each 32-bit word. It is + // 16-byte aligned so it can be read as a __m128i using aligned SIMD loads in the second + // part of this method. + uint32_t new_bucket[8] __attribute__((aligned(16))); + for (int i = 0; i < 8; ++i) { + // Rehash 'hash' and use the top LOG_BUCKET_WORD_BITS bits, following Dietzfelbinger. + new_bucket[i] = + (REHASH[i] * hash) >> ((1 << LOG_BUCKET_WORD_BITS) - LOG_BUCKET_WORD_BITS); + new_bucket[i] = 1U << new_bucket[i]; + } + for (int i = 0; i < 2; ++i) { + __m128i new_bucket_sse = + _mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i)); + __m128i* existing_bucket = reinterpret_cast<__m128i*>(&directory_[bucket_idx][4 * i]); + *existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse); + } +} + +inline __m256i BloomFilter::MakeMask(const uint32_t hash) { + const __m256i ones = _mm256_set1_epi32(1); + const __m256i rehash = _mm256_setr_epi32(IMPALA_BLOOM_HASH_CONSTANTS); + // Load hash into a YMM register, repeated eight times + __m256i hash_data = _mm256_set1_epi32(hash); + // Multiply-shift hashing ala Dietzfelbinger et al.: multiply 'hash' by eight different + // odd constants, then keep the 5 most significant bits from each product. + hash_data = _mm256_mullo_epi32(rehash, hash_data); + hash_data = _mm256_srli_epi32(hash_data, 27); + // Use these 5 bits to shift a single bit to a location in each 32-bit lane + return _mm256_sllv_epi32(ones, hash_data); +} + +inline void BloomFilter::BucketInsertAVX2( + const uint32_t bucket_idx, const uint32_t hash) { + const __m256i mask = MakeMask(hash); + __m256i* const bucket = &reinterpret_cast<__m256i*>(directory_)[bucket_idx]; + _mm256_store_si256(bucket, _mm256_or_si256(*bucket, mask)); + // For SSE compatibility, unset the high bits of each YMM register so SSE instructions + // dont have to save them off before using XMM registers. + _mm256_zeroupper(); +} + +inline bool BloomFilter::BucketFindAVX2( + const uint32_t bucket_idx, const uint32_t hash) const { + const __m256i mask = MakeMask(hash); + const __m256i bucket = reinterpret_cast<__m256i*>(directory_)[bucket_idx]; + // We should return true if 'bucket' has a one wherever 'mask' does. _mm256_testc_si256 + // takes the negation of its first argument and ands that with its second argument. In + // our case, the result is zero everywhere iff there is a one in 'bucket' wherever + // 'mask' is one. testc returns 1 if the result is 0 everywhere and returns 0 otherwise. + const bool result = _mm256_testc_si256(bucket, mask); + _mm256_zeroupper(); + return result; +} + +inline bool BloomFilter::BucketFind( + const uint32_t bucket_idx, const uint32_t hash) const { for (int i = 0; i < BUCKET_WORDS; ++i) { - if (!(directory_[bucket_idx][i] & - (static_cast<BucketWord>(1) << (bits_to_set & BUCKET_WORD_MASK)))) { + BucketWord hval = + (REHASH[i] * hash) >> ((1 << LOG_BUCKET_WORD_BITS) - LOG_BUCKET_WORD_BITS); + hval = 1U << hval; + if (!(directory_[bucket_idx][i] & hval)) { return false; } - bits_to_set >>= LOG_BUCKET_WORD_BITS; } return true; } } // namespace impala +#undef IMPALA_BLOOM_HASH_CONSTANTS #endif // IMPALA_UTIL_BLOOM_H http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/cpu-info.cc ---------------------------------------------------------------------- diff --git a/be/src/util/cpu-info.cc b/be/src/util/cpu-info.cc index 4ead44f..e0f28c4 100644 --- a/be/src/util/cpu-info.cc +++ b/be/src/util/cpu-info.cc @@ -60,6 +60,7 @@ static struct { { "sse4_1", CpuInfo::SSE4_1 }, { "sse4_2", CpuInfo::SSE4_2 }, { "popcnt", CpuInfo::POPCNT }, + { "avx2", CpuInfo::AVX2 }, }; static const long num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/cpu-info.h ---------------------------------------------------------------------- diff --git a/be/src/util/cpu-info.h b/be/src/util/cpu-info.h index e47829c..86fc400 100644 --- a/be/src/util/cpu-info.h +++ b/be/src/util/cpu-info.h @@ -33,6 +33,7 @@ class CpuInfo { static const int64_t SSE4_1 = (1 << 2); static const int64_t SSE4_2 = (1 << 3); static const int64_t POPCNT = (1 << 4); + static const int64_t AVX2 = (1 << 5); /// Cache enums for L1 (data), L2 and L3 enum CacheLevel { @@ -85,6 +86,35 @@ class CpuInfo { static std::string DebugString(); + /// A utility class for temporarily disabling CPU features. Usage: + /// + /// { + /// CpuInfo::TempDisable disabler(CpuInfo::AVX2); + /// // On the previous line, the constructor disables AVX2 instructions. On the next + /// // line, CpuInfo::IsSupported(CpuInfo::AVX2) will return false. + /// SomeOperation(); + /// // On the next line, the block closes, 'disabler's destructor runs, and AVX2 + /// // instructions are re-enabled. + /// } + /// + /// TempDisable's destructor never re-enables features that were not enabled when then + /// constructor ran. + struct TempDisable { + TempDisable(int64_t feature) + : feature_(feature), reenable_(CpuInfo::IsSupported(feature)) { + CpuInfo::EnableFeature(feature_, false); + } + ~TempDisable() { + if (reenable_) { + CpuInfo::EnableFeature(feature_, true); + } + } + + private: + int64_t feature_; + bool reenable_; + }; + private: /// Populates the arguments with information about this machine's caches. /// The values returned are not reliable in some environments, e.g. RHEL5 on EC2, so
