[2/2] incubator-impala git commit: Use AVX2 operations to speedup Bloom filters by 10-100%.

tarmstrong Tue, 21 Jun 2016 17:38:38 -0700

Use AVX2 operations to speedup Bloom filters by 10-100%.

As a reminder, our Bloom filters use so-called "block" Bloom filters,
in which each Bloom filter is actually a set of tiny Bloom fitlers,
each the size of a cache line.


The big idea is to make the tiny Bloom filters that make up a large
Bloom filter the size of AVX2 registers (256 bits) rather than cache
lines (512 bits). This enables a number of SIMD optimizations, and the
resulting AVX2 code does not need loops or conditionals.

Impala supports machines that do not have AVX2 instructions, so the
fast path is only conditionally enabled. Checking whether AVX2
instructions are available does not seem to hurt operation speed,
perhaps because the branch becomes very easy to predict.

Change-Id: I6fef4f6652876f8fd7e3f0e41431702380418c98
Reviewed-on: http://gerrit.cloudera.org:8080/3338
Reviewed-by: Jim Apple <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/38b18ea1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/38b18ea1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/38b18ea1

Branch: refs/heads/master
Commit: 38b18ea1c599cf8bfdad5c072beefad869088b45
Parents: fc3ff1c
Author: Jim Apple <[email protected]>
Authored: Thu Jun 2 08:55:32 2016 -0700
Committer: Tim Armstrong <[email protected]>
Committed: Tue Jun 21 17:38:12 2016 -0700

----------------------------------------------------------------------
 be/src/benchmarks/bloom-filter-benchmark.cc | 138 ++++++++++++-----
 be/src/util/bloom-filter-test.cc            |  90 +++++++----
 be/src/util/bloom-filter.cc                 |  20 +--
 be/src/util/bloom-filter.h                  | 183 +++++++++++++++++------
 be/src/util/cpu-info.cc                     |   1 +
 be/src/util/cpu-info.h                      |  30 ++++
 6 files changed, 339 insertions(+), 123 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/benchmarks/bloom-filter-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc 
b/be/src/benchmarks/bloom-filter-benchmark.cc
index 7374ed4..35d48fc 100644
--- a/be/src/benchmarks/bloom-filter-benchmark.cc
+++ b/be/src/benchmarks/bloom-filter-benchmark.cc
@@ -38,51 +38,103 @@ using namespace impala;
 // As in bloom-filter.h, ndv refers to the number of unique items inserted 
into a filter
 // and fpp is the probability of false positives.
 //
+//
 // Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
+//
+// With AVX2:
+//
 // initialize:           Function     Rate (iters/ms)          Comparison
 // ----------------------------------------------------------------------
-//       ndv      10k fpp   10.0%                6628                  1X
-//       ndv      10k fpp    1.0%                3655             0.5515X
-//       ndv      10k fpp    0.1%                1293              0.195X
-//       ndv    1000k fpp   10.0%               28.92           0.004363X
-//       ndv    1000k fpp    1.0%                14.5           0.002188X
-//       ndv    1000k fpp    0.1%               14.51            0.00219X
-//       ndv  100000k fpp   10.0%             0.05863          8.846e-06X
-//       ndv  100000k fpp    1.0%             0.05776          8.714e-06X
-//       ndv  100000k fpp    0.1%             0.02849          4.298e-06X
+//       ndv      10k fpp   10.0%                6607                  1X
+//       ndv      10k fpp    1.0%                3427             0.5187X
+//       ndv      10k fpp    0.1%                1203              0.182X
+//       ndv    1000k fpp   10.0%               5.273          0.0007982X
+//       ndv    1000k fpp    1.0%               3.297           0.000499X
+//       ndv    1000k fpp    0.1%                3.31           0.000501X
+//       ndv  100000k fpp   10.0%             0.08597          1.301e-05X
+//       ndv  100000k fpp    1.0%              0.0846           1.28e-05X
+//       ndv  100000k fpp    0.1%             0.04349          6.582e-06X
 //
 // insert:               Function     Rate (iters/ms)          Comparison
 // ----------------------------------------------------------------------
-//       ndv      10k fpp   10.0%           1.109e+05                  1X
-//       ndv      10k fpp    1.0%           1.083e+05             0.9771X
-//       ndv      10k fpp    0.1%           1.088e+05             0.9808X
-//       ndv    1000k fpp   10.0%           8.975e+04             0.8094X
-//       ndv    1000k fpp    1.0%             8.8e+04             0.7937X
-//       ndv    1000k fpp    0.1%           9.353e+04             0.8435X
-//       ndv  100000k fpp   10.0%           2.322e+04             0.2094X
-//       ndv  100000k fpp    1.0%           2.314e+04             0.2087X
-//       ndv  100000k fpp    0.1%           1.953e+04             0.1762X
+//       ndv      10k fpp   10.0%           1.858e+05                  1X
+//       ndv      10k fpp    1.0%           1.801e+05             0.9693X
+//       ndv      10k fpp    0.1%           1.869e+05              1.006X
+//       ndv    1000k fpp   10.0%           1.686e+05             0.9076X
+//       ndv    1000k fpp    1.0%           1.627e+05             0.8756X
+//       ndv    1000k fpp    0.1%            1.53e+05             0.8234X
+//       ndv  100000k fpp   10.0%           4.262e+04             0.2294X
+//       ndv  100000k fpp    1.0%           4.326e+04             0.2329X
+//       ndv  100000k fpp    0.1%           4.185e+04             0.2253X
 //
 // find:                 Function     Rate (iters/ms)          Comparison
 // ----------------------------------------------------------------------
-// present ndv      10k fpp   10.0%           2.044e+05                  1X
-// absent  ndv      10k fpp   10.0%           1.019e+05             0.4984X
-// present ndv      10k fpp    1.0%           2.039e+05             0.9976X
-// absent  ndv      10k fpp    1.0%           1.234e+05             0.6037X
-// present ndv      10k fpp    0.1%           1.928e+05             0.9431X
-// absent  ndv      10k fpp    0.1%           1.998e+05             0.9774X
-// present ndv    1000k fpp   10.0%           1.367e+05             0.6686X
-// absent  ndv    1000k fpp   10.0%           7.115e+04              0.348X
-// present ndv    1000k fpp    1.0%           1.164e+05             0.5694X
-// absent  ndv    1000k fpp    1.0%           9.859e+04             0.4822X
-// present ndv    1000k fpp    0.1%           1.153e+05             0.5638X
-// absent  ndv    1000k fpp    0.1%           9.787e+04             0.4787X
-// present ndv  100000k fpp   10.0%           2.869e+04             0.1403X
-// absent  ndv  100000k fpp   10.0%           3.222e+04             0.1576X
-// present ndv  100000k fpp    1.0%           2.868e+04             0.1403X
-// absent  ndv  100000k fpp    1.0%           3.212e+04             0.1571X
-// present ndv  100000k fpp    0.1%           2.793e+04             0.1366X
-// absent  ndv  100000k fpp    0.1%           3.948e+04             0.1931X
+// present ndv      10k fpp   10.0%           2.277e+05                  1X
+// absent  ndv      10k fpp   10.0%           2.258e+05             0.9914X
+// present ndv      10k fpp    1.0%           2.277e+05                  1X
+// absent  ndv      10k fpp    1.0%           2.295e+05              1.008X
+// present ndv      10k fpp    0.1%           2.258e+05             0.9916X
+// absent  ndv      10k fpp    0.1%           2.283e+05              1.003X
+// present ndv    1000k fpp   10.0%           1.799e+05             0.7901X
+// absent  ndv    1000k fpp   10.0%           1.777e+05             0.7803X
+// present ndv    1000k fpp    1.0%            1.52e+05             0.6674X
+// absent  ndv    1000k fpp    1.0%           1.625e+05             0.7134X
+// present ndv    1000k fpp    0.1%           1.825e+05             0.8013X
+// absent  ndv    1000k fpp    0.1%           1.836e+05              0.806X
+// present ndv  100000k fpp   10.0%           4.125e+04             0.1811X
+// absent  ndv  100000k fpp   10.0%           4.147e+04             0.1821X
+// present ndv  100000k fpp    1.0%           4.203e+04             0.1845X
+// absent  ndv  100000k fpp    1.0%           4.189e+04             0.1839X
+// present ndv  100000k fpp    0.1%           3.506e+04             0.1539X
+// absent  ndv  100000k fpp    0.1%           3.507e+04              0.154X
+//
+//
+// Without AVX2:
+//
+// initialize:           Function     Rate (iters/ms)          Comparison
+// ----------------------------------------------------------------------
+//       ndv      10k fpp   10.0%                6453                  1X
+//       ndv      10k fpp    1.0%                3271             0.5068X
+//       ndv      10k fpp    0.1%                1280             0.1984X
+//       ndv    1000k fpp   10.0%               5.213          0.0008078X
+//       ndv    1000k fpp    1.0%               2.574          0.0003989X
+//       ndv    1000k fpp    0.1%               2.584          0.0004005X
+//       ndv  100000k fpp   10.0%             0.03276          5.076e-06X
+//       ndv  100000k fpp    1.0%             0.03224          4.996e-06X
+//       ndv  100000k fpp    0.1%              0.0161          2.494e-06X
+//
+// insert:               Function     Rate (iters/ms)          Comparison
+// ----------------------------------------------------------------------
+//       ndv      10k fpp   10.0%           1.128e+05                  1X
+//       ndv      10k fpp    1.0%           1.162e+05               1.03X
+//       ndv      10k fpp    0.1%           1.145e+05              1.015X
+//       ndv    1000k fpp   10.0%           1.086e+05             0.9626X
+//       ndv    1000k fpp    1.0%           8.377e+04             0.7427X
+//       ndv    1000k fpp    0.1%           8.902e+04             0.7892X
+//       ndv  100000k fpp   10.0%           2.548e+04             0.2259X
+//       ndv  100000k fpp    1.0%            2.37e+04             0.2101X
+//       ndv  100000k fpp    0.1%           2.256e+04                0.2X
+//
+// find:                 Function     Rate (iters/ms)          Comparison
+// ----------------------------------------------------------------------
+// present ndv      10k fpp   10.0%           1.676e+05                  1X
+// absent  ndv      10k fpp   10.0%           1.067e+05             0.6366X
+// present ndv      10k fpp    1.0%           1.683e+05              1.004X
+// absent  ndv      10k fpp    1.0%           1.291e+05             0.7705X
+// present ndv      10k fpp    0.1%           1.662e+05             0.9917X
+// absent  ndv      10k fpp    0.1%           2.238e+05              1.336X
+// present ndv    1000k fpp   10.0%           1.231e+05             0.7344X
+// absent  ndv    1000k fpp   10.0%           6.903e+04             0.4119X
+// present ndv    1000k fpp    1.0%           1.215e+05              0.725X
+// absent  ndv    1000k fpp    1.0%           1.124e+05             0.6707X
+// present ndv    1000k fpp    0.1%           1.095e+05             0.6532X
+// absent  ndv    1000k fpp    0.1%           1.034e+05             0.6171X
+// present ndv  100000k fpp   10.0%           2.733e+04             0.1631X
+// absent  ndv  100000k fpp   10.0%           3.447e+04             0.2057X
+// present ndv  100000k fpp    1.0%           2.779e+04             0.1658X
+// absent  ndv  100000k fpp    1.0%            3.36e+04             0.2005X
+// present ndv  100000k fpp    0.1%           2.725e+04             0.1626X
+// absent  ndv  100000k fpp    0.1%           4.342e+04             0.2591X
 
 // Make a random uint32_t, avoiding the absent high bit and the low-entropy 
low bits
 // produced by rand().
@@ -174,9 +226,7 @@ void Absent(int batch_size, void* data) {
 
 }  // namespace find
 
-int main(int argc, char **argv) {
-  CpuInfo::Init();
-  cout << endl << Benchmark::GetMachineInfo() << endl;
+void RunBenchmarks() {
 
   char name[120];
 
@@ -220,6 +270,14 @@ int main(int argc, char **argv) {
     }
     cout << suite.Measure() << endl;
   }
+}
 
-  return 0;
+int main(int argc, char **argv) {
+  CpuInfo::Init();
+  cout << endl << Benchmark::GetMachineInfo() << endl << endl
+       << "With AVX2:" << endl << endl;
+  RunBenchmarks();
+  cout << endl << "Without AVX2:" << endl << endl;
+  CpuInfo::TempDisable t(CpuInfo::AVX2);
+  RunBenchmarks();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/bloom-filter-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter-test.cc b/be/src/util/bloom-filter-test.cc
index 70de938..2f2f8da 100644
--- a/be/src/util/bloom-filter-test.cc
+++ b/be/src/util/bloom-filter-test.cc
@@ -15,15 +15,20 @@
 #include "util/bloom-filter.h"
 
 #include <algorithm>
-#include <set>
+#include <unordered_set>
 #include <vector>
 
 #include <gtest/gtest.h>
 
+#include "util/cpu-info.h"
+
 using namespace std;
 
 namespace {
-// Make a random uint32_t, avoiding the absent high bit and the low-entropy 
low bits
+
+using namespace impala;
+
+// Make a random uint64_t, avoiding the absent high bit and the low-entropy 
low bits
 // produced by rand().
 uint64_t MakeRand() {
   uint32_t result = (rand() >> 8) & 0xffff;
@@ -32,7 +37,30 @@ uint64_t MakeRand() {
   return result;
 }
 
+// BfInsert() and BfFind() are like BloomFilter::{Insert,Find}, except they 
randomly
+// disable AVX2 instructions half of the time. These are used for testing that 
AVX2
+// machines and non-AVX2 machines produce compatible BloomFilters.
+
+void BfInsert(BloomFilter& bf, uint32_t h) {
+  if (MakeRand() & 0x1) {
+    bf.Insert(h);
+  } else {
+    CpuInfo::TempDisable t1(CpuInfo::AVX2);
+    bf.Insert(h);
+  }
+}
+
+bool BfFind(BloomFilter& bf, uint32_t h) {
+  if (MakeRand() & 0x1) {
+    return bf.Find(h);
+  } else {
+    CpuInfo::TempDisable t1(CpuInfo::AVX2);
+    return bf.Find(h);
+  }
+}
+
 }  // namespace
+
 namespace impala {
 
 // We can construct (and destruct) Bloom filters with different spaces.
@@ -48,7 +76,7 @@ TEST(BloomFilter, Insert) {
   for (int i = 13; i < 17; ++i) {
     BloomFilter bf(i);
     for (int k = 0; k < (1 << 15); ++k) {
-      bf.Insert(MakeRand());
+      BfInsert(bf, MakeRand());
     }
   }
 }
@@ -60,8 +88,8 @@ TEST(BloomFilter, Find) {
     BloomFilter bf(i);
     for (int k = 0; k < (1 << 15); ++k) {
       const uint64_t to_insert = MakeRand();
-      bf.Insert(to_insert);
-      EXPECT_TRUE(bf.Find(to_insert));
+      BfInsert(bf, to_insert);
+      EXPECT_TRUE(BfFind(bf, to_insert));
     }
   }
 }
@@ -75,9 +103,9 @@ TEST(BloomFilter, CumulativeFind) {
     for (int k = 0; k < (1 << 10); ++k) {
       const uint32_t to_insert = MakeRand();
       inserted.push_back(to_insert);
-      bf.Insert(to_insert);
+      BfInsert(bf, to_insert);
       for (int n = 0; n < inserted.size(); ++n) {
-        EXPECT_TRUE(bf.Find(inserted[n]));
+        EXPECT_TRUE(BfFind(bf, inserted[n]));
       }
     }
   }
@@ -88,17 +116,19 @@ TEST(BloomFilter, CumulativeFind) {
 TEST(BloomFilter, FindInvalid) {
   srand(0);
   static const int find_limit = 1 << 20;
-  set<uint32_t> to_find;
+  unordered_set<uint32_t> to_find;
   while (to_find.size() < find_limit) {
     to_find.insert(MakeRand());
   }
   static const int max_log_ndv = 19;
-  set<uint32_t> to_insert;
+  unordered_set<uint32_t> to_insert;
   while (to_insert.size() < (1ull << max_log_ndv)) {
-    to_insert.insert(MakeRand());
+    const auto candidate = MakeRand();
+    if (to_find.find(candidate) == to_find.end()) {
+      to_insert.insert(candidate);
+    }
   }
   vector<uint32_t> shuffled_insert(to_insert.begin(), to_insert.end());
-  random_shuffle(shuffled_insert.begin(), shuffled_insert.end());
   for (int log_ndv = 12; log_ndv < max_log_ndv; ++log_ndv) {
     for (int log_fpp = 4; log_fpp < 15; ++log_fpp) {
       double fpp = 1.0 / (1 << log_fpp);
@@ -107,20 +137,22 @@ TEST(BloomFilter, FindInvalid) {
       BloomFilter bf(log_heap_space);
       // Fill up a BF with exactly as much ndv as we planned for it:
       for (size_t i = 0; i < ndv; ++i) {
-        bf.Insert(shuffled_insert[i]);
+        BfInsert(bf, shuffled_insert[i]);
       }
       int found = 0;
       // Now we sample from the set of possible hashes, looking for hits.
       for (const auto& i : to_find) {
-        found += bf.Find(i);
+        found += BfFind(bf, i);
       }
-      EXPECT_LE(found, 3 * find_limit * fpp)
+      EXPECT_LE(found, find_limit * fpp * 2)
           << "Too many false positives with -log2(fpp) = " << log_fpp;
       // Because the space is rounded up to a power of 2, we might actually 
get a lower
       // fpp than the one passed to MinLogSpace().
       const double expected_fpp = BloomFilter::FalsePositiveProb(ndv, 
log_heap_space);
-      EXPECT_GE(found, 0.33 * find_limit * expected_fpp)
+      EXPECT_GE(found, find_limit * expected_fpp)
           << "Too few false positives with -log2(fpp) = " << log_fpp;
+      EXPECT_LE(found, find_limit * expected_fpp * 8)
+          << "Too many false positives with -log2(fpp) = " << log_fpp;
     }
   }
 }
@@ -193,11 +225,11 @@ TEST(BloomFilter, MinSpaceForFpp) {
 
 TEST(BloomFilter, Thrift) {
   BloomFilter bf(BloomFilter::MinLogSpace(100, 0.01));
-  for (int i = 0; i < 10; ++i) bf.Insert(i);
+  for (int i = 0; i < 10; ++i) BfInsert(bf, i);
   // Check no unexpected new false positives.
-  set<int> missing_ints;
+  unordered_set<int> missing_ints;
   for (int i = 11; i < 100; ++i) {
-    if (!bf.Find(i)) missing_ints.insert(i);
+    if (!BfFind(bf, i)) missing_ints.insert(i);
   }
 
   TBloomFilter to_thrift;
@@ -205,8 +237,8 @@ TEST(BloomFilter, Thrift) {
   EXPECT_EQ(to_thrift.always_true, false);
 
   BloomFilter from_thrift(to_thrift);
-  for (int i = 0; i < 10; ++i) ASSERT_TRUE(from_thrift.Find(i));
-  for (int missing: missing_ints) ASSERT_FALSE(from_thrift.Find(missing));
+  for (int i = 0; i < 10; ++i) ASSERT_TRUE(BfFind(from_thrift, i));
+  for (int missing: missing_ints) ASSERT_FALSE(BfFind(from_thrift, missing));
 
   BloomFilter::ToThrift(NULL, &to_thrift);
   EXPECT_EQ(to_thrift.always_true, true);
@@ -215,23 +247,25 @@ TEST(BloomFilter, Thrift) {
 TEST(BloomFilter, Or) {
   BloomFilter bf1(BloomFilter::MinLogSpace(100, 0.01));
   BloomFilter bf2(BloomFilter::MinLogSpace(100, 0.01));
-  for (int i = 60; i < 80; ++i) bf2.Insert(i);
+  for (int i = 60; i < 80; ++i) BfInsert(bf2, i);
 
-  for (int i = 0; i < 10; ++i) bf1.Insert(i);
+  for (int i = 0; i < 10; ++i) BfInsert(bf1, i);
   bf2.Or(bf1);
-  for (int i = 0; i < 10; ++i) ASSERT_TRUE(bf2.Find(i));
-  for (int i = 60; i < 80; ++i) ASSERT_TRUE(bf2.Find(i));
+  for (int i = 0; i < 10; ++i) ASSERT_TRUE(BfFind(bf2, i));
+  for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf2, i));
 
-  for (int i = 11; i < 50; ++i) bf1.Insert(i);
+  for (int i = 11; i < 50; ++i) BfInsert(bf1, i);
   bf2.Or(bf1);
-  for (int i = 11; i < 50; ++i) ASSERT_TRUE(bf2.Find(i));
-  for (int i = 60; i < 80; ++i) ASSERT_TRUE(bf2.Find(i));
-  ASSERT_FALSE(bf2.Find(81));
+  for (int i = 11; i < 50; ++i) ASSERT_TRUE(BfFind(bf2, i));
+  for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf2, i));
+  ASSERT_FALSE(BfFind(bf2, 81));
 }
 
 }  // namespace impala
 
 int main(int argc, char** argv) {
+  using namespace impala;
+  CpuInfo::Init();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/bloom-filter.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.cc b/be/src/util/bloom-filter.cc
index fc26e32..2d36be5 100644
--- a/be/src/util/bloom-filter.cc
+++ b/be/src/util/bloom-filter.cc
@@ -26,21 +26,22 @@ using namespace std;
 
 namespace impala {
 
-BloomFilter* BloomFilter::ALWAYS_TRUE_FILTER = NULL;
+BloomFilter* const BloomFilter::ALWAYS_TRUE_FILTER = NULL;
+
+constexpr uint32_t BloomFilter::REHASH[8] __attribute__((aligned(32)));
 
 BloomFilter::BloomFilter(const int log_heap_space)
-    : // Since log_heap_space is in bytes, we need to convert it to cache 
lines. There
-      // are 64 = 2^6 bytes in a cache line.
-      log_num_buckets_(std::max(1, log_heap_space - LOG_BUCKET_WORD_BITS)),
-      // Don't use log_num_buckets_ if it will lead to undefined behavior by a 
shift
-      // that is too large.
-      directory_mask_((1ull << std::min(63, log_num_buckets_)) - 1),
-      directory_(NULL) {
+  : // Since log_heap_space is in bytes, we need to convert it to the number 
of tiny Bloom
+    // filters we will use.
+    log_num_buckets_(std::max(1, log_heap_space - LOG_BUCKET_BYTE_SIZE)),
+    // Don't use log_num_buckets_ if it will lead to undefined behavior by a 
shift
+    // that is too large.
+    directory_mask_((1ull << std::min(63, log_num_buckets_)) - 1),
+    directory_(NULL) {
   // Since we use 32 bits in the arguments of Insert() and Find(), 
log_num_buckets_
   // must be limited.
   DCHECK(log_num_buckets_ <= 32)
       << "Bloom filter too large. log_heap_space: " << log_heap_space;
-  // Each bucket has 64 = 2^6 bytes:
   const size_t alloc_size = directory_size();
   const int malloc_failed =
       posix_memalign(reinterpret_cast<void**>(&directory_), 64, alloc_size);
@@ -84,6 +85,7 @@ void BloomFilter::Or(const BloomFilter& other) {
   BucketWord* dir_ptr = reinterpret_cast<BucketWord*>(directory_);
   const BucketWord* other_dir_ptr = reinterpret_cast<const 
BucketWord*>(other.directory_);
   int directory_size_in_words = directory_size() / sizeof(BucketWord);
+  // TODO: use SIMD here:
   for (int i = 0; i < directory_size_in_words; ++i) dir_ptr[i] |= 
other_dir_ptr[i];
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/bloom-filter.h
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.h b/be/src/util/bloom-filter.h
index 6fdd140..8a94ea8 100644
--- a/be/src/util/bloom-filter.h
+++ b/be/src/util/bloom-filter.h
@@ -20,6 +20,8 @@
 
 #include <limits>
 
+#include <immintrin.h>
+
 #include "gutil/macros.h"
 
 #include "gen-cpp/ImpalaInternalService_types.h"
@@ -40,21 +42,23 @@ namespace impala {
 /// 2. NDV: the number of unique items that have been inserted
 ///
 /// BloomFilter is implemented using block Bloom filters from Putze et al.'s 
"Cache-,
-/// Hash- and Space-Efficient Bloom Filters". The basic idea is to hash the 
item to a
-/// single cache line and then treat that cache line like a Bloom filter.  This
-/// implementation sets 8 bits in each cache-line-sized Bloom filter. This 
provides a
-/// false positive rate near optimal for between 5 and 15 bits per distinct 
value, which
-/// corresponds to false positive probabilities between 0.1% (for 15 bits) and 
10% (for 5
-/// bits).
+/// Hash- and Space-Efficient Bloom Filters". The basic idea is to hash the 
item to a tiny
+/// Bloom filter the size of a single cache line or smaller. This 
implementation sets 8
+/// bits in each tiny Bloom filter. This provides a false positive rate near 
optimal for
+/// between 5 and 15 bits per distinct value, which corresponds to false 
positive
+/// probabilities between 0.1% (for 15 bits) and 10% (for 5 bits).
+///
+/// Our tiny BloomFilters are 32 bytes to take advantage of 32-byte SIMD in 
newer Intel
+/// machines.
 class BloomFilter {
  public:
   /// Consumes at most (1 << log_heap_space) bytes on the heap.
-  BloomFilter(const int log_heap_space);
-  BloomFilter(const TBloomFilter& thrift);
+  explicit BloomFilter(const int log_heap_space);
+  explicit BloomFilter(const TBloomFilter& thrift);
   ~BloomFilter();
 
   /// Representation of a filter which allows all elements to pass.
-  static BloomFilter* ALWAYS_TRUE_FILTER;
+  static BloomFilter* const ALWAYS_TRUE_FILTER;
 
   /// Converts 'filter' to its corresponding Thrift representation. If the 
first argument
   /// is NULL, it is interpreted as a complete filter which contains all 
elements.
@@ -97,32 +101,48 @@ class BloomFilter {
   }
 
  private:
-  /// log_directory_space_ is the log (base 2) of the number of buckets in the 
directory.
+  /// The BloomFilter is divided up into Buckets
+  static const uint64_t BUCKET_WORDS = 8;
+  typedef uint32_t BucketWord;
+
+  // log2(number of bits in a BucketWord)
+  static const int LOG_BUCKET_WORD_BITS = 5;
+  static const BucketWord BUCKET_WORD_MASK = (1 << LOG_BUCKET_WORD_BITS) - 1;
+
+  /// log2(number of bytes in a bucket)
+  static const int LOG_BUCKET_BYTE_SIZE = 5;
+
+  static_assert((1 << LOG_BUCKET_WORD_BITS) == 
std::numeric_limits<BucketWord>::digits,
+      "BucketWord must have a bit-width that is be a power of 2, like 64 for 
uint64_t.");
+
+  typedef BucketWord Bucket[BUCKET_WORDS];
+
+  /// log_num_buckets_ is the log (base 2) of the number of buckets in the 
directory.
   const int log_num_buckets_;
 
   /// directory_mask_ is (1 << log_num_buckets_) - 1. It is precomputed for
   /// efficiency reasons.
   const uint32_t directory_mask_;
 
-  /// The BloomFilter is divided up into Buckets, each of which is a cache 
line.
-  static const uint64_t BUCKET_WORDS = 8;
-  typedef uint64_t BucketWord;
+  Bucket* directory_;
 
-  // log2(number of bits in a BucketWord)
-  // TODO: Use Bits::Log2Ceiling64(numeric_limits<BucketWord>::digits) once we 
enable
-  // C++14 for codegen.
-  static const int LOG_BUCKET_WORD_BITS = 6;
-  static const BucketWord BUCKET_WORD_MASK = 63; // 2^LOG_BUCKET_WORD_BITS - 1
+  /// Does the actual work of Insert(). bucket_idx is the index of the bucket 
to insert
+  /// into and 'hash' is the value passed to Insert().
+  void BucketInsert(const uint32_t bucket_idx, const uint32_t hash);
 
-  /// log2(number of bytes in a bucket)
-  static const int LOG_BUCKET_BYTE_SIZE = 6;
+  /// A faster SIMD version of BucketInsert().
+  void BucketInsertAVX2(const uint32_t bucket_idx, const uint32_t hash)
+      __attribute__((__target__("avx2")));
 
-  // TODO: Re-enable static_asserts when C++14 is enabled.
-  //static_assert((1 << LOG_BUCKET_WORD_BITS) == 
std::numeric_limits<BucketWord>::digits,
-  // "BucketWord must have a bit-width that is be a power of 2, like 64 for 
uint64_t.");
+  /// BucketFind() and BucketFindAVX2() are just like BucketInsert() and
+  /// BucketInsertAVX2(), but for Find().
+  bool BucketFind(const uint32_t bucket_idx, const uint32_t hash) const;
+  bool BucketFindAVX2(const uint32_t bucket_idx, const uint32_t hash) const
+      __attribute__((__target__("avx2")));
 
-  typedef BucketWord Bucket[BUCKET_WORDS];
-  Bucket* directory_;
+  /// A helper function for the AVX2 methods. Turns a 32-bit hash into a 
256-bit Bucket
+  /// with 1 single 1-bit set in each 32-bit lane.
+  static __m256i MakeMask(const uint32_t hash) 
__attribute__((__target__("avx2")));
 
   int64_t directory_size() const {
     return 1uLL << (log_num_buckets_ + LOG_BUCKET_BYTE_SIZE);
@@ -131,44 +151,115 @@ class BloomFilter {
   /// Serializes this filter as Thrift.
   void ToThrift(TBloomFilter* thrift) const;
 
+  /// Some constants used in hashing. #defined for efficiency reasons.
+#define IMPALA_BLOOM_HASH_CONSTANTS                                            
 \
+  0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, 0x705495c7U, 
0x2df1424bU, \
+      0x9efc4947U, 0x5c6bfb31U
+
+  /// REHASH is used as 8 odd 32-bit unsigned ints.  See Dietzfelbinger et 
al.'s "A
+  /// reliable randomized algorithm for the closest-pair problem".
+  static constexpr uint32_t REHASH[8]
+      __attribute__((aligned(32))) = {IMPALA_BLOOM_HASH_CONSTANTS};
+
   DISALLOW_COPY_AND_ASSIGN(BloomFilter);
 };
 
+// To set 8 bits in an 32-byte Bloom filter, we set one bit in each 32-bit 
uint32_t. This
+// is a "split Bloom filter", and it has approximately the same false positive 
probability
+// as standard a Bloom filter; See Mitzenmacher's "Bloom Filters and Such". It 
also has
+// the advantage of requiring fewer random bits: log2(32) * 8 = 5 * 8 = 40 
random bits for
+// a split Bloom filter, but log2(256) * 8 = 64 random bits for a standard 
Bloom filter.
+
 inline void BloomFilter::Insert(const uint32_t hash) {
   const uint32_t bucket_idx = HashUtil::Rehash32to32(hash) & directory_mask_;
-  uint64_t bits_to_set = HashUtil::Rehash32to64(hash);
-  // To set 8 bits in an 64-byte cache line, we set one bit in each 64-bit 
uint64_t in
-  // that cache line. This is a "split Bloom filter", and it has approximately 
the same
-  // false positive probability as standard a Bloom filter; See Mitzenmacher's 
"Bloom
-  // Filters and Such". It also has the advantage of requiring fewer random 
bits for the
-  // 2016-era Intel cache line sizes and machine word sizes: log2(64) * 8 = 6 
* 8 = 48
-  // random bits for a split Bloom filter, but log2(512) * 8 = 72 random bits 
for a
-  // standard Bloom filter. In fact, this leaves the most significant 16 bits 
of
-  // bits_to_set unused.
-  DCHECK_GE(std::numeric_limits<uint64_t>::digits, LOG_BUCKET_WORD_BITS * 
BUCKET_WORDS)
-      << "bits_to_set must have enough bits to index into all the bucket 
words";
-  for (int i = 0; i < BUCKET_WORDS; ++i) {
-    // Use LOG_BUCKET_WORD_BITS bits of hash data to index into a BucketWord 
and set one
-    // of its bits.
-    directory_[bucket_idx][i] |= static_cast<BucketWord>(1)
-        << (bits_to_set & BUCKET_WORD_MASK);
-    bits_to_set >>= LOG_BUCKET_WORD_BITS;
+  if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+    BucketInsertAVX2(bucket_idx, hash);
+  } else {
+    BucketInsert(bucket_idx, hash);
   }
 }
 
 inline bool BloomFilter::Find(const uint32_t hash) const {
   const uint32_t bucket_idx = HashUtil::Rehash32to32(hash) & directory_mask_;
-  uint64_t bits_to_set = HashUtil::Rehash32to64(hash);
+  if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+    return BucketFindAVX2(bucket_idx, hash);
+  } else {
+    return BucketFind(bucket_idx, hash);
+  }
+}
+
+// The SIMD reinterpret_casts technically violate C++'s strict aliasing rules. 
However, we
+// compile with -fno-strict-aliasing.
+
+inline void BloomFilter::BucketInsert(const uint32_t bucket_idx, const 
uint32_t hash) {
+  // new_bucket will be all zeros except for eight 1-bits, one in each 32-bit 
word. It is
+  // 16-byte aligned so it can be read as a __m128i using aligned SIMD loads 
in the second
+  // part of this method.
+  uint32_t new_bucket[8] __attribute__((aligned(16)));
+  for (int i = 0; i < 8; ++i) {
+    // Rehash 'hash' and use the top LOG_BUCKET_WORD_BITS bits, following 
Dietzfelbinger.
+    new_bucket[i] =
+        (REHASH[i] * hash) >> ((1 << LOG_BUCKET_WORD_BITS) - 
LOG_BUCKET_WORD_BITS);
+    new_bucket[i] = 1U << new_bucket[i];
+  }
+  for (int i = 0; i < 2; ++i) {
+    __m128i new_bucket_sse =
+        _mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i));
+    __m128i* existing_bucket = 
reinterpret_cast<__m128i*>(&directory_[bucket_idx][4 * i]);
+    *existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse);
+  }
+}
+
+inline __m256i BloomFilter::MakeMask(const uint32_t hash) {
+   const __m256i ones = _mm256_set1_epi32(1);
+   const __m256i rehash = _mm256_setr_epi32(IMPALA_BLOOM_HASH_CONSTANTS);
+  // Load hash into a YMM register, repeated eight times
+  __m256i hash_data = _mm256_set1_epi32(hash);
+  // Multiply-shift hashing ala Dietzfelbinger et al.: multiply 'hash' by 
eight different
+  // odd constants, then keep the 5 most significant bits from each product.
+  hash_data = _mm256_mullo_epi32(rehash, hash_data);
+  hash_data = _mm256_srli_epi32(hash_data, 27);
+  // Use these 5 bits to shift a single bit to a location in each 32-bit lane
+  return _mm256_sllv_epi32(ones, hash_data);
+}
+
+inline void BloomFilter::BucketInsertAVX2(
+    const uint32_t bucket_idx, const uint32_t hash) {
+  const __m256i mask = MakeMask(hash);
+  __m256i* const bucket = &reinterpret_cast<__m256i*>(directory_)[bucket_idx];
+  _mm256_store_si256(bucket, _mm256_or_si256(*bucket, mask));
+  // For SSE compatibility, unset the high bits of each YMM register so SSE 
instructions
+  // dont have to save them off before using XMM registers.
+  _mm256_zeroupper();
+}
+
+inline bool BloomFilter::BucketFindAVX2(
+    const uint32_t bucket_idx, const uint32_t hash) const {
+  const __m256i mask = MakeMask(hash);
+  const __m256i bucket = reinterpret_cast<__m256i*>(directory_)[bucket_idx];
+  // We should return true if 'bucket' has a one wherever 'mask' does. 
_mm256_testc_si256
+  // takes the negation of its first argument and ands that with its second 
argument. In
+  // our case, the result is zero everywhere iff there is a one in 'bucket' 
wherever
+  // 'mask' is one. testc returns 1 if the result is 0 everywhere and returns 
0 otherwise.
+  const bool result = _mm256_testc_si256(bucket, mask);
+  _mm256_zeroupper();
+  return result;
+}
+
+inline bool BloomFilter::BucketFind(
+    const uint32_t bucket_idx, const uint32_t hash) const {
   for (int i = 0; i < BUCKET_WORDS; ++i) {
-    if (!(directory_[bucket_idx][i] &
-            (static_cast<BucketWord>(1) << (bits_to_set & BUCKET_WORD_MASK)))) 
{
+    BucketWord hval =
+        (REHASH[i] * hash) >> ((1 << LOG_BUCKET_WORD_BITS) - 
LOG_BUCKET_WORD_BITS);
+    hval = 1U << hval;
+    if (!(directory_[bucket_idx][i] & hval)) {
       return false;
     }
-    bits_to_set >>= LOG_BUCKET_WORD_BITS;
   }
   return true;
 }
 
 }  // namespace impala
 
+#undef IMPALA_BLOOM_HASH_CONSTANTS
 #endif  // IMPALA_UTIL_BLOOM_H

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/cpu-info.cc
----------------------------------------------------------------------
diff --git a/be/src/util/cpu-info.cc b/be/src/util/cpu-info.cc
index 4ead44f..e0f28c4 100644
--- a/be/src/util/cpu-info.cc
+++ b/be/src/util/cpu-info.cc
@@ -60,6 +60,7 @@ static struct {
   { "sse4_1", CpuInfo::SSE4_1 },
   { "sse4_2", CpuInfo::SSE4_2 },
   { "popcnt", CpuInfo::POPCNT },
+  { "avx2",   CpuInfo::AVX2 },
 };
 static const long num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/38b18ea1/be/src/util/cpu-info.h
----------------------------------------------------------------------
diff --git a/be/src/util/cpu-info.h b/be/src/util/cpu-info.h
index e47829c..86fc400 100644
--- a/be/src/util/cpu-info.h
+++ b/be/src/util/cpu-info.h
@@ -33,6 +33,7 @@ class CpuInfo {
   static const int64_t SSE4_1  = (1 << 2);
   static const int64_t SSE4_2  = (1 << 3);
   static const int64_t POPCNT  = (1 << 4);
+  static const int64_t AVX2    = (1 << 5);
 
   /// Cache enums for L1 (data), L2 and L3
   enum CacheLevel {
@@ -85,6 +86,35 @@ class CpuInfo {
 
   static std::string DebugString();
 
+  /// A utility class for temporarily disabling CPU features. Usage:
+  ///
+  /// {
+  ///   CpuInfo::TempDisable disabler(CpuInfo::AVX2);
+  ///   // On the previous line, the constructor disables AVX2 instructions. 
On the next
+  ///   // line, CpuInfo::IsSupported(CpuInfo::AVX2) will return false.
+  ///   SomeOperation();
+  ///   // On the next line, the block closes, 'disabler's destructor runs, 
and AVX2
+  ///   // instructions are re-enabled.
+  /// }
+  ///
+  /// TempDisable's destructor never re-enables features that were not enabled 
when then
+  /// constructor ran.
+  struct TempDisable {
+    TempDisable(int64_t feature)
+      : feature_(feature), reenable_(CpuInfo::IsSupported(feature)) {
+      CpuInfo::EnableFeature(feature_, false);
+    }
+    ~TempDisable() {
+      if (reenable_) {
+        CpuInfo::EnableFeature(feature_, true);
+      }
+    }
+
+   private:
+    int64_t feature_;
+    bool reenable_;
+  };
+
  private:
   /// Populates the arguments with information about this machine's caches.
   /// The values returned are not reliable in some environments, e.g. RHEL5 on 
EC2, so

[2/2] incubator-impala git commit: Use AVX2 operations to speedup Bloom filters by 10-100%.

Reply via email to