IMPALA-4300: Speed up BloomFilter::Or with SIMD The previous code was not written in a way that GCC could auto-vectorize it. Manually vectorizing speeds up BloomFilter::Or by up to 184x.
Change-Id: I840799d9cfb81285c796e2abfe2029bb869b0f67 Reviewed-on: http://gerrit.cloudera.org:8080/4813 Reviewed-by: Jim Apple <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/61fcb489 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/61fcb489 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/61fcb489 Branch: refs/heads/hadoop-next Commit: 61fcb489745f3f0b3f1abbf9fbf666a29a6363de Parents: 0fbb5b7 Author: Jim Apple <[email protected]> Authored: Fri Oct 21 07:46:42 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Mon Oct 24 18:07:25 2016 +0000 ---------------------------------------------------------------------- be/src/benchmarks/bloom-filter-benchmark.cc | 198 +++++++++++++++-------- be/src/testutil/mem-util.h | 2 + be/src/util/bloom-filter.cc | 49 +++++- be/src/util/cpu-info.cc | 1 + be/src/util/cpu-info.h | 3 +- 5 files changed, 178 insertions(+), 75 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/benchmarks/bloom-filter-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc b/be/src/benchmarks/bloom-filter-benchmark.cc index 1aa0619..d9019c8 100644 --- a/be/src/benchmarks/bloom-filter-benchmark.cc +++ b/be/src/benchmarks/bloom-filter-benchmark.cc @@ -30,13 +30,14 @@ using namespace std; using namespace impala; -// Tests Bloom filter performance on four tasks: +// Tests Bloom filter performance on: // // 1. Construct/destruct pairs // 2. Inserts // 3. Lookups when the item is present // 4. Lookups when the item is absent (this is theoretically faster than when the item is // present in some Bloom filter variants) +// 5. Unions // // As in bloom-filter.h, ndv refers to the number of unique items inserted into a filter // and fpp is the probability of false positives. @@ -46,89 +47,116 @@ using namespace impala; // initialize: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// ndv 10k fpp 10.0% 7.05e+03 7.27e+03 7.34e+03 1X 1X 1X -// ndv 10k fpp 1.0% 3.79e+03 3.93e+03 3.96e+03 0.538X 0.541X 0.54X -// ndv 10k fpp 0.1% 1.39e+03 1.42e+03 1.44e+03 0.198X 0.196X 0.196X -// ndv 1000k fpp 10.0% 4.62 4.78 4.81 0.000655X 0.000658X 0.000655X -// ndv 1000k fpp 1.0% 2.49 2.55 2.6 0.000354X 0.000351X 0.000354X -// ndv 1000k fpp 0.1% 2.45 2.55 2.6 0.000347X 0.000351X 0.000354X -// ndv 100000k fpp 10.0% 0.035 0.0358 0.037 4.96e-06X 4.93e-06X 5.04e-06X -// ndv 100000k fpp 1.0% 0.0347 0.0361 0.0372 4.93e-06X 4.96e-06X 5.06e-06X -// ndv 100000k fpp 0.1% 0.0176 0.0181 0.0186 2.5e-06X 2.49e-06X 2.53e-06X +// ndv 10k fpp 10.0% 5.89e+03 5.98e+03 6.03e+03 1X 1X 1X +// ndv 10k fpp 1.0% 3.22e+03 3.25e+03 3.27e+03 0.546X 0.543X 0.542X +// ndv 10k fpp 0.1% 1.13e+03 1.17e+03 1.18e+03 0.191X 0.195X 0.195X +// ndv 1000k fpp 10.0% 3.85 3.93 3.93 0.000654X 0.000657X 0.000652X +// ndv 1000k fpp 1.0% 2.04 2.12 2.12 0.000346X 0.000354X 0.000351X +// ndv 1000k fpp 0.1% 2.04 2.12 2.12 0.000346X 0.000354X 0.000351X +// ndv 100000k fpp 10.0% 0.0281 0.029 0.0294 4.77e-06X 4.85e-06X 4.87e-06X +// ndv 100000k fpp 1.0% 0.0284 0.029 0.0298 4.82e-06X 4.85e-06X 4.93e-06X +// ndv 100000k fpp 0.1% 0.0144 0.0147 0.0149 2.44e-06X 2.47e-06X 2.47e-06X // // With AVX2: // // insert: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// ndv 10k fpp 10.0% 2.03e+05 2.05e+05 2.08e+05 1X 1X 1X -// ndv 10k fpp 1.0% 2.03e+05 2.06e+05 2.08e+05 0.997X 1X 1X -// ndv 10k fpp 0.1% 2.03e+05 2.05e+05 2.07e+05 0.997X 0.998X 0.997X -// ndv 1000k fpp 10.0% 1.82e+05 1.87e+05 1.89e+05 0.896X 0.91X 0.907X -// ndv 1000k fpp 1.0% 1.49e+05 1.53e+05 1.56e+05 0.731X 0.747X 0.75X -// ndv 1000k fpp 0.1% 1.79e+05 1.82e+05 1.83e+05 0.881X 0.886X 0.882X -// ndv 100000k fpp 10.0% 4.08e+04 4.49e+04 5.44e+04 0.201X 0.219X 0.262X -// ndv 100000k fpp 1.0% 3.94e+04 4.4e+04 5.04e+04 0.194X 0.214X 0.242X -// ndv 100000k fpp 0.1% 4.08e+04 4.48e+04 5.68e+04 0.201X 0.218X 0.273X +// ndv 10k fpp 10.0% 1.17e+05 1.23e+05 1.25e+05 1X 1X 1X +// ndv 10k fpp 1.0% 1.17e+05 1.24e+05 1.25e+05 1X 1X 1X +// ndv 10k fpp 0.1% 1.2e+05 1.23e+05 1.24e+05 1.02X 0.996X 0.991X +// ndv 1000k fpp 10.0% 1.1e+05 1.18e+05 1.2e+05 0.944X 0.959X 0.96X +// ndv 1000k fpp 1.0% 1.11e+05 1.16e+05 1.17e+05 0.954X 0.938X 0.934X +// ndv 1000k fpp 0.1% 9.73e+04 1.16e+05 1.17e+05 0.834X 0.937X 0.936X +// ndv 100000k fpp 10.0% 2.96e+04 4.19e+04 5.44e+04 0.254X 0.34X 0.436X +// ndv 100000k fpp 1.0% 2.92e+04 3.81e+04 4.89e+04 0.25X 0.308X 0.391X +// ndv 100000k fpp 0.1% 2.44e+04 3.28e+04 4.31e+04 0.209X 0.266X 0.345X // // find: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// present ndv 10k fpp 10.0% 2.48e+05 2.51e+05 2.53e+05 1X 1X 1X -// absent ndv 10k fpp 10.0% 2.47e+05 2.52e+05 2.55e+05 0.995X 1X 1.01X -// present ndv 10k fpp 1.0% 2.49e+05 2.52e+05 2.55e+05 1X 1.01X 1.01X -// absent ndv 10k fpp 1.0% 2.47e+05 2.53e+05 2.56e+05 0.997X 1.01X 1.01X -// present ndv 10k fpp 0.1% 2.49e+05 2.53e+05 2.54e+05 1X 1.01X 1.01X -// absent ndv 10k fpp 0.1% 2.47e+05 2.53e+05 2.56e+05 0.997X 1.01X 1.01X -// present ndv 1000k fpp 10.0% 1.98e+05 2.04e+05 2.06e+05 0.8X 0.814X 0.812X -// absent ndv 1000k fpp 10.0% 2.01e+05 2.07e+05 2.1e+05 0.808X 0.826X 0.829X -// present ndv 1000k fpp 1.0% 1.83e+05 1.95e+05 2.02e+05 0.737X 0.78X 0.798X -// absent ndv 1000k fpp 1.0% 2.01e+05 2.04e+05 2.08e+05 0.808X 0.815X 0.82X -// present ndv 1000k fpp 0.1% 1.96e+05 2.01e+05 2.03e+05 0.788X 0.8X 0.801X -// absent ndv 1000k fpp 0.1% 2e+05 2.05e+05 2.07e+05 0.808X 0.817X 0.818X -// present ndv 100000k fpp 10.0% 4.6e+04 5.09e+04 6.08e+04 0.185X 0.203X 0.24X -// absent ndv 100000k fpp 10.0% 4.11e+04 4.36e+04 4.53e+04 0.166X 0.174X 0.179X -// present ndv 100000k fpp 1.0% 4.55e+04 4.96e+04 6.19e+04 0.184X 0.198X 0.245X -// absent ndv 100000k fpp 1.0% 3.83e+04 4.15e+04 4.69e+04 0.154X 0.166X 0.186X -// present ndv 100000k fpp 0.1% 4.73e+04 5.43e+04 6.58e+04 0.191X 0.217X 0.26X -// absent ndv 100000k fpp 0.1% 3.77e+04 4.07e+04 4.37e+04 0.152X 0.163X 0.173X +// present ndv 10k fpp 10.0% 1.16e+05 1.17e+05 1.18e+05 1X 1X 1X +// absent ndv 10k fpp 10.0% 1.16e+05 1.17e+05 1.18e+05 1X 1X 0.998X +// present ndv 10k fpp 1.0% 1.15e+05 1.17e+05 1.18e+05 0.988X 1X 0.999X +// absent ndv 10k fpp 1.0% 1.14e+05 1.17e+05 1.19e+05 0.978X 1X 1X +// present ndv 10k fpp 0.1% 1.09e+05 1.17e+05 1.18e+05 0.939X 1X 1X +// absent ndv 10k fpp 0.1% 1.13e+05 1.17e+05 1.18e+05 0.97X 1X 1X +// present ndv 1000k fpp 10.0% 1.09e+05 1.13e+05 1.15e+05 0.942X 0.968X 0.97X +// absent ndv 1000k fpp 10.0% 1.09e+05 1.15e+05 1.16e+05 0.937X 0.982X 0.982X +// present ndv 1000k fpp 1.0% 9.44e+04 1.12e+05 1.13e+05 0.814X 0.952X 0.951X +// absent ndv 1000k fpp 1.0% 1.02e+05 1.14e+05 1.15e+05 0.877X 0.973X 0.972X +// present ndv 1000k fpp 0.1% 1.01e+05 1.11e+05 1.12e+05 0.868X 0.951X 0.949X +// absent ndv 1000k fpp 0.1% 1.08e+05 1.14e+05 1.15e+05 0.927X 0.975X 0.975X +// present ndv 100000k fpp 10.0% 3.18e+04 3.94e+04 5.18e+04 0.274X 0.336X 0.437X +// absent ndv 100000k fpp 10.0% 2.74e+04 3.07e+04 3.49e+04 0.236X 0.262X 0.294X +// present ndv 100000k fpp 1.0% 3.07e+04 4.29e+04 5.51e+04 0.265X 0.366X 0.465X +// absent ndv 100000k fpp 1.0% 2.67e+04 2.9e+04 3.25e+04 0.23X 0.248X 0.274X +// present ndv 100000k fpp 0.1% 2.78e+04 3.88e+04 4.9e+04 0.24X 0.331X 0.413X +// absent ndv 100000k fpp 0.1% 2.44e+04 2.84e+04 3.02e+04 0.211X 0.242X 0.255X // -// Without AVX2: +// union: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile +// (relative) (relative) (relative) +// --------------------------------------------------------------------------------------------------------- +// ndv 10k fpp 10.0% 1.81e+04 1.84e+04 1.86e+04 1X 1X 1X +// ndv 10k fpp 1.0% 8.25e+03 8.39e+03 8.47e+03 0.455X 0.455X 0.455X +// ndv 10k fpp 0.1% 4.02e+03 4.31e+03 4.35e+03 0.222X 0.234X 0.234X +// ndv 1000k fpp 10.0% 105 111 112 0.00577X 0.00603X 0.00602X +// ndv 1000k fpp 1.0% 45.9 46.4 46.9 0.00253X 0.00252X 0.00252X +// ndv 1000k fpp 0.1% 46.2 46.6 46.9 0.00255X 0.00253X 0.00252X +// ndv 100000k fpp 10.0% 0.2 0.2 0.2 1.1e-05X 1.08e-05X 1.07e-05X +// ndv 100000k fpp 1.0% 0.2 0.2 0.2 1.1e-05X 1.08e-05X 1.07e-05X +// ndv 100000k fpp 0.1% 0.133 0.143 0.145 7.35e-06X 7.75e-06X 7.79e-06X +// +// +// Without AVX or AVX2: // // insert: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// ndv 10k fpp 10.0% 1.25e+05 1.27e+05 1.28e+05 1X 1X 1X -// ndv 10k fpp 1.0% 1.27e+05 1.29e+05 1.3e+05 1.01X 1.02X 1.02X -// ndv 10k fpp 0.1% 1.26e+05 1.28e+05 1.3e+05 1X 1.01X 1.01X -// ndv 1000k fpp 10.0% 1.23e+05 1.25e+05 1.26e+05 0.977X 0.981X 0.985X -// ndv 1000k fpp 1.0% 1.16e+05 1.22e+05 1.23e+05 0.925X 0.958X 0.958X -// ndv 1000k fpp 0.1% 1.16e+05 1.22e+05 1.23e+05 0.928X 0.958X 0.957X -// ndv 100000k fpp 10.0% 3.77e+04 4.06e+04 5.62e+04 0.301X 0.319X 0.438X -// ndv 100000k fpp 1.0% 3.71e+04 4.06e+04 5.45e+04 0.296X 0.32X 0.425X -// ndv 100000k fpp 0.1% 3.37e+04 3.68e+04 5.15e+04 0.269X 0.29X 0.401X +// ndv 10k fpp 10.0% 9.27e+04 9.33e+04 9.4e+04 1X 1X 1X +// ndv 10k fpp 1.0% 9.43e+04 9.49e+04 9.61e+04 1.02X 1.02X 1.02X +// ndv 10k fpp 0.1% 9.36e+04 9.5e+04 9.58e+04 1.01X 1.02X 1.02X +// ndv 1000k fpp 10.0% 8.4e+04 9.49e+04 9.61e+04 0.906X 1.02X 1.02X +// ndv 1000k fpp 1.0% 7.64e+04 9.34e+04 9.45e+04 0.824X 1X 1.01X +// ndv 1000k fpp 0.1% 8.24e+04 9.34e+04 9.44e+04 0.888X 1X 1X +// ndv 100000k fpp 10.0% 3.22e+04 4e+04 5.03e+04 0.347X 0.429X 0.535X +// ndv 100000k fpp 1.0% 2.77e+04 3.6e+04 4.8e+04 0.298X 0.386X 0.51X +// ndv 100000k fpp 0.1% 2.54e+04 2.93e+04 4.32e+04 0.274X 0.314X 0.46X // // find: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// present ndv 10k fpp 10.0% 1.6e+05 1.64e+05 1.66e+05 1X 1X 1X -// absent ndv 10k fpp 10.0% 1.11e+05 1.14e+05 1.15e+05 0.696X 0.697X 0.695X -// present ndv 10k fpp 1.0% 1.57e+05 1.63e+05 1.64e+05 0.982X 0.994X 0.989X -// absent ndv 10k fpp 1.0% 1.3e+05 1.33e+05 1.35e+05 0.814X 0.813X 0.812X -// present ndv 10k fpp 0.1% 1.55e+05 1.58e+05 1.61e+05 0.967X 0.968X 0.969X -// absent ndv 10k fpp 0.1% 2.26e+05 2.29e+05 2.31e+05 1.41X 1.4X 1.4X -// present ndv 1000k fpp 10.0% 1.21e+05 1.23e+05 1.25e+05 0.758X 0.753X 0.756X -// absent ndv 1000k fpp 10.0% 7.6e+04 7.72e+04 7.81e+04 0.475X 0.472X 0.471X -// present ndv 1000k fpp 1.0% 1.23e+05 1.27e+05 1.28e+05 0.771X 0.773X 0.77X -// absent ndv 1000k fpp 1.0% 1.19e+05 1.21e+05 1.22e+05 0.744X 0.739X 0.738X -// present ndv 1000k fpp 0.1% 1.17e+05 1.18e+05 1.2e+05 0.731X 0.724X 0.723X -// absent ndv 1000k fpp 0.1% 1.13e+05 1.16e+05 1.17e+05 0.707X 0.706X 0.705X -// present ndv 100000k fpp 10.0% 3.42e+04 3.63e+04 3.9e+04 0.214X 0.222X 0.235X -// absent ndv 100000k fpp 10.0% 3.6e+04 3.77e+04 3.82e+04 0.225X 0.23X 0.23X -// present ndv 100000k fpp 1.0% 3.18e+04 3.42e+04 3.57e+04 0.199X 0.209X 0.216X -// absent ndv 100000k fpp 1.0% 3.63e+04 3.73e+04 3.79e+04 0.227X 0.228X 0.229X -// present ndv 100000k fpp 0.1% 2.89e+04 3.2e+04 3.33e+04 0.18X 0.196X 0.201X -// absent ndv 100000k fpp 0.1% 4.56e+04 4.78e+04 4.86e+04 0.285X 0.292X 0.293X +// present ndv 10k fpp 10.0% 1.3e+05 1.31e+05 1.33e+05 1X 1X 1X +// absent ndv 10k fpp 10.0% 8.74e+04 8.83e+04 8.92e+04 0.674X 0.673X 0.671X +// present ndv 10k fpp 1.0% 1.25e+05 1.3e+05 1.31e+05 0.96X 0.991X 0.988X +// absent ndv 10k fpp 1.0% 1.04e+05 1.06e+05 1.07e+05 0.805X 0.809X 0.807X +// present ndv 10k fpp 0.1% 1.28e+05 1.3e+05 1.31e+05 0.986X 0.988X 0.984X +// absent ndv 10k fpp 0.1% 1.69e+05 1.72e+05 1.74e+05 1.3X 1.31X 1.31X +// present ndv 1000k fpp 10.0% 9.33e+04 9.6e+04 9.69e+04 0.719X 0.732X 0.729X +// absent ndv 1000k fpp 10.0% 5.99e+04 6.07e+04 6.12e+04 0.462X 0.462X 0.461X +// present ndv 1000k fpp 1.0% 9.48e+04 1.01e+05 1.02e+05 0.731X 0.768X 0.768X +// absent ndv 1000k fpp 1.0% 9.49e+04 9.67e+04 9.74e+04 0.731X 0.737X 0.734X +// present ndv 1000k fpp 0.1% 8.46e+04 9.3e+04 9.41e+04 0.652X 0.709X 0.709X +// absent ndv 1000k fpp 0.1% 9.05e+04 9.18e+04 9.28e+04 0.697X 0.7X 0.699X +// present ndv 100000k fpp 10.0% 2.6e+04 2.88e+04 3.11e+04 0.201X 0.22X 0.235X +// absent ndv 100000k fpp 10.0% 2.88e+04 2.99e+04 3.08e+04 0.222X 0.228X 0.232X +// present ndv 100000k fpp 1.0% 2.34e+04 2.76e+04 2.91e+04 0.18X 0.21X 0.219X +// absent ndv 100000k fpp 1.0% 2.86e+04 2.97e+04 3.03e+04 0.22X 0.227X 0.228X +// present ndv 100000k fpp 0.1% 2.34e+04 2.65e+04 2.81e+04 0.18X 0.202X 0.211X +// absent ndv 100000k fpp 0.1% 3.73e+04 3.85e+04 3.91e+04 0.287X 0.293X 0.295X +// +// union: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile +// (relative) (relative) (relative) +// --------------------------------------------------------------------------------------------------------- +// ndv 10k fpp 10.0% 3.06e+03 3.1e+03 3.12e+03 1X 1X 1X +// ndv 10k fpp 1.0% 1.51e+03 1.55e+03 1.57e+03 0.493X 0.502X 0.503X +// ndv 10k fpp 0.1% 748 775 782 0.244X 0.25X 0.251X +// ndv 1000k fpp 10.0% 19.6 20 20.2 0.0064X 0.00646X 0.00647X +// ndv 1000k fpp 1.0% 9.41 10 10.1 0.00307X 0.00324X 0.00323X +// ndv 1000k fpp 0.1% 9.9 10 10.1 0.00323X 0.00324X 0.00323X +// ndv 100000k fpp 10.0% 0.0671 0.0714 0.0725 2.19e-05X 2.3e-05X 2.32e-05X +// ndv 100000k fpp 1.0% 0.0676 0.0709 0.0719 2.21e-05X 2.29e-05X 2.31e-05X +// ndv 100000k fpp 0.1% 0.0338 0.035 0.0356 1.1e-05X 1.13e-05X 1.14e-05X // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits // produced by rand(). @@ -221,6 +249,27 @@ void Absent(int batch_size, void* data) { } // namespace find +// Benchmark or +namespace either { + +struct TestData { + explicit TestData(int log_heap_size) { + BloomFilter bf(log_heap_size); + BloomFilter::ToThrift(&bf, &tbf); + } + + TBloomFilter tbf; +}; + +void Benchmark(int batch_size, void* data) { + TestData* d = reinterpret_cast<TestData*>(data); + for (int i = 0; i < batch_size; ++i) { + BloomFilter::Or(d->tbf, &d->tbf); + } +} + +} // namespace either + void RunBenchmarks() { char name[120]; @@ -254,6 +303,20 @@ void RunBenchmarks() { } cout << suite.Measure() << endl; } + + { + Benchmark suite("union"); + vector<unique_ptr<either::TestData> > testdata; + for (int ndv = 10000; ndv <= 100 * 1000 * 1000; ndv *= 100) { + for (double fpp = 0.1; fpp >= 0.001; fpp /= 10) { + testdata.emplace_back( + new either::TestData(BloomFilter::MinLogSpace(ndv, fpp))); + snprintf(name, sizeof(name), "ndv %7dk fpp %6.1f%%", ndv/1000, fpp*100); + suite.AddBenchmark(name, either::Benchmark, testdata.back().get()); + } + } + cout << suite.Measure() << endl; + } } int main(int argc, char **argv) { @@ -277,7 +340,8 @@ int main(int argc, char **argv) { cout << "With AVX2:" << endl << endl; RunBenchmarks(); - cout << endl << "Without AVX2:" << endl << endl; - CpuInfo::TempDisable t(CpuInfo::AVX2); + cout << endl << "Without AVX or AVX2:" << endl << endl; + CpuInfo::TempDisable t1(CpuInfo::AVX); + CpuInfo::TempDisable t2(CpuInfo::AVX2); RunBenchmarks(); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/testutil/mem-util.h ---------------------------------------------------------------------- diff --git a/be/src/testutil/mem-util.h b/be/src/testutil/mem-util.h index 78b7b48..b4ce9ea 100644 --- a/be/src/testutil/mem-util.h +++ b/be/src/testutil/mem-util.h @@ -21,6 +21,8 @@ #include <cstdint> #include <cstdlib> +#include <glog/logging.h> + #include "gutil/macros.h" namespace impala { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/bloom-filter.cc ---------------------------------------------------------------------- diff --git a/be/src/util/bloom-filter.cc b/be/src/util/bloom-filter.cc index 6fd53f5..2aadc05 100644 --- a/be/src/util/bloom-filter.cc +++ b/be/src/util/bloom-filter.cc @@ -154,6 +154,21 @@ bool BloomFilter::BucketFind( return true; } +namespace { +// Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' using AVX +// instructions. 'n' must be a multiple of 32. +void __attribute__((target("avx"))) OrEqualArrayAvx(size_t n, const char* in, char* out) { + constexpr size_t REGISTER_SIZE = sizeof(__m256d); + DCHECK_EQ(n % REGISTER_SIZE, 0) << "Invalid Bloom Filter directory size"; + const double* simd_in = reinterpret_cast<const double*>(in); + double* simd_out = reinterpret_cast<double*>(out); + const size_t simd_size = n / REGISTER_SIZE; + for (size_t i = 0; i < simd_size; i += REGISTER_SIZE / sizeof(simd_in[0])) { + _mm256_storeu_pd(simd_out + i, + _mm256_or_pd(_mm256_loadu_pd(simd_out + i), _mm256_loadu_pd(simd_in + i))); + } +} +} //namespace void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) { DCHECK(out != NULL); @@ -163,8 +178,29 @@ void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) { out->directory.resize(0); return; } - - for (int i = 0; i < in.directory.size(); ++i) out->directory[i] |= in.directory[i]; + // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not + // written in a way that is very friendly to auto-vectorization. Instead, we manually + // vectorize, increasing the speed by up to 184x. + // + // TODO: Tune gcc flags to auto-vectorize the trivial loop instead of hand-vectorizing + // it. This might not be possible. + if (CpuInfo::IsSupported(CpuInfo::AVX)) { + OrEqualArrayAvx(in.directory.size(), &in.directory[0], &out->directory[0]); + } else { + const __m128i* simd_in = reinterpret_cast<const __m128i*>(&in.directory[0]); + __m128i* simd_out = reinterpret_cast<__m128i*>(&out->directory[0]); + const size_t simd_size = + (in.directory.size() * sizeof(in.directory[0])) / sizeof(simd_in[0]); + // in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(__m128i) + // == 16, we can do two _mm_or_si128's in each iteration without checking array + // bounds. + for (size_t i = 0; i < simd_size; i += 2) { + _mm_storeu_si128(simd_out + i, + _mm_or_si128(_mm_loadu_si128(simd_out + i), _mm_loadu_si128(simd_in + i))); + _mm_storeu_si128(simd_out + i + 1, _mm_or_si128(_mm_loadu_si128(simd_out + i + 1), + _mm_loadu_si128(simd_in + i + 1))); + } + } } // The following three methods are derived from @@ -187,14 +223,13 @@ int BloomFilter::MinLogSpace(const size_t ndv, const double fpp) { const double m = -k * ndv / log(1 - pow(fpp, 1.0 / k)); // Handle case where ndv == 1 => ceil(log2(m/8)) < 0. - return max(0, static_cast<int>(ceil(log2(m/8)))); + return max(0, static_cast<int>(ceil(log2(m / 8)))); } double BloomFilter::FalsePositiveProb(const size_t ndv, const int log_heap_space) { - return pow( - 1 - exp((-1.0 * static_cast<double>(BUCKET_WORDS) * static_cast<double>(ndv)) / - static_cast<double>(1ull << (log_heap_space + 3))), + return pow(1 - exp((-1.0 * static_cast<double>(BUCKET_WORDS) * static_cast<double>(ndv)) + / static_cast<double>(1ull << (log_heap_space + 3))), BUCKET_WORDS); } -} // namespace impala +} // namespace impala http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/cpu-info.cc ---------------------------------------------------------------------- diff --git a/be/src/util/cpu-info.cc b/be/src/util/cpu-info.cc index 532c9dd..6329ca8 100644 --- a/be/src/util/cpu-info.cc +++ b/be/src/util/cpu-info.cc @@ -77,6 +77,7 @@ static struct { { "sse4_1", CpuInfo::SSE4_1 }, { "sse4_2", CpuInfo::SSE4_2 }, { "popcnt", CpuInfo::POPCNT }, + { "avx", CpuInfo::AVX }, { "avx2", CpuInfo::AVX2 }, }; static const long num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/61fcb489/be/src/util/cpu-info.h ---------------------------------------------------------------------- diff --git a/be/src/util/cpu-info.h b/be/src/util/cpu-info.h index cb577c2..868d2dd 100644 --- a/be/src/util/cpu-info.h +++ b/be/src/util/cpu-info.h @@ -36,7 +36,8 @@ class CpuInfo { static const int64_t SSE4_1 = (1 << 2); static const int64_t SSE4_2 = (1 << 3); static const int64_t POPCNT = (1 << 4); - static const int64_t AVX2 = (1 << 5); + static const int64_t AVX = (1 << 5); + static const int64_t AVX2 = (1 << 6); /// Cache enums for L1 (data), L2 and L3 enum CacheLevel {
