IMPALA-4381: Incorrect AVX version of BloomFilter::Or

The iteration on the loop incrementing the loop variable by the wrong
amount. This version is hopefully clearer. I have tested it on an
AVX_enabled machine and I have inspected the assembly.

Change-Id: I18480c598fbf6b842398581acde6fc719c40ce27
Reviewed-on: http://gerrit.cloudera.org:8080/4866
Reviewed-by: Tim Armstrong <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/81c5b815
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/81c5b815
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/81c5b815

Branch: refs/heads/master
Commit: 81c5b8150f3ee3ad8a1e1c1b0e768164225e2251
Parents: dba30cc
Author: Jim Apple <[email protected]>
Authored: Thu Oct 27 09:09:20 2016 -0700
Committer: Internal Jenkins <[email protected]>
Committed: Wed Nov 2 20:05:47 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/bloom-filter-benchmark.cc | 167 ++++++++++++-----------
 be/src/util/bloom-filter-test.cc            |  52 +++----
 be/src/util/bloom-filter.cc                 |  40 +++---
 3 files changed, 136 insertions(+), 123 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/81c5b815/be/src/benchmarks/bloom-filter-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc 
b/be/src/benchmarks/bloom-filter-benchmark.cc
index d9019c8..792be19 100644
--- a/be/src/benchmarks/bloom-filter-benchmark.cc
+++ b/be/src/benchmarks/bloom-filter-benchmark.cc
@@ -47,65 +47,65 @@ using namespace impala;
 // initialize:                Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           5.89e+03 5.98e+03 6.03e+03    
     1X         1X         1X
-//            ndv      10k fpp    1.0%           3.22e+03 3.25e+03 3.27e+03    
 0.546X     0.543X     0.542X
-//            ndv      10k fpp    0.1%           1.13e+03 1.17e+03 1.18e+03    
 0.191X     0.195X     0.195X
-//            ndv    1000k fpp   10.0%               3.85     3.93     3.93  
0.000654X  0.000657X  0.000652X
-//            ndv    1000k fpp    1.0%               2.04     2.12     2.12  
0.000346X  0.000354X  0.000351X
-//            ndv    1000k fpp    0.1%               2.04     2.12     2.12  
0.000346X  0.000354X  0.000351X
-//            ndv  100000k fpp   10.0%             0.0281    0.029   0.0294  
4.77e-06X  4.85e-06X  4.87e-06X
-//            ndv  100000k fpp    1.0%             0.0284    0.029   0.0298  
4.82e-06X  4.85e-06X  4.93e-06X
-//            ndv  100000k fpp    0.1%             0.0144   0.0147   0.0149  
2.44e-06X  2.47e-06X  2.47e-06X
+//            ndv      10k fpp   10.0%           5.92e+03 5.98e+03 6.03e+03    
     1X         1X         1X
+//            ndv      10k fpp    1.0%           3.17e+03 3.24e+03 3.26e+03    
 0.535X     0.542X     0.541X
+//            ndv      10k fpp    0.1%           1.16e+03 1.17e+03 1.18e+03    
 0.195X     0.195X     0.195X
+//            ndv    1000k fpp   10.0%               3.85     3.93     3.93  
0.000651X  0.000657X  0.000652X
+//            ndv    1000k fpp    1.0%               2.08     2.12     2.12  
0.000351X  0.000354X  0.000351X
+//            ndv    1000k fpp    0.1%               2.08     2.12     2.12  
0.000351X  0.000354X  0.000351X
+//            ndv  100000k fpp   10.0%             0.0299   0.0304    0.031  
5.06e-06X  5.09e-06X  5.14e-06X
+//            ndv  100000k fpp    1.0%             0.0295   0.0306   0.0311  
4.98e-06X  5.12e-06X  5.15e-06X
+//            ndv  100000k fpp    0.1%             0.0151   0.0153   0.0154  
2.55e-06X  2.55e-06X  2.55e-06X
 //
 // With AVX2:
 //
 // insert:                    Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           1.17e+05 1.23e+05 1.25e+05    
     1X         1X         1X
-//            ndv      10k fpp    1.0%           1.17e+05 1.24e+05 1.25e+05    
     1X         1X         1X
-//            ndv      10k fpp    0.1%            1.2e+05 1.23e+05 1.24e+05    
  1.02X     0.996X     0.991X
-//            ndv    1000k fpp   10.0%            1.1e+05 1.18e+05  1.2e+05    
 0.944X     0.959X      0.96X
-//            ndv    1000k fpp    1.0%           1.11e+05 1.16e+05 1.17e+05    
 0.954X     0.938X     0.934X
-//            ndv    1000k fpp    0.1%           9.73e+04 1.16e+05 1.17e+05    
 0.834X     0.937X     0.936X
-//            ndv  100000k fpp   10.0%           2.96e+04 4.19e+04 5.44e+04    
 0.254X      0.34X     0.436X
-//            ndv  100000k fpp    1.0%           2.92e+04 3.81e+04 4.89e+04    
  0.25X     0.308X     0.391X
-//            ndv  100000k fpp    0.1%           2.44e+04 3.28e+04 4.31e+04    
 0.209X     0.266X     0.345X
+//            ndv      10k fpp   10.0%           1.22e+05 1.23e+05 1.24e+05    
     1X         1X         1X
+//            ndv      10k fpp    1.0%           1.22e+05 1.23e+05 1.24e+05    
 0.998X         1X         1X
+//            ndv      10k fpp    0.1%           1.22e+05 1.23e+05 1.24e+05    
     1X         1X         1X
+//            ndv    1000k fpp   10.0%           1.16e+05 1.18e+05  1.2e+05    
  0.95X     0.964X     0.965X
+//            ndv    1000k fpp    1.0%           1.14e+05 1.15e+05 1.16e+05    
 0.935X     0.941X     0.939X
+//            ndv    1000k fpp    0.1%           1.14e+05 1.16e+05 1.17e+05    
 0.939X     0.945X     0.943X
+//            ndv  100000k fpp   10.0%           3.35e+04 4.22e+04  5.3e+04    
 0.275X     0.344X     0.428X
+//            ndv  100000k fpp    1.0%           3.16e+04 4.77e+04 5.78e+04    
  0.26X     0.388X     0.466X
+//            ndv  100000k fpp    0.1%              3e+04  3.7e+04 4.66e+04    
 0.246X     0.301X     0.376X
 //
 // find:                      Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
 //    present ndv      10k fpp   10.0%           1.16e+05 1.17e+05 1.18e+05    
     1X         1X         1X
-//    absent  ndv      10k fpp   10.0%           1.16e+05 1.17e+05 1.18e+05    
     1X         1X     0.998X
-//    present ndv      10k fpp    1.0%           1.15e+05 1.17e+05 1.18e+05    
 0.988X         1X     0.999X
-//    absent  ndv      10k fpp    1.0%           1.14e+05 1.17e+05 1.19e+05    
 0.978X         1X         1X
-//    present ndv      10k fpp    0.1%           1.09e+05 1.17e+05 1.18e+05    
 0.939X         1X         1X
-//    absent  ndv      10k fpp    0.1%           1.13e+05 1.17e+05 1.18e+05    
  0.97X         1X         1X
-//    present ndv    1000k fpp   10.0%           1.09e+05 1.13e+05 1.15e+05    
 0.942X     0.968X      0.97X
-//    absent  ndv    1000k fpp   10.0%           1.09e+05 1.15e+05 1.16e+05    
 0.937X     0.982X     0.982X
-//    present ndv    1000k fpp    1.0%           9.44e+04 1.12e+05 1.13e+05    
 0.814X     0.952X     0.951X
-//    absent  ndv    1000k fpp    1.0%           1.02e+05 1.14e+05 1.15e+05    
 0.877X     0.973X     0.972X
-//    present ndv    1000k fpp    0.1%           1.01e+05 1.11e+05 1.12e+05    
 0.868X     0.951X     0.949X
-//    absent  ndv    1000k fpp    0.1%           1.08e+05 1.14e+05 1.15e+05    
 0.927X     0.975X     0.975X
-//    present ndv  100000k fpp   10.0%           3.18e+04 3.94e+04 5.18e+04    
 0.274X     0.336X     0.437X
-//    absent  ndv  100000k fpp   10.0%           2.74e+04 3.07e+04 3.49e+04    
 0.236X     0.262X     0.294X
-//    present ndv  100000k fpp    1.0%           3.07e+04 4.29e+04 5.51e+04    
 0.265X     0.366X     0.465X
-//    absent  ndv  100000k fpp    1.0%           2.67e+04  2.9e+04 3.25e+04    
  0.23X     0.248X     0.274X
-//    present ndv  100000k fpp    0.1%           2.78e+04 3.88e+04  4.9e+04    
  0.24X     0.331X     0.413X
-//    absent  ndv  100000k fpp    0.1%           2.44e+04 2.84e+04 3.02e+04    
 0.211X     0.242X     0.255X
+//    absent  ndv      10k fpp   10.0%           1.15e+05 1.17e+05 1.18e+05    
 0.996X     0.998X         1X
+//    present ndv      10k fpp    1.0%           1.16e+05 1.17e+05 1.18e+05    
 0.999X     0.996X         1X
+//    absent  ndv      10k fpp    1.0%           1.16e+05 1.17e+05 1.18e+05    
     1X     0.998X     0.999X
+//    present ndv      10k fpp    0.1%           1.16e+05 1.17e+05 1.18e+05    
 0.999X     0.997X     0.997X
+//    absent  ndv      10k fpp    0.1%           1.16e+05 1.17e+05 1.18e+05    
     1X     0.996X     0.998X
+//    present ndv    1000k fpp   10.0%           1.09e+05 1.12e+05 1.14e+05    
 0.936X     0.958X     0.964X
+//    absent  ndv    1000k fpp   10.0%           1.07e+05 1.14e+05 1.15e+05    
 0.921X     0.976X     0.976X
+//    present ndv    1000k fpp    1.0%           1.05e+05  1.1e+05 1.12e+05    
 0.906X     0.943X     0.946X
+//    absent  ndv    1000k fpp    1.0%           1.11e+05 1.13e+05 1.14e+05    
 0.961X     0.966X     0.969X
+//    present ndv    1000k fpp    0.1%           9.78e+04 1.11e+05 1.12e+05    
 0.844X     0.944X     0.946X
+//    absent  ndv    1000k fpp    0.1%           1.08e+05 1.13e+05 1.14e+05    
  0.93X     0.967X      0.97X
+//    present ndv  100000k fpp   10.0%           3.85e+04 4.53e+04 6.12e+04    
 0.332X     0.387X     0.518X
+//    absent  ndv  100000k fpp   10.0%           2.54e+04 3.01e+04 3.26e+04    
 0.219X     0.257X     0.276X
+//    present ndv  100000k fpp    1.0%            3.3e+04  4.5e+04 6.06e+04    
 0.284X     0.384X     0.514X
+//    absent  ndv  100000k fpp    1.0%           2.67e+04 3.01e+04  3.2e+04    
  0.23X     0.257X     0.271X
+//    present ndv  100000k fpp    0.1%           3.12e+04 4.25e+04 5.15e+04    
 0.269X     0.362X     0.436X
+//    absent  ndv  100000k fpp    0.1%           2.39e+04 2.69e+04 2.84e+04    
 0.206X     0.229X      0.24X
 //
 // union:                     Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           1.81e+04 1.84e+04 1.86e+04    
     1X         1X         1X
-//            ndv      10k fpp    1.0%           8.25e+03 8.39e+03 8.47e+03    
 0.455X     0.455X     0.455X
-//            ndv      10k fpp    0.1%           4.02e+03 4.31e+03 4.35e+03    
 0.222X     0.234X     0.234X
-//            ndv    1000k fpp   10.0%                105      111      112   
0.00577X   0.00603X   0.00602X
-//            ndv    1000k fpp    1.0%               45.9     46.4     46.9   
0.00253X   0.00252X   0.00252X
-//            ndv    1000k fpp    0.1%               46.2     46.6     46.9   
0.00255X   0.00253X   0.00252X
-//            ndv  100000k fpp   10.0%                0.2      0.2      0.2   
1.1e-05X  1.08e-05X  1.07e-05X
-//            ndv  100000k fpp    1.0%                0.2      0.2      0.2   
1.1e-05X  1.08e-05X  1.07e-05X
-//            ndv  100000k fpp    0.1%              0.133    0.143    0.145  
7.35e-06X  7.75e-06X  7.79e-06X
+//            ndv      10k fpp   10.0%           5.43e+03 5.63e+03 5.67e+03    
     1X         1X         1X
+//            ndv      10k fpp    1.0%           2.82e+03 2.84e+03 2.87e+03    
  0.52X     0.505X     0.507X
+//            ndv      10k fpp    0.1%                780      803      812    
 0.144X     0.143X     0.143X
+//            ndv    1000k fpp   10.0%               16.2     16.5     16.7   
0.00298X   0.00292X   0.00294X
+//            ndv    1000k fpp    1.0%               7.75     8.04     8.11   
0.00143X   0.00143X   0.00143X
+//            ndv    1000k fpp    0.1%               7.96     8.11     8.11   
0.00147X   0.00144X   0.00143X
+//            ndv  100000k fpp   10.0%              0.045   0.0472   0.0478  
8.29e-06X  8.38e-06X  8.44e-06X
+//            ndv  100000k fpp    1.0%              0.045   0.0474   0.0478  
8.29e-06X  8.42e-06X  8.44e-06X
+//            ndv  100000k fpp    0.1%              0.023   0.0235   0.0238  
4.23e-06X  4.17e-06X   4.2e-06X
 //
 //
 // Without AVX or AVX2:
@@ -113,50 +113,50 @@ using namespace impala;
 // insert:                    Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           9.27e+04 9.33e+04  9.4e+04    
     1X         1X         1X
-//            ndv      10k fpp    1.0%           9.43e+04 9.49e+04 9.61e+04    
  1.02X      1.02X      1.02X
-//            ndv      10k fpp    0.1%           9.36e+04  9.5e+04 9.58e+04    
  1.01X      1.02X      1.02X
-//            ndv    1000k fpp   10.0%            8.4e+04 9.49e+04 9.61e+04    
 0.906X      1.02X      1.02X
-//            ndv    1000k fpp    1.0%           7.64e+04 9.34e+04 9.45e+04    
 0.824X         1X      1.01X
-//            ndv    1000k fpp    0.1%           8.24e+04 9.34e+04 9.44e+04    
 0.888X         1X         1X
-//            ndv  100000k fpp   10.0%           3.22e+04    4e+04 5.03e+04    
 0.347X     0.429X     0.535X
-//            ndv  100000k fpp    1.0%           2.77e+04  3.6e+04  4.8e+04    
 0.298X     0.386X      0.51X
-//            ndv  100000k fpp    0.1%           2.54e+04 2.93e+04 4.32e+04    
 0.274X     0.314X      0.46X
+//            ndv      10k fpp   10.0%           9.47e+04 9.52e+04  9.6e+04    
     1X         1X         1X
+//            ndv      10k fpp    1.0%           9.45e+04 9.53e+04 9.59e+04    
 0.998X         1X     0.998X
+//            ndv      10k fpp    0.1%            9.2e+04 9.56e+04 9.64e+04    
 0.972X         1X         1X
+//            ndv    1000k fpp   10.0%            9.2e+04 9.46e+04 9.57e+04    
 0.972X     0.993X     0.997X
+//            ndv    1000k fpp    1.0%           8.49e+04 9.32e+04 9.45e+04    
 0.896X     0.979X     0.984X
+//            ndv    1000k fpp    0.1%           8.37e+04 9.35e+04 9.47e+04    
 0.884X     0.981X     0.986X
+//            ndv  100000k fpp   10.0%           4.03e+04  5.1e+04 5.83e+04    
 0.425X     0.536X     0.607X
+//            ndv  100000k fpp    1.0%            3.2e+04 3.95e+04 5.11e+04    
 0.337X     0.415X     0.532X
+//            ndv  100000k fpp    0.1%           3.82e+04 4.52e+04 5.19e+04    
 0.404X     0.474X      0.54X
 //
 // find:                      Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//    present ndv      10k fpp   10.0%            1.3e+05 1.31e+05 1.33e+05    
     1X         1X         1X
-//    absent  ndv      10k fpp   10.0%           8.74e+04 8.83e+04 8.92e+04    
 0.674X     0.673X     0.671X
-//    present ndv      10k fpp    1.0%           1.25e+05  1.3e+05 1.31e+05    
  0.96X     0.991X     0.988X
-//    absent  ndv      10k fpp    1.0%           1.04e+05 1.06e+05 1.07e+05    
 0.805X     0.809X     0.807X
-//    present ndv      10k fpp    0.1%           1.28e+05  1.3e+05 1.31e+05    
 0.986X     0.988X     0.984X
-//    absent  ndv      10k fpp    0.1%           1.69e+05 1.72e+05 1.74e+05    
   1.3X      1.31X      1.31X
-//    present ndv    1000k fpp   10.0%           9.33e+04  9.6e+04 9.69e+04    
 0.719X     0.732X     0.729X
-//    absent  ndv    1000k fpp   10.0%           5.99e+04 6.07e+04 6.12e+04    
 0.462X     0.462X     0.461X
-//    present ndv    1000k fpp    1.0%           9.48e+04 1.01e+05 1.02e+05    
 0.731X     0.768X     0.768X
-//    absent  ndv    1000k fpp    1.0%           9.49e+04 9.67e+04 9.74e+04    
 0.731X     0.737X     0.734X
-//    present ndv    1000k fpp    0.1%           8.46e+04  9.3e+04 9.41e+04    
 0.652X     0.709X     0.709X
-//    absent  ndv    1000k fpp    0.1%           9.05e+04 9.18e+04 9.28e+04    
 0.697X       0.7X     0.699X
-//    present ndv  100000k fpp   10.0%            2.6e+04 2.88e+04 3.11e+04    
 0.201X      0.22X     0.235X
-//    absent  ndv  100000k fpp   10.0%           2.88e+04 2.99e+04 3.08e+04    
 0.222X     0.228X     0.232X
-//    present ndv  100000k fpp    1.0%           2.34e+04 2.76e+04 2.91e+04    
  0.18X      0.21X     0.219X
-//    absent  ndv  100000k fpp    1.0%           2.86e+04 2.97e+04 3.03e+04    
  0.22X     0.227X     0.228X
-//    present ndv  100000k fpp    0.1%           2.34e+04 2.65e+04 2.81e+04    
  0.18X     0.202X     0.211X
-//    absent  ndv  100000k fpp    0.1%           3.73e+04 3.85e+04 3.91e+04    
 0.287X     0.293X     0.295X
+//    present ndv      10k fpp   10.0%           1.25e+05  1.3e+05 1.31e+05    
     1X         1X         1X
+//    absent  ndv      10k fpp   10.0%           7.91e+04 7.99e+04 8.06e+04    
 0.633X     0.614X     0.613X
+//    present ndv      10k fpp    1.0%           1.26e+05 1.32e+05 1.33e+05    
  1.01X      1.01X      1.01X
+//    absent  ndv      10k fpp    1.0%           9.99e+04 1.01e+05 1.02e+05    
 0.799X     0.779X     0.777X
+//    present ndv      10k fpp    0.1%           1.25e+05 1.29e+05 1.29e+05    
 0.999X     0.989X     0.985X
+//    absent  ndv      10k fpp    0.1%           1.52e+05 1.66e+05 1.68e+05    
  1.21X      1.28X      1.28X
+//    present ndv    1000k fpp   10.0%           9.23e+04 9.61e+04 9.71e+04    
 0.739X     0.739X     0.739X
+//    absent  ndv    1000k fpp   10.0%           5.77e+04 5.84e+04 5.88e+04    
 0.462X     0.449X     0.448X
+//    present ndv    1000k fpp    1.0%           7.25e+04 9.08e+04 9.33e+04    
 0.581X     0.698X      0.71X
+//    absent  ndv    1000k fpp    1.0%            7.6e+04 8.97e+04 9.08e+04    
 0.608X      0.69X     0.691X
+//    present ndv    1000k fpp    0.1%           8.65e+04 9.35e+04 9.43e+04    
 0.692X     0.719X     0.717X
+//    absent  ndv    1000k fpp    0.1%           8.33e+04 8.98e+04 9.07e+04    
 0.667X      0.69X      0.69X
+//    present ndv  100000k fpp   10.0%           2.74e+04 3.06e+04 3.37e+04    
 0.219X     0.236X     0.256X
+//    absent  ndv  100000k fpp   10.0%           2.88e+04 2.98e+04 3.03e+04    
 0.231X     0.229X     0.231X
+//    present ndv  100000k fpp    1.0%           2.29e+04 2.82e+04 2.95e+04    
 0.184X     0.217X     0.224X
+//    absent  ndv  100000k fpp    1.0%           2.84e+04 2.94e+04 3.01e+04    
 0.227X     0.226X     0.229X
+//    present ndv  100000k fpp    0.1%           2.34e+04 2.72e+04 3.09e+04    
 0.187X     0.209X     0.235X
+//    absent  ndv  100000k fpp    0.1%            3.3e+04 3.84e+04 3.96e+04    
 0.264X     0.295X     0.301X
 //
 // union:                     Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//            ndv      10k fpp   10.0%           3.06e+03  3.1e+03 3.12e+03    
     1X         1X         1X
-//            ndv      10k fpp    1.0%           1.51e+03 1.55e+03 1.57e+03    
 0.493X     0.502X     0.503X
-//            ndv      10k fpp    0.1%                748      775      782    
 0.244X      0.25X     0.251X
-//            ndv    1000k fpp   10.0%               19.6       20     20.2    
0.0064X   0.00646X   0.00647X
-//            ndv    1000k fpp    1.0%               9.41       10     10.1   
0.00307X   0.00324X   0.00323X
-//            ndv    1000k fpp    0.1%                9.9       10     10.1   
0.00323X   0.00324X   0.00323X
-//            ndv  100000k fpp   10.0%             0.0671   0.0714   0.0725  
2.19e-05X   2.3e-05X  2.32e-05X
-//            ndv  100000k fpp    1.0%             0.0676   0.0709   0.0719  
2.21e-05X  2.29e-05X  2.31e-05X
-//            ndv  100000k fpp    0.1%             0.0338    0.035   0.0356   
1.1e-05X  1.13e-05X  1.14e-05X
+//            ndv      10k fpp   10.0%            3.9e+03 3.96e+03 3.99e+03    
     1X         1X         1X
+//            ndv      10k fpp    1.0%            1.9e+03 1.95e+03 1.96e+03    
 0.487X     0.492X     0.491X
+//            ndv      10k fpp    0.1%                630      638      643    
 0.161X     0.161X     0.161X
+//            ndv    1000k fpp   10.0%               15.5     15.8     15.9   
0.00397X   0.00399X   0.00399X
+//            ndv    1000k fpp    1.0%               7.52     7.74     7.88   
0.00193X   0.00196X   0.00197X
+//            ndv    1000k fpp    0.1%               7.46     7.88     7.89   
0.00191X   0.00199X   0.00198X
+//            ndv  100000k fpp   10.0%             0.0452   0.0474   0.0478  
1.16e-05X   1.2e-05X   1.2e-05X
+//            ndv  100000k fpp    1.0%             0.0452   0.0474   0.0478  
1.16e-05X   1.2e-05X   1.2e-05X
+//            ndv  100000k fpp    0.1%             0.0231   0.0235   0.0239  
5.92e-06X  5.93e-06X  5.98e-06X
 
 // Make a random uint32_t, avoiding the absent high bit and the low-entropy 
low bits
 // produced by rand().
@@ -255,16 +255,17 @@ namespace either {
 struct TestData {
   explicit TestData(int log_heap_size) {
     BloomFilter bf(log_heap_size);
-    BloomFilter::ToThrift(&bf, &tbf);
+    BloomFilter::ToThrift(&bf, &tbf1);
+    BloomFilter::ToThrift(&bf, &tbf2);
   }
 
-  TBloomFilter tbf;
+  TBloomFilter tbf1, tbf2;
 };
 
 void Benchmark(int batch_size, void* data) {
   TestData* d = reinterpret_cast<TestData*>(data);
   for (int i = 0; i < batch_size; ++i) {
-    BloomFilter::Or(d->tbf, &d->tbf);
+    BloomFilter::Or(d->tbf1, &d->tbf2);
   }
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/81c5b815/be/src/util/bloom-filter-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter-test.cc b/be/src/util/bloom-filter-test.cc
index d47a7e0..9e63227 100644
--- a/be/src/util/bloom-filter-test.cc
+++ b/be/src/util/bloom-filter-test.cc
@@ -60,6 +60,26 @@ bool BfFind(BloomFilter& bf, uint32_t h) {
   }
 }
 
+// Computes union of 'x' and 'y'. Computes twice with AVX enabled and disabled 
and
+// verifies both produce the same result. 'success' is set to true if both 
union
+// computations returned the same result and set to false otherwise.
+TBloomFilter BfUnion(const BloomFilter& x, const BloomFilter& y, bool* 
success) {
+  TBloomFilter thrift_x, thrift_y;
+  BloomFilter::ToThrift(&x, &thrift_x);
+  BloomFilter::ToThrift(&y, &thrift_y);
+  BloomFilter::Or(thrift_x, &thrift_y);
+  {
+    CpuInfo::TempDisable t1(CpuInfo::AVX);
+    CpuInfo::TempDisable t2(CpuInfo::AVX2);
+    TBloomFilter thrift_x2, thrift_y2;
+    BloomFilter::ToThrift(&x, &thrift_x2);
+    BloomFilter::ToThrift(&y, &thrift_y2);
+    BloomFilter::Or(thrift_x2, &thrift_y2);
+    *success = thrift_y.directory == thrift_y2.directory;
+  }
+  return thrift_y;
+}
+
 }  // namespace
 
 namespace impala {
@@ -252,34 +272,20 @@ TEST(BloomFilter, ThriftOr) {
   for (int i = 60; i < 80; ++i) BfInsert(bf2, i);
   for (int i = 0; i < 10; ++i) BfInsert(bf1, i);
 
-  TBloomFilter bf1_thrift;
-  TBloomFilter bf2_thrift;
-
-  // Create TBloomFilter with BloomFilter values.
-  BloomFilter::ToThrift(&bf1, &bf1_thrift);
-  BloomFilter::ToThrift(&bf2, &bf2_thrift);
-
-  // Or the TBloomFilters.
-  BloomFilter::Or(bf1_thrift, &bf2_thrift);
-
-  // Apply aggregated TBloomFilter to BloomFilter to verify values with 
BfFind().
-  BloomFilter bf3(bf2_thrift);
-  for (int i = 0; i < 10; ++i) ASSERT_TRUE(BfFind(bf3, i));
-  for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf3, i));
+  bool success;
+  BloomFilter bf3(BfUnion(bf1, bf2, &success));
+  ASSERT_TRUE(success) << "SIMD BloomFilter::Union error";
+  for (int i = 0; i < 10; ++i) ASSERT_TRUE(BfFind(bf3, i)) << i;
+  for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf3, i)) << i;
 
   // Insert another value to aggregated BloomFilter.
   for (int i = 11; i < 50; ++i) BfInsert(bf3, i);
 
-  // Convert to TBloomFilter again and do Or().
-  TBloomFilter bf3_thrift;
-  BloomFilter::ToThrift(&bf3, &bf3_thrift);
-
-  BloomFilter::Or(bf1_thrift, &bf3_thrift);
-
   // Apply TBloomFilter back to BloomFilter and verify if aggregation was 
correct.
-  BloomFilter bf4(bf3_thrift);
-  for (int i = 11; i < 50; ++i) ASSERT_TRUE(BfFind(bf4, i));
-  for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf4, i));
+  BloomFilter bf4(BfUnion(bf1, bf3, &success));
+  ASSERT_TRUE(success) << "SIMD BloomFilter::Union error";
+  for (int i = 11; i < 50; ++i) ASSERT_TRUE(BfFind(bf4, i)) << i;
+  for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf4, i)) << i;
   ASSERT_FALSE(BfFind(bf4, 81));
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/81c5b815/be/src/util/bloom-filter.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.cc b/be/src/util/bloom-filter.cc
index 2aadc05..45238b0 100644
--- a/be/src/util/bloom-filter.cc
+++ b/be/src/util/bloom-filter.cc
@@ -157,15 +157,16 @@ bool BloomFilter::BucketFind(
 namespace {
 // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' using 
AVX
 // instructions. 'n' must be a multiple of 32.
-void __attribute__((target("avx"))) OrEqualArrayAvx(size_t n, const char* in, 
char* out) {
-  constexpr size_t REGISTER_SIZE = sizeof(__m256d);
-  DCHECK_EQ(n % REGISTER_SIZE, 0) << "Invalid Bloom Filter directory size";
-  const double* simd_in = reinterpret_cast<const double*>(in);
-  double* simd_out = reinterpret_cast<double*>(out);
-  const size_t simd_size = n / REGISTER_SIZE;
-  for (size_t i = 0; i < simd_size; i += REGISTER_SIZE / sizeof(simd_in[0])) {
-    _mm256_storeu_pd(simd_out + i,
-        _mm256_or_pd(_mm256_loadu_pd(simd_out + i), _mm256_loadu_pd(simd_in + 
i)));
+void __attribute__((target("avx")))
+OrEqualArrayAvx(size_t n, const char* __restrict__ in, char* __restrict__ out) 
{
+  constexpr size_t AVX_REGISTER_BYTES = sizeof(__m256d);
+  DCHECK_EQ(n % AVX_REGISTER_BYTES, 0) << "Invalid Bloom Filter directory 
size";
+  const char* const in_end = in + n;
+  for (; in != in_end; (in += AVX_REGISTER_BYTES), (out += 
AVX_REGISTER_BYTES)) {
+    const double* double_in = reinterpret_cast<const double*>(in);
+    double* double_out = reinterpret_cast<double*>(out);
+    _mm256_storeu_pd(double_out,
+        _mm256_or_pd(_mm256_loadu_pd(double_out), _mm256_loadu_pd(double_in)));
   }
 }
 } //namespace
@@ -173,14 +174,19 @@ void __attribute__((target("avx"))) 
OrEqualArrayAvx(size_t n, const char* in, ch
 void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) {
   DCHECK(out != NULL);
   DCHECK_EQ(in.log_heap_space, out->log_heap_space);
+  if (&in == out) return;
   out->always_true |= in.always_true;
   if (out->always_true) {
     out->directory.resize(0);
     return;
   }
+  DCHECK_EQ(in.directory.size(), out->directory.size())
+      << "Equal log heap space " << in.log_heap_space
+      << ", but different directory sizes: " << in.directory.size() << ", "
+      << out->directory.size();
   // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, 
but it is not
   // written in a way that is very friendly to auto-vectorization. Instead, we 
manually
-  // vectorize, increasing the speed by up to 184x.
+  // vectorize, increasing the speed by up to 56x.
   //
   // TODO: Tune gcc flags to auto-vectorize the trivial loop instead of 
hand-vectorizing
   // it. This might not be possible.
@@ -188,17 +194,17 @@ void BloomFilter::Or(const TBloomFilter& in, 
TBloomFilter* out) {
     OrEqualArrayAvx(in.directory.size(), &in.directory[0], &out->directory[0]);
   } else {
     const __m128i* simd_in = reinterpret_cast<const 
__m128i*>(&in.directory[0]);
+    const __m128i* const simd_in_end =
+        reinterpret_cast<const __m128i*>(&in.directory[0] + 
in.directory.size());
     __m128i* simd_out = reinterpret_cast<__m128i*>(&out->directory[0]);
-    const size_t simd_size =
-        (in.directory.size() * sizeof(in.directory[0])) / sizeof(simd_in[0]);
     // in.directory has a size (in bytes) that is a multiple of 32. Since 
sizeof(__m128i)
     // == 16, we can do two _mm_or_si128's in each iteration without checking 
array
     // bounds.
-    for (size_t i = 0; i < simd_size; i += 2) {
-      _mm_storeu_si128(simd_out + i,
-          _mm_or_si128(_mm_loadu_si128(simd_out + i), _mm_loadu_si128(simd_in 
+ i)));
-      _mm_storeu_si128(simd_out + i + 1, _mm_or_si128(_mm_loadu_si128(simd_out 
+ i + 1),
-                                             _mm_loadu_si128(simd_in + i + 
1)));
+    while (simd_in != simd_in_end) {
+      for (int i = 0; i < 2; ++i, ++simd_in, ++simd_out) {
+        _mm_storeu_si128(
+            simd_out, _mm_or_si128(_mm_loadu_si128(simd_out), 
_mm_loadu_si128(simd_in)));
+      }
     }
   }
 }

Reply via email to