IMPALA-4381: Incorrect AVX version of BloomFilter::Or The iteration on the loop incrementing the loop variable by the wrong amount. This version is hopefully clearer. I have tested it on an AVX_enabled machine and I have inspected the assembly.
Change-Id: I18480c598fbf6b842398581acde6fc719c40ce27 Reviewed-on: http://gerrit.cloudera.org:8080/4866 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/81c5b815 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/81c5b815 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/81c5b815 Branch: refs/heads/master Commit: 81c5b8150f3ee3ad8a1e1c1b0e768164225e2251 Parents: dba30cc Author: Jim Apple <[email protected]> Authored: Thu Oct 27 09:09:20 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Wed Nov 2 20:05:47 2016 +0000 ---------------------------------------------------------------------- be/src/benchmarks/bloom-filter-benchmark.cc | 167 ++++++++++++----------- be/src/util/bloom-filter-test.cc | 52 +++---- be/src/util/bloom-filter.cc | 40 +++--- 3 files changed, 136 insertions(+), 123 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/81c5b815/be/src/benchmarks/bloom-filter-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc b/be/src/benchmarks/bloom-filter-benchmark.cc index d9019c8..792be19 100644 --- a/be/src/benchmarks/bloom-filter-benchmark.cc +++ b/be/src/benchmarks/bloom-filter-benchmark.cc @@ -47,65 +47,65 @@ using namespace impala; // initialize: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// ndv 10k fpp 10.0% 5.89e+03 5.98e+03 6.03e+03 1X 1X 1X -// ndv 10k fpp 1.0% 3.22e+03 3.25e+03 3.27e+03 0.546X 0.543X 0.542X -// ndv 10k fpp 0.1% 1.13e+03 1.17e+03 1.18e+03 0.191X 0.195X 0.195X -// ndv 1000k fpp 10.0% 3.85 3.93 3.93 0.000654X 0.000657X 0.000652X -// ndv 1000k fpp 1.0% 2.04 2.12 2.12 0.000346X 0.000354X 0.000351X -// ndv 1000k fpp 0.1% 2.04 2.12 2.12 0.000346X 0.000354X 0.000351X -// ndv 100000k fpp 10.0% 0.0281 0.029 0.0294 4.77e-06X 4.85e-06X 4.87e-06X -// ndv 100000k fpp 1.0% 0.0284 0.029 0.0298 4.82e-06X 4.85e-06X 4.93e-06X -// ndv 100000k fpp 0.1% 0.0144 0.0147 0.0149 2.44e-06X 2.47e-06X 2.47e-06X +// ndv 10k fpp 10.0% 5.92e+03 5.98e+03 6.03e+03 1X 1X 1X +// ndv 10k fpp 1.0% 3.17e+03 3.24e+03 3.26e+03 0.535X 0.542X 0.541X +// ndv 10k fpp 0.1% 1.16e+03 1.17e+03 1.18e+03 0.195X 0.195X 0.195X +// ndv 1000k fpp 10.0% 3.85 3.93 3.93 0.000651X 0.000657X 0.000652X +// ndv 1000k fpp 1.0% 2.08 2.12 2.12 0.000351X 0.000354X 0.000351X +// ndv 1000k fpp 0.1% 2.08 2.12 2.12 0.000351X 0.000354X 0.000351X +// ndv 100000k fpp 10.0% 0.0299 0.0304 0.031 5.06e-06X 5.09e-06X 5.14e-06X +// ndv 100000k fpp 1.0% 0.0295 0.0306 0.0311 4.98e-06X 5.12e-06X 5.15e-06X +// ndv 100000k fpp 0.1% 0.0151 0.0153 0.0154 2.55e-06X 2.55e-06X 2.55e-06X // // With AVX2: // // insert: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// ndv 10k fpp 10.0% 1.17e+05 1.23e+05 1.25e+05 1X 1X 1X -// ndv 10k fpp 1.0% 1.17e+05 1.24e+05 1.25e+05 1X 1X 1X -// ndv 10k fpp 0.1% 1.2e+05 1.23e+05 1.24e+05 1.02X 0.996X 0.991X -// ndv 1000k fpp 10.0% 1.1e+05 1.18e+05 1.2e+05 0.944X 0.959X 0.96X -// ndv 1000k fpp 1.0% 1.11e+05 1.16e+05 1.17e+05 0.954X 0.938X 0.934X -// ndv 1000k fpp 0.1% 9.73e+04 1.16e+05 1.17e+05 0.834X 0.937X 0.936X -// ndv 100000k fpp 10.0% 2.96e+04 4.19e+04 5.44e+04 0.254X 0.34X 0.436X -// ndv 100000k fpp 1.0% 2.92e+04 3.81e+04 4.89e+04 0.25X 0.308X 0.391X -// ndv 100000k fpp 0.1% 2.44e+04 3.28e+04 4.31e+04 0.209X 0.266X 0.345X +// ndv 10k fpp 10.0% 1.22e+05 1.23e+05 1.24e+05 1X 1X 1X +// ndv 10k fpp 1.0% 1.22e+05 1.23e+05 1.24e+05 0.998X 1X 1X +// ndv 10k fpp 0.1% 1.22e+05 1.23e+05 1.24e+05 1X 1X 1X +// ndv 1000k fpp 10.0% 1.16e+05 1.18e+05 1.2e+05 0.95X 0.964X 0.965X +// ndv 1000k fpp 1.0% 1.14e+05 1.15e+05 1.16e+05 0.935X 0.941X 0.939X +// ndv 1000k fpp 0.1% 1.14e+05 1.16e+05 1.17e+05 0.939X 0.945X 0.943X +// ndv 100000k fpp 10.0% 3.35e+04 4.22e+04 5.3e+04 0.275X 0.344X 0.428X +// ndv 100000k fpp 1.0% 3.16e+04 4.77e+04 5.78e+04 0.26X 0.388X 0.466X +// ndv 100000k fpp 0.1% 3e+04 3.7e+04 4.66e+04 0.246X 0.301X 0.376X // // find: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- // present ndv 10k fpp 10.0% 1.16e+05 1.17e+05 1.18e+05 1X 1X 1X -// absent ndv 10k fpp 10.0% 1.16e+05 1.17e+05 1.18e+05 1X 1X 0.998X -// present ndv 10k fpp 1.0% 1.15e+05 1.17e+05 1.18e+05 0.988X 1X 0.999X -// absent ndv 10k fpp 1.0% 1.14e+05 1.17e+05 1.19e+05 0.978X 1X 1X -// present ndv 10k fpp 0.1% 1.09e+05 1.17e+05 1.18e+05 0.939X 1X 1X -// absent ndv 10k fpp 0.1% 1.13e+05 1.17e+05 1.18e+05 0.97X 1X 1X -// present ndv 1000k fpp 10.0% 1.09e+05 1.13e+05 1.15e+05 0.942X 0.968X 0.97X -// absent ndv 1000k fpp 10.0% 1.09e+05 1.15e+05 1.16e+05 0.937X 0.982X 0.982X -// present ndv 1000k fpp 1.0% 9.44e+04 1.12e+05 1.13e+05 0.814X 0.952X 0.951X -// absent ndv 1000k fpp 1.0% 1.02e+05 1.14e+05 1.15e+05 0.877X 0.973X 0.972X -// present ndv 1000k fpp 0.1% 1.01e+05 1.11e+05 1.12e+05 0.868X 0.951X 0.949X -// absent ndv 1000k fpp 0.1% 1.08e+05 1.14e+05 1.15e+05 0.927X 0.975X 0.975X -// present ndv 100000k fpp 10.0% 3.18e+04 3.94e+04 5.18e+04 0.274X 0.336X 0.437X -// absent ndv 100000k fpp 10.0% 2.74e+04 3.07e+04 3.49e+04 0.236X 0.262X 0.294X -// present ndv 100000k fpp 1.0% 3.07e+04 4.29e+04 5.51e+04 0.265X 0.366X 0.465X -// absent ndv 100000k fpp 1.0% 2.67e+04 2.9e+04 3.25e+04 0.23X 0.248X 0.274X -// present ndv 100000k fpp 0.1% 2.78e+04 3.88e+04 4.9e+04 0.24X 0.331X 0.413X -// absent ndv 100000k fpp 0.1% 2.44e+04 2.84e+04 3.02e+04 0.211X 0.242X 0.255X +// absent ndv 10k fpp 10.0% 1.15e+05 1.17e+05 1.18e+05 0.996X 0.998X 1X +// present ndv 10k fpp 1.0% 1.16e+05 1.17e+05 1.18e+05 0.999X 0.996X 1X +// absent ndv 10k fpp 1.0% 1.16e+05 1.17e+05 1.18e+05 1X 0.998X 0.999X +// present ndv 10k fpp 0.1% 1.16e+05 1.17e+05 1.18e+05 0.999X 0.997X 0.997X +// absent ndv 10k fpp 0.1% 1.16e+05 1.17e+05 1.18e+05 1X 0.996X 0.998X +// present ndv 1000k fpp 10.0% 1.09e+05 1.12e+05 1.14e+05 0.936X 0.958X 0.964X +// absent ndv 1000k fpp 10.0% 1.07e+05 1.14e+05 1.15e+05 0.921X 0.976X 0.976X +// present ndv 1000k fpp 1.0% 1.05e+05 1.1e+05 1.12e+05 0.906X 0.943X 0.946X +// absent ndv 1000k fpp 1.0% 1.11e+05 1.13e+05 1.14e+05 0.961X 0.966X 0.969X +// present ndv 1000k fpp 0.1% 9.78e+04 1.11e+05 1.12e+05 0.844X 0.944X 0.946X +// absent ndv 1000k fpp 0.1% 1.08e+05 1.13e+05 1.14e+05 0.93X 0.967X 0.97X +// present ndv 100000k fpp 10.0% 3.85e+04 4.53e+04 6.12e+04 0.332X 0.387X 0.518X +// absent ndv 100000k fpp 10.0% 2.54e+04 3.01e+04 3.26e+04 0.219X 0.257X 0.276X +// present ndv 100000k fpp 1.0% 3.3e+04 4.5e+04 6.06e+04 0.284X 0.384X 0.514X +// absent ndv 100000k fpp 1.0% 2.67e+04 3.01e+04 3.2e+04 0.23X 0.257X 0.271X +// present ndv 100000k fpp 0.1% 3.12e+04 4.25e+04 5.15e+04 0.269X 0.362X 0.436X +// absent ndv 100000k fpp 0.1% 2.39e+04 2.69e+04 2.84e+04 0.206X 0.229X 0.24X // // union: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// ndv 10k fpp 10.0% 1.81e+04 1.84e+04 1.86e+04 1X 1X 1X -// ndv 10k fpp 1.0% 8.25e+03 8.39e+03 8.47e+03 0.455X 0.455X 0.455X -// ndv 10k fpp 0.1% 4.02e+03 4.31e+03 4.35e+03 0.222X 0.234X 0.234X -// ndv 1000k fpp 10.0% 105 111 112 0.00577X 0.00603X 0.00602X -// ndv 1000k fpp 1.0% 45.9 46.4 46.9 0.00253X 0.00252X 0.00252X -// ndv 1000k fpp 0.1% 46.2 46.6 46.9 0.00255X 0.00253X 0.00252X -// ndv 100000k fpp 10.0% 0.2 0.2 0.2 1.1e-05X 1.08e-05X 1.07e-05X -// ndv 100000k fpp 1.0% 0.2 0.2 0.2 1.1e-05X 1.08e-05X 1.07e-05X -// ndv 100000k fpp 0.1% 0.133 0.143 0.145 7.35e-06X 7.75e-06X 7.79e-06X +// ndv 10k fpp 10.0% 5.43e+03 5.63e+03 5.67e+03 1X 1X 1X +// ndv 10k fpp 1.0% 2.82e+03 2.84e+03 2.87e+03 0.52X 0.505X 0.507X +// ndv 10k fpp 0.1% 780 803 812 0.144X 0.143X 0.143X +// ndv 1000k fpp 10.0% 16.2 16.5 16.7 0.00298X 0.00292X 0.00294X +// ndv 1000k fpp 1.0% 7.75 8.04 8.11 0.00143X 0.00143X 0.00143X +// ndv 1000k fpp 0.1% 7.96 8.11 8.11 0.00147X 0.00144X 0.00143X +// ndv 100000k fpp 10.0% 0.045 0.0472 0.0478 8.29e-06X 8.38e-06X 8.44e-06X +// ndv 100000k fpp 1.0% 0.045 0.0474 0.0478 8.29e-06X 8.42e-06X 8.44e-06X +// ndv 100000k fpp 0.1% 0.023 0.0235 0.0238 4.23e-06X 4.17e-06X 4.2e-06X // // // Without AVX or AVX2: @@ -113,50 +113,50 @@ using namespace impala; // insert: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// ndv 10k fpp 10.0% 9.27e+04 9.33e+04 9.4e+04 1X 1X 1X -// ndv 10k fpp 1.0% 9.43e+04 9.49e+04 9.61e+04 1.02X 1.02X 1.02X -// ndv 10k fpp 0.1% 9.36e+04 9.5e+04 9.58e+04 1.01X 1.02X 1.02X -// ndv 1000k fpp 10.0% 8.4e+04 9.49e+04 9.61e+04 0.906X 1.02X 1.02X -// ndv 1000k fpp 1.0% 7.64e+04 9.34e+04 9.45e+04 0.824X 1X 1.01X -// ndv 1000k fpp 0.1% 8.24e+04 9.34e+04 9.44e+04 0.888X 1X 1X -// ndv 100000k fpp 10.0% 3.22e+04 4e+04 5.03e+04 0.347X 0.429X 0.535X -// ndv 100000k fpp 1.0% 2.77e+04 3.6e+04 4.8e+04 0.298X 0.386X 0.51X -// ndv 100000k fpp 0.1% 2.54e+04 2.93e+04 4.32e+04 0.274X 0.314X 0.46X +// ndv 10k fpp 10.0% 9.47e+04 9.52e+04 9.6e+04 1X 1X 1X +// ndv 10k fpp 1.0% 9.45e+04 9.53e+04 9.59e+04 0.998X 1X 0.998X +// ndv 10k fpp 0.1% 9.2e+04 9.56e+04 9.64e+04 0.972X 1X 1X +// ndv 1000k fpp 10.0% 9.2e+04 9.46e+04 9.57e+04 0.972X 0.993X 0.997X +// ndv 1000k fpp 1.0% 8.49e+04 9.32e+04 9.45e+04 0.896X 0.979X 0.984X +// ndv 1000k fpp 0.1% 8.37e+04 9.35e+04 9.47e+04 0.884X 0.981X 0.986X +// ndv 100000k fpp 10.0% 4.03e+04 5.1e+04 5.83e+04 0.425X 0.536X 0.607X +// ndv 100000k fpp 1.0% 3.2e+04 3.95e+04 5.11e+04 0.337X 0.415X 0.532X +// ndv 100000k fpp 0.1% 3.82e+04 4.52e+04 5.19e+04 0.404X 0.474X 0.54X // // find: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// present ndv 10k fpp 10.0% 1.3e+05 1.31e+05 1.33e+05 1X 1X 1X -// absent ndv 10k fpp 10.0% 8.74e+04 8.83e+04 8.92e+04 0.674X 0.673X 0.671X -// present ndv 10k fpp 1.0% 1.25e+05 1.3e+05 1.31e+05 0.96X 0.991X 0.988X -// absent ndv 10k fpp 1.0% 1.04e+05 1.06e+05 1.07e+05 0.805X 0.809X 0.807X -// present ndv 10k fpp 0.1% 1.28e+05 1.3e+05 1.31e+05 0.986X 0.988X 0.984X -// absent ndv 10k fpp 0.1% 1.69e+05 1.72e+05 1.74e+05 1.3X 1.31X 1.31X -// present ndv 1000k fpp 10.0% 9.33e+04 9.6e+04 9.69e+04 0.719X 0.732X 0.729X -// absent ndv 1000k fpp 10.0% 5.99e+04 6.07e+04 6.12e+04 0.462X 0.462X 0.461X -// present ndv 1000k fpp 1.0% 9.48e+04 1.01e+05 1.02e+05 0.731X 0.768X 0.768X -// absent ndv 1000k fpp 1.0% 9.49e+04 9.67e+04 9.74e+04 0.731X 0.737X 0.734X -// present ndv 1000k fpp 0.1% 8.46e+04 9.3e+04 9.41e+04 0.652X 0.709X 0.709X -// absent ndv 1000k fpp 0.1% 9.05e+04 9.18e+04 9.28e+04 0.697X 0.7X 0.699X -// present ndv 100000k fpp 10.0% 2.6e+04 2.88e+04 3.11e+04 0.201X 0.22X 0.235X -// absent ndv 100000k fpp 10.0% 2.88e+04 2.99e+04 3.08e+04 0.222X 0.228X 0.232X -// present ndv 100000k fpp 1.0% 2.34e+04 2.76e+04 2.91e+04 0.18X 0.21X 0.219X -// absent ndv 100000k fpp 1.0% 2.86e+04 2.97e+04 3.03e+04 0.22X 0.227X 0.228X -// present ndv 100000k fpp 0.1% 2.34e+04 2.65e+04 2.81e+04 0.18X 0.202X 0.211X -// absent ndv 100000k fpp 0.1% 3.73e+04 3.85e+04 3.91e+04 0.287X 0.293X 0.295X +// present ndv 10k fpp 10.0% 1.25e+05 1.3e+05 1.31e+05 1X 1X 1X +// absent ndv 10k fpp 10.0% 7.91e+04 7.99e+04 8.06e+04 0.633X 0.614X 0.613X +// present ndv 10k fpp 1.0% 1.26e+05 1.32e+05 1.33e+05 1.01X 1.01X 1.01X +// absent ndv 10k fpp 1.0% 9.99e+04 1.01e+05 1.02e+05 0.799X 0.779X 0.777X +// present ndv 10k fpp 0.1% 1.25e+05 1.29e+05 1.29e+05 0.999X 0.989X 0.985X +// absent ndv 10k fpp 0.1% 1.52e+05 1.66e+05 1.68e+05 1.21X 1.28X 1.28X +// present ndv 1000k fpp 10.0% 9.23e+04 9.61e+04 9.71e+04 0.739X 0.739X 0.739X +// absent ndv 1000k fpp 10.0% 5.77e+04 5.84e+04 5.88e+04 0.462X 0.449X 0.448X +// present ndv 1000k fpp 1.0% 7.25e+04 9.08e+04 9.33e+04 0.581X 0.698X 0.71X +// absent ndv 1000k fpp 1.0% 7.6e+04 8.97e+04 9.08e+04 0.608X 0.69X 0.691X +// present ndv 1000k fpp 0.1% 8.65e+04 9.35e+04 9.43e+04 0.692X 0.719X 0.717X +// absent ndv 1000k fpp 0.1% 8.33e+04 8.98e+04 9.07e+04 0.667X 0.69X 0.69X +// present ndv 100000k fpp 10.0% 2.74e+04 3.06e+04 3.37e+04 0.219X 0.236X 0.256X +// absent ndv 100000k fpp 10.0% 2.88e+04 2.98e+04 3.03e+04 0.231X 0.229X 0.231X +// present ndv 100000k fpp 1.0% 2.29e+04 2.82e+04 2.95e+04 0.184X 0.217X 0.224X +// absent ndv 100000k fpp 1.0% 2.84e+04 2.94e+04 3.01e+04 0.227X 0.226X 0.229X +// present ndv 100000k fpp 0.1% 2.34e+04 2.72e+04 3.09e+04 0.187X 0.209X 0.235X +// absent ndv 100000k fpp 0.1% 3.3e+04 3.84e+04 3.96e+04 0.264X 0.295X 0.301X // // union: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// ndv 10k fpp 10.0% 3.06e+03 3.1e+03 3.12e+03 1X 1X 1X -// ndv 10k fpp 1.0% 1.51e+03 1.55e+03 1.57e+03 0.493X 0.502X 0.503X -// ndv 10k fpp 0.1% 748 775 782 0.244X 0.25X 0.251X -// ndv 1000k fpp 10.0% 19.6 20 20.2 0.0064X 0.00646X 0.00647X -// ndv 1000k fpp 1.0% 9.41 10 10.1 0.00307X 0.00324X 0.00323X -// ndv 1000k fpp 0.1% 9.9 10 10.1 0.00323X 0.00324X 0.00323X -// ndv 100000k fpp 10.0% 0.0671 0.0714 0.0725 2.19e-05X 2.3e-05X 2.32e-05X -// ndv 100000k fpp 1.0% 0.0676 0.0709 0.0719 2.21e-05X 2.29e-05X 2.31e-05X -// ndv 100000k fpp 0.1% 0.0338 0.035 0.0356 1.1e-05X 1.13e-05X 1.14e-05X +// ndv 10k fpp 10.0% 3.9e+03 3.96e+03 3.99e+03 1X 1X 1X +// ndv 10k fpp 1.0% 1.9e+03 1.95e+03 1.96e+03 0.487X 0.492X 0.491X +// ndv 10k fpp 0.1% 630 638 643 0.161X 0.161X 0.161X +// ndv 1000k fpp 10.0% 15.5 15.8 15.9 0.00397X 0.00399X 0.00399X +// ndv 1000k fpp 1.0% 7.52 7.74 7.88 0.00193X 0.00196X 0.00197X +// ndv 1000k fpp 0.1% 7.46 7.88 7.89 0.00191X 0.00199X 0.00198X +// ndv 100000k fpp 10.0% 0.0452 0.0474 0.0478 1.16e-05X 1.2e-05X 1.2e-05X +// ndv 100000k fpp 1.0% 0.0452 0.0474 0.0478 1.16e-05X 1.2e-05X 1.2e-05X +// ndv 100000k fpp 0.1% 0.0231 0.0235 0.0239 5.92e-06X 5.93e-06X 5.98e-06X // Make a random uint32_t, avoiding the absent high bit and the low-entropy low bits // produced by rand(). @@ -255,16 +255,17 @@ namespace either { struct TestData { explicit TestData(int log_heap_size) { BloomFilter bf(log_heap_size); - BloomFilter::ToThrift(&bf, &tbf); + BloomFilter::ToThrift(&bf, &tbf1); + BloomFilter::ToThrift(&bf, &tbf2); } - TBloomFilter tbf; + TBloomFilter tbf1, tbf2; }; void Benchmark(int batch_size, void* data) { TestData* d = reinterpret_cast<TestData*>(data); for (int i = 0; i < batch_size; ++i) { - BloomFilter::Or(d->tbf, &d->tbf); + BloomFilter::Or(d->tbf1, &d->tbf2); } } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/81c5b815/be/src/util/bloom-filter-test.cc ---------------------------------------------------------------------- diff --git a/be/src/util/bloom-filter-test.cc b/be/src/util/bloom-filter-test.cc index d47a7e0..9e63227 100644 --- a/be/src/util/bloom-filter-test.cc +++ b/be/src/util/bloom-filter-test.cc @@ -60,6 +60,26 @@ bool BfFind(BloomFilter& bf, uint32_t h) { } } +// Computes union of 'x' and 'y'. Computes twice with AVX enabled and disabled and +// verifies both produce the same result. 'success' is set to true if both union +// computations returned the same result and set to false otherwise. +TBloomFilter BfUnion(const BloomFilter& x, const BloomFilter& y, bool* success) { + TBloomFilter thrift_x, thrift_y; + BloomFilter::ToThrift(&x, &thrift_x); + BloomFilter::ToThrift(&y, &thrift_y); + BloomFilter::Or(thrift_x, &thrift_y); + { + CpuInfo::TempDisable t1(CpuInfo::AVX); + CpuInfo::TempDisable t2(CpuInfo::AVX2); + TBloomFilter thrift_x2, thrift_y2; + BloomFilter::ToThrift(&x, &thrift_x2); + BloomFilter::ToThrift(&y, &thrift_y2); + BloomFilter::Or(thrift_x2, &thrift_y2); + *success = thrift_y.directory == thrift_y2.directory; + } + return thrift_y; +} + } // namespace namespace impala { @@ -252,34 +272,20 @@ TEST(BloomFilter, ThriftOr) { for (int i = 60; i < 80; ++i) BfInsert(bf2, i); for (int i = 0; i < 10; ++i) BfInsert(bf1, i); - TBloomFilter bf1_thrift; - TBloomFilter bf2_thrift; - - // Create TBloomFilter with BloomFilter values. - BloomFilter::ToThrift(&bf1, &bf1_thrift); - BloomFilter::ToThrift(&bf2, &bf2_thrift); - - // Or the TBloomFilters. - BloomFilter::Or(bf1_thrift, &bf2_thrift); - - // Apply aggregated TBloomFilter to BloomFilter to verify values with BfFind(). - BloomFilter bf3(bf2_thrift); - for (int i = 0; i < 10; ++i) ASSERT_TRUE(BfFind(bf3, i)); - for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf3, i)); + bool success; + BloomFilter bf3(BfUnion(bf1, bf2, &success)); + ASSERT_TRUE(success) << "SIMD BloomFilter::Union error"; + for (int i = 0; i < 10; ++i) ASSERT_TRUE(BfFind(bf3, i)) << i; + for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf3, i)) << i; // Insert another value to aggregated BloomFilter. for (int i = 11; i < 50; ++i) BfInsert(bf3, i); - // Convert to TBloomFilter again and do Or(). - TBloomFilter bf3_thrift; - BloomFilter::ToThrift(&bf3, &bf3_thrift); - - BloomFilter::Or(bf1_thrift, &bf3_thrift); - // Apply TBloomFilter back to BloomFilter and verify if aggregation was correct. - BloomFilter bf4(bf3_thrift); - for (int i = 11; i < 50; ++i) ASSERT_TRUE(BfFind(bf4, i)); - for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf4, i)); + BloomFilter bf4(BfUnion(bf1, bf3, &success)); + ASSERT_TRUE(success) << "SIMD BloomFilter::Union error"; + for (int i = 11; i < 50; ++i) ASSERT_TRUE(BfFind(bf4, i)) << i; + for (int i = 60; i < 80; ++i) ASSERT_TRUE(BfFind(bf4, i)) << i; ASSERT_FALSE(BfFind(bf4, 81)); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/81c5b815/be/src/util/bloom-filter.cc ---------------------------------------------------------------------- diff --git a/be/src/util/bloom-filter.cc b/be/src/util/bloom-filter.cc index 2aadc05..45238b0 100644 --- a/be/src/util/bloom-filter.cc +++ b/be/src/util/bloom-filter.cc @@ -157,15 +157,16 @@ bool BloomFilter::BucketFind( namespace { // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' using AVX // instructions. 'n' must be a multiple of 32. -void __attribute__((target("avx"))) OrEqualArrayAvx(size_t n, const char* in, char* out) { - constexpr size_t REGISTER_SIZE = sizeof(__m256d); - DCHECK_EQ(n % REGISTER_SIZE, 0) << "Invalid Bloom Filter directory size"; - const double* simd_in = reinterpret_cast<const double*>(in); - double* simd_out = reinterpret_cast<double*>(out); - const size_t simd_size = n / REGISTER_SIZE; - for (size_t i = 0; i < simd_size; i += REGISTER_SIZE / sizeof(simd_in[0])) { - _mm256_storeu_pd(simd_out + i, - _mm256_or_pd(_mm256_loadu_pd(simd_out + i), _mm256_loadu_pd(simd_in + i))); +void __attribute__((target("avx"))) +OrEqualArrayAvx(size_t n, const char* __restrict__ in, char* __restrict__ out) { + constexpr size_t AVX_REGISTER_BYTES = sizeof(__m256d); + DCHECK_EQ(n % AVX_REGISTER_BYTES, 0) << "Invalid Bloom Filter directory size"; + const char* const in_end = in + n; + for (; in != in_end; (in += AVX_REGISTER_BYTES), (out += AVX_REGISTER_BYTES)) { + const double* double_in = reinterpret_cast<const double*>(in); + double* double_out = reinterpret_cast<double*>(out); + _mm256_storeu_pd(double_out, + _mm256_or_pd(_mm256_loadu_pd(double_out), _mm256_loadu_pd(double_in))); } } } //namespace @@ -173,14 +174,19 @@ void __attribute__((target("avx"))) OrEqualArrayAvx(size_t n, const char* in, ch void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) { DCHECK(out != NULL); DCHECK_EQ(in.log_heap_space, out->log_heap_space); + if (&in == out) return; out->always_true |= in.always_true; if (out->always_true) { out->directory.resize(0); return; } + DCHECK_EQ(in.directory.size(), out->directory.size()) + << "Equal log heap space " << in.log_heap_space + << ", but different directory sizes: " << in.directory.size() << ", " + << out->directory.size(); // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not // written in a way that is very friendly to auto-vectorization. Instead, we manually - // vectorize, increasing the speed by up to 184x. + // vectorize, increasing the speed by up to 56x. // // TODO: Tune gcc flags to auto-vectorize the trivial loop instead of hand-vectorizing // it. This might not be possible. @@ -188,17 +194,17 @@ void BloomFilter::Or(const TBloomFilter& in, TBloomFilter* out) { OrEqualArrayAvx(in.directory.size(), &in.directory[0], &out->directory[0]); } else { const __m128i* simd_in = reinterpret_cast<const __m128i*>(&in.directory[0]); + const __m128i* const simd_in_end = + reinterpret_cast<const __m128i*>(&in.directory[0] + in.directory.size()); __m128i* simd_out = reinterpret_cast<__m128i*>(&out->directory[0]); - const size_t simd_size = - (in.directory.size() * sizeof(in.directory[0])) / sizeof(simd_in[0]); // in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(__m128i) // == 16, we can do two _mm_or_si128's in each iteration without checking array // bounds. - for (size_t i = 0; i < simd_size; i += 2) { - _mm_storeu_si128(simd_out + i, - _mm_or_si128(_mm_loadu_si128(simd_out + i), _mm_loadu_si128(simd_in + i))); - _mm_storeu_si128(simd_out + i + 1, _mm_or_si128(_mm_loadu_si128(simd_out + i + 1), - _mm_loadu_si128(simd_in + i + 1))); + while (simd_in != simd_in_end) { + for (int i = 0; i < 2; ++i, ++simd_in, ++simd_out) { + _mm_storeu_si128( + simd_out, _mm_or_si128(_mm_loadu_si128(simd_out), _mm_loadu_si128(simd_in))); + } } } }
