This is an automated email from the ASF dual-hosted git repository. boroknagyz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push: new 1776c7a Revert "IMPALA-8710: Increase allowed bit width to 64 for bit packing" 1776c7a is described below commit 1776c7a54adb929bbd41eea50aed7665282f46f0 Author: Zoltan Borok-Nagy <borokna...@cloudera.com> AuthorDate: Fri Jul 5 15:15:57 2019 +0000 Revert "IMPALA-8710: Increase allowed bit width to 64 for bit packing" This reverts commit b1cbf9e6b786132e86699cbb1e472ec98499bb11. Change-Id: If18a083c72ed036f0b493506a43e20b2c2450a41 Reviewed-on: http://gerrit.cloudera.org:8080/13808 Reviewed-by: Attila Jeges <atti...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- be/src/benchmarks/bit-packing-benchmark.cc | 198 ++++++++++++++--------------- be/src/util/CMakeLists.txt | 2 +- be/src/util/bit-packing-test.cc | 26 ++-- be/src/util/bit-packing.h | 10 +- be/src/util/bit-packing.inline.h | 103 +++++++-------- be/src/util/bit-stream-utils-test.cc | 147 ++------------------- be/src/util/bit-stream-utils.h | 39 ++---- be/src/util/bit-stream-utils.inline.h | 87 +++---------- be/src/util/rle-encoding.h | 6 +- be/src/util/rle-test.cc | 2 +- 10 files changed, 205 insertions(+), 415 deletions(-) diff --git a/be/src/benchmarks/bit-packing-benchmark.cc b/be/src/benchmarks/bit-packing-benchmark.cc index 6c895ad..79ac68b 100644 --- a/be/src/benchmarks/bit-packing-benchmark.cc +++ b/be/src/benchmarks/bit-packing-benchmark.cc @@ -24,237 +24,237 @@ // Unpack32Scalar internally. // // -// Machine Info: Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz +// Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz // Unpack32Values bit_width 0:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.05e+04 1.06e+04 1.07e+04 1X 1X 1X -// Unpack32Scalar 1.29e+05 1.32e+05 1.34e+05 12.3X 12.5X 12.5X -// UnpackScalar 3.18e+05 3.23e+05 3.26e+05 30.3X 30.4X 30.5X +// BitReader 1.57e+04 1.59e+04 1.6e+04 1X 1X 1X +// Unpack32Scalar 1.34e+05 1.35e+05 1.36e+05 8.51X 8.49X 8.51X +// UnpackScalar 2.08e+05 2.1e+05 2.12e+05 13.3X 13.2X 13.2X // // Unpack32Values bit_width 1:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.28e+04 1.29e+04 1.31e+04 1X 1X 1X -// Unpack32Scalar 8.22e+04 8.42e+04 8.5e+04 6.4X 6.51X 6.5X -// UnpackScalar 7.96e+04 8.08e+04 8.16e+04 6.2X 6.24X 6.24X +// BitReader 1.19e+04 1.2e+04 1.2e+04 1X 1X 1X +// Unpack32Scalar 8.89e+04 8.94e+04 9.04e+04 7.48X 7.46X 7.51X +// UnpackScalar 9.72e+04 9.8e+04 9.86e+04 8.18X 8.18X 8.19X // // Unpack32Values bit_width 2:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- // BitReader 1.18e+04 1.19e+04 1.2e+04 1X 1X 1X -// Unpack32Scalar 8.37e+04 8.41e+04 8.49e+04 7.06X 7.06X 7.06X -// UnpackScalar 7.6e+04 7.66e+04 7.73e+04 6.42X 6.44X 6.43X +// Unpack32Scalar 8.84e+04 8.91e+04 8.99e+04 7.49X 7.48X 7.5X +// UnpackScalar 9.68e+04 9.76e+04 9.84e+04 8.2X 8.19X 8.21X // // Unpack32Values bit_width 3:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.26e+04 1.27e+04 1.28e+04 1X 1X 1X -// Unpack32Scalar 8.31e+04 8.37e+04 8.44e+04 6.61X 6.6X 6.61X -// UnpackScalar 9.42e+04 9.54e+04 9.61e+04 7.49X 7.52X 7.52X +// BitReader 1.16e+04 1.17e+04 1.18e+04 1X 1X 1X +// Unpack32Scalar 8.67e+04 8.72e+04 8.79e+04 7.45X 7.42X 7.43X +// UnpackScalar 9.6e+04 9.66e+04 9.74e+04 8.25X 8.22X 8.24X // // Unpack32Values bit_width 4:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.26e+04 1.27e+04 1.28e+04 1X 1X 1X -// Unpack32Scalar 8.4e+04 8.46e+04 8.52e+04 6.68X 6.68X 6.67X -// UnpackScalar 7.46e+04 7.52e+04 7.59e+04 5.93X 5.93X 5.94X +// BitReader 1.08e+04 1.09e+04 1.1e+04 1X 1X 1X +// Unpack32Scalar 9.13e+04 9.19e+04 9.25e+04 8.44X 8.43X 8.42X +// UnpackScalar 9.65e+04 9.69e+04 9.78e+04 8.91X 8.89X 8.9X // // Unpack32Values bit_width 5:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.23e+04 1.25e+04 1.25e+04 1X 1X 1X -// Unpack32Scalar 8.25e+04 8.32e+04 8.39e+04 6.68X 6.67X 6.69X -// UnpackScalar 9.37e+04 9.43e+04 9.52e+04 7.6X 7.56X 7.58X +// BitReader 1.14e+04 1.15e+04 1.16e+04 1X 1X 1X +// Unpack32Scalar 8.35e+04 8.42e+04 8.49e+04 7.3X 7.31X 7.31X +// UnpackScalar 9.41e+04 9.48e+04 9.56e+04 8.22X 8.22X 8.24X // // Unpack32Values bit_width 6:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.23e+04 1.24e+04 1.25e+04 1X 1X 1X -// Unpack32Scalar 8.23e+04 8.32e+04 8.39e+04 6.69X 6.7X 6.71X -// UnpackScalar 9.28e+04 9.42e+04 9.52e+04 7.54X 7.59X 7.61X +// BitReader 1.14e+04 1.15e+04 1.16e+04 1X 1X 1X +// Unpack32Scalar 8.46e+04 8.53e+04 8.6e+04 7.4X 7.41X 7.41X +// UnpackScalar 9.35e+04 9.41e+04 9.51e+04 8.18X 8.16X 8.2X // // Unpack32Values bit_width 7:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.2e+04 1.23e+04 1.24e+04 1X 1X 1X -// Unpack32Scalar 8.19e+04 8.26e+04 8.32e+04 6.8X 6.73X 6.72X -// UnpackScalar 9.26e+04 9.36e+04 9.46e+04 7.69X 7.62X 7.64X +// BitReader 1.09e+04 1.1e+04 1.11e+04 1X 1X 1X +// Unpack32Scalar 8.11e+04 8.16e+04 8.25e+04 7.44X 7.44X 7.45X +// UnpackScalar 9.16e+04 9.21e+04 9.3e+04 8.4X 8.4X 8.39X // // Unpack32Values bit_width 8:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.15e+04 1.2e+04 1.23e+04 1X 1X 1X -// Unpack32Scalar 6.33e+04 8.31e+04 8.51e+04 5.5X 6.9X 6.94X -// UnpackScalar 5.64e+04 7e+04 7.29e+04 4.91X 5.82X 5.95X +// BitReader 1.14e+04 1.15e+04 1.16e+04 1X 1X 1X +// Unpack32Scalar 9.02e+04 9.07e+04 9.14e+04 7.9X 7.9X 7.91X +// UnpackScalar 9.48e+04 9.55e+04 9.63e+04 8.31X 8.33X 8.33X // // Unpack32Values bit_width 9:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.07e+04 1.14e+04 1.16e+04 1X 1X 1X -// Unpack32Scalar 6.33e+04 8.16e+04 8.27e+04 5.9X 7.12X 7.14X -// UnpackScalar 6.95e+04 9.22e+04 9.32e+04 6.47X 8.05X 8.05X +// BitReader 1.11e+04 1.12e+04 1.13e+04 1X 1X 1X +// Unpack32Scalar 7.94e+04 7.97e+04 8.06e+04 7.14X 7.12X 7.14X +// UnpackScalar 8.78e+04 8.83e+04 8.9e+04 7.89X 7.88X 7.89X // // Unpack32Values bit_width 10:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.18e+04 1.19e+04 1.2e+04 1X 1X 1X -// Unpack32Scalar 8.13e+04 8.18e+04 8.23e+04 6.89X 6.85X 6.84X -// UnpackScalar 9.17e+04 9.26e+04 9.34e+04 7.76X 7.76X 7.75X +// BitReader 1.1e+04 1.11e+04 1.12e+04 1X 1X 1X +// Unpack32Scalar 8.07e+04 8.14e+04 8.21e+04 7.31X 7.32X 7.34X +// UnpackScalar 8.95e+04 9.02e+04 9.09e+04 8.11X 8.12X 8.12X // // Unpack32Values bit_width 11:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.17e+04 1.18e+04 1.19e+04 1X 1X 1X -// Unpack32Scalar 8.07e+04 8.14e+04 8.22e+04 6.89X 6.9X 6.92X -// UnpackScalar 9.13e+04 9.22e+04 9.29e+04 7.79X 7.82X 7.83X +// BitReader 1.09e+04 1.1e+04 1.11e+04 1X 1X 1X +// Unpack32Scalar 7.63e+04 7.69e+04 7.75e+04 6.99X 6.99X 6.99X +// UnpackScalar 8.55e+04 8.61e+04 8.69e+04 7.83X 7.83X 7.84X // // Unpack32Values bit_width 12:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.17e+04 1.17e+04 1.19e+04 1X 1X 1X -// Unpack32Scalar 8.07e+04 8.14e+04 8.2e+04 6.92X 6.93X 6.92X -// UnpackScalar 8.94e+04 9.14e+04 9.23e+04 7.67X 7.78X 7.79X +// BitReader 1.09e+04 1.1e+04 1.1e+04 1X 1X 1X +// Unpack32Scalar 8.23e+04 8.29e+04 8.35e+04 7.55X 7.56X 7.57X +// UnpackScalar 9.06e+04 9.12e+04 9.19e+04 8.31X 8.31X 8.33X // // Unpack32Values bit_width 13:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.13e+04 1.15e+04 1.16e+04 1X 1X 1X -// Unpack32Scalar 7.85e+04 8.05e+04 8.14e+04 6.93X 6.98X 6.99X -// UnpackScalar 8.77e+04 9.09e+04 9.17e+04 7.74X 7.88X 7.87X +// BitReader 1.07e+04 1.08e+04 1.09e+04 1X 1X 1X +// Unpack32Scalar 7.42e+04 7.47e+04 7.55e+04 6.92X 6.9X 6.92X +// UnpackScalar 8.16e+04 8.23e+04 8.29e+04 7.6X 7.6X 7.61X // // Unpack32Values bit_width 14:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.14e+04 1.15e+04 1.16e+04 1X 1X 1X -// Unpack32Scalar 7.81e+04 8e+04 8.06e+04 6.84X 6.95X 6.94X -// UnpackScalar 8.87e+04 9.01e+04 9.1e+04 7.77X 7.83X 7.84X +// BitReader 1.07e+04 1.08e+04 1.09e+04 1X 1X 1X +// Unpack32Scalar 7.58e+04 7.62e+04 7.68e+04 7.08X 7.08X 7.08X +// UnpackScalar 8.33e+04 8.38e+04 8.46e+04 7.78X 7.78X 7.79X // // Unpack32Values bit_width 15:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.12e+04 1.13e+04 1.14e+04 1X 1X 1X -// Unpack32Scalar 7.66e+04 7.96e+04 8.06e+04 6.85X 7.01X 7.04X -// UnpackScalar 8.81e+04 8.94e+04 9.04e+04 7.87X 7.88X 7.9X +// BitReader 1.06e+04 1.06e+04 1.07e+04 1X 1X 1X +// Unpack32Scalar 7.16e+04 7.22e+04 7.29e+04 6.78X 6.79X 6.79X +// UnpackScalar 7.96e+04 8.05e+04 8.09e+04 7.54X 7.57X 7.54X // // Unpack32Values bit_width 16:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.13e+04 1.13e+04 1.14e+04 1X 1X 1X -// Unpack32Scalar 8.11e+04 8.19e+04 8.27e+04 7.2X 7.22X 7.23X -// UnpackScalar 8.82e+04 8.91e+04 8.97e+04 7.84X 7.85X 7.85X +// BitReader 1.08e+04 1.08e+04 1.09e+04 1X 1X 1X +// Unpack32Scalar 8.71e+04 8.76e+04 8.83e+04 8.09X 8.09X 8.08X +// UnpackScalar 9.22e+04 9.3e+04 9.37e+04 8.56X 8.58X 8.57X // // Unpack32Values bit_width 17:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.11e+04 1.12e+04 1.13e+04 1X 1X 1X -// Unpack32Scalar 7.75e+04 7.86e+04 7.91e+04 7X 7.04X 7.03X -// UnpackScalar 8.75e+04 8.82e+04 8.89e+04 7.89X 7.91X 7.9X +// BitReader 1.04e+04 1.04e+04 1.05e+04 1X 1X 1X +// Unpack32Scalar 6.98e+04 7.04e+04 7.09e+04 6.73X 6.74X 6.74X +// UnpackScalar 7.73e+04 7.78e+04 7.85e+04 7.45X 7.45X 7.47X // // Unpack32Values bit_width 18:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.09e+04 1.11e+04 1.12e+04 1X 1X 1X -// Unpack32Scalar 7.71e+04 7.81e+04 7.88e+04 7.06X 7.02X 7.03X -// UnpackScalar 8.68e+04 8.81e+04 8.89e+04 7.95X 7.92X 7.93X +// BitReader 1.03e+04 1.04e+04 1.05e+04 1X 1X 1X +// Unpack32Scalar 7.1e+04 7.17e+04 7.22e+04 6.86X 6.88X 6.87X +// UnpackScalar 7.77e+04 7.82e+04 7.89e+04 7.51X 7.5X 7.51X // // Unpack32Values bit_width 19:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.09e+04 1.1e+04 1.11e+04 1X 1X 1X -// Unpack32Scalar 7.69e+04 7.75e+04 7.83e+04 7.06X 7.07X 7.07X -// UnpackScalar 8.62e+04 8.72e+04 8.78e+04 7.91X 7.96X 7.93X +// BitReader 1.02e+04 1.03e+04 1.04e+04 1X 1X 1X +// Unpack32Scalar 6.74e+04 6.8e+04 6.85e+04 6.59X 6.6X 6.61X +// UnpackScalar 7.43e+04 7.49e+04 7.54e+04 7.26X 7.27X 7.28X // // Unpack32Values bit_width 20:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.07e+04 1.1e+04 1.11e+04 1X 1X 1X -// Unpack32Scalar 7.58e+04 7.74e+04 7.82e+04 7.1X 7.06X 7.07X -// UnpackScalar 8.43e+04 8.64e+04 8.74e+04 7.89X 7.89X 7.9X +// BitReader 1.02e+04 1.03e+04 1.03e+04 1X 1X 1X +// Unpack32Scalar 7.28e+04 7.34e+04 7.4e+04 7.15X 7.15X 7.15X +// UnpackScalar 7.94e+04 8.02e+04 8.07e+04 7.8X 7.81X 7.8X // // Unpack32Values bit_width 21:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.07e+04 1.08e+04 1.09e+04 1X 1X 1X -// Unpack32Scalar 7.58e+04 7.66e+04 7.76e+04 7.07X 7.09X 7.14X -// UnpackScalar 8.51e+04 8.61e+04 8.69e+04 7.95X 7.97X 8X +// BitReader 1.01e+04 1.01e+04 1.02e+04 1X 1X 1X +// Unpack32Scalar 6.56e+04 6.62e+04 6.67e+04 6.53X 6.54X 6.54X +// UnpackScalar 7.1e+04 7.15e+04 7.19e+04 7.06X 7.06X 7.06X // // Unpack32Values bit_width 22:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.05e+04 1.08e+04 1.09e+04 1X 1X 1X -// Unpack32Scalar 7.49e+04 7.62e+04 7.71e+04 7.1X 7.08X 7.1X -// UnpackScalar 8.39e+04 8.59e+04 8.65e+04 7.96X 7.98X 7.97X +// BitReader 1e+04 1.01e+04 1.02e+04 1X 1X 1X +// Unpack32Scalar 6.68e+04 6.73e+04 6.79e+04 6.68X 6.68X 6.68X +// UnpackScalar 7.35e+04 7.41e+04 7.46e+04 7.34X 7.35X 7.35X // // Unpack32Values bit_width 23:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.06e+04 1.07e+04 1.08e+04 1X 1X 1X -// Unpack32Scalar 7.44e+04 7.54e+04 7.62e+04 7.03X 7.07X 7.08X -// UnpackScalar 8.44e+04 8.54e+04 8.61e+04 7.97X 8X 8X +// BitReader 9.87e+03 9.95e+03 1e+04 1X 1X 1X +// Unpack32Scalar 6.44e+04 6.48e+04 6.53e+04 6.52X 6.52X 6.51X +// UnpackScalar 6.93e+04 6.97e+04 7.04e+04 7.03X 7.01X 7.02X // // Unpack32Values bit_width 24:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.05e+04 1.06e+04 1.07e+04 1X 1X 1X -// Unpack32Scalar 7.54e+04 7.62e+04 7.69e+04 7.17X 7.18X 7.2X -// UnpackScalar 8.42e+04 8.51e+04 8.62e+04 8.01X 8.01X 8.07X +// BitReader 9.93e+03 1e+04 1.01e+04 1X 1X 1X +// Unpack32Scalar 7.44e+04 7.49e+04 7.55e+04 7.49X 7.49X 7.49X +// UnpackScalar 8.12e+04 8.17e+04 8.27e+04 8.18X 8.17X 8.2X // // Unpack32Values bit_width 25:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.04e+04 1.05e+04 1.05e+04 1X 1X 1X -// Unpack32Scalar 7.42e+04 7.49e+04 7.57e+04 7.15X 7.17X 7.18X -// UnpackScalar 8.32e+04 8.43e+04 8.51e+04 8.01X 8.07X 8.07X +// BitReader 9.71e+03 9.79e+03 9.86e+03 1X 1X 1X +// Unpack32Scalar 6.12e+04 6.16e+04 6.22e+04 6.31X 6.29X 6.31X +// UnpackScalar 6.44e+04 6.48e+04 6.53e+04 6.64X 6.62X 6.62X // // Unpack32Values bit_width 26:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.02e+04 1.04e+04 1.05e+04 1X 1X 1X -// Unpack32Scalar 7.24e+04 7.46e+04 7.55e+04 7.12X 7.16X 7.19X -// UnpackScalar 8.15e+04 8.43e+04 8.51e+04 8.01X 8.09X 8.1X +// BitReader 9.67e+03 9.74e+03 9.81e+03 1X 1X 1X +// Unpack32Scalar 6.21e+04 6.26e+04 6.31e+04 6.42X 6.42X 6.43X +// UnpackScalar 6.53e+04 6.59e+04 6.64e+04 6.75X 6.77X 6.76X // // Unpack32Values bit_width 27:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.01e+04 1.03e+04 1.04e+04 1X 1X 1X -// Unpack32Scalar 7.26e+04 7.4e+04 7.47e+04 7.18X 7.18X 7.2X -// UnpackScalar 8.13e+04 8.35e+04 8.43e+04 8.03X 8.11X 8.12X +// BitReader 9.56e+03 9.62e+03 9.7e+03 1X 1X 1X +// Unpack32Scalar 5.99e+04 6.03e+04 6.09e+04 6.27X 6.27X 6.28X +// UnpackScalar 6.32e+04 6.35e+04 6.42e+04 6.61X 6.6X 6.62X // // Unpack32Values bit_width 28:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1e+04 1.03e+04 1.03e+04 1X 1X 1X -// Unpack32Scalar 7.07e+04 7.41e+04 7.48e+04 7.05X 7.22X 7.23X -// UnpackScalar 7.97e+04 8.34e+04 8.42e+04 7.96X 8.13X 8.14X +// BitReader 9.53e+03 9.61e+03 9.66e+03 1X 1X 1X +// Unpack32Scalar 6.37e+04 6.42e+04 6.47e+04 6.69X 6.68X 6.7X +// UnpackScalar 6.68e+04 6.73e+04 6.77e+04 7.01X 7X 7.01X // // Unpack32Values bit_width 29:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 1.01e+04 1.01e+04 1.02e+04 1X 1X 1X -// Unpack32Scalar 7.06e+04 7.27e+04 7.35e+04 7.02X 7.18X 7.19X -// UnpackScalar 8e+04 8.23e+04 8.32e+04 7.96X 8.12X 8.14X +// BitReader 9.41e+03 9.46e+03 9.55e+03 1X 1X 1X +// Unpack32Scalar 5.79e+04 5.82e+04 5.87e+04 6.15X 6.15X 6.14X +// UnpackScalar 6.08e+04 6.11e+04 6.16e+04 6.46X 6.46X 6.46X // // Unpack32Values bit_width 30:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 9.95e+03 1.01e+04 1.02e+04 1X 1X 1X -// Unpack32Scalar 7e+04 7.23e+04 7.32e+04 7.04X 7.16X 7.18X -// UnpackScalar 7.9e+04 8.18e+04 8.24e+04 7.94X 8.09X 8.08X +// BitReader 9.37e+03 9.45e+03 9.52e+03 1X 1X 1X +// Unpack32Scalar 5.87e+04 5.92e+04 5.96e+04 6.26X 6.27X 6.26X +// UnpackScalar 6.16e+04 6.2e+04 6.26e+04 6.58X 6.56X 6.57X // // Unpack32Values bit_width 31:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 9.06e+03 9.33e+03 9.43e+03 1X 1X 1X -// Unpack32Scalar 6.52e+04 7.17e+04 7.26e+04 7.19X 7.69X 7.7X -// UnpackScalar 7.67e+04 8.1e+04 8.19e+04 8.46X 8.69X 8.69X +// BitReader 9.26e+03 9.33e+03 9.41e+03 1X 1X 1X +// Unpack32Scalar 5.59e+04 5.63e+04 5.67e+04 6.03X 6.03X 6.03X +// UnpackScalar 5.85e+04 5.89e+04 5.94e+04 6.31X 6.31X 6.31X // // Unpack32Values bit_width 32:Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile // (relative) (relative) (relative) // --------------------------------------------------------------------------------------------------------- -// BitReader 9.73e+03 9.96e+03 1e+04 1X 1X 1X -// Unpack32Scalar 1.06e+05 1.1e+05 1.11e+05 10.9X 11X 11.1X -// UnpackScalar 1.41e+05 1.49e+05 1.5e+05 14.5X 14.9X 15X +// BitReader 9.89e+03 9.96e+03 1e+04 1X 1X 1X +// Unpack32Scalar 9.83e+04 9.96e+04 1.01e+05 9.95X 10X 10X +// UnpackScalar 8.24e+04 8.36e+04 8.44e+04 8.34X 8.4X 8.41X #include <cmath> #include <cstdio> #include <cstdlib> diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index 21942d3..b12ae2d 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -159,7 +159,7 @@ target_link_libraries(loggingsupport ${IMPALA_LINK_LIBS_DYNAMIC_TARGETS}) ADD_UNIFIED_BE_LSAN_TEST(benchmark-test "BenchmarkTest.*") ADD_UNIFIED_BE_LSAN_TEST(bitmap-test "Bitmap.*") ADD_UNIFIED_BE_LSAN_TEST(bit-packing-test "BitPackingTest.*") -ADD_UNIFIED_BE_LSAN_TEST(bit-stream-utils-test "BitArray.*:VLQInt.*") +ADD_UNIFIED_BE_LSAN_TEST(bit-stream-utils-test "BitArray.*") ADD_UNIFIED_BE_LSAN_TEST(bit-util-test "BitUtil.*") ADD_UNIFIED_BE_LSAN_TEST(blocking-queue-test "BlockingQueueTest.*") ADD_UNIFIED_BE_LSAN_TEST(bloom-filter-test "BloomFilter.*:BloomFilterTest.*") diff --git a/be/src/util/bit-packing-test.cc b/be/src/util/bit-packing-test.cc index 8010fe3..2cf86ba 100644 --- a/be/src/util/bit-packing-test.cc +++ b/be/src/util/bit-packing-test.cc @@ -32,8 +32,8 @@ using std::mt19937; namespace impala { namespace { -uint64_t ComputeMask(int bit_width) { - return (bit_width < 64) ? ((1UL << bit_width) - 1) : ~0UL; +uint32_t ComputeMask(int bit_width) { + return (bit_width < 32) ? ((1U << bit_width) - 1) : ~0U; } } @@ -45,19 +45,19 @@ uint64_t ComputeMask(int bit_width) { /// /// This is to test that we do not overrun either the input or output buffer for smaller /// batch sizes. -void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values, +void UnpackSubset(const uint32_t* in, const uint8_t* packed, int num_in_values, int bit_width, bool aligned); /// Test a packing/unpacking round-trip of the 'num_in_values' values in 'in', /// packed with 'bit_width'. If 'aligned' is true, buffers for packed and unpacked data /// are allocated at a 64-byte aligned address. Otherwise the buffers are misaligned /// by 1 byte from a 64-byte aligned address. -void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool aligned) { +void PackUnpack(const uint32_t* in, int num_in_values, int bit_width, bool aligned) { LOG(INFO) << "num_in_values = " << num_in_values << " bit_width = " << bit_width << " aligned = " << aligned; // Mask out higher bits so that the values to pack are in range. - const uint64_t mask = ComputeMask(bit_width); + const uint32_t mask = ComputeMask(bit_width); const int misalignment = aligned ? 0 : 1; const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * num_in_values); @@ -79,8 +79,8 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align for (const int num_to_unpack : {num_in_values, num_in_values + 1, num_in_values + 77}) { LOG(INFO) << "Unpacking " << num_to_unpack; // Size buffer exactly so that ASAN can detect reads/writes that overrun the buffer. - AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment); - uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment); + AlignedAllocation out_storage(num_to_unpack * sizeof(uint32_t) + misalignment); + uint32_t* out = reinterpret_cast<uint32_t*>(out_storage.data() + misalignment); const auto result = BitPacking::UnpackValues( bit_width, packed, writer.bytes_written(), num_to_unpack, out); ASSERT_EQ(packed + writer.bytes_written(), result.first) @@ -105,7 +105,7 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align UnpackSubset(in, packed, num_in_values, bit_width, aligned); } -void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values, +void UnpackSubset(const uint32_t* in, const uint8_t* packed, int num_in_values, int bit_width, bool aligned) { const int misalignment = aligned ? 0 : 1; for (int num_to_unpack : {1, 10, 77, num_in_values - 7}) { @@ -116,8 +116,8 @@ void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values, AlignedAllocation packed_copy_storage(bytes_to_read + misalignment); uint8_t* packed_copy = packed_copy_storage.data() + misalignment; memcpy(packed_copy, packed, bytes_to_read); - AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment); - uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment); + AlignedAllocation out_storage(num_to_unpack * sizeof(uint32_t) + misalignment); + uint32_t* out = reinterpret_cast<uint32_t*>(out_storage.data() + misalignment); const auto result = BitPacking::UnpackValues( bit_width, packed_copy, bytes_to_read, num_to_unpack, out); ASSERT_EQ(packed_copy + bytes_to_read, result.first) << "Read wrong # of bytes"; @@ -132,9 +132,9 @@ void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values, TEST(BitPackingTest, RandomUnpack) { constexpr int NUM_IN_VALUES = 64 * 1024; - uint64_t in[NUM_IN_VALUES]; + uint32_t in[NUM_IN_VALUES]; mt19937 rng; - uniform_int_distribution<uint64_t> dist; + uniform_int_distribution<uint32_t> dist; std::generate(std::begin(in), std::end(in), [&rng, &dist] { return dist(rng); }); // Test various odd input lengths to exercise boundary cases for full and partial @@ -145,7 +145,7 @@ TEST(BitPackingTest, RandomUnpack) { lengths.push_back(i); } - for (int bit_width = 0; bit_width <= 64; ++bit_width) { + for (int bit_width = 0; bit_width <= 32; ++bit_width) { for (const int length : lengths) { // Test that unpacking to/from aligned and unaligned memory works. for (const bool aligned : {true, false}) { diff --git a/be/src/util/bit-packing.h b/be/src/util/bit-packing.h index d449048..c70b55e 100644 --- a/be/src/util/bit-packing.h +++ b/be/src/util/bit-packing.h @@ -40,17 +40,15 @@ namespace impala { /// in the two output bytes: [ 0 0 1 | 0 0 0 0 1 ] [ x x x x x x | 1 0 ] /// lower bits of 17--^ 1 next value ^--upper bits of 17 /// -/// Bit widths from 0 to 64 are supported (0 bit width means that every value is 0). +/// Bit widths from 0 to 32 are supported (0 bit width means that every value is 0). /// The batched unpacking functions operate on batches of 32 values. This batch size /// is convenient because for every supported bit width, the end of a 32 value batch /// falls on a byte boundary. It is also large enough to amortise loop overheads. class BitPacking { public: - static constexpr int MAX_BITWIDTH = sizeof(uint64_t) * 8; - /// Unpack bit-packed values with 'bit_width' from 'in' to 'out'. Keeps unpacking until /// either all 'in_bytes' are read or 'num_values' values are unpacked. 'out' must have - /// enough space for 'num_values'. 0 <= 'bit_width' <= 64 and 'bit_width' <= # of bits + /// enough space for 'num_values'. 0 <= 'bit_width' <= 32 and 'bit_width' <= # of bits /// in OutType. 'in' must point to 'in_bytes' of addressable memory. /// /// Returns a pointer to the byte after the last byte of 'in' that was read and also the @@ -88,7 +86,7 @@ class BitPacking { /// Unpack exactly 32 values of 'bit_width' from 'in' to 'out'. 'in' must point to /// 'in_bytes' of addressable memory, and 'in_bytes' must be at least /// (32 * bit_width / 8). 'out' must have space for 32 OutType values. - /// 0 <= 'bit_width' <= 64 and 'bit_width' <= # of bits in OutType. + /// 0 <= 'bit_width' <= 32 and 'bit_width' <= # of bits in OutType. template <typename OutType> static const uint8_t* Unpack32Values(int bit_width, const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ out); @@ -115,7 +113,7 @@ class BitPacking { /// 'num_values' must be at most 31. 'in' must point to 'in_bytes' of addressable /// memory, and 'in_bytes' must be at least ceil(num_values * bit_width / 8). /// 'out' must have space for 'num_values' OutType values. - /// 0 <= 'bit_width' <= 64 and 'bit_width' <= # of bits in OutType. + /// 0 <= 'bit_width' <= 32 and 'bit_width' <= # of bits in OutType. template <typename OutType, int BIT_WIDTH> static const uint8_t* UnpackUpTo31Values(const uint8_t* __restrict__ in, int64_t in_bytes, int num_values, OutType* __restrict__ out); diff --git a/be/src/util/bit-packing.inline.h b/be/src/util/bit-packing.inline.h index bbaa23b..8cebe40 100644 --- a/be/src/util/bit-packing.inline.h +++ b/be/src/util/bit-packing.inline.h @@ -58,8 +58,8 @@ std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(int bit_width, return UnpackValues<OutType, i>(in, in_bytes, num_values, out); switch (bit_width) { - // Expand cases from 0 to 64. - BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore); + // Expand cases from 0 to 32. + BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore); default: DCHECK(false); return std::make_pair(nullptr, -1); @@ -77,14 +77,12 @@ std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues( const int64_t remainder_values = values_to_read % BATCH_SIZE; const uint8_t* in_pos = in; OutType* out_pos = out; - // First unpack as many full batches as possible. for (int64_t i = 0; i < batches_to_read; ++i) { in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos); out_pos += BATCH_SIZE; in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT; } - // Then unpack the final partial batch. if (remainder_values > 0) { in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>( @@ -105,14 +103,15 @@ std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(int bit_wid in, in_bytes, dict, dict_len, num_values, out, stride, decode_error); switch (bit_width) { - // Expand cases from 0 to 64. - BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore); + // Expand cases from 0 to 32. + BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore); default: DCHECK(false); return std::make_pair(nullptr, -1); } #pragma pop_macro("UNPACK_VALUES_CASE") } + template <typename OutType, int BIT_WIDTH> std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues( const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict, @@ -150,57 +149,49 @@ std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues( // bpacking.c at https://github.com/lemire/FrameOfReference/, but is much more compact // because it uses templates rather than source-level unrolling of all combinations. // -// After the template parameters are expanded and constants are propagated, all branches +// After the template parameters is expanded and constants are propagated, all branches // and offset/shift calculations should be optimized out, leaving only shifts by constants // and bitmasks by constants. Calls to this must be stamped out manually or with // BOOST_PP_REPEAT_FROM_TO: experimentation revealed that the GCC 4.9.2 optimiser was // not able to fully propagate constants and remove branches when this was called from // inside a for loop with constant bounds with VALUE_IDX changed to a function argument. -// -// We compute how many 32 bit words we have to read, which is either 1, 2 or 3. If it is -// at least 2, the first two 32 bit words are read as one 64 bit word. Even if only one -// word needs to be read, we try to read 64 bits if it does not lead to buffer overflow -// because benchmarks show that it has a positive effect on performance. template <int BIT_WIDTH, int VALUE_IDX> -inline uint64_t ALWAYS_INLINE UnpackValue(const uint8_t* __restrict__ in_buf) { - if (BIT_WIDTH == 0) return 0; - - constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH; - constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32; - constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH; - constexpr int LAST_WORD_IDX = BitUtil::RoundUpNumi32(LAST_BIT_IDX); - constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX; - static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded."); - - constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32; - constexpr uint64_t mask = BIT_WIDTH == 64 ? ~0L : (1UL << BIT_WIDTH) - 1; - const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf); - - // Avoid reading past the end of the buffer. - constexpr bool CAN_READ_64 = FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32; - - // We do not try to read 64 bits when the bit width is a power of two (unless it is - // necessary) because performance benchmarks show that it is better this way. We can - // revisit it when we update the compiler version. - constexpr bool READ_32_BITS = WORDS_TO_READ == 1 - && (!CAN_READ_64 || BitUtil::IsPowerOf2(BIT_WIDTH)); - - if (READ_32_BITS) { - uint32_t word = in[FIRST_WORD_IDX]; - word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0; - return word & mask; - } - - uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX); - word >>= FIRST_BIT_OFFSET; - - if (WORDS_TO_READ > 2) { - constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET; - uint64_t extra_word = in[FIRST_WORD_IDX + 2]; - word |= extra_word << USEFUL_BITS; +inline uint32_t ALWAYS_INLINE UnpackValue(const uint8_t* __restrict__ in_buf) { + constexpr uint32_t LOAD_BIT_WIDTH = sizeof(uint32_t) * CHAR_BIT; + static_assert(BIT_WIDTH <= LOAD_BIT_WIDTH, "BIT_WIDTH > LOAD_BIT_WIDTH"); + static_assert(VALUE_IDX >= 0 && VALUE_IDX < 32, "0 <= VALUE_IDX < 32"); + // The index of the first bit of the value, relative to the start of 'in_buf'. + constexpr uint32_t FIRST_BIT = VALUE_IDX * BIT_WIDTH; + constexpr uint32_t IN_WORD_IDX = FIRST_BIT / LOAD_BIT_WIDTH; + constexpr uint32_t FIRST_BIT_OFFSET = FIRST_BIT % LOAD_BIT_WIDTH; + // Index of bit after last bit of this value, relative to start of IN_WORD_IDX. + constexpr uint32_t END_BIT_OFFSET = FIRST_BIT_OFFSET + BIT_WIDTH; + + const uint32_t* in_words = reinterpret_cast<const uint32_t*>(in_buf); + // The lower bits of the value come from the first word. + const uint32_t lower_bits = + BIT_WIDTH > 0 ? in_words[IN_WORD_IDX] >> FIRST_BIT_OFFSET : 0U; + if (END_BIT_OFFSET < LOAD_BIT_WIDTH) { + // All bits of the value are in the first word, but we need to mask out upper bits + // that belong to the next value. + return lower_bits % (1UL << BIT_WIDTH); + } if (END_BIT_OFFSET == LOAD_BIT_WIDTH) { + // This value was exactly the uppermost bits of the first word - no masking required. + return lower_bits; + } else { + DCHECK_GT(END_BIT_OFFSET, LOAD_BIT_WIDTH); + DCHECK_LT(VALUE_IDX, 31) + << "Should not go down this branch for last value with no trailing bits."; + // Value is split between words, so grab trailing bits from the next word. + // Force into [0, LOAD_BIT_WIDTH) to avoid spurious shift >= width of type warning. + constexpr uint32_t NUM_TRAILING_BITS = + END_BIT_OFFSET < LOAD_BIT_WIDTH ? 0 : END_BIT_OFFSET - LOAD_BIT_WIDTH; + const uint32_t trailing_bits = in_words[IN_WORD_IDX + 1] % (1UL << NUM_TRAILING_BITS); + // Force into [0, LOAD_BIT_WIDTH) to avoid spurious shift >= width of type warning. + constexpr uint32_t TRAILING_BITS_SHIFT = + BIT_WIDTH == 32 ? 0 : (BIT_WIDTH - NUM_TRAILING_BITS); + return lower_bits | (trailing_bits << TRAILING_BITS_SHIFT); } - - return word & mask; } template <typename OutType> @@ -219,7 +210,7 @@ template <typename OutType, int BIT_WIDTH> const uint8_t* BitPacking::Unpack32Values( const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ out) { static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low"); - static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high"); + static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32"); DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output"; constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH); DCHECK_GE(in_bytes, BYTES_TO_READ); @@ -242,8 +233,8 @@ const uint8_t* BitPacking::Unpack32Values(int bit_width, const uint8_t* __restri case i: return Unpack32Values<OutType, i>(in, in_bytes, out); switch (bit_width) { - // Expand cases from 0 to 64. - BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore); + // Expand cases from 0 to 32. + BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore); default: DCHECK(false); return in; } #pragma pop_macro("UNPACK_VALUES_CASE") @@ -254,7 +245,7 @@ const uint8_t* BitPacking::UnpackAndDecode32Values(const uint8_t* __restrict__ i int64_t in_bytes, OutType* __restrict__ dict, int64_t dict_len, OutType* __restrict__ out, int64_t stride, bool* __restrict__ decode_error) { static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low"); - static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high"); + static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32"); constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH); DCHECK_GE(in_bytes, BYTES_TO_READ); // TODO: this could be optimised further by using SIMD instructions. @@ -278,7 +269,7 @@ template <typename OutType, int BIT_WIDTH> const uint8_t* BitPacking::UnpackUpTo31Values(const uint8_t* __restrict__ in, int64_t in_bytes, int num_values, OutType* __restrict__ out) { static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low"); - static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high"); + static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32"); DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output"; constexpr int MAX_BATCH_SIZE = 31; const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH); @@ -319,7 +310,7 @@ const uint8_t* BitPacking::UnpackAndDecodeUpTo31Values(const uint8_t* __restrict int64_t in_bytes, OutType* __restrict__ dict, int64_t dict_len, int num_values, OutType* __restrict__ out, int64_t stride, bool* __restrict__ decode_error) { static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low"); - static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high"); + static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32"); constexpr int MAX_BATCH_SIZE = 31; const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH); DCHECK_GE(in_bytes, BYTES_TO_READ); diff --git a/be/src/util/bit-stream-utils-test.cc b/be/src/util/bit-stream-utils-test.cc index aa157e3..e02e3c3 100644 --- a/be/src/util/bit-stream-utils-test.cc +++ b/be/src/util/bit-stream-utils-test.cc @@ -151,16 +151,15 @@ TEST(BitArray, TestIntSkip) { TestSkipping<int>(buffer, bytes_written, bit_width, 8, 56); } -// Writes 'num_vals' values with width 'bit_width' starting from 'start' and increasing -// and reads them back. -void TestBitArrayValues(int bit_width, uint64_t start, uint64_t num_vals) { +// Writes 'num_vals' values with width 'bit_width' and reads them back. +void TestBitArrayValues(int bit_width, int num_vals) { const int len = BitUtil::Ceil(bit_width * num_vals, 8); - const uint64_t mask = bit_width == 64 ? ~0UL : (1UL << bit_width) - 1; + const int64_t mod = bit_width == 64 ? 1 : 1LL << bit_width; uint8_t buffer[(len > 0) ? len : 1]; BitWriter writer(buffer, len); - for (uint64_t i = 0; i < num_vals; ++i) { - bool result = writer.PutValue((start + i) & mask, bit_width); + for (int i = 0; i < num_vals; ++i) { + bool result = writer.PutValue(i % mod, bit_width); EXPECT_TRUE(result); } writer.Flush(); @@ -173,20 +172,19 @@ void TestBitArrayValues(int bit_width, uint64_t start, uint64_t num_vals) { // Unpack all values at once with one batched reader and in small batches with the // other batched reader. vector<int64_t> batch_vals(num_vals); - const uint64_t BATCH_SIZE = 32; + const int BATCH_SIZE = 32; vector<int64_t> batch_vals2(BATCH_SIZE); EXPECT_EQ(num_vals, reader.UnpackBatch(bit_width, num_vals, batch_vals.data())); - for (uint64_t i = 0; i < num_vals; ++i) { + for (int i = 0; i < num_vals; ++i) { if (i % BATCH_SIZE == 0) { int num_to_unpack = min(BATCH_SIZE, num_vals - i); EXPECT_EQ(num_to_unpack, reader2.UnpackBatch(bit_width, num_to_unpack, batch_vals2.data())); } - EXPECT_EQ((start + i) & mask, batch_vals[i]); - EXPECT_EQ((start + i) & mask, batch_vals2[i % BATCH_SIZE]); + EXPECT_EQ(i % mod, batch_vals[i]); + EXPECT_EQ(i % mod, batch_vals2[i % BATCH_SIZE]); } - EXPECT_EQ(reader.bytes_left(), 0); EXPECT_EQ(reader2.bytes_left(), 0); reader.Reset(buffer, len); @@ -196,132 +194,13 @@ void TestBitArrayValues(int bit_width, uint64_t start, uint64_t num_vals) { TEST(BitArray, TestValues) { for (int width = 0; width <= MAX_WIDTH; ++width) { - TestBitArrayValues(width, 0, 1); - TestBitArrayValues(width, 0, 2); + TestBitArrayValues(width, 1); + TestBitArrayValues(width, 2); // Don't write too many values - TestBitArrayValues(width, 0, (width < 12) ? (1 << width) : 4096); - TestBitArrayValues(width, 0, 1024); - - // Also test values that need bitwidth > 32. - TestBitArrayValues(width, 1099511627775LL, 1024); + TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096); + TestBitArrayValues(width, 1024); } } -template<typename UINT_T> -void TestUleb128Encode(const UINT_T value, std::vector<uint8_t> expected_bytes) { - const int len = BatchedBitReader::max_vlq_byte_len<UINT_T>(); - - std::vector<uint8_t> buffer(len, 0); - BitWriter writer(buffer.data(), len); - - const bool success = writer.PutUleb128(value); - EXPECT_TRUE(success); - - if (expected_bytes.size() < len) { - // Allow the user to input fewer bytes than the maximum. - expected_bytes.resize(len, 0); - } - - EXPECT_EQ(expected_bytes, buffer); } -TEST(VLQInt, TestPutUleb128) { - TestUleb128Encode<uint32_t>(5, {0x05}); - TestUleb128Encode<uint32_t>(2018, {0xe2, 0x0f}); - TestUleb128Encode<uint32_t>(25248, {0xa0, 0xc5, 0x01}); - TestUleb128Encode<uint32_t>(55442211, {0xa3, 0xf6, 0xb7, 0x1a}); - TestUleb128Encode<uint32_t>(4294967295, {0xff, 0xff, 0xff, 0xff, 0x0f}); - - TestUleb128Encode<uint64_t>(5, {0x05}); - TestUleb128Encode<uint64_t>(55442211, {0xa3, 0xf6, 0xb7, 0x1a}); - TestUleb128Encode<uint64_t>(1649267441664, {0x80, 0x80, 0x80, 0x80, 0x80, 0x30}); - TestUleb128Encode<uint64_t>(60736917191292289, - {0x81, 0xeb, 0xc1, 0xaf, 0xf8, 0xfc, 0xf1, 0x6b}); - TestUleb128Encode<uint64_t>(18446744073709551615U, - {0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x01}); -} - -template<typename UINT_T> -void TestUleb128Decode(const UINT_T expected_value, const std::vector<uint8_t>& bytes) { - BatchedBitReader reader(bytes.data(), bytes.size()); - UINT_T value; - const bool success = reader.GetUleb128<UINT_T>(&value); - EXPECT_TRUE(success); - EXPECT_EQ(expected_value, value); -} - -TEST(VLQInt, TestGetUleb128) { - TestUleb128Decode<uint32_t>(5, {0x05}); - TestUleb128Decode<uint32_t>(2018, {0xe2, 0x0f}); - TestUleb128Decode<uint32_t>(25248, {0xa0, 0xc5, 0x01}); - TestUleb128Decode<uint32_t>(55442211, {0xa3, 0xf6, 0xb7, 0x1a}); - TestUleb128Decode<uint32_t>(4294967295, {0xff, 0xff, 0xff, 0xff, 0x0f}); - - TestUleb128Decode<uint64_t>(5, {0x05}); - TestUleb128Decode<uint64_t>(55442211, {0xa3, 0xf6, 0xb7, 0x1a}); - TestUleb128Decode<uint64_t>(1649267441664, {0x80, 0x80, 0x80, 0x80, 0x80, 0x30}); - TestUleb128Decode<uint64_t>(60736917191292289, - {0x81, 0xeb, 0xc1, 0xaf, 0xf8, 0xfc, 0xf1, 0x6b}); - TestUleb128Decode<uint64_t>(18446744073709551615U, - {0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x01}); -} - -template<typename INT_T> -void TestZigZagEncode(const INT_T value, std::vector<uint8_t> expected_bytes) { - const int len = BatchedBitReader::max_vlq_byte_len<INT_T>(); - - std::vector<uint8_t> buffer(len, 0); - BitWriter writer(buffer.data(), len); - - const bool success = writer.PutZigZagInteger(value); - EXPECT_TRUE(success); - - if (expected_bytes.size() < len) { - // Allow the user to input fewer bytes than the maximum. - expected_bytes.resize(len, 0); - } - - EXPECT_EQ(expected_bytes, buffer); -} - -TEST(VLQInt, TestPutZigZagInteger) { - TestZigZagEncode<int32_t>(-3, {0x05}); - TestZigZagEncode<int32_t>(-2485, {0xe9, 0x26}); - TestZigZagEncode<int32_t>(5648448, {0x80, 0xc1, 0xb1, 0x5}); - - TestZigZagEncode<int64_t>(1629267541664, {0xc0, 0xfa, 0xcd, 0xfe, 0xea, 0x5e}); - TestZigZagEncode<int64_t>(-9223372036854775808U, // Most negative int64_t. - {0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x1}); -} - -template<typename INT_T> -void TestZigZagDecode(const INT_T expected_value, std::vector<uint8_t> bytes) { - BatchedBitReader reader(bytes.data(), bytes.size()); - INT_T value; - const bool success = reader.GetZigZagInteger<INT_T>(&value); - EXPECT_TRUE(success); - EXPECT_EQ(expected_value, value); -} - -TEST(VLQInt, TestGetZigZagInteger) { - TestZigZagDecode<int32_t>(-3, {0x05}); - TestZigZagDecode<int32_t>(-2485, {0xe9, 0x26}); - TestZigZagDecode<int32_t>(5648448, {0x80, 0xc1, 0xb1, 0x5}); - - TestZigZagDecode<int64_t>(1629267541664, {0xc0, 0xfa, 0xcd, 0xfe, 0xea, 0x5e}); - TestZigZagDecode<int64_t>(-9223372036854775808U, // Most negative int64_t. - {0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x1}); -} - -TEST(VLQInt, TestMaxVlqByteLen) { - EXPECT_EQ(2, BatchedBitReader::max_vlq_byte_len<uint8_t>()); - EXPECT_EQ(3, BatchedBitReader::max_vlq_byte_len<uint16_t>()); - EXPECT_EQ(5, BatchedBitReader::max_vlq_byte_len<uint32_t>()); - EXPECT_EQ(10, BatchedBitReader::max_vlq_byte_len<uint64_t>()); -} - -} diff --git a/be/src/util/bit-stream-utils.h b/be/src/util/bit-stream-utils.h index 230c9dd..9889f35 100644 --- a/be/src/util/bit-stream-utils.h +++ b/be/src/util/bit-stream-utils.h @@ -23,17 +23,16 @@ #include <string.h> #include "common/compiler-util.h" #include "common/logging.h" -#include "util/bit-packing.h" #include "util/bit-util.h" namespace impala { -/// Utility class to write bit/byte streams. This class can write data to either be +/// Utility class to write bit/byte streams. This class can write data to either be /// bit packed or byte aligned (and a single stream that has a mix of both). /// This class does not allocate memory. class BitWriter { public: - /// buffer: buffer to write bits to. Buffer should be preallocated with + /// buffer: buffer to write bits to. Buffer should be preallocated with /// 'buffer_len' bytes. BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), @@ -65,16 +64,7 @@ class BitWriter { /// Write an unsigned ULEB-128 encoded int to the buffer. Return false if there was not /// enough room. The value is written byte aligned. For more details on ULEB-128: /// https://en.wikipedia.org/wiki/LEB128 - /// UINT_T must be an unsigned integer type. - template<typename UINT_T> - bool PutUleb128(UINT_T v); - - /// Write a ZigZag encoded int to the buffer. Return false if there was not enough - /// room. The value is written byte aligned. For more details on ZigZag encoding: - /// https://developers.google.com/protocol-buffers/docs/encoding#signed-integers - /// INT_T must be a signed integer type. - template<typename INT_T> - bool PutZigZagInteger(INT_T v); + bool PutUleb128Int(uint32_t v); /// Get a pointer to the next aligned byte and advance the underlying buffer /// by num_bytes. @@ -87,7 +77,7 @@ class BitWriter { void Flush(bool align=false); /// Maximum supported bitwidth for writer. - static const int MAX_BITWIDTH = 64; + static const int MAX_BITWIDTH = 32; private: uint8_t* buffer_; @@ -167,29 +157,16 @@ class BatchedBitReader { /// at the beginning of a byte. Return false if there were not enough bytes in the /// buffer or the int is invalid. For more details on ULEB-128: /// https://en.wikipedia.org/wiki/LEB128 - /// UINT_T must be an unsigned integer type. - template<typename UINT_T> - bool GetUleb128(UINT_T* v); - - /// Read a ZigZag encoded int from the stream. The encoded int must start at the - /// beginning of a byte. Return false if there were not enough bytes in the buffer or - /// the int is invalid. For more details on ZigZag encoding: - /// https://developers.google.com/protocol-buffers/docs/encoding#signed-integers - /// INT_T must be a signed integer type. - template<typename INT_T> - bool GetZigZagInteger(INT_T* v); + bool GetUleb128Int(uint32_t* v); /// Returns the number of bytes left in the stream. int bytes_left() { return buffer_end_ - buffer_pos_; } - /// Maximum byte length of a vlq encoded integer of type T. - template <typename T> - static constexpr int max_vlq_byte_len() { - return BitUtil::Ceil(sizeof(T) * 8, 7); - } + /// Maximum byte length of a vlq encoded int + static const int MAX_VLQ_BYTE_LEN = 5; /// Maximum supported bitwidth for reader. - static const int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH; + static const int MAX_BITWIDTH = 32; private: /// Current read position in the buffer. diff --git a/be/src/util/bit-stream-utils.inline.h b/be/src/util/bit-stream-utils.inline.h index 868b562..18df34c 100644 --- a/be/src/util/bit-stream-utils.inline.h +++ b/be/src/util/bit-stream-utils.inline.h @@ -21,13 +21,14 @@ #include "util/bit-stream-utils.h" #include "common/compiler-util.h" +#include "util/bit-packing.h" namespace impala { inline bool BitWriter::PutValue(uint64_t v, int num_bits) { + // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases) DCHECK_LE(num_bits, MAX_BITWIDTH); - DCHECK(num_bits == MAX_BITWIDTH || v >> num_bits == 0) - << "v = " << v << ", num_bits = " << num_bits; + DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits; if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false; @@ -37,20 +38,11 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) { if (UNLIKELY(bit_offset_ >= 64)) { // Flush buffered_values_ and write out bits of v that did not fit memcpy(buffer_ + byte_offset_, &buffered_values_, 8); + buffered_values_ = 0; byte_offset_ += 8; bit_offset_ -= 64; - - // Shifting with the same or greater amount than the number of bits in the number is - // undefined behaviour. - int shift = num_bits - bit_offset_; - - if (LIKELY(shift < 64)) { - buffered_values_ = v >> shift; - } else { - buffered_values_ = 0; - } + buffered_values_ = v >> (num_bits - bit_offset_); } - DCHECK_LT(bit_offset_, 64); return true; } @@ -84,40 +76,16 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) { return true; } -template<typename UINT_T> -inline bool BitWriter::PutUleb128(UINT_T v) { - static_assert(std::is_integral<UINT_T>::value, "Integral type required."); - static_assert(std::is_unsigned<UINT_T>::value, "Unsigned type required."); - static_assert(!std::is_same<UINT_T, bool>::value, "Bools are not supported."); - - constexpr UINT_T mask_lowest_7_bits = std::numeric_limits<UINT_T>::max() ^ 0x7Fu; - +inline bool BitWriter::PutUleb128Int(uint32_t v) { bool result = true; - while ((v & mask_lowest_7_bits) != 0L) { - result &= PutAligned<uint8_t>((v & 0x7Fu) | 0x80u, 1); + while ((v & 0xFFFFFF80) != 0L) { + result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1); v >>= 7; } - result &= PutAligned<uint8_t>(v & 0x7Fu, 1); + result &= PutAligned<uint8_t>(v & 0x7F, 1); return result; } -template<typename INT_T> -bool BitWriter::PutZigZagInteger(INT_T v) { - static_assert(std::is_integral<INT_T>::value, "Integral type required."); - static_assert(std::is_signed<INT_T>::value, "Signed type required."); - - using UINT_T = std::make_unsigned_t<INT_T>; - constexpr int bit_length_minus_one = sizeof(INT_T) * 8 - 1; - - /// For negative values, << results in undefined behaviour (prior to C++20), so we treat - /// `v` as unsigned. - const UINT_T v_unsigned = v; - - /// For the right shift, we rely on implementation defined behaviour as we expect it to - /// be an arithmetic right shift (bits equal to the MSB are shifted in). - return PutUleb128((v_unsigned << 1) ^ (v >> bit_length_minus_one)); -} - template<typename T> inline int BatchedBitReader::UnpackBatch(int bit_width, int num_values, T* v) { DCHECK(buffer_pos_ != nullptr); @@ -178,42 +146,19 @@ inline bool BatchedBitReader::GetBytes(int num_bytes, T* v) { return true; } -template<typename UINT_T> -inline bool BatchedBitReader::GetUleb128(UINT_T* v) { - static_assert(std::is_integral<UINT_T>::value, "Integral type required."); - static_assert(std::is_unsigned<UINT_T>::value, "Unsigned type required."); - static_assert(!std::is_same<UINT_T, bool>::value, "Bools are not supported."); - +inline bool BatchedBitReader::GetUleb128Int(uint32_t* v) { *v = 0; int shift = 0; uint8_t byte = 0; do { - if (UNLIKELY(shift >= max_vlq_byte_len<UINT_T>() * 7)) return false; + if (UNLIKELY(shift >= MAX_VLQ_BYTE_LEN * 7)) return false; if (!GetBytes(1, &byte)) return false; - - /// We need to convert 'byte' to UINT_T so that the result of the bitwise and - /// operation is at least as long an integer as '*v', otherwise the shift may be too - /// big and lead to undefined behaviour. - const UINT_T byte_as_UINT_T = byte; - *v |= (byte_as_UINT_T & 0x7Fu) << shift; + // The constant below must be explicitly unsigned to ensure that the result of the + // bitwise-and is unsigned, so that the left shift is always defined behavior under + // the C++ standard. + *v |= (byte & 0x7Fu) << shift; shift += 7; - } while ((byte & 0x80u) != 0); - return true; -} - -template<typename INT_T> -bool BatchedBitReader::GetZigZagInteger(INT_T* v) { - static_assert(std::is_integral<INT_T>::value, "Integral type required."); - static_assert(std::is_signed<INT_T>::value, "Signed type required."); - - using UINT_T = std::make_unsigned_t<INT_T>; - - UINT_T v_unsigned; - if (UNLIKELY(!GetUleb128<UINT_T>(&v_unsigned))) return false; - - /// Here we rely on implementation defined behaviour in converting UINT_T to INT_T. - *v = (v_unsigned >> 1) ^ -(v_unsigned & 1u); - + } while ((byte & 0x80) != 0); return true; } } diff --git a/be/src/util/rle-encoding.h b/be/src/util/rle-encoding.h index 322b5d7..e4b87d0 100644 --- a/be/src/util/rle-encoding.h +++ b/be/src/util/rle-encoding.h @@ -244,7 +244,7 @@ class RleEncoder { 1 + BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8); /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value. int max_repeated_run_size = - BatchedBitReader::max_vlq_byte_len<uint32_t>() + BitUtil::Ceil(bit_width, 8); + BatchedBitReader::MAX_VLQ_BYTE_LEN + BitUtil::Ceil(bit_width, 8); return std::max(max_literal_run_size, max_repeated_run_size); } @@ -406,7 +406,7 @@ inline void RleEncoder::FlushRepeatedRun() { bool result = true; // The lsb of 0 indicates this is a repeated run uint32_t indicator_value = static_cast<uint32_t>(repeat_count_) << 1; - result &= bit_writer_.PutUleb128<uint32_t>(indicator_value); + result &= bit_writer_.PutUleb128Int(indicator_value); result &= bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 8)); DCHECK(result); num_buffered_values_ = 0; @@ -665,7 +665,7 @@ inline void RleBatchDecoder<T>::NextCounts() { // Read the next run's indicator int, it could be a literal or repeated run. // The int is encoded as a ULEB128-encoded value. uint32_t indicator_value = 0; - if (UNLIKELY(!bit_reader_.GetUleb128<uint32_t>(&indicator_value))) return; + if (UNLIKELY(!bit_reader_.GetUleb128Int(&indicator_value))) return; // lsb indicates if it is a literal run or repeated run bool is_literal = indicator_value & 1; diff --git a/be/src/util/rle-test.cc b/be/src/util/rle-test.cc index f74b8c3..4bdcff1 100644 --- a/be/src/util/rle-test.cc +++ b/be/src/util/rle-test.cc @@ -623,7 +623,7 @@ TEST_F(RleTest, RepeatCountOverflow) { // are 1. const uint32_t REPEATED_RUN_HEADER = 0xfffffffe; const uint32_t LITERAL_RUN_HEADER = 0xffffffff; - writer.PutUleb128<uint32_t>(literal_run ? LITERAL_RUN_HEADER : REPEATED_RUN_HEADER); + writer.PutUleb128Int(literal_run ? LITERAL_RUN_HEADER : REPEATED_RUN_HEADER); writer.Flush(); RleBatchDecoder<uint64_t> decoder(buffer, BUFFER_LEN, 1);