IMPALA-4123: Fast bit unpacking

Adds utility functions for fast unpacking of batches of bit-packed
values. These support reading batches of any number of values provided
that the start of the batch is aligned to a byte boundary. Callers that
want to read smaller batches that don't align to byte boundaries will
need to implement their own buffering.

The unpacking code uses only portable C++ and no SIMD intrinsics, but is
fairly efficient because unpacking a full batch of 32 values compiles
down to 32-bit loads, shifts by constants, masks by constants, bitwise
ors when a value straddles 32-bit words and stores. Further speedups
should be possible using SIMD intrinsics.

Testing:
Added unit tests for unpacking, exhaustively covering different
bitwidths with additional test dimensions (memory alignment, various
input sizes, etc).

Tested under ASAN to ensure the bit unpacking doesn't read past the end
of buffers.

Perf:
Added microbenchmark that shows on average an 8-9x speedup over the
existing BitReader code.

Change-Id: I12db69409483d208cd4c0f41c27a78aeb6cd3622
Reviewed-on: http://gerrit.cloudera.org:8080/4494
Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/07da7679
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/07da7679
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/07da7679

Branch: refs/heads/hadoop-next
Commit: 07da7679d1755ada836706f752d8078260a76244
Parents: ef762b7
Author: Tim Armstrong <tarmstr...@cloudera.com>
Authored: Wed Sep 14 10:44:08 2016 -0700
Committer: Internal Jenkins <cloudera-hud...@gerrit.cloudera.org>
Committed: Tue Oct 18 02:53:16 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/CMakeLists.txt           |  31 ++-
 be/src/benchmarks/bit-packing-benchmark.cc | 347 ++++++++++++++++++++++++
 be/src/benchmarks/bswap-benchmark.cc       |  23 +-
 be/src/exprs/expr-test.cc                  |   5 +-
 be/src/testutil/mem-util.h                 |  57 ++++
 be/src/util/CMakeLists.txt                 |   1 +
 be/src/util/bit-packing-test.cc            | 159 +++++++++++
 be/src/util/bit-packing.h                  |  92 +++++++
 be/src/util/bit-packing.inline.h           | 202 ++++++++++++++
 be/src/util/bit-stream-utils.h             |  16 +-
 be/src/util/bit-stream-utils.inline.h      |   2 +-
 be/src/util/bit-util.h                     |  68 +++--
 be/src/util/openssl-util-test.cc           |  10 +-
 13 files changed, 929 insertions(+), 84 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/benchmarks/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/CMakeLists.txt b/be/src/benchmarks/CMakeLists.txt
index 0a28dce..ba8bcfc 100644
--- a/be/src/benchmarks/CMakeLists.txt
+++ b/be/src/benchmarks/CMakeLists.txt
@@ -27,28 +27,29 @@ FUNCTION(ADD_BE_BENCHMARK BENCHMARK_NAME)
   TARGET_LINK_LIBRARIES(${BENCHMARK_NAME} ${IMPALA_LINK_LIBS})
 ENDFUNCTION()
 
-ADD_BE_BENCHMARK(parse-timestamp-benchmark)
-ADD_BE_BENCHMARK(string-search-benchmark)
 ADD_BE_BENCHMARK(atod-benchmark)
 ADD_BE_BENCHMARK(atof-benchmark)
 ADD_BE_BENCHMARK(atoi-benchmark)
-ADD_BE_BENCHMARK(lock-benchmark)
-ADD_BE_BENCHMARK(thread-create-benchmark)
-ADD_BE_BENCHMARK(tuple-layout-benchmark)
-ADD_BE_BENCHMARK(string-benchmark)
-ADD_BE_BENCHMARK(rle-benchmark)
-ADD_BE_BENCHMARK(string-compare-benchmark)
-ADD_BE_BENCHMARK(multiint-benchmark)
-ADD_BE_BENCHMARK(status-benchmark)
-ADD_BE_BENCHMARK(row-batch-serialize-benchmark)
-ADD_BE_BENCHMARK(overflow-benchmark)
-ADD_BE_BENCHMARK(bloom-filter-benchmark)
-ADD_BE_BENCHMARK(int-hash-benchmark)
 ADD_BE_BENCHMARK(bitmap-benchmark)
+ADD_BE_BENCHMARK(bit-packing-benchmark)
+ADD_BE_BENCHMARK(bloom-filter-benchmark)
+ADD_BE_BENCHMARK(bswap-benchmark)
 ADD_BE_BENCHMARK(expr-benchmark)
 ADD_BE_BENCHMARK(hash-benchmark)
 ADD_BE_BENCHMARK(in-predicate-benchmark)
+ADD_BE_BENCHMARK(int-hash-benchmark)
+ADD_BE_BENCHMARK(lock-benchmark)
+ADD_BE_BENCHMARK(multiint-benchmark)
 ADD_BE_BENCHMARK(network-perf-benchmark)
-ADD_BE_BENCHMARK(bswap-benchmark)
+ADD_BE_BENCHMARK(overflow-benchmark)
+ADD_BE_BENCHMARK(parse-timestamp-benchmark)
+ADD_BE_BENCHMARK(rle-benchmark)
+ADD_BE_BENCHMARK(row-batch-serialize-benchmark)
+ADD_BE_BENCHMARK(status-benchmark)
+ADD_BE_BENCHMARK(string-benchmark)
+ADD_BE_BENCHMARK(string-compare-benchmark)
+ADD_BE_BENCHMARK(string-search-benchmark)
+ADD_BE_BENCHMARK(thread-create-benchmark)
+ADD_BE_BENCHMARK(tuple-layout-benchmark)
 
 target_link_libraries(hash-benchmark Experiments)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/benchmarks/bit-packing-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bit-packing-benchmark.cc 
b/be/src/benchmarks/bit-packing-benchmark.cc
new file mode 100644
index 0000000..6e80d83
--- /dev/null
+++ b/be/src/benchmarks/bit-packing-benchmark.cc
@@ -0,0 +1,347 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Test bit packing performance when unpacking data for all supported 
bit-widths.
+// This compares:
+// * BitReader - the original bit reader that unpacks a value at a time.
+// * Unpack32Scalar - a batched implementation using scalar operations to 
unpack batches
+//    of 32 values.
+// * UnpackScalar - an implementation that can unpack a variable number of 
values, using
+//   Unpack32Scalar internally.
+//
+//
+// Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
+// Unpack32Values bit_width 0:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.57e+04 1.59e+04  1.6e+04    
     1X         1X         1X
+//                      Unpack32Scalar           1.34e+05 1.35e+05 1.36e+05    
  8.51X      8.49X      8.51X
+//                        UnpackScalar           2.08e+05  2.1e+05 2.12e+05    
  13.3X      13.2X      13.2X
+//
+// Unpack32Values bit_width 1:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.19e+04  1.2e+04  1.2e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.89e+04 8.94e+04 9.04e+04    
  7.48X      7.46X      7.51X
+//                        UnpackScalar           9.72e+04  9.8e+04 9.86e+04    
  8.18X      8.18X      8.19X
+//
+// Unpack32Values bit_width 2:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.18e+04 1.19e+04  1.2e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.84e+04 8.91e+04 8.99e+04    
  7.49X      7.48X       7.5X
+//                        UnpackScalar           9.68e+04 9.76e+04 9.84e+04    
   8.2X      8.19X      8.21X
+//
+// Unpack32Values bit_width 3:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.16e+04 1.17e+04 1.18e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.67e+04 8.72e+04 8.79e+04    
  7.45X      7.42X      7.43X
+//                        UnpackScalar            9.6e+04 9.66e+04 9.74e+04    
  8.25X      8.22X      8.24X
+//
+// Unpack32Values bit_width 4:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.08e+04 1.09e+04  1.1e+04    
     1X         1X         1X
+//                      Unpack32Scalar           9.13e+04 9.19e+04 9.25e+04    
  8.44X      8.43X      8.42X
+//                        UnpackScalar           9.65e+04 9.69e+04 9.78e+04    
  8.91X      8.89X       8.9X
+//
+// Unpack32Values bit_width 5:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.14e+04 1.15e+04 1.16e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.35e+04 8.42e+04 8.49e+04    
   7.3X      7.31X      7.31X
+//                        UnpackScalar           9.41e+04 9.48e+04 9.56e+04    
  8.22X      8.22X      8.24X
+//
+// Unpack32Values bit_width 6:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.14e+04 1.15e+04 1.16e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.46e+04 8.53e+04  8.6e+04    
   7.4X      7.41X      7.41X
+//                        UnpackScalar           9.35e+04 9.41e+04 9.51e+04    
  8.18X      8.16X       8.2X
+//
+// Unpack32Values bit_width 7:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.09e+04  1.1e+04 1.11e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.11e+04 8.16e+04 8.25e+04    
  7.44X      7.44X      7.45X
+//                        UnpackScalar           9.16e+04 9.21e+04  9.3e+04    
   8.4X       8.4X      8.39X
+//
+// Unpack32Values bit_width 8:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.14e+04 1.15e+04 1.16e+04    
     1X         1X         1X
+//                      Unpack32Scalar           9.02e+04 9.07e+04 9.14e+04    
   7.9X       7.9X      7.91X
+//                        UnpackScalar           9.48e+04 9.55e+04 9.63e+04    
  8.31X      8.33X      8.33X
+//
+// Unpack32Values bit_width 9:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.11e+04 1.12e+04 1.13e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.94e+04 7.97e+04 8.06e+04    
  7.14X      7.12X      7.14X
+//                        UnpackScalar           8.78e+04 8.83e+04  8.9e+04    
  7.89X      7.88X      7.89X
+//
+// Unpack32Values bit_width 10:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader            1.1e+04 1.11e+04 1.12e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.07e+04 8.14e+04 8.21e+04    
  7.31X      7.32X      7.34X
+//                        UnpackScalar           8.95e+04 9.02e+04 9.09e+04    
  8.11X      8.12X      8.12X
+//
+// Unpack32Values bit_width 11:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.09e+04  1.1e+04 1.11e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.63e+04 7.69e+04 7.75e+04    
  6.99X      6.99X      6.99X
+//                        UnpackScalar           8.55e+04 8.61e+04 8.69e+04    
  7.83X      7.83X      7.84X
+//
+// Unpack32Values bit_width 12:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.09e+04  1.1e+04  1.1e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.23e+04 8.29e+04 8.35e+04    
  7.55X      7.56X      7.57X
+//                        UnpackScalar           9.06e+04 9.12e+04 9.19e+04    
  8.31X      8.31X      8.33X
+//
+// Unpack32Values bit_width 13:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.07e+04 1.08e+04 1.09e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.42e+04 7.47e+04 7.55e+04    
  6.92X       6.9X      6.92X
+//                        UnpackScalar           8.16e+04 8.23e+04 8.29e+04    
   7.6X       7.6X      7.61X
+//
+// Unpack32Values bit_width 14:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.07e+04 1.08e+04 1.09e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.58e+04 7.62e+04 7.68e+04    
  7.08X      7.08X      7.08X
+//                        UnpackScalar           8.33e+04 8.38e+04 8.46e+04    
  7.78X      7.78X      7.79X
+//
+// Unpack32Values bit_width 15:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.06e+04 1.06e+04 1.07e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.16e+04 7.22e+04 7.29e+04    
  6.78X      6.79X      6.79X
+//                        UnpackScalar           7.96e+04 8.05e+04 8.09e+04    
  7.54X      7.57X      7.54X
+//
+// Unpack32Values bit_width 16:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.08e+04 1.08e+04 1.09e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.71e+04 8.76e+04 8.83e+04    
  8.09X      8.09X      8.08X
+//                        UnpackScalar           9.22e+04  9.3e+04 9.37e+04    
  8.56X      8.58X      8.57X
+//
+// Unpack32Values bit_width 17:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.04e+04 1.04e+04 1.05e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.98e+04 7.04e+04 7.09e+04    
  6.73X      6.74X      6.74X
+//                        UnpackScalar           7.73e+04 7.78e+04 7.85e+04    
  7.45X      7.45X      7.47X
+//
+// Unpack32Values bit_width 18:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.03e+04 1.04e+04 1.05e+04    
     1X         1X         1X
+//                      Unpack32Scalar            7.1e+04 7.17e+04 7.22e+04    
  6.86X      6.88X      6.87X
+//                        UnpackScalar           7.77e+04 7.82e+04 7.89e+04    
  7.51X       7.5X      7.51X
+//
+// Unpack32Values bit_width 19:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.02e+04 1.03e+04 1.04e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.74e+04  6.8e+04 6.85e+04    
  6.59X       6.6X      6.61X
+//                        UnpackScalar           7.43e+04 7.49e+04 7.54e+04    
  7.26X      7.27X      7.28X
+//
+// Unpack32Values bit_width 20:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.02e+04 1.03e+04 1.03e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.28e+04 7.34e+04  7.4e+04    
  7.15X      7.15X      7.15X
+//                        UnpackScalar           7.94e+04 8.02e+04 8.07e+04    
   7.8X      7.81X       7.8X
+//
+// Unpack32Values bit_width 21:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           1.01e+04 1.01e+04 1.02e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.56e+04 6.62e+04 6.67e+04    
  6.53X      6.54X      6.54X
+//                        UnpackScalar            7.1e+04 7.15e+04 7.19e+04    
  7.06X      7.06X      7.06X
+//
+// Unpack32Values bit_width 22:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader              1e+04 1.01e+04 1.02e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.68e+04 6.73e+04 6.79e+04    
  6.68X      6.68X      6.68X
+//                        UnpackScalar           7.35e+04 7.41e+04 7.46e+04    
  7.34X      7.35X      7.35X
+//
+// Unpack32Values bit_width 23:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.87e+03 9.95e+03    1e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.44e+04 6.48e+04 6.53e+04    
  6.52X      6.52X      6.51X
+//                        UnpackScalar           6.93e+04 6.97e+04 7.04e+04    
  7.03X      7.01X      7.02X
+//
+// Unpack32Values bit_width 24:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.93e+03    1e+04 1.01e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.44e+04 7.49e+04 7.55e+04    
  7.49X      7.49X      7.49X
+//                        UnpackScalar           8.12e+04 8.17e+04 8.27e+04    
  8.18X      8.17X       8.2X
+//
+// Unpack32Values bit_width 25:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.71e+03 9.79e+03 9.86e+03    
     1X         1X         1X
+//                      Unpack32Scalar           6.12e+04 6.16e+04 6.22e+04    
  6.31X      6.29X      6.31X
+//                        UnpackScalar           6.44e+04 6.48e+04 6.53e+04    
  6.64X      6.62X      6.62X
+//
+// Unpack32Values bit_width 26:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.67e+03 9.74e+03 9.81e+03    
     1X         1X         1X
+//                      Unpack32Scalar           6.21e+04 6.26e+04 6.31e+04    
  6.42X      6.42X      6.43X
+//                        UnpackScalar           6.53e+04 6.59e+04 6.64e+04    
  6.75X      6.77X      6.76X
+//
+// Unpack32Values bit_width 27:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.56e+03 9.62e+03  9.7e+03    
     1X         1X         1X
+//                      Unpack32Scalar           5.99e+04 6.03e+04 6.09e+04    
  6.27X      6.27X      6.28X
+//                        UnpackScalar           6.32e+04 6.35e+04 6.42e+04    
  6.61X       6.6X      6.62X
+//
+// Unpack32Values bit_width 28:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.53e+03 9.61e+03 9.66e+03    
     1X         1X         1X
+//                      Unpack32Scalar           6.37e+04 6.42e+04 6.47e+04    
  6.69X      6.68X       6.7X
+//                        UnpackScalar           6.68e+04 6.73e+04 6.77e+04    
  7.01X         7X      7.01X
+//
+// Unpack32Values bit_width 29:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.41e+03 9.46e+03 9.55e+03    
     1X         1X         1X
+//                      Unpack32Scalar           5.79e+04 5.82e+04 5.87e+04    
  6.15X      6.15X      6.14X
+//                        UnpackScalar           6.08e+04 6.11e+04 6.16e+04    
  6.46X      6.46X      6.46X
+//
+// Unpack32Values bit_width 30:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.37e+03 9.45e+03 9.52e+03    
     1X         1X         1X
+//                      Unpack32Scalar           5.87e+04 5.92e+04 5.96e+04    
  6.26X      6.27X      6.26X
+//                        UnpackScalar           6.16e+04  6.2e+04 6.26e+04    
  6.58X      6.56X      6.57X
+//
+// Unpack32Values bit_width 31:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.26e+03 9.33e+03 9.41e+03    
     1X         1X         1X
+//                      Unpack32Scalar           5.59e+04 5.63e+04 5.67e+04    
  6.03X      6.03X      6.03X
+//                        UnpackScalar           5.85e+04 5.89e+04 5.94e+04    
  6.31X      6.31X      6.31X
+//
+// Unpack32Values bit_width 32:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
+//                                                                          
(relative) (relative) (relative)
+// 
---------------------------------------------------------------------------------------------------------
+//                           BitReader           9.89e+03 9.96e+03    1e+04    
     1X         1X         1X
+//                      Unpack32Scalar           9.83e+04 9.96e+04 1.01e+05    
  9.95X        10X        10X
+//                        UnpackScalar           8.24e+04 8.36e+04 8.44e+04    
  8.34X       8.4X      8.41X
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "gutil/strings/substitute.h"
+#include "util/benchmark.h"
+#include "util/bit-packing.inline.h"
+#include "util/bit-stream-utils.inline.h"
+#include "util/cpu-info.h"
+
+#include "common/names.h"
+
+using namespace impala;
+
+constexpr int NUM_OUT_VALUES = 1024 * 1024;
+static_assert(NUM_OUT_VALUES % 32 == 0, "NUM_OUT_VALUES must be divisible by 
32");
+
+uint32_t out_buffer[NUM_OUT_VALUES];
+
+struct BenchmarkParams {
+  int bit_width;
+  const uint8_t* data;
+  int64_t data_len;
+};
+
+/// Benchmark calling BitReader::GetValue() in a loop to unpack 32 * 
'batch_size' values.
+void BitReaderBenchmark(int batch_size, void* data) {
+  const BenchmarkParams* p = reinterpret_cast<BenchmarkParams*>(data);
+  BitReader reader(p->data, p->data_len);
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < 32; ++j) {
+      const int64_t offset = (i * 32 + j) % NUM_OUT_VALUES;
+      if (UNLIKELY(!reader.GetValue<uint32_t>(p->bit_width, 
&out_buffer[offset]))) {
+        reader.Reset(p->data, p->data_len);
+        const bool success = reader.GetValue<uint32_t>(p->bit_width, 
&out_buffer[offset]);
+        DCHECK(success);
+      }
+    }
+  }
+}
+
+/// Benchmark calling Unpack32Values() in a loop to unpack 32 * 'batch_size' 
values.
+void Unpack32Benchmark(int batch_size, void* data) {
+  const BenchmarkParams* p = reinterpret_cast<BenchmarkParams*>(data);
+  const uint8_t* pos = reinterpret_cast<const uint8_t*>(p->data);
+  const uint8_t* const data_end = pos + p->data_len;
+  for (int i = 0; i < batch_size; ++i) {
+    if (UNLIKELY(pos >= data_end)) pos = reinterpret_cast<const 
uint8_t*>(p->data);
+    const int64_t offset = (i * 32) % NUM_OUT_VALUES;
+    pos = BitPacking::Unpack32Values(
+        p->bit_width, pos, data_end - pos, out_buffer + offset);
+  }
+}
+
+/// Benchmark calling UnpackValues() to unpack 32 * 'batch_size' values.
+void UnpackBenchmark(int batch_size, void* data) {
+  const BenchmarkParams* p = reinterpret_cast<BenchmarkParams*>(data);
+  const int64_t total_values_to_unpack = 32L * batch_size;
+  for (int64_t unpacked = 0; unpacked < total_values_to_unpack;
+       unpacked += NUM_OUT_VALUES) {
+    const int64_t unpack_batch =
+        min<int64_t>(NUM_OUT_VALUES, total_values_to_unpack - unpacked);
+    BitPacking::UnpackValues(
+        p->bit_width, p->data, p->data_len, unpack_batch, out_buffer);
+  }
+}
+
+int main(int argc, char **argv) {
+  CpuInfo::Init();
+  cout << endl << Benchmark::GetMachineInfo() << endl;
+
+  for (int bit_width = 0; bit_width <= 32; ++bit_width) {
+    Benchmark suite(Substitute("Unpack32Values bit_width $0", bit_width));
+    const int64_t data_len = NUM_OUT_VALUES * bit_width / 8;
+    vector<uint8_t> data(data_len);
+    std::iota(data.begin(), data.end(), 0);
+    BenchmarkParams params{bit_width, data.data(), data_len};
+    suite.AddBenchmark(Substitute("BitReader", bit_width), BitReaderBenchmark, 
&params);
+    suite.AddBenchmark(
+        Substitute("Unpack32Scalar", bit_width), Unpack32Benchmark, &params);
+    suite.AddBenchmark(Substitute("UnpackScalar", bit_width), UnpackBenchmark, 
&params);
+    cout << suite.Measure() << endl;
+  }
+  return 0;
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/benchmarks/bswap-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bswap-benchmark.cc 
b/be/src/benchmarks/bswap-benchmark.cc
index 6add717..f62d4fc 100644
--- a/be/src/benchmarks/bswap-benchmark.cc
+++ b/be/src/benchmarks/bswap-benchmark.cc
@@ -25,6 +25,7 @@
 #include "gutil/strings/substitute.h"
 #include "exec/parquet-common.h"
 #include "runtime/decimal-value.h"
+#include "testutil/mem-util.h"
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
 #include "util/bit-util.h"
@@ -116,18 +117,6 @@ void TestSIMDSwap(int batch_size, void* d) {
   BitUtil::ByteSwap(data->outbuffer, data->inbuffer, data->num_values);
 }
 
-// Allocate 64-byte (an x86-64 cache line) aligned memory so it does not 
straddle cache
-// line boundaries. This is sufficient to meet alignment requirements for all 
SIMD
-// instructions, at least up to AVX-512.
-// Exit process if allocation fails.
-void* AllocateAligned(size_t size) {
-  void* ptr;
-  if (posix_memalign(&ptr, 64, size) != 0) {
-    LOG(FATAL) << "Failed to allocate " << size;
-  }
-  return ptr;
-}
-
 // Benchmark routine for FastScalar/"Pure" SSSE3/"Pure" AVX2/SIMD approaches
 void PerfBenchmark() {
   // Measure perf both when memory is perfectly aligned for SIMD and also 
misaligned.
@@ -135,18 +124,16 @@ void PerfBenchmark() {
   const vector<int> misalignments({0, 1, 4, max_misalignment});
   const int data_len = 1 << 20;
 
-  const unique_ptr<uint8_t, decltype(free)*> inbuffer(
-      reinterpret_cast<uint8_t*>(AllocateAligned(data_len + 
max_misalignment)), free);
-  const unique_ptr<uint8_t, decltype(free)*> outbuffer(
-      reinterpret_cast<uint8_t*>(AllocateAligned(data_len + 
max_misalignment)), free);
+  AlignedAllocation inbuffer(data_len + max_misalignment);
+  AlignedAllocation outbuffer(data_len + max_misalignment);
 
   for (const int misalign : misalignments) {
     Benchmark suite(Substitute("ByteSwap benchmark misalignment=$0", 
misalign));
     TestData data;
 
     data.num_values = data_len;
-    data.inbuffer = inbuffer.get() + misalign;
-    data.outbuffer = outbuffer.get() + misalign;
+    data.inbuffer = inbuffer.data() + misalign;
+    data.outbuffer = outbuffer.data() + misalign;
     InitData(data.inbuffer, data_len);
 
     const int baseline = suite.AddBenchmark("FastScalar", TestFastScalarSwap, 
&data, -1);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 88565e1..0a6b720 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -15,16 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <math.h>
+#include <time.h>
 #include <limits>
 #include <map>
-#include <math.h>
 #include <string>
-#include <time.h>
 
 #include <boost/date_time/c_local_time_adjustor.hpp>
 #include <boost/date_time/posix_time/posix_time.hpp>
 #include <boost/lexical_cast.hpp>
-#include <boost/random/mersenne_twister.hpp>
 #include <boost/regex.hpp>
 #include <boost/unordered_map.hpp>
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/testutil/mem-util.h
----------------------------------------------------------------------
diff --git a/be/src/testutil/mem-util.h b/be/src/testutil/mem-util.h
new file mode 100644
index 0000000..78b7b48
--- /dev/null
+++ b/be/src/testutil/mem-util.h
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_TESTUTIL_MEM_UTIL_H_
+#define IMPALA_TESTUTIL_MEM_UTIL_H_
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "gutil/macros.h"
+
+namespace impala {
+
+/// Allocate 64-byte (an x86-64 cache line) aligned memory so it does not 
straddle cache
+/// line boundaries. This is sufficient to meet alignment requirements for all 
SIMD
+/// instructions, at least up to AVX-512.
+/// Exits process if allocation fails so should be used for tests and 
benchmarks only.
+inline uint8_t* AllocateAligned(size_t size) {
+  void* ptr;
+  if (posix_memalign(&ptr, 64, size) != 0) {
+    LOG(FATAL) << "Failed to allocate " << size;
+  }
+  return reinterpret_cast<uint8_t*>(ptr);
+}
+
+/// Scoped allocation with 64-bit alignment.
+/// Exits process if allocation fails so should be used for tests and 
benchmarks only.
+class AlignedAllocation {
+ public:
+  AlignedAllocation(size_t bytes) : data_(AllocateAligned(bytes)) {}
+  ~AlignedAllocation() { free(data_); }
+
+  uint8_t* data() { return data_; }
+ private:
+  DISALLOW_COPY_AND_ASSIGN(AlignedAllocation);
+
+  uint8_t* data_;
+};
+
+}
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index 0dfd12e..ecc222a 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -105,6 +105,7 @@ target_link_libraries(loggingsupport 
${IMPALA_LINK_LIBS_DYNAMIC_TARGETS})
 
 ADD_BE_TEST(benchmark-test)
 ADD_BE_TEST(bitmap-test)
+ADD_BE_TEST(bit-packing-test)
 ADD_BE_TEST(bit-util-test)
 ADD_BE_TEST(blocking-queue-test)
 ADD_BE_TEST(bloom-filter-test)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/util/bit-packing-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bit-packing-test.cc b/be/src/util/bit-packing-test.cc
new file mode 100644
index 0000000..bedf178
--- /dev/null
+++ b/be/src/util/bit-packing-test.cc
@@ -0,0 +1,159 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include "testutil/gtest-util.h"
+#include "testutil/mem-util.h"
+#include "util/bit-packing.inline.h"
+#include "util/bit-stream-utils.inline.h"
+
+#include "common/names.h"
+
+using std::uniform_int_distribution;
+using std::mt19937;
+
+namespace impala {
+
+namespace {
+uint32_t ComputeMask(int bit_width) {
+  return (1U << bit_width) - 1;
+}
+}
+
+/// Test unpacking a subarray of values to/from smaller buffers that are sized 
to exactly
+/// fit the the input and output. 'in' is the original unpacked input, 
'packed' is the
+/// bit-packed data. The test copies 'num_in_values' packed values to a 
smaller temporary
+/// buffer, then unpacks them to another temporary buffer. Both buffers are 
sized to the
+/// minimum number of bytes required to fit the packed/unpacked data.
+///
+/// This is to test that we do not overrun either the input or output buffer 
for smaller
+/// batch sizes.
+void UnpackSubset(const uint32_t* in, const uint8_t* packed, int num_in_values,
+    int bit_width, bool aligned);
+
+/// Test a packing/unpacking round-trip of the 'num_in_values' values in 'in',
+/// packed with 'bit_width'. If 'aligned' is true, buffers for packed and 
unpacked data
+/// are allocated at a 64-byte aligned address. Otherwise the buffers are 
misaligned
+/// by 1 byte from a 64-byte aligned address.
+void PackUnpack(const uint32_t* in, int num_in_values, int bit_width, bool 
aligned) {
+  LOG(INFO) << "num_in_values = " << num_in_values << " bit_width = " << 
bit_width
+            << " aligned = " << aligned;
+
+  // Mask out higher bits so that the values to pack are in range.
+  const uint32_t mask = ComputeMask(bit_width);
+  const int misalignment = aligned ? 0 : 1;
+
+  const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * 
num_in_values);
+  AlignedAllocation storage(bytes_required + misalignment);
+  uint8_t* packed = storage.data() + misalignment;
+
+  BitWriter writer(packed, bytes_required);
+  if (bit_width > 0) {
+    for (int i = 0; i < num_in_values; ++i) {
+      ASSERT_TRUE(writer.PutValue(in[i] & mask, bit_width));
+    }
+  }
+  writer.Flush();
+  LOG(INFO) << "Wrote " << writer.bytes_written() << " bytes.";
+
+  // Test unpacking all the values. Trying to unpack extra values should have 
the same
+  // result because the input buffer size 'num_in_values' limits the number of 
values to
+  // return.
+  for (const int num_to_unpack : {num_in_values, num_in_values + 1, 
num_in_values + 77}) {
+    LOG(INFO) << "Unpacking " << num_to_unpack;
+    // Size buffer exactly so that ASAN can detect reads/writes that overrun 
the buffer.
+    AlignedAllocation out_storage(num_to_unpack * sizeof(uint32_t) + 
misalignment);
+    uint32_t* out = reinterpret_cast<uint32_t*>(out_storage.data() + 
misalignment);
+    const auto result = BitPacking::UnpackValues(
+        bit_width, packed, writer.bytes_written(), num_to_unpack, out);
+    ASSERT_EQ(packed + writer.bytes_written(), result.first)
+        << "Unpacked different # of bytes from the # written";
+    if (bit_width == 0) {
+      // If no bits, we can get back as many as we ask for.
+      ASSERT_EQ(num_to_unpack, result.second) << "Unpacked wrong # of values";
+    } else if (bit_width < CHAR_BIT) {
+      // We may get back some garbage values that we didn't actually pack if we
+      // didn't use all of the trailing byte.
+      const int max_packed_values = writer.bytes_written() * CHAR_BIT / 
bit_width;
+      ASSERT_EQ(min(num_to_unpack, max_packed_values), result.second)
+          << "Unpacked wrong # of values";
+    } else {
+      ASSERT_EQ(num_in_values, result.second) << "Unpacked wrong # of values";
+    }
+
+    for (int i = 0; i < num_in_values; ++i) {
+      EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i;
+    }
+  }
+  UnpackSubset(in, packed, num_in_values, bit_width, aligned);
+}
+
+void UnpackSubset(const uint32_t* in, const uint8_t* packed, int num_in_values,
+    int bit_width, bool aligned) {
+  const int misalignment = aligned ? 0 : 1;
+  for (int num_to_unpack : {1, 10, 77, num_in_values - 7}) {
+    if (num_to_unpack < 0 || num_to_unpack > num_in_values) continue;
+
+    // Size buffers exactly so that ASAN can detect buffer overruns.
+    const int64_t bytes_to_read = BitUtil::RoundUpNumBytes(num_to_unpack * 
bit_width);
+    AlignedAllocation packed_copy_storage(bytes_to_read + misalignment);
+    uint8_t* packed_copy = packed_copy_storage.data() + misalignment;
+    memcpy(packed_copy, packed, bytes_to_read);
+    AlignedAllocation out_storage(num_to_unpack * sizeof(uint32_t) + 
misalignment);
+    uint32_t* out = reinterpret_cast<uint32_t*>(out_storage.data() + 
misalignment);
+    const auto result = BitPacking::UnpackValues(
+        bit_width, packed_copy, bytes_to_read, num_to_unpack, out);
+    ASSERT_EQ(packed_copy + bytes_to_read, result.first) << "Read wrong # of 
bytes";
+    ASSERT_EQ(num_to_unpack, result.second) << "Unpacked wrong # of values";
+
+    for (int i = 0; i < num_to_unpack; ++i) {
+      ASSERT_EQ(in[i] & ComputeMask(bit_width), out[i]) << "Didn't get back 
input value "
+                                                         << i;
+    }
+  }
+}
+
+TEST(BitPackingTest, RandomUnpack) {
+  constexpr int NUM_IN_VALUES = 64 * 1024;
+  uint32_t in[NUM_IN_VALUES];
+  mt19937 rng;
+  uniform_int_distribution<uint32_t> dist;
+  std::generate(std::begin(in), std::end(in), [&rng, &dist] { return 
dist(rng); });
+
+  // Test various odd input lengths to exercise boundary cases for full and 
partial
+  // batches of 32.
+  vector<int> lengths{NUM_IN_VALUES, NUM_IN_VALUES - 1, NUM_IN_VALUES - 16,
+      NUM_IN_VALUES - 19, NUM_IN_VALUES - 31};
+  for (int i = 0; i < 32; ++i) {
+    lengths.push_back(i);
+  }
+
+  for (int bit_width = 0; bit_width <= 32; ++bit_width) {
+    for (const int length : lengths) {
+      // Test that unpacking to/from aligned and unaligned memory works.
+      for (const bool aligned : {true, false}) {
+        PackUnpack(in, length, bit_width, aligned);
+      }
+    }
+  }
+}
+}
+
+IMPALA_TEST_MAIN();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/util/bit-packing.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-packing.h b/be/src/util/bit-packing.h
new file mode 100644
index 0000000..62e5e88
--- /dev/null
+++ b/be/src/util/bit-packing.h
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_UTIL_BIT_PACKING_H
+#define IMPALA_UTIL_BIT_PACKING_H
+
+namespace impala {
+
+#include <cstdint>
+
+#include <utility>
+
+/// Utilities for manipulating bit-packed values. Bit-packing is a technique 
for
+/// compressing integer values that do not use the full range of the integer 
type.
+/// E.g. an array of uint32_t values with range [0, 31] only uses the lower 5 
bits
+/// of every uint32_t value, or an array of 0/1 booleans only uses the lowest 
bit
+/// of each integer.
+///
+/// Bit-packing always has a "bit width" parameter that determines the range of
+/// representable unsigned values: [0, 2^bit_width - 1]. The packed 
representation
+/// is logically the concatenatation of the lower bits of the input values (in
+/// little-endian order). E.g. the values 1, 2, 3, 4 packed with bit width 4 
results
+/// in the two output bytes: [ 0 0 1 0 | 0 0 0 1 ] [ 0 1 0 0 | 0 0 1 1 ]
+///                               2         1           4         3
+///
+/// Packed values can be split across words, e.g. packing 1, 17 with bit_width 
5 results
+/// in the two output bytes: [ 0 0 1 | 0 0 0 0 1 ] [ x x x x x x | 1 0 ]
+///            lower bits of 17--^         1         next value     ^--upper 
bits of 17
+///
+/// Bit widths from 0 to 32 are supported (0 bit width means that every value 
is 0).
+/// The batched unpacking functions operate on batches of 32 values. This 
batch size
+/// is convenient because for every supported bit width, the end of a 32 value 
batch
+/// falls on a byte boundary. It is also large enough to amortise loop 
overheads.
+class BitPacking {
+ public:
+  /// Unpack bit-packed values with 'bit_width' from 'in' to 'out'. Keeps 
unpacking until
+  /// either all 'in_bytes' are read or 'num_values' values are unpacked. 
'out' must have
+  /// enough space for 'num_values'. 0 <= 'bit_width' <= 32 and 'bit_width' <= 
# of bits
+  /// in OutType. 'in' must point to 'in_bytes' of addressable memory.
+  ///
+  /// Returns a pointer to the byte after the last byte of 'in' that was read 
and also the
+  /// number of values that were read. If the caller wants to continue reading 
packed
+  /// values after the last one returned, it must ensure that the next value 
to unpack
+  /// starts at a byte boundary. This is true if 'num_values' is a multiple of 
32, or
+  /// more generally if (bit_width * num_values) % 8 == 0.
+  template <typename OutType>
+  static std::pair<const uint8_t*, int64_t> UnpackValues(int bit_width,
+      const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
+      OutType* __restrict__ out);
+
+  /// Unpack exactly 32 values of 'bit_width' from 'in' to 'out'. 'in' must 
point to
+  /// 'in_bytes' of addressable memory, and 'in_bytes' must be at least
+  /// (32 * bit_width / 8). 'out' must have space for 32 OutType values.
+  /// 0 <= 'bit_width' <= 32 and 'bit_width' <= # of bits in OutType.
+  template <typename OutType>
+  static const uint8_t* Unpack32Values(int bit_width, const uint8_t* 
__restrict__ in,
+      int64_t in_bytes, OutType* __restrict__ out);
+
+ private:
+  /// Implementation of Unpack32Values() that uses 32-bit integer loads to
+  /// unpack values with the given BIT_WIDTH from 'in' to 'out'.
+  template <typename OutType, int BIT_WIDTH>
+  static const uint8_t* Unpack32Values(
+      const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ 
out);
+
+  /// Function that unpacks 'num_values' values with the given BIT_WIDTH from 
'in' to
+  /// 'out'. 'num_values' can be at most 32. The version with 'bit_width' as 
an argument
+  /// dispatches based on 'bit_width' to the appropriate templated 
implementation.
+  template <typename OutType, int BIT_WIDTH>
+  static const uint8_t* UnpackUpTo32Values(const uint8_t* __restrict__ in,
+      int64_t in_bytes, int num_values, OutType* __restrict__ out);
+  template <typename OutType>
+  static const uint8_t* UnpackUpTo32Values(int bit_width, const uint8_t* 
__restrict__ in,
+      int64_t in_bytes, int num_values, OutType* __restrict__ out);
+};
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/util/bit-packing.inline.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-packing.inline.h b/be/src/util/bit-packing.inline.h
new file mode 100644
index 0000000..37d51ab
--- /dev/null
+++ b/be/src/util/bit-packing.inline.h
@@ -0,0 +1,202 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_UTIL_BIT_PACKING_INLINE_H
+#define IMPALA_UTIL_BIT_PACKING_INLINE_H
+
+#include "util/bit-packing.h"
+
+#include <algorithm>
+#include <type_traits>
+
+#include <boost/preprocessor/repetition/repeat_from_to.hpp>
+
+#include "common/compiler-util.h"
+#include "common/logging.h"
+#include "util/bit-util.h"
+
+namespace impala {
+
+template <typename OutType>
+std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(int bit_width,
+    const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
+    OutType* __restrict__ out) {
+  constexpr int BATCH_SIZE = 32;
+  const int64_t max_input_values =
+      bit_width ? (in_bytes * CHAR_BIT) / bit_width : num_values;
+  const int64_t values_to_read = std::min(num_values, max_input_values);
+  const int64_t batches_to_read = values_to_read / BATCH_SIZE;
+  const int64_t remainder_values = values_to_read % BATCH_SIZE;
+  const uint8_t* in_pos = in;
+  OutType* out_pos = out;
+  // First unpack as many full batches as possible.
+  for (int64_t i = 0; i < batches_to_read; ++i) {
+    in_pos = Unpack32Values<OutType>(bit_width, in_pos, in_bytes, out_pos);
+    out_pos += BATCH_SIZE;
+    in_bytes -= (BATCH_SIZE * bit_width) / CHAR_BIT;
+  }
+  // Then unpack the final partial batch.
+  if (remainder_values > 0) {
+    in_pos = UnpackUpTo32Values<OutType>(bit_width,
+        in_pos, in_bytes, remainder_values, out_pos);
+  }
+  return std::make_pair(in_pos, values_to_read);
+}
+
+// Loop body of unrolled loop that unpacks the value. BIT_WIDTH is the bit 
width of
+// the packed values. 'in_buf' is the start of the input buffer and 'out_vals' 
is the
+// start of the output values array. This function unpacks the VALUE_IDX'th 
packed value
+// from 'in_buf'.
+//
+// This implements essentially the same algorithm as the (Apache-licensed) 
code in
+// bpacking.c at https://github.com/lemire/FrameOfReference/, but is much more 
compact
+// because it uses templates rather than source-level unrolling of all 
combinations.
+//
+// After the template parameters is expanded and constants are propagated, all 
branches
+// and offset/shift calculations should be optimized out, leaving only shifts 
by constants
+// and bitmasks by constants. Calls to this must be stamped out manually or 
with
+// BOOST_PP_REPEAT_FROM_TO: experimentation revealed that the GCC 4.9.2 
optimiser was
+// not able to fully propagate constants and remove branches when this was 
called from
+// inside a for loop with constant bounds with VALUE_IDX changed to a function 
argument.
+template <int BIT_WIDTH, int VALUE_IDX>
+inline uint32_t ALWAYS_INLINE UnpackValue(const uint8_t* __restrict__ in_buf) {
+  constexpr uint32_t LOAD_BIT_WIDTH = sizeof(uint32_t) * CHAR_BIT;
+  static_assert(BIT_WIDTH <= LOAD_BIT_WIDTH, "BIT_WIDTH > LOAD_BIT_WIDTH");
+  static_assert(VALUE_IDX >= 0 && VALUE_IDX < 32, "0 <= VALUE_IDX < 32");
+  // The index of the first bit of the value, relative to the start of 
'in_buf'.
+  constexpr uint32_t FIRST_BIT = VALUE_IDX * BIT_WIDTH;
+  constexpr uint32_t IN_WORD_IDX = FIRST_BIT / LOAD_BIT_WIDTH;
+  constexpr uint32_t FIRST_BIT_OFFSET = FIRST_BIT % LOAD_BIT_WIDTH;
+  // Index of bit after last bit of this value, relative to start of 
IN_WORD_IDX.
+  constexpr uint32_t END_BIT_OFFSET = FIRST_BIT_OFFSET + BIT_WIDTH;
+
+  const uint32_t* in_words = reinterpret_cast<const uint32_t*>(in_buf);
+  // The lower bits of the value come from the first word.
+  const uint32_t lower_bits =
+      BIT_WIDTH > 0 ? in_words[IN_WORD_IDX] >> FIRST_BIT_OFFSET : 0U;
+  if (END_BIT_OFFSET < LOAD_BIT_WIDTH) {
+    // All bits of the value are in the first word, but we need to mask out 
upper bits
+    // that belong to the next value.
+    return lower_bits % (1UL << BIT_WIDTH);
+  } if (END_BIT_OFFSET == LOAD_BIT_WIDTH) {
+    // This value was exactly the uppermost bits of the first word - no 
masking required.
+    return lower_bits;
+  } else {
+    DCHECK_GT(END_BIT_OFFSET, LOAD_BIT_WIDTH);
+    DCHECK_LT(VALUE_IDX, 31)
+        << "Should not go down this branch for last value with no trailing 
bits.";
+    // Value is split between words, so grab trailing bits from the next word.
+    // Force into [0, LOAD_BIT_WIDTH) to avoid spurious shift >= width of type 
warning.
+    constexpr uint32_t NUM_TRAILING_BITS =
+        END_BIT_OFFSET < LOAD_BIT_WIDTH ? 0 : END_BIT_OFFSET - LOAD_BIT_WIDTH;
+    const uint32_t trailing_bits = in_words[IN_WORD_IDX + 1] % (1UL << 
NUM_TRAILING_BITS);
+    // Force into [0, LOAD_BIT_WIDTH) to avoid spurious shift >= width of type 
warning.
+    constexpr uint32_t TRAILING_BITS_SHIFT =
+        BIT_WIDTH == 32 ? 0 : (BIT_WIDTH - NUM_TRAILING_BITS);
+    return lower_bits | (trailing_bits << TRAILING_BITS_SHIFT);
+  }
+}
+
+template <typename OutType, int BIT_WIDTH>
+const uint8_t* BitPacking::Unpack32Values(
+    const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ 
out) {
+  static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
+  static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32");
+  static_assert(
+      BIT_WIDTH <= sizeof(OutType) * CHAR_BIT, "BIT_WIDTH too high for output 
type");
+  constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
+  DCHECK_GE(in_bytes, BYTES_TO_READ);
+
+// Call UnpackValue for 0 <= i < 32.
+#pragma push_macro("UNPACK_VALUES_CALL")
+#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
+  out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i>(in));
+  BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
+#pragma pop_macro("UNPACK_VALUES_CALL")
+  return in + BYTES_TO_READ;
+}
+
+template <typename OutType>
+const uint8_t* BitPacking::Unpack32Values(int bit_width, const uint8_t* 
__restrict__ in,
+    int64_t in_bytes, OutType* __restrict__ out) {
+  switch (bit_width) {
+    // Expand cases from 0 to 32.
+#pragma push_macro("UNPACK_VALUES_CASE")
+#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
+    case i: return Unpack32Values<OutType, i>(in, in_bytes, out);
+    BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
+#pragma pop_macro("UNPACK_VALUES_CASE")
+    default: DCHECK(false); return in;
+  }
+}
+
+template <typename OutType>
+const uint8_t* BitPacking::UnpackUpTo32Values(int bit_width, const uint8_t* 
__restrict__ in,
+    int64_t in_bytes, int num_values, OutType* __restrict__ out) {
+  switch (bit_width) {
+    // Expand cases from 0 to 32.
+#pragma push_macro("UNPACK_VALUES_CASE")
+#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
+    case i: return UnpackUpTo32Values<OutType, i>(in, in_bytes, num_values, 
out);
+    BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
+#pragma pop_macro("UNPACK_VALUES_CASE")
+    default: DCHECK(false); return in;
+  }
+}
+
+template <typename OutType, int BIT_WIDTH>
+const uint8_t* BitPacking::UnpackUpTo32Values(const uint8_t* __restrict__ in,
+    int64_t in_bytes, int num_values, OutType* __restrict__ out) {
+  static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
+  static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32");
+  static_assert(
+      BIT_WIDTH <= sizeof(OutType) * CHAR_BIT, "BIT_WIDTH too high for output 
type");
+  constexpr int MAX_BATCH_SIZE = 31;
+  const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
+  DCHECK_GE(in_bytes, BYTES_TO_READ);
+  DCHECK_LE(num_values, MAX_BATCH_SIZE);
+
+  // Make sure the buffer is at least 1 byte.
+  constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ?
+    (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
+  uint8_t tmp_buffer[TMP_BUFFER_SIZE];
+
+  const uint8_t* in_buffer = in;
+  // Copy into padded temporary buffer to avoid reading past the end of 'in' 
if the
+  // last 32-bit load would go past the end of the buffer.
+  if (BitUtil::RoundUp(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
+    memcpy(tmp_buffer, in, BYTES_TO_READ);
+    in_buffer = tmp_buffer;
+  }
+
+  // Use switch with fall-through cases to minimise branching.
+  switch (num_values) {
+// Expand cases from 31 down to 1.
+#pragma push_macro("UNPACK_VALUES_CASE")
+#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
+  case 31 - i: out[30 - i] = \
+      static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i>(in_buffer));
+    BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
+#pragma pop_macro("UNPACK_VALUES_CASE")
+    case 0: break;
+    default: DCHECK(false);
+  }
+  return in + BYTES_TO_READ;
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/util/bit-stream-utils.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-stream-utils.h b/be/src/util/bit-stream-utils.h
index ce159cb..5acdeee 100644
--- a/be/src/util/bit-stream-utils.h
+++ b/be/src/util/bit-stream-utils.h
@@ -98,13 +98,19 @@ class BitWriter {
 class BitReader {
  public:
   /// 'buffer' is the buffer to read from.  The buffer's length is 
'buffer_len'.
-  BitReader(uint8_t* buffer, int buffer_len) {
-    Reset(buffer, buffer_len);
-  }
+  /// Does not take ownership of the buffer.
+  BitReader(const uint8_t* buffer, int buffer_len) { Reset(buffer, 
buffer_len); }
 
   BitReader() : buffer_(NULL), max_bytes_(0) {}
 
-  void Reset(uint8_t* buffer, int buffer_len) {
+  // The implicit copy constructor is left defined. If a BitReader is copied, 
the
+  // two copies do not share any state. Invoking functions on either copy 
continues
+  // reading from the current read position without modifying the state of the 
other
+  // copy.
+
+  /// Resets the read to start reading from the start of 'buffer'. The buffer's
+  /// length is 'buffer_len'. Does not take ownership of the buffer.
+  void Reset(const uint8_t* buffer, int buffer_len) {
     buffer_ = buffer;
     max_bytes_ = buffer_len;
     byte_offset_ = 0;
@@ -141,7 +147,7 @@ class BitReader {
   static const int MAX_BITWIDTH = 32;
 
  private:
-  uint8_t* buffer_;
+  const uint8_t* buffer_;
   int max_bytes_;
 
   /// Bytes are memcpy'd from buffer_ and values are read from this variable. 
This is

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/util/bit-stream-utils.inline.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-stream-utils.inline.h 
b/be/src/util/bit-stream-utils.inline.h
index fd77974..41648e3 100644
--- a/be/src/util/bit-stream-utils.inline.h
+++ b/be/src/util/bit-stream-utils.inline.h
@@ -86,7 +86,7 @@ inline bool BitWriter::PutVlqInt(int32_t v) {
 
 template<typename T>
 inline bool BitReader::GetValue(int num_bits, T* v) {
-  DCHECK(buffer_ != NULL);
+  DCHECK(num_bits == 0 || buffer_ != NULL);
   // TODO: revisit this limit if necessary
   DCHECK_LE(num_bits, MAX_BITWIDTH);
   DCHECK_LE(num_bits, sizeof(T) * 8);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index f947a17..33dd02b 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -41,17 +41,17 @@ using boost::make_unsigned;
 class BitUtil {
  public:
   /// Returns the ceil of value/divisor
-  static inline int64_t Ceil(int64_t value, int64_t divisor) {
+  constexpr static inline int64_t Ceil(int64_t value, int64_t divisor) {
     return value / divisor + (value % divisor != 0);
   }
 
   /// Returns 'value' rounded up to the nearest multiple of 'factor'
-  static inline int64_t RoundUp(int64_t value, int64_t factor) {
+  constexpr static inline int64_t RoundUp(int64_t value, int64_t factor) {
     return (value + (factor - 1)) / factor * factor;
   }
 
   /// Returns 'value' rounded down to the nearest multiple of 'factor'
-  static inline int64_t RoundDown(int64_t value, int64_t factor) {
+  constexpr static inline int64_t RoundDown(int64_t value, int64_t factor) {
     return (value / factor) * factor;
   }
 
@@ -85,34 +85,28 @@ class BitUtil {
   /// Specialized round up and down functions for frequently used factors,
   /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64).
   /// Returns the rounded up number of bytes that fit the number of bits.
-  static inline uint32_t RoundUpNumBytes(uint32_t bits) {
+  constexpr static inline uint32_t RoundUpNumBytes(uint32_t bits) {
     return (bits + 7) >> 3;
   }
 
   /// Returns the rounded down number of bytes that fit the number of bits.
-  static inline uint32_t RoundDownNumBytes(uint32_t bits) {
-    return bits >> 3;
-  }
+  constexpr static inline uint32_t RoundDownNumBytes(uint32_t bits) { return 
bits >> 3; }
 
   /// Returns the rounded up to 32 multiple. Used for conversions of bits to 
i32.
-  static inline uint32_t RoundUpNumi32(uint32_t bits) {
+  constexpr static inline uint32_t RoundUpNumi32(uint32_t bits) {
     return (bits + 31) >> 5;
   }
 
   /// Returns the rounded up 32 multiple.
-  static inline uint32_t RoundDownNumi32(uint32_t bits) {
-    return bits >> 5;
-  }
+  constexpr static inline uint32_t RoundDownNumi32(uint32_t bits) { return 
bits >> 5; }
 
   /// Returns the rounded up to 64 multiple. Used for conversions of bits to 
i64.
-  static inline uint32_t RoundUpNumi64(uint32_t bits) {
+  constexpr static inline uint32_t RoundUpNumi64(uint32_t bits) {
     return (bits + 63) >> 6;
   }
 
   /// Returns the rounded down to 64 multiple.
-  static inline uint32_t RoundDownNumi64(uint32_t bits) {
-    return bits >> 6;
-  }
+  constexpr static inline uint32_t RoundDownNumi64(uint32_t bits) { return 
bits >> 6; }
 
   /// Non hw accelerated pop count.
   /// TODO: we don't use this in any perf sensitive code paths currently.  
There
@@ -172,51 +166,51 @@ class BitUtil {
   /// swap for len > 16.
   static void ByteSwap(void* dest, const void* source, int len);
 
-  /// Converts to big endian format (if not already in big endian) from the
-  /// machine's native endian format.
+/// Converts to big endian format (if not already in big endian) from the
+/// machine's native endian format.
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-  static inline int64_t  ToBigEndian(int64_t value)  { return ByteSwap(value); 
}
+  static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); }
   static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); 
}
-  static inline int32_t  ToBigEndian(int32_t value)  { return ByteSwap(value); 
}
+  static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); }
   static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); 
}
-  static inline int16_t  ToBigEndian(int16_t value)  { return ByteSwap(value); 
}
+  static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); }
   static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); 
}
 #else
-  static inline int64_t  ToBigEndian(int64_t val)  { return val; }
+  static inline int64_t ToBigEndian(int64_t val) { return val; }
   static inline uint64_t ToBigEndian(uint64_t val) { return val; }
-  static inline int32_t  ToBigEndian(int32_t val)  { return val; }
+  static inline int32_t ToBigEndian(int32_t val) { return val; }
   static inline uint32_t ToBigEndian(uint32_t val) { return val; }
-  static inline int16_t  ToBigEndian(int16_t val)  { return val; }
+  static inline int16_t ToBigEndian(int16_t val) { return val; }
   static inline uint16_t ToBigEndian(uint16_t val) { return val; }
 #endif
 
-  /// Converts from big endian format to the machine's native endian format.
+/// Converts from big endian format to the machine's native endian format.
 #if __BYTE_ORDER == __LITTLE_ENDIAN
-  static inline int64_t  FromBigEndian(int64_t value)  { return 
ByteSwap(value); }
+  static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); 
}
   static inline uint64_t FromBigEndian(uint64_t value) { return 
ByteSwap(value); }
-  static inline int32_t  FromBigEndian(int32_t value)  { return 
ByteSwap(value); }
+  static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); 
}
   static inline uint32_t FromBigEndian(uint32_t value) { return 
ByteSwap(value); }
-  static inline int16_t  FromBigEndian(int16_t value)  { return 
ByteSwap(value); }
+  static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); 
}
   static inline uint16_t FromBigEndian(uint16_t value) { return 
ByteSwap(value); }
 #else
-  static inline int64_t  FromBigEndian(int64_t val)  { return val; }
+  static inline int64_t FromBigEndian(int64_t val) { return val; }
   static inline uint64_t FromBigEndian(uint64_t val) { return val; }
-  static inline int32_t  FromBigEndian(int32_t val)  { return val; }
+  static inline int32_t FromBigEndian(int32_t val) { return val; }
   static inline uint32_t FromBigEndian(uint32_t val) { return val; }
-  static inline int16_t  FromBigEndian(int16_t val)  { return val; }
+  static inline int16_t FromBigEndian(int16_t val) { return val; }
   static inline uint16_t FromBigEndian(uint16_t val) { return val; }
 #endif
 
   /// Returns true if 'value' is a non-negative 32-bit integer.
-  static inline bool IsNonNegative32Bit(int64_t value) {
+  constexpr static inline bool IsNonNegative32Bit(int64_t value) {
     return static_cast<uint64_t>(value) <= std::numeric_limits<int32_t>::max();
   }
 
   /// Logical right shift for signed integer types
   /// This is needed because the C >> operator does arithmetic right shift
   /// Negative shift amounts lead to undefined behavior
-  template<typename T>
-  static T ShiftRightLogical(T v, int shift) {
+  template <typename T>
+  constexpr static T ShiftRightLogical(T v, int shift) {
     // Conversion to unsigned ensures most significant bits always filled with 
0's
     return static_cast<typename make_unsigned<T>::type>(v) >> shift;
   }
@@ -230,15 +224,15 @@ class BitUtil {
 
   /// Set a specific bit to 1
   /// Behavior when bitpos is negative is undefined
-  template<typename T>
-  static T SetBit(T v, int bitpos) {
+  template <typename T>
+  constexpr static T SetBit(T v, int bitpos) {
     return v | (static_cast<T>(0x1) << bitpos);
   }
 
   /// Set a specific bit to 0
   /// Behavior when bitpos is negative is undefined
-  template<typename T>
-  static T UnsetBit(T v, int bitpos) {
+  template <typename T>
+  constexpr static T UnsetBit(T v, int bitpos) {
     return v & ~(static_cast<T>(0x1) << bitpos);
   }
 };

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07da7679/be/src/util/openssl-util-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/openssl-util-test.cc b/be/src/util/openssl-util-test.cc
index b0238bf..ef1b28e 100644
--- a/be/src/util/openssl-util-test.cc
+++ b/be/src/util/openssl-util-test.cc
@@ -15,17 +15,17 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <random>
+
 #include <gtest/gtest.h>
 #include <openssl/rand.h>
-#include <boost/random/mersenne_twister.hpp>
-#include <boost/random/uniform_int.hpp>
 
 #include "common/init.h"
 #include "testutil/gtest-util.h"
 #include "util/openssl-util.h"
 
-using boost::uniform_int;
-using boost::mt19937_64;
+using std::uniform_int_distribution;
+using std::mt19937_64;
 
 namespace impala {
 
@@ -40,7 +40,7 @@ class OpenSSLUtilTest : public ::testing::Test {
     DCHECK_EQ(len % 8, 0);
     for (int64_t i = 0; i < len; i += sizeof(uint64_t)) {
       *(reinterpret_cast<uint64_t*>(&data[i])) =
-          uniform_int<uint64_t>(0, numeric_limits<uint64_t>::max())(rng_);
+          uniform_int_distribution<uint64_t>(0, 
numeric_limits<uint64_t>::max())(rng_);
     }
   }
 

Reply via email to