[impala] branch master updated: Revert "IMPALA-8710: Increase allowed bit width to 64 for bit packing"

boroknagyz Mon, 08 Jul 2019 02:25:56 -0700

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git



The following commit(s) were added to refs/heads/master by this push:
     new 1776c7a  Revert "IMPALA-8710: Increase allowed bit width to 64 for bit 
packing"
1776c7a is described below

commit 1776c7a54adb929bbd41eea50aed7665282f46f0
Author: Zoltan Borok-Nagy <borokna...@cloudera.com>
AuthorDate: Fri Jul 5 15:15:57 2019 +0000

    Revert "IMPALA-8710: Increase allowed bit width to 64 for bit packing"
    
    This reverts commit b1cbf9e6b786132e86699cbb1e472ec98499bb11.
    
    Change-Id: If18a083c72ed036f0b493506a43e20b2c2450a41
    Reviewed-on: http://gerrit.cloudera.org:8080/13808
    Reviewed-by: Attila Jeges <atti...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 be/src/benchmarks/bit-packing-benchmark.cc | 198 ++++++++++++++---------------
 be/src/util/CMakeLists.txt                 |   2 +-
 be/src/util/bit-packing-test.cc            |  26 ++--
 be/src/util/bit-packing.h                  |  10 +-
 be/src/util/bit-packing.inline.h           | 103 +++++++--------
 be/src/util/bit-stream-utils-test.cc       | 147 ++-------------------
 be/src/util/bit-stream-utils.h             |  39 ++----
 be/src/util/bit-stream-utils.inline.h      |  87 +++----------
 be/src/util/rle-encoding.h                 |   6 +-
 be/src/util/rle-test.cc                    |   2 +-
 10 files changed, 205 insertions(+), 415 deletions(-)

diff --git a/be/src/benchmarks/bit-packing-benchmark.cc 
b/be/src/benchmarks/bit-packing-benchmark.cc
index 6c895ad..79ac68b 100644
--- a/be/src/benchmarks/bit-packing-benchmark.cc
+++ b/be/src/benchmarks/bit-packing-benchmark.cc
@@ -24,237 +24,237 @@
 //   Unpack32Scalar internally.
 //
 //
-// Machine Info: Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz
+// Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
 // Unpack32Values bit_width 0:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.05e+04 1.06e+04 1.07e+04    
     1X         1X         1X
-//                      Unpack32Scalar           1.29e+05 1.32e+05 1.34e+05    
  12.3X      12.5X      12.5X
-//                        UnpackScalar           3.18e+05 3.23e+05 3.26e+05    
  30.3X      30.4X      30.5X
+//                           BitReader           1.57e+04 1.59e+04  1.6e+04    
     1X         1X         1X
+//                      Unpack32Scalar           1.34e+05 1.35e+05 1.36e+05    
  8.51X      8.49X      8.51X
+//                        UnpackScalar           2.08e+05  2.1e+05 2.12e+05    
  13.3X      13.2X      13.2X
 //
 // Unpack32Values bit_width 1:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.28e+04 1.29e+04 1.31e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.22e+04 8.42e+04  8.5e+04    
   6.4X      6.51X       6.5X
-//                        UnpackScalar           7.96e+04 8.08e+04 8.16e+04    
   6.2X      6.24X      6.24X
+//                           BitReader           1.19e+04  1.2e+04  1.2e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.89e+04 8.94e+04 9.04e+04    
  7.48X      7.46X      7.51X
+//                        UnpackScalar           9.72e+04  9.8e+04 9.86e+04    
  8.18X      8.18X      8.19X
 //
 // Unpack32Values bit_width 2:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
 //                           BitReader           1.18e+04 1.19e+04  1.2e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.37e+04 8.41e+04 8.49e+04    
  7.06X      7.06X      7.06X
-//                        UnpackScalar            7.6e+04 7.66e+04 7.73e+04    
  6.42X      6.44X      6.43X
+//                      Unpack32Scalar           8.84e+04 8.91e+04 8.99e+04    
  7.49X      7.48X       7.5X
+//                        UnpackScalar           9.68e+04 9.76e+04 9.84e+04    
   8.2X      8.19X      8.21X
 //
 // Unpack32Values bit_width 3:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.26e+04 1.27e+04 1.28e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.31e+04 8.37e+04 8.44e+04    
  6.61X       6.6X      6.61X
-//                        UnpackScalar           9.42e+04 9.54e+04 9.61e+04    
  7.49X      7.52X      7.52X
+//                           BitReader           1.16e+04 1.17e+04 1.18e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.67e+04 8.72e+04 8.79e+04    
  7.45X      7.42X      7.43X
+//                        UnpackScalar            9.6e+04 9.66e+04 9.74e+04    
  8.25X      8.22X      8.24X
 //
 // Unpack32Values bit_width 4:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.26e+04 1.27e+04 1.28e+04    
     1X         1X         1X
-//                      Unpack32Scalar            8.4e+04 8.46e+04 8.52e+04    
  6.68X      6.68X      6.67X
-//                        UnpackScalar           7.46e+04 7.52e+04 7.59e+04    
  5.93X      5.93X      5.94X
+//                           BitReader           1.08e+04 1.09e+04  1.1e+04    
     1X         1X         1X
+//                      Unpack32Scalar           9.13e+04 9.19e+04 9.25e+04    
  8.44X      8.43X      8.42X
+//                        UnpackScalar           9.65e+04 9.69e+04 9.78e+04    
  8.91X      8.89X       8.9X
 //
 // Unpack32Values bit_width 5:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.23e+04 1.25e+04 1.25e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.25e+04 8.32e+04 8.39e+04    
  6.68X      6.67X      6.69X
-//                        UnpackScalar           9.37e+04 9.43e+04 9.52e+04    
   7.6X      7.56X      7.58X
+//                           BitReader           1.14e+04 1.15e+04 1.16e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.35e+04 8.42e+04 8.49e+04    
   7.3X      7.31X      7.31X
+//                        UnpackScalar           9.41e+04 9.48e+04 9.56e+04    
  8.22X      8.22X      8.24X
 //
 // Unpack32Values bit_width 6:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.23e+04 1.24e+04 1.25e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.23e+04 8.32e+04 8.39e+04    
  6.69X       6.7X      6.71X
-//                        UnpackScalar           9.28e+04 9.42e+04 9.52e+04    
  7.54X      7.59X      7.61X
+//                           BitReader           1.14e+04 1.15e+04 1.16e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.46e+04 8.53e+04  8.6e+04    
   7.4X      7.41X      7.41X
+//                        UnpackScalar           9.35e+04 9.41e+04 9.51e+04    
  8.18X      8.16X       8.2X
 //
 // Unpack32Values bit_width 7:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader            1.2e+04 1.23e+04 1.24e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.19e+04 8.26e+04 8.32e+04    
   6.8X      6.73X      6.72X
-//                        UnpackScalar           9.26e+04 9.36e+04 9.46e+04    
  7.69X      7.62X      7.64X
+//                           BitReader           1.09e+04  1.1e+04 1.11e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.11e+04 8.16e+04 8.25e+04    
  7.44X      7.44X      7.45X
+//                        UnpackScalar           9.16e+04 9.21e+04  9.3e+04    
   8.4X       8.4X      8.39X
 //
 // Unpack32Values bit_width 8:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.15e+04  1.2e+04 1.23e+04    
     1X         1X         1X
-//                      Unpack32Scalar           6.33e+04 8.31e+04 8.51e+04    
   5.5X       6.9X      6.94X
-//                        UnpackScalar           5.64e+04    7e+04 7.29e+04    
  4.91X      5.82X      5.95X
+//                           BitReader           1.14e+04 1.15e+04 1.16e+04    
     1X         1X         1X
+//                      Unpack32Scalar           9.02e+04 9.07e+04 9.14e+04    
   7.9X       7.9X      7.91X
+//                        UnpackScalar           9.48e+04 9.55e+04 9.63e+04    
  8.31X      8.33X      8.33X
 //
 // Unpack32Values bit_width 9:Function  iters/ms   10%ile   50%ile   90%ile    
 10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.07e+04 1.14e+04 1.16e+04    
     1X         1X         1X
-//                      Unpack32Scalar           6.33e+04 8.16e+04 8.27e+04    
   5.9X      7.12X      7.14X
-//                        UnpackScalar           6.95e+04 9.22e+04 9.32e+04    
  6.47X      8.05X      8.05X
+//                           BitReader           1.11e+04 1.12e+04 1.13e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.94e+04 7.97e+04 8.06e+04    
  7.14X      7.12X      7.14X
+//                        UnpackScalar           8.78e+04 8.83e+04  8.9e+04    
  7.89X      7.88X      7.89X
 //
 // Unpack32Values bit_width 10:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.18e+04 1.19e+04  1.2e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.13e+04 8.18e+04 8.23e+04    
  6.89X      6.85X      6.84X
-//                        UnpackScalar           9.17e+04 9.26e+04 9.34e+04    
  7.76X      7.76X      7.75X
+//                           BitReader            1.1e+04 1.11e+04 1.12e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.07e+04 8.14e+04 8.21e+04    
  7.31X      7.32X      7.34X
+//                        UnpackScalar           8.95e+04 9.02e+04 9.09e+04    
  8.11X      8.12X      8.12X
 //
 // Unpack32Values bit_width 11:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.17e+04 1.18e+04 1.19e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.07e+04 8.14e+04 8.22e+04    
  6.89X       6.9X      6.92X
-//                        UnpackScalar           9.13e+04 9.22e+04 9.29e+04    
  7.79X      7.82X      7.83X
+//                           BitReader           1.09e+04  1.1e+04 1.11e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.63e+04 7.69e+04 7.75e+04    
  6.99X      6.99X      6.99X
+//                        UnpackScalar           8.55e+04 8.61e+04 8.69e+04    
  7.83X      7.83X      7.84X
 //
 // Unpack32Values bit_width 12:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.17e+04 1.17e+04 1.19e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.07e+04 8.14e+04  8.2e+04    
  6.92X      6.93X      6.92X
-//                        UnpackScalar           8.94e+04 9.14e+04 9.23e+04    
  7.67X      7.78X      7.79X
+//                           BitReader           1.09e+04  1.1e+04  1.1e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.23e+04 8.29e+04 8.35e+04    
  7.55X      7.56X      7.57X
+//                        UnpackScalar           9.06e+04 9.12e+04 9.19e+04    
  8.31X      8.31X      8.33X
 //
 // Unpack32Values bit_width 13:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.13e+04 1.15e+04 1.16e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.85e+04 8.05e+04 8.14e+04    
  6.93X      6.98X      6.99X
-//                        UnpackScalar           8.77e+04 9.09e+04 9.17e+04    
  7.74X      7.88X      7.87X
+//                           BitReader           1.07e+04 1.08e+04 1.09e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.42e+04 7.47e+04 7.55e+04    
  6.92X       6.9X      6.92X
+//                        UnpackScalar           8.16e+04 8.23e+04 8.29e+04    
   7.6X       7.6X      7.61X
 //
 // Unpack32Values bit_width 14:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.14e+04 1.15e+04 1.16e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.81e+04    8e+04 8.06e+04    
  6.84X      6.95X      6.94X
-//                        UnpackScalar           8.87e+04 9.01e+04  9.1e+04    
  7.77X      7.83X      7.84X
+//                           BitReader           1.07e+04 1.08e+04 1.09e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.58e+04 7.62e+04 7.68e+04    
  7.08X      7.08X      7.08X
+//                        UnpackScalar           8.33e+04 8.38e+04 8.46e+04    
  7.78X      7.78X      7.79X
 //
 // Unpack32Values bit_width 15:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.12e+04 1.13e+04 1.14e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.66e+04 7.96e+04 8.06e+04    
  6.85X      7.01X      7.04X
-//                        UnpackScalar           8.81e+04 8.94e+04 9.04e+04    
  7.87X      7.88X       7.9X
+//                           BitReader           1.06e+04 1.06e+04 1.07e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.16e+04 7.22e+04 7.29e+04    
  6.78X      6.79X      6.79X
+//                        UnpackScalar           7.96e+04 8.05e+04 8.09e+04    
  7.54X      7.57X      7.54X
 //
 // Unpack32Values bit_width 16:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.13e+04 1.13e+04 1.14e+04    
     1X         1X         1X
-//                      Unpack32Scalar           8.11e+04 8.19e+04 8.27e+04    
   7.2X      7.22X      7.23X
-//                        UnpackScalar           8.82e+04 8.91e+04 8.97e+04    
  7.84X      7.85X      7.85X
+//                           BitReader           1.08e+04 1.08e+04 1.09e+04    
     1X         1X         1X
+//                      Unpack32Scalar           8.71e+04 8.76e+04 8.83e+04    
  8.09X      8.09X      8.08X
+//                        UnpackScalar           9.22e+04  9.3e+04 9.37e+04    
  8.56X      8.58X      8.57X
 //
 // Unpack32Values bit_width 17:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.11e+04 1.12e+04 1.13e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.75e+04 7.86e+04 7.91e+04    
     7X      7.04X      7.03X
-//                        UnpackScalar           8.75e+04 8.82e+04 8.89e+04    
  7.89X      7.91X       7.9X
+//                           BitReader           1.04e+04 1.04e+04 1.05e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.98e+04 7.04e+04 7.09e+04    
  6.73X      6.74X      6.74X
+//                        UnpackScalar           7.73e+04 7.78e+04 7.85e+04    
  7.45X      7.45X      7.47X
 //
 // Unpack32Values bit_width 18:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.09e+04 1.11e+04 1.12e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.71e+04 7.81e+04 7.88e+04    
  7.06X      7.02X      7.03X
-//                        UnpackScalar           8.68e+04 8.81e+04 8.89e+04    
  7.95X      7.92X      7.93X
+//                           BitReader           1.03e+04 1.04e+04 1.05e+04    
     1X         1X         1X
+//                      Unpack32Scalar            7.1e+04 7.17e+04 7.22e+04    
  6.86X      6.88X      6.87X
+//                        UnpackScalar           7.77e+04 7.82e+04 7.89e+04    
  7.51X       7.5X      7.51X
 //
 // Unpack32Values bit_width 19:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.09e+04  1.1e+04 1.11e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.69e+04 7.75e+04 7.83e+04    
  7.06X      7.07X      7.07X
-//                        UnpackScalar           8.62e+04 8.72e+04 8.78e+04    
  7.91X      7.96X      7.93X
+//                           BitReader           1.02e+04 1.03e+04 1.04e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.74e+04  6.8e+04 6.85e+04    
  6.59X       6.6X      6.61X
+//                        UnpackScalar           7.43e+04 7.49e+04 7.54e+04    
  7.26X      7.27X      7.28X
 //
 // Unpack32Values bit_width 20:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.07e+04  1.1e+04 1.11e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.58e+04 7.74e+04 7.82e+04    
   7.1X      7.06X      7.07X
-//                        UnpackScalar           8.43e+04 8.64e+04 8.74e+04    
  7.89X      7.89X       7.9X
+//                           BitReader           1.02e+04 1.03e+04 1.03e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.28e+04 7.34e+04  7.4e+04    
  7.15X      7.15X      7.15X
+//                        UnpackScalar           7.94e+04 8.02e+04 8.07e+04    
   7.8X      7.81X       7.8X
 //
 // Unpack32Values bit_width 21:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.07e+04 1.08e+04 1.09e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.58e+04 7.66e+04 7.76e+04    
  7.07X      7.09X      7.14X
-//                        UnpackScalar           8.51e+04 8.61e+04 8.69e+04    
  7.95X      7.97X         8X
+//                           BitReader           1.01e+04 1.01e+04 1.02e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.56e+04 6.62e+04 6.67e+04    
  6.53X      6.54X      6.54X
+//                        UnpackScalar            7.1e+04 7.15e+04 7.19e+04    
  7.06X      7.06X      7.06X
 //
 // Unpack32Values bit_width 22:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.05e+04 1.08e+04 1.09e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.49e+04 7.62e+04 7.71e+04    
   7.1X      7.08X       7.1X
-//                        UnpackScalar           8.39e+04 8.59e+04 8.65e+04    
  7.96X      7.98X      7.97X
+//                           BitReader              1e+04 1.01e+04 1.02e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.68e+04 6.73e+04 6.79e+04    
  6.68X      6.68X      6.68X
+//                        UnpackScalar           7.35e+04 7.41e+04 7.46e+04    
  7.34X      7.35X      7.35X
 //
 // Unpack32Values bit_width 23:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.06e+04 1.07e+04 1.08e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.44e+04 7.54e+04 7.62e+04    
  7.03X      7.07X      7.08X
-//                        UnpackScalar           8.44e+04 8.54e+04 8.61e+04    
  7.97X         8X         8X
+//                           BitReader           9.87e+03 9.95e+03    1e+04    
     1X         1X         1X
+//                      Unpack32Scalar           6.44e+04 6.48e+04 6.53e+04    
  6.52X      6.52X      6.51X
+//                        UnpackScalar           6.93e+04 6.97e+04 7.04e+04    
  7.03X      7.01X      7.02X
 //
 // Unpack32Values bit_width 24:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.05e+04 1.06e+04 1.07e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.54e+04 7.62e+04 7.69e+04    
  7.17X      7.18X       7.2X
-//                        UnpackScalar           8.42e+04 8.51e+04 8.62e+04    
  8.01X      8.01X      8.07X
+//                           BitReader           9.93e+03    1e+04 1.01e+04    
     1X         1X         1X
+//                      Unpack32Scalar           7.44e+04 7.49e+04 7.55e+04    
  7.49X      7.49X      7.49X
+//                        UnpackScalar           8.12e+04 8.17e+04 8.27e+04    
  8.18X      8.17X       8.2X
 //
 // Unpack32Values bit_width 25:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.04e+04 1.05e+04 1.05e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.42e+04 7.49e+04 7.57e+04    
  7.15X      7.17X      7.18X
-//                        UnpackScalar           8.32e+04 8.43e+04 8.51e+04    
  8.01X      8.07X      8.07X
+//                           BitReader           9.71e+03 9.79e+03 9.86e+03    
     1X         1X         1X
+//                      Unpack32Scalar           6.12e+04 6.16e+04 6.22e+04    
  6.31X      6.29X      6.31X
+//                        UnpackScalar           6.44e+04 6.48e+04 6.53e+04    
  6.64X      6.62X      6.62X
 //
 // Unpack32Values bit_width 26:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.02e+04 1.04e+04 1.05e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.24e+04 7.46e+04 7.55e+04    
  7.12X      7.16X      7.19X
-//                        UnpackScalar           8.15e+04 8.43e+04 8.51e+04    
  8.01X      8.09X       8.1X
+//                           BitReader           9.67e+03 9.74e+03 9.81e+03    
     1X         1X         1X
+//                      Unpack32Scalar           6.21e+04 6.26e+04 6.31e+04    
  6.42X      6.42X      6.43X
+//                        UnpackScalar           6.53e+04 6.59e+04 6.64e+04    
  6.75X      6.77X      6.76X
 //
 // Unpack32Values bit_width 27:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.01e+04 1.03e+04 1.04e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.26e+04  7.4e+04 7.47e+04    
  7.18X      7.18X       7.2X
-//                        UnpackScalar           8.13e+04 8.35e+04 8.43e+04    
  8.03X      8.11X      8.12X
+//                           BitReader           9.56e+03 9.62e+03  9.7e+03    
     1X         1X         1X
+//                      Unpack32Scalar           5.99e+04 6.03e+04 6.09e+04    
  6.27X      6.27X      6.28X
+//                        UnpackScalar           6.32e+04 6.35e+04 6.42e+04    
  6.61X       6.6X      6.62X
 //
 // Unpack32Values bit_width 28:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader              1e+04 1.03e+04 1.03e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.07e+04 7.41e+04 7.48e+04    
  7.05X      7.22X      7.23X
-//                        UnpackScalar           7.97e+04 8.34e+04 8.42e+04    
  7.96X      8.13X      8.14X
+//                           BitReader           9.53e+03 9.61e+03 9.66e+03    
     1X         1X         1X
+//                      Unpack32Scalar           6.37e+04 6.42e+04 6.47e+04    
  6.69X      6.68X       6.7X
+//                        UnpackScalar           6.68e+04 6.73e+04 6.77e+04    
  7.01X         7X      7.01X
 //
 // Unpack32Values bit_width 29:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           1.01e+04 1.01e+04 1.02e+04    
     1X         1X         1X
-//                      Unpack32Scalar           7.06e+04 7.27e+04 7.35e+04    
  7.02X      7.18X      7.19X
-//                        UnpackScalar              8e+04 8.23e+04 8.32e+04    
  7.96X      8.12X      8.14X
+//                           BitReader           9.41e+03 9.46e+03 9.55e+03    
     1X         1X         1X
+//                      Unpack32Scalar           5.79e+04 5.82e+04 5.87e+04    
  6.15X      6.15X      6.14X
+//                        UnpackScalar           6.08e+04 6.11e+04 6.16e+04    
  6.46X      6.46X      6.46X
 //
 // Unpack32Values bit_width 30:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           9.95e+03 1.01e+04 1.02e+04    
     1X         1X         1X
-//                      Unpack32Scalar              7e+04 7.23e+04 7.32e+04    
  7.04X      7.16X      7.18X
-//                        UnpackScalar            7.9e+04 8.18e+04 8.24e+04    
  7.94X      8.09X      8.08X
+//                           BitReader           9.37e+03 9.45e+03 9.52e+03    
     1X         1X         1X
+//                      Unpack32Scalar           5.87e+04 5.92e+04 5.96e+04    
  6.26X      6.27X      6.26X
+//                        UnpackScalar           6.16e+04  6.2e+04 6.26e+04    
  6.58X      6.56X      6.57X
 //
 // Unpack32Values bit_width 31:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           9.06e+03 9.33e+03 9.43e+03    
     1X         1X         1X
-//                      Unpack32Scalar           6.52e+04 7.17e+04 7.26e+04    
  7.19X      7.69X       7.7X
-//                        UnpackScalar           7.67e+04  8.1e+04 8.19e+04    
  8.46X      8.69X      8.69X
+//                           BitReader           9.26e+03 9.33e+03 9.41e+03    
     1X         1X         1X
+//                      Unpack32Scalar           5.59e+04 5.63e+04 5.67e+04    
  6.03X      6.03X      6.03X
+//                        UnpackScalar           5.85e+04 5.89e+04 5.94e+04    
  6.31X      6.31X      6.31X
 //
 // Unpack32Values bit_width 32:Function  iters/ms   10%ile   50%ile   90%ile   
  10%ile     50%ile     90%ile
 //                                                                          
(relative) (relative) (relative)
 // 
---------------------------------------------------------------------------------------------------------
-//                           BitReader           9.73e+03 9.96e+03    1e+04    
     1X         1X         1X
-//                      Unpack32Scalar           1.06e+05  1.1e+05 1.11e+05    
  10.9X        11X      11.1X
-//                        UnpackScalar           1.41e+05 1.49e+05  1.5e+05    
  14.5X      14.9X        15X
+//                           BitReader           9.89e+03 9.96e+03    1e+04    
     1X         1X         1X
+//                      Unpack32Scalar           9.83e+04 9.96e+04 1.01e+05    
  9.95X        10X        10X
+//                        UnpackScalar           8.24e+04 8.36e+04 8.44e+04    
  8.34X       8.4X      8.41X
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index 21942d3..b12ae2d 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -159,7 +159,7 @@ target_link_libraries(loggingsupport 
${IMPALA_LINK_LIBS_DYNAMIC_TARGETS})
 ADD_UNIFIED_BE_LSAN_TEST(benchmark-test "BenchmarkTest.*")
 ADD_UNIFIED_BE_LSAN_TEST(bitmap-test "Bitmap.*")
 ADD_UNIFIED_BE_LSAN_TEST(bit-packing-test "BitPackingTest.*")
-ADD_UNIFIED_BE_LSAN_TEST(bit-stream-utils-test "BitArray.*:VLQInt.*")
+ADD_UNIFIED_BE_LSAN_TEST(bit-stream-utils-test "BitArray.*")
 ADD_UNIFIED_BE_LSAN_TEST(bit-util-test "BitUtil.*")
 ADD_UNIFIED_BE_LSAN_TEST(blocking-queue-test "BlockingQueueTest.*")
 ADD_UNIFIED_BE_LSAN_TEST(bloom-filter-test "BloomFilter.*:BloomFilterTest.*")
diff --git a/be/src/util/bit-packing-test.cc b/be/src/util/bit-packing-test.cc
index 8010fe3..2cf86ba 100644
--- a/be/src/util/bit-packing-test.cc
+++ b/be/src/util/bit-packing-test.cc
@@ -32,8 +32,8 @@ using std::mt19937;
 namespace impala {
 
 namespace {
-uint64_t ComputeMask(int bit_width) {
-  return (bit_width < 64) ? ((1UL << bit_width) - 1) : ~0UL;
+uint32_t ComputeMask(int bit_width) {
+  return (bit_width < 32) ? ((1U << bit_width) - 1) : ~0U;
 }
 }
 
@@ -45,19 +45,19 @@ uint64_t ComputeMask(int bit_width) {
 ///
 /// This is to test that we do not overrun either the input or output buffer 
for smaller
 /// batch sizes.
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+void UnpackSubset(const uint32_t* in, const uint8_t* packed, int num_in_values,
     int bit_width, bool aligned);
 
 /// Test a packing/unpacking round-trip of the 'num_in_values' values in 'in',
 /// packed with 'bit_width'. If 'aligned' is true, buffers for packed and 
unpacked data
 /// are allocated at a 64-byte aligned address. Otherwise the buffers are 
misaligned
 /// by 1 byte from a 64-byte aligned address.
-void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool 
aligned) {
+void PackUnpack(const uint32_t* in, int num_in_values, int bit_width, bool 
aligned) {
   LOG(INFO) << "num_in_values = " << num_in_values << " bit_width = " << 
bit_width
             << " aligned = " << aligned;
 
   // Mask out higher bits so that the values to pack are in range.
-  const uint64_t mask = ComputeMask(bit_width);
+  const uint32_t mask = ComputeMask(bit_width);
   const int misalignment = aligned ? 0 : 1;
 
   const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * 
num_in_values);
@@ -79,8 +79,8 @@ void PackUnpack(const uint64_t* in, int num_in_values, int 
bit_width, bool align
   for (const int num_to_unpack : {num_in_values, num_in_values + 1, 
num_in_values + 77}) {
     LOG(INFO) << "Unpacking " << num_to_unpack;
     // Size buffer exactly so that ASAN can detect reads/writes that overrun 
the buffer.
-    AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + 
misalignment);
-    uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + 
misalignment);
+    AlignedAllocation out_storage(num_to_unpack * sizeof(uint32_t) + 
misalignment);
+    uint32_t* out = reinterpret_cast<uint32_t*>(out_storage.data() + 
misalignment);
     const auto result = BitPacking::UnpackValues(
         bit_width, packed, writer.bytes_written(), num_to_unpack, out);
     ASSERT_EQ(packed + writer.bytes_written(), result.first)
@@ -105,7 +105,7 @@ void PackUnpack(const uint64_t* in, int num_in_values, int 
bit_width, bool align
   UnpackSubset(in, packed, num_in_values, bit_width, aligned);
 }
 
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+void UnpackSubset(const uint32_t* in, const uint8_t* packed, int num_in_values,
     int bit_width, bool aligned) {
   const int misalignment = aligned ? 0 : 1;
   for (int num_to_unpack : {1, 10, 77, num_in_values - 7}) {
@@ -116,8 +116,8 @@ void UnpackSubset(const uint64_t* in, const uint8_t* 
packed, int num_in_values,
     AlignedAllocation packed_copy_storage(bytes_to_read + misalignment);
     uint8_t* packed_copy = packed_copy_storage.data() + misalignment;
     memcpy(packed_copy, packed, bytes_to_read);
-    AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + 
misalignment);
-    uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + 
misalignment);
+    AlignedAllocation out_storage(num_to_unpack * sizeof(uint32_t) + 
misalignment);
+    uint32_t* out = reinterpret_cast<uint32_t*>(out_storage.data() + 
misalignment);
     const auto result = BitPacking::UnpackValues(
         bit_width, packed_copy, bytes_to_read, num_to_unpack, out);
     ASSERT_EQ(packed_copy + bytes_to_read, result.first) << "Read wrong # of 
bytes";
@@ -132,9 +132,9 @@ void UnpackSubset(const uint64_t* in, const uint8_t* 
packed, int num_in_values,
 
 TEST(BitPackingTest, RandomUnpack) {
   constexpr int NUM_IN_VALUES = 64 * 1024;
-  uint64_t in[NUM_IN_VALUES];
+  uint32_t in[NUM_IN_VALUES];
   mt19937 rng;
-  uniform_int_distribution<uint64_t> dist;
+  uniform_int_distribution<uint32_t> dist;
   std::generate(std::begin(in), std::end(in), [&rng, &dist] { return 
dist(rng); });
 
   // Test various odd input lengths to exercise boundary cases for full and 
partial
@@ -145,7 +145,7 @@ TEST(BitPackingTest, RandomUnpack) {
     lengths.push_back(i);
   }
 
-  for (int bit_width = 0; bit_width <= 64; ++bit_width) {
+  for (int bit_width = 0; bit_width <= 32; ++bit_width) {
     for (const int length : lengths) {
       // Test that unpacking to/from aligned and unaligned memory works.
       for (const bool aligned : {true, false}) {
diff --git a/be/src/util/bit-packing.h b/be/src/util/bit-packing.h
index d449048..c70b55e 100644
--- a/be/src/util/bit-packing.h
+++ b/be/src/util/bit-packing.h
@@ -40,17 +40,15 @@ namespace impala {
 /// in the two output bytes: [ 0 0 1 | 0 0 0 0 1 ] [ x x x x x x | 1 0 ]
 ///            lower bits of 17--^         1         next value     ^--upper 
bits of 17
 ///
-/// Bit widths from 0 to 64 are supported (0 bit width means that every value 
is 0).
+/// Bit widths from 0 to 32 are supported (0 bit width means that every value 
is 0).
 /// The batched unpacking functions operate on batches of 32 values. This 
batch size
 /// is convenient because for every supported bit width, the end of a 32 value 
batch
 /// falls on a byte boundary. It is also large enough to amortise loop 
overheads.
 class BitPacking {
  public:
-  static constexpr int MAX_BITWIDTH = sizeof(uint64_t) * 8;
-
   /// Unpack bit-packed values with 'bit_width' from 'in' to 'out'. Keeps 
unpacking until
   /// either all 'in_bytes' are read or 'num_values' values are unpacked. 
'out' must have
-  /// enough space for 'num_values'. 0 <= 'bit_width' <= 64 and 'bit_width' <= 
# of bits
+  /// enough space for 'num_values'. 0 <= 'bit_width' <= 32 and 'bit_width' <= 
# of bits
   /// in OutType. 'in' must point to 'in_bytes' of addressable memory.
   ///
   /// Returns a pointer to the byte after the last byte of 'in' that was read 
and also the
@@ -88,7 +86,7 @@ class BitPacking {
   /// Unpack exactly 32 values of 'bit_width' from 'in' to 'out'. 'in' must 
point to
   /// 'in_bytes' of addressable memory, and 'in_bytes' must be at least
   /// (32 * bit_width / 8). 'out' must have space for 32 OutType values.
-  /// 0 <= 'bit_width' <= 64 and 'bit_width' <= # of bits in OutType.
+  /// 0 <= 'bit_width' <= 32 and 'bit_width' <= # of bits in OutType.
   template <typename OutType>
   static const uint8_t* Unpack32Values(int bit_width, const uint8_t* 
__restrict__ in,
       int64_t in_bytes, OutType* __restrict__ out);
@@ -115,7 +113,7 @@ class BitPacking {
   /// 'num_values' must be at most 31. 'in' must point to 'in_bytes' of 
addressable
   /// memory, and 'in_bytes' must be at least ceil(num_values * bit_width / 8).
   /// 'out' must have space for 'num_values' OutType values.
-  /// 0 <= 'bit_width' <= 64 and 'bit_width' <= # of bits in OutType.
+  /// 0 <= 'bit_width' <= 32 and 'bit_width' <= # of bits in OutType.
   template <typename OutType, int BIT_WIDTH>
   static const uint8_t* UnpackUpTo31Values(const uint8_t* __restrict__ in,
       int64_t in_bytes, int num_values, OutType* __restrict__ out);
diff --git a/be/src/util/bit-packing.inline.h b/be/src/util/bit-packing.inline.h
index bbaa23b..8cebe40 100644
--- a/be/src/util/bit-packing.inline.h
+++ b/be/src/util/bit-packing.inline.h
@@ -58,8 +58,8 @@ std::pair<const uint8_t*, int64_t> 
BitPacking::UnpackValues(int bit_width,
     return UnpackValues<OutType, i>(in, in_bytes, num_values, out);
 
   switch (bit_width) {
-    // Expand cases from 0 to 64.
-    BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
+    // Expand cases from 0 to 32.
+    BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
     default:
       DCHECK(false);
       return std::make_pair(nullptr, -1);
@@ -77,14 +77,12 @@ std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(
   const int64_t remainder_values = values_to_read % BATCH_SIZE;
   const uint8_t* in_pos = in;
   OutType* out_pos = out;
-
   // First unpack as many full batches as possible.
   for (int64_t i = 0; i < batches_to_read; ++i) {
     in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
     out_pos += BATCH_SIZE;
     in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
   }
-
   // Then unpack the final partial batch.
   if (remainder_values > 0) {
     in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(
@@ -105,14 +103,15 @@ std::pair<const uint8_t*, int64_t> 
BitPacking::UnpackAndDecodeValues(int bit_wid
         in, in_bytes, dict, dict_len, num_values, out, stride, decode_error);
 
   switch (bit_width) {
-    // Expand cases from 0 to 64.
-    BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
+    // Expand cases from 0 to 32.
+    BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
     default:
       DCHECK(false);
       return std::make_pair(nullptr, -1);
   }
 #pragma pop_macro("UNPACK_VALUES_CASE")
 }
+
 template <typename OutType, int BIT_WIDTH>
 std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(
     const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ 
dict,
@@ -150,57 +149,49 @@ std::pair<const uint8_t*, int64_t> 
BitPacking::UnpackAndDecodeValues(
 // bpacking.c at https://github.com/lemire/FrameOfReference/, but is much more 
compact
 // because it uses templates rather than source-level unrolling of all 
combinations.
 //
-// After the template parameters are expanded and constants are propagated, 
all branches
+// After the template parameters is expanded and constants are propagated, all 
branches
 // and offset/shift calculations should be optimized out, leaving only shifts 
by constants
 // and bitmasks by constants. Calls to this must be stamped out manually or 
with
 // BOOST_PP_REPEAT_FROM_TO: experimentation revealed that the GCC 4.9.2 
optimiser was
 // not able to fully propagate constants and remove branches when this was 
called from
 // inside a for loop with constant bounds with VALUE_IDX changed to a function 
argument.
-//
-// We compute how many 32 bit words we have to read, which is either 1, 2 or 
3. If it is
-// at least 2, the first two 32 bit words are read as one 64 bit word. Even if 
only one
-// word needs to be read, we try to read 64 bits if it does not lead to buffer 
overflow
-// because benchmarks show that it has a positive effect on performance.
 template <int BIT_WIDTH, int VALUE_IDX>
-inline uint64_t ALWAYS_INLINE UnpackValue(const uint8_t* __restrict__ in_buf) {
-  if (BIT_WIDTH == 0) return 0;
-
-  constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
-  constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
-  constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
-  constexpr int LAST_WORD_IDX = BitUtil::RoundUpNumi32(LAST_BIT_IDX);
-  constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
-  static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be 
loaded.");
-
-  constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
-  constexpr uint64_t mask = BIT_WIDTH == 64 ? ~0L : (1UL << BIT_WIDTH) - 1;
-  const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);
-
-  // Avoid reading past the end of the buffer.
-  constexpr bool CAN_READ_64 = FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= 
BIT_WIDTH * 32;
-
-  // We do not try to read 64 bits when the bit width is a power of two 
(unless it is
-  // necessary) because performance benchmarks show that it is better this 
way. We can
-  // revisit it when we update the compiler version.
-  constexpr bool READ_32_BITS = WORDS_TO_READ == 1
-      && (!CAN_READ_64 || BitUtil::IsPowerOf2(BIT_WIDTH));
-
-  if (READ_32_BITS) {
-    uint32_t word = in[FIRST_WORD_IDX];
-    word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
-    return word & mask;
-  }
-
-  uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
-  word >>= FIRST_BIT_OFFSET;
-
-  if (WORDS_TO_READ > 2) {
-    constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - 
FIRST_BIT_OFFSET;
-    uint64_t extra_word = in[FIRST_WORD_IDX + 2];
-    word |= extra_word << USEFUL_BITS;
+inline uint32_t ALWAYS_INLINE UnpackValue(const uint8_t* __restrict__ in_buf) {
+  constexpr uint32_t LOAD_BIT_WIDTH = sizeof(uint32_t) * CHAR_BIT;
+  static_assert(BIT_WIDTH <= LOAD_BIT_WIDTH, "BIT_WIDTH > LOAD_BIT_WIDTH");
+  static_assert(VALUE_IDX >= 0 && VALUE_IDX < 32, "0 <= VALUE_IDX < 32");
+  // The index of the first bit of the value, relative to the start of 
'in_buf'.
+  constexpr uint32_t FIRST_BIT = VALUE_IDX * BIT_WIDTH;
+  constexpr uint32_t IN_WORD_IDX = FIRST_BIT / LOAD_BIT_WIDTH;
+  constexpr uint32_t FIRST_BIT_OFFSET = FIRST_BIT % LOAD_BIT_WIDTH;
+  // Index of bit after last bit of this value, relative to start of 
IN_WORD_IDX.
+  constexpr uint32_t END_BIT_OFFSET = FIRST_BIT_OFFSET + BIT_WIDTH;
+
+  const uint32_t* in_words = reinterpret_cast<const uint32_t*>(in_buf);
+  // The lower bits of the value come from the first word.
+  const uint32_t lower_bits =
+      BIT_WIDTH > 0 ? in_words[IN_WORD_IDX] >> FIRST_BIT_OFFSET : 0U;
+  if (END_BIT_OFFSET < LOAD_BIT_WIDTH) {
+    // All bits of the value are in the first word, but we need to mask out 
upper bits
+    // that belong to the next value.
+    return lower_bits % (1UL << BIT_WIDTH);
+  } if (END_BIT_OFFSET == LOAD_BIT_WIDTH) {
+    // This value was exactly the uppermost bits of the first word - no 
masking required.
+    return lower_bits;
+  } else {
+    DCHECK_GT(END_BIT_OFFSET, LOAD_BIT_WIDTH);
+    DCHECK_LT(VALUE_IDX, 31)
+        << "Should not go down this branch for last value with no trailing 
bits.";
+    // Value is split between words, so grab trailing bits from the next word.
+    // Force into [0, LOAD_BIT_WIDTH) to avoid spurious shift >= width of type 
warning.
+    constexpr uint32_t NUM_TRAILING_BITS =
+        END_BIT_OFFSET < LOAD_BIT_WIDTH ? 0 : END_BIT_OFFSET - LOAD_BIT_WIDTH;
+    const uint32_t trailing_bits = in_words[IN_WORD_IDX + 1] % (1UL << 
NUM_TRAILING_BITS);
+    // Force into [0, LOAD_BIT_WIDTH) to avoid spurious shift >= width of type 
warning.
+    constexpr uint32_t TRAILING_BITS_SHIFT =
+        BIT_WIDTH == 32 ? 0 : (BIT_WIDTH - NUM_TRAILING_BITS);
+    return lower_bits | (trailing_bits << TRAILING_BITS_SHIFT);
   }
-
-  return word & mask;
 }
 
 template <typename OutType>
@@ -219,7 +210,7 @@ template <typename OutType, int BIT_WIDTH>
 const uint8_t* BitPacking::Unpack32Values(
     const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ 
out) {
   static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
-  static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
+  static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32");
   DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for 
output";
   constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
   DCHECK_GE(in_bytes, BYTES_TO_READ);
@@ -242,8 +233,8 @@ const uint8_t* BitPacking::Unpack32Values(int bit_width, 
const uint8_t* __restri
     case i: return Unpack32Values<OutType, i>(in, in_bytes, out);
 
   switch (bit_width) {
-    // Expand cases from 0 to 64.
-    BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
+    // Expand cases from 0 to 32.
+    BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
     default: DCHECK(false); return in;
   }
 #pragma pop_macro("UNPACK_VALUES_CASE")
@@ -254,7 +245,7 @@ const uint8_t* BitPacking::UnpackAndDecode32Values(const 
uint8_t* __restrict__ i
     int64_t in_bytes, OutType* __restrict__ dict, int64_t dict_len,
     OutType* __restrict__ out, int64_t stride, bool* __restrict__ 
decode_error) {
   static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
-  static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
+  static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32");
   constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
   DCHECK_GE(in_bytes, BYTES_TO_READ);
   // TODO: this could be optimised further by using SIMD instructions.
@@ -278,7 +269,7 @@ template <typename OutType, int BIT_WIDTH>
 const uint8_t* BitPacking::UnpackUpTo31Values(const uint8_t* __restrict__ in,
     int64_t in_bytes, int num_values, OutType* __restrict__ out) {
   static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
-  static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
+  static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32");
   DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for 
output";
   constexpr int MAX_BATCH_SIZE = 31;
   const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
@@ -319,7 +310,7 @@ const uint8_t* 
BitPacking::UnpackAndDecodeUpTo31Values(const uint8_t* __restrict
       int64_t in_bytes, OutType* __restrict__ dict, int64_t dict_len, int 
num_values,
       OutType* __restrict__ out, int64_t stride, bool* __restrict__ 
decode_error) {
   static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
-  static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
+  static_assert(BIT_WIDTH <= 32, "BIT_WIDTH > 32");
   constexpr int MAX_BATCH_SIZE = 31;
   const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
   DCHECK_GE(in_bytes, BYTES_TO_READ);
diff --git a/be/src/util/bit-stream-utils-test.cc 
b/be/src/util/bit-stream-utils-test.cc
index aa157e3..e02e3c3 100644
--- a/be/src/util/bit-stream-utils-test.cc
+++ b/be/src/util/bit-stream-utils-test.cc
@@ -151,16 +151,15 @@ TEST(BitArray, TestIntSkip) {
   TestSkipping<int>(buffer, bytes_written, bit_width, 8, 56);
 }
 
-// Writes 'num_vals' values with width 'bit_width' starting from 'start' and 
increasing
-// and reads them back.
-void TestBitArrayValues(int bit_width, uint64_t start, uint64_t num_vals) {
+// Writes 'num_vals' values with width 'bit_width' and reads them back.
+void TestBitArrayValues(int bit_width, int num_vals) {
   const int len = BitUtil::Ceil(bit_width * num_vals, 8);
-  const uint64_t mask = bit_width == 64 ? ~0UL : (1UL << bit_width) - 1;
+  const int64_t mod = bit_width == 64 ? 1 : 1LL << bit_width;
 
   uint8_t buffer[(len > 0) ? len : 1];
   BitWriter writer(buffer, len);
-  for (uint64_t i = 0; i < num_vals; ++i) {
-    bool result = writer.PutValue((start + i) & mask, bit_width);
+  for (int i = 0; i < num_vals; ++i) {
+    bool result = writer.PutValue(i % mod, bit_width);
     EXPECT_TRUE(result);
   }
   writer.Flush();
@@ -173,20 +172,19 @@ void TestBitArrayValues(int bit_width, uint64_t start, 
uint64_t num_vals) {
     // Unpack all values at once with one batched reader and in small batches 
with the
     // other batched reader.
     vector<int64_t> batch_vals(num_vals);
-    const uint64_t BATCH_SIZE = 32;
+    const int BATCH_SIZE = 32;
     vector<int64_t> batch_vals2(BATCH_SIZE);
     EXPECT_EQ(num_vals,
         reader.UnpackBatch(bit_width, num_vals, batch_vals.data()));
-    for (uint64_t i = 0; i < num_vals; ++i) {
+    for (int i = 0; i < num_vals; ++i) {
       if (i % BATCH_SIZE == 0) {
         int num_to_unpack = min(BATCH_SIZE, num_vals - i);
         EXPECT_EQ(num_to_unpack,
            reader2.UnpackBatch(bit_width, num_to_unpack, batch_vals2.data()));
       }
-      EXPECT_EQ((start + i) & mask, batch_vals[i]);
-      EXPECT_EQ((start + i) & mask, batch_vals2[i % BATCH_SIZE]);
+      EXPECT_EQ(i % mod, batch_vals[i]);
+      EXPECT_EQ(i % mod, batch_vals2[i % BATCH_SIZE]);
     }
-
     EXPECT_EQ(reader.bytes_left(), 0);
     EXPECT_EQ(reader2.bytes_left(), 0);
     reader.Reset(buffer, len);
@@ -196,132 +194,13 @@ void TestBitArrayValues(int bit_width, uint64_t start, 
uint64_t num_vals) {
 
 TEST(BitArray, TestValues) {
   for (int width = 0; width <= MAX_WIDTH; ++width) {
-    TestBitArrayValues(width, 0, 1);
-    TestBitArrayValues(width, 0, 2);
+    TestBitArrayValues(width, 1);
+    TestBitArrayValues(width, 2);
     // Don't write too many values
-    TestBitArrayValues(width, 0, (width < 12) ? (1 << width) : 4096);
-    TestBitArrayValues(width, 0, 1024);
-
-    // Also test values that need bitwidth > 32.
-    TestBitArrayValues(width, 1099511627775LL, 1024);
+    TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096);
+    TestBitArrayValues(width, 1024);
   }
 }
 
-template<typename UINT_T>
-void TestUleb128Encode(const UINT_T value, std::vector<uint8_t> 
expected_bytes) {
-  const int len = BatchedBitReader::max_vlq_byte_len<UINT_T>();
-
-  std::vector<uint8_t> buffer(len, 0);
-  BitWriter writer(buffer.data(), len);
-
-  const bool success = writer.PutUleb128(value);
-  EXPECT_TRUE(success);
-
-  if (expected_bytes.size() < len) {
-    // Allow the user to input fewer bytes than the maximum.
-    expected_bytes.resize(len, 0);
-  }
-
-  EXPECT_EQ(expected_bytes, buffer);
 }
 
-TEST(VLQInt, TestPutUleb128) {
-  TestUleb128Encode<uint32_t>(5, {0x05});
-  TestUleb128Encode<uint32_t>(2018, {0xe2, 0x0f});
-  TestUleb128Encode<uint32_t>(25248, {0xa0, 0xc5, 0x01});
-  TestUleb128Encode<uint32_t>(55442211, {0xa3, 0xf6, 0xb7, 0x1a});
-  TestUleb128Encode<uint32_t>(4294967295, {0xff, 0xff, 0xff, 0xff, 0x0f});
-
-  TestUleb128Encode<uint64_t>(5, {0x05});
-  TestUleb128Encode<uint64_t>(55442211, {0xa3, 0xf6, 0xb7, 0x1a});
-  TestUleb128Encode<uint64_t>(1649267441664, {0x80, 0x80, 0x80, 0x80, 0x80, 
0x30});
-  TestUleb128Encode<uint64_t>(60736917191292289,
-                             {0x81, 0xeb, 0xc1, 0xaf, 0xf8, 0xfc, 0xf1, 0x6b});
-  TestUleb128Encode<uint64_t>(18446744073709551615U,
-                             {0xff, 0xff, 0xff, 0xff, 0xff,
-                              0xff, 0xff, 0xff, 0xff, 0x01});
-}
-
-template<typename UINT_T>
-void TestUleb128Decode(const UINT_T expected_value, const 
std::vector<uint8_t>& bytes) {
-  BatchedBitReader reader(bytes.data(), bytes.size());
-  UINT_T value;
-  const bool success = reader.GetUleb128<UINT_T>(&value);
-  EXPECT_TRUE(success);
-  EXPECT_EQ(expected_value, value);
-}
-
-TEST(VLQInt, TestGetUleb128) {
-  TestUleb128Decode<uint32_t>(5, {0x05});
-  TestUleb128Decode<uint32_t>(2018, {0xe2, 0x0f});
-  TestUleb128Decode<uint32_t>(25248, {0xa0, 0xc5, 0x01});
-  TestUleb128Decode<uint32_t>(55442211, {0xa3, 0xf6, 0xb7, 0x1a});
-  TestUleb128Decode<uint32_t>(4294967295, {0xff, 0xff, 0xff, 0xff, 0x0f});
-
-  TestUleb128Decode<uint64_t>(5, {0x05});
-  TestUleb128Decode<uint64_t>(55442211, {0xa3, 0xf6, 0xb7, 0x1a});
-  TestUleb128Decode<uint64_t>(1649267441664, {0x80, 0x80, 0x80, 0x80, 0x80, 
0x30});
-  TestUleb128Decode<uint64_t>(60736917191292289,
-                             {0x81, 0xeb, 0xc1, 0xaf, 0xf8, 0xfc, 0xf1, 0x6b});
-  TestUleb128Decode<uint64_t>(18446744073709551615U,
-                             {0xff, 0xff, 0xff, 0xff, 0xff,
-                              0xff, 0xff, 0xff, 0xff, 0x01});
-}
-
-template<typename INT_T>
-void TestZigZagEncode(const INT_T value, std::vector<uint8_t> expected_bytes) {
-  const int len = BatchedBitReader::max_vlq_byte_len<INT_T>();
-
-  std::vector<uint8_t> buffer(len, 0);
-  BitWriter writer(buffer.data(), len);
-
-  const bool success = writer.PutZigZagInteger(value);
-  EXPECT_TRUE(success);
-
-  if (expected_bytes.size() < len) {
-    // Allow the user to input fewer bytes than the maximum.
-    expected_bytes.resize(len, 0);
-  }
-
-  EXPECT_EQ(expected_bytes, buffer);
-}
-
-TEST(VLQInt, TestPutZigZagInteger) {
-  TestZigZagEncode<int32_t>(-3, {0x05});
-  TestZigZagEncode<int32_t>(-2485, {0xe9, 0x26});
-  TestZigZagEncode<int32_t>(5648448, {0x80, 0xc1, 0xb1, 0x5});
-
-  TestZigZagEncode<int64_t>(1629267541664, {0xc0, 0xfa, 0xcd, 0xfe, 0xea, 
0x5e});
-  TestZigZagEncode<int64_t>(-9223372036854775808U, // Most negative int64_t.
-                           {0xff, 0xff, 0xff, 0xff, 0xff,
-                            0xff, 0xff, 0xff, 0xff, 0x1});
-}
-
-template<typename INT_T>
-void TestZigZagDecode(const INT_T expected_value, std::vector<uint8_t> bytes) {
-  BatchedBitReader reader(bytes.data(), bytes.size());
-  INT_T value;
-  const bool success = reader.GetZigZagInteger<INT_T>(&value);
-  EXPECT_TRUE(success);
-  EXPECT_EQ(expected_value, value);
-}
-
-TEST(VLQInt, TestGetZigZagInteger) {
-  TestZigZagDecode<int32_t>(-3, {0x05});
-  TestZigZagDecode<int32_t>(-2485, {0xe9, 0x26});
-  TestZigZagDecode<int32_t>(5648448, {0x80, 0xc1, 0xb1, 0x5});
-
-  TestZigZagDecode<int64_t>(1629267541664, {0xc0, 0xfa, 0xcd, 0xfe, 0xea, 
0x5e});
-  TestZigZagDecode<int64_t>(-9223372036854775808U, // Most negative int64_t.
-                           {0xff, 0xff, 0xff, 0xff, 0xff,
-                            0xff, 0xff, 0xff, 0xff, 0x1});
-}
-
-TEST(VLQInt, TestMaxVlqByteLen) {
-  EXPECT_EQ(2, BatchedBitReader::max_vlq_byte_len<uint8_t>());
-  EXPECT_EQ(3, BatchedBitReader::max_vlq_byte_len<uint16_t>());
-  EXPECT_EQ(5, BatchedBitReader::max_vlq_byte_len<uint32_t>());
-  EXPECT_EQ(10, BatchedBitReader::max_vlq_byte_len<uint64_t>());
-}
-
-}
diff --git a/be/src/util/bit-stream-utils.h b/be/src/util/bit-stream-utils.h
index 230c9dd..9889f35 100644
--- a/be/src/util/bit-stream-utils.h
+++ b/be/src/util/bit-stream-utils.h
@@ -23,17 +23,16 @@
 #include <string.h>
 #include "common/compiler-util.h"
 #include "common/logging.h"
-#include "util/bit-packing.h"
 #include "util/bit-util.h"
 
 namespace impala {
 
-/// Utility class to write bit/byte streams. This class can write data to 
either be
+/// Utility class to write bit/byte streams.  This class can write data to 
either be
 /// bit packed or byte aligned (and a single stream that has a mix of both).
 /// This class does not allocate memory.
 class BitWriter {
  public:
-  /// buffer: buffer to write bits to. Buffer should be preallocated with
+  /// buffer: buffer to write bits to.  Buffer should be preallocated with
   /// 'buffer_len' bytes.
   BitWriter(uint8_t* buffer, int buffer_len) :
       buffer_(buffer),
@@ -65,16 +64,7 @@ class BitWriter {
   /// Write an unsigned ULEB-128 encoded int to the buffer. Return false if 
there was not
   /// enough room. The value is written byte aligned. For more details on 
ULEB-128:
   /// https://en.wikipedia.org/wiki/LEB128
-  /// UINT_T must be an unsigned integer type.
-  template<typename UINT_T>
-  bool PutUleb128(UINT_T v);
-
-  /// Write a ZigZag encoded int to the buffer. Return false if there was not 
enough
-  /// room. The value is written byte aligned. For more details on ZigZag 
encoding:
-  /// 
https://developers.google.com/protocol-buffers/docs/encoding#signed-integers
-  /// INT_T must be a signed integer type.
-  template<typename INT_T>
-  bool PutZigZagInteger(INT_T v);
+  bool PutUleb128Int(uint32_t v);
 
   /// Get a pointer to the next aligned byte and advance the underlying buffer
   /// by num_bytes.
@@ -87,7 +77,7 @@ class BitWriter {
   void Flush(bool align=false);
 
   /// Maximum supported bitwidth for writer.
-  static const int MAX_BITWIDTH = 64;
+  static const int MAX_BITWIDTH = 32;
 
  private:
   uint8_t* buffer_;
@@ -167,29 +157,16 @@ class BatchedBitReader {
   /// at the beginning of a byte. Return false if there were not enough bytes 
in the
   /// buffer or the int is invalid. For more details on ULEB-128:
   /// https://en.wikipedia.org/wiki/LEB128
-  /// UINT_T must be an unsigned integer type.
-  template<typename UINT_T>
-  bool GetUleb128(UINT_T* v);
-
-  /// Read a ZigZag encoded int from the stream. The encoded int must start at 
the
-  /// beginning of a byte. Return false if there were not enough bytes in the 
buffer or
-  /// the int is invalid. For more details on ZigZag encoding:
-  /// 
https://developers.google.com/protocol-buffers/docs/encoding#signed-integers
-  /// INT_T must be a signed integer type.
-  template<typename INT_T>
-  bool GetZigZagInteger(INT_T* v);
+  bool GetUleb128Int(uint32_t* v);
 
   /// Returns the number of bytes left in the stream.
   int bytes_left() { return buffer_end_ - buffer_pos_; }
 
-  /// Maximum byte length of a vlq encoded integer of type T.
-  template <typename T>
-  static constexpr int max_vlq_byte_len() {
-    return BitUtil::Ceil(sizeof(T) * 8, 7);
-  }
+  /// Maximum byte length of a vlq encoded int
+  static const int MAX_VLQ_BYTE_LEN = 5;
 
   /// Maximum supported bitwidth for reader.
-  static const int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+  static const int MAX_BITWIDTH = 32;
 
  private:
   /// Current read position in the buffer.
diff --git a/be/src/util/bit-stream-utils.inline.h 
b/be/src/util/bit-stream-utils.inline.h
index 868b562..18df34c 100644
--- a/be/src/util/bit-stream-utils.inline.h
+++ b/be/src/util/bit-stream-utils.inline.h
@@ -21,13 +21,14 @@
 #include "util/bit-stream-utils.h"
 
 #include "common/compiler-util.h"
+#include "util/bit-packing.h"
 
 namespace impala {
 
 inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
+  // TODO: revisit this limit if necessary (can be raised to 64 by fixing some 
edge cases)
   DCHECK_LE(num_bits, MAX_BITWIDTH);
-  DCHECK(num_bits == MAX_BITWIDTH || v >> num_bits == 0)
-      << "v = " << v << ", num_bits = " << num_bits;
+  DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
 
   if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) 
return false;
 
@@ -37,20 +38,11 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
   if (UNLIKELY(bit_offset_ >= 64)) {
     // Flush buffered_values_ and write out bits of v that did not fit
     memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
+    buffered_values_ = 0;
     byte_offset_ += 8;
     bit_offset_ -= 64;
-
-    // Shifting with the same or greater amount than the number of bits in the 
number is
-    // undefined behaviour.
-    int shift = num_bits - bit_offset_;
-
-    if (LIKELY(shift < 64)) {
-      buffered_values_ = v >> shift;
-    } else {
-      buffered_values_ = 0;
-    }
+    buffered_values_ = v >> (num_bits - bit_offset_);
   }
-
   DCHECK_LT(bit_offset_, 64);
   return true;
 }
@@ -84,40 +76,16 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
   return true;
 }
 
-template<typename UINT_T>
-inline bool BitWriter::PutUleb128(UINT_T v) {
-  static_assert(std::is_integral<UINT_T>::value, "Integral type required.");
-  static_assert(std::is_unsigned<UINT_T>::value, "Unsigned type required.");
-  static_assert(!std::is_same<UINT_T, bool>::value, "Bools are not 
supported.");
-
-  constexpr UINT_T mask_lowest_7_bits = std::numeric_limits<UINT_T>::max() ^ 
0x7Fu;
-
+inline bool BitWriter::PutUleb128Int(uint32_t v) {
   bool result = true;
-  while ((v & mask_lowest_7_bits) != 0L) {
-    result &= PutAligned<uint8_t>((v & 0x7Fu) | 0x80u, 1);
+  while ((v & 0xFFFFFF80) != 0L) {
+    result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
     v >>= 7;
   }
-  result &= PutAligned<uint8_t>(v & 0x7Fu, 1);
+  result &= PutAligned<uint8_t>(v & 0x7F, 1);
   return result;
 }
 
-template<typename INT_T>
-bool BitWriter::PutZigZagInteger(INT_T v) {
-  static_assert(std::is_integral<INT_T>::value, "Integral type required.");
-  static_assert(std::is_signed<INT_T>::value, "Signed type required.");
-
-  using UINT_T = std::make_unsigned_t<INT_T>;
-  constexpr int bit_length_minus_one = sizeof(INT_T) * 8 - 1;
-
-  /// For negative values, << results in undefined behaviour (prior to C++20), 
so we treat
-  /// `v` as unsigned.
-  const UINT_T v_unsigned = v;
-
-  /// For the right shift, we rely on implementation defined behaviour as we 
expect it to
-  /// be an arithmetic right shift (bits equal to the MSB are shifted in).
-  return PutUleb128((v_unsigned << 1) ^ (v >> bit_length_minus_one));
-}
-
 template<typename T>
 inline int BatchedBitReader::UnpackBatch(int bit_width, int num_values, T* v) {
   DCHECK(buffer_pos_ != nullptr);
@@ -178,42 +146,19 @@ inline bool BatchedBitReader::GetBytes(int num_bytes, T* 
v) {
   return true;
 }
 
-template<typename UINT_T>
-inline bool BatchedBitReader::GetUleb128(UINT_T* v) {
-  static_assert(std::is_integral<UINT_T>::value, "Integral type required.");
-  static_assert(std::is_unsigned<UINT_T>::value, "Unsigned type required.");
-  static_assert(!std::is_same<UINT_T, bool>::value, "Bools are not 
supported.");
-
+inline bool BatchedBitReader::GetUleb128Int(uint32_t* v) {
   *v = 0;
   int shift = 0;
   uint8_t byte = 0;
   do {
-    if (UNLIKELY(shift >= max_vlq_byte_len<UINT_T>() * 7)) return false;
+    if (UNLIKELY(shift >= MAX_VLQ_BYTE_LEN * 7)) return false;
     if (!GetBytes(1, &byte)) return false;
-
-    /// We need to convert 'byte' to UINT_T so that the result of the bitwise 
and
-    /// operation is at least as long an integer as '*v', otherwise the shift 
may be too
-    /// big and lead to undefined behaviour.
-    const UINT_T byte_as_UINT_T = byte;
-    *v |= (byte_as_UINT_T & 0x7Fu) << shift;
+    // The constant below must be explicitly unsigned to ensure that the 
result of the
+    // bitwise-and is unsigned, so that the left shift is always defined 
behavior under
+    // the C++ standard.
+    *v |= (byte & 0x7Fu) << shift;
     shift += 7;
-  } while ((byte & 0x80u) != 0);
-  return true;
-}
-
-template<typename INT_T>
-bool BatchedBitReader::GetZigZagInteger(INT_T* v) {
-  static_assert(std::is_integral<INT_T>::value, "Integral type required.");
-  static_assert(std::is_signed<INT_T>::value, "Signed type required.");
-
-  using UINT_T = std::make_unsigned_t<INT_T>;
-
-  UINT_T v_unsigned;
-  if (UNLIKELY(!GetUleb128<UINT_T>(&v_unsigned))) return false;
-
-  /// Here we rely on implementation defined behaviour in converting UINT_T to 
INT_T.
-  *v = (v_unsigned >> 1) ^ -(v_unsigned & 1u);
-
+  } while ((byte & 0x80) != 0);
   return true;
 }
 }
diff --git a/be/src/util/rle-encoding.h b/be/src/util/rle-encoding.h
index 322b5d7..e4b87d0 100644
--- a/be/src/util/rle-encoding.h
+++ b/be/src/util/rle-encoding.h
@@ -244,7 +244,7 @@ class RleEncoder {
         1 + BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8);
     /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value.
     int max_repeated_run_size =
-        BatchedBitReader::max_vlq_byte_len<uint32_t>() + 
BitUtil::Ceil(bit_width, 8);
+        BatchedBitReader::MAX_VLQ_BYTE_LEN + BitUtil::Ceil(bit_width, 8);
     return std::max(max_literal_run_size, max_repeated_run_size);
   }
 
@@ -406,7 +406,7 @@ inline void RleEncoder::FlushRepeatedRun() {
   bool result = true;
   // The lsb of 0 indicates this is a repeated run
   uint32_t indicator_value = static_cast<uint32_t>(repeat_count_) << 1;
-  result &= bit_writer_.PutUleb128<uint32_t>(indicator_value);
+  result &= bit_writer_.PutUleb128Int(indicator_value);
   result &= bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 
8));
   DCHECK(result);
   num_buffered_values_ = 0;
@@ -665,7 +665,7 @@ inline void RleBatchDecoder<T>::NextCounts() {
   // Read the next run's indicator int, it could be a literal or repeated run.
   // The int is encoded as a ULEB128-encoded value.
   uint32_t indicator_value = 0;
-  if (UNLIKELY(!bit_reader_.GetUleb128<uint32_t>(&indicator_value))) return;
+  if (UNLIKELY(!bit_reader_.GetUleb128Int(&indicator_value))) return;
 
   // lsb indicates if it is a literal run or repeated run
   bool is_literal = indicator_value & 1;
diff --git a/be/src/util/rle-test.cc b/be/src/util/rle-test.cc
index f74b8c3..4bdcff1 100644
--- a/be/src/util/rle-test.cc
+++ b/be/src/util/rle-test.cc
@@ -623,7 +623,7 @@ TEST_F(RleTest, RepeatCountOverflow) {
     // are 1.
     const uint32_t REPEATED_RUN_HEADER = 0xfffffffe;
     const uint32_t LITERAL_RUN_HEADER = 0xffffffff;
-    writer.PutUleb128<uint32_t>(literal_run ? LITERAL_RUN_HEADER : 
REPEATED_RUN_HEADER);
+    writer.PutUleb128Int(literal_run ? LITERAL_RUN_HEADER : 
REPEATED_RUN_HEADER);
     writer.Flush();
 
     RleBatchDecoder<uint64_t> decoder(buffer, BUFFER_LEN, 1);

[impala] branch master updated: Revert "IMPALA-8710: Increase allowed bit width to 64 for bit packing"

Reply via email to