Repository: kudu
Updated Branches:
  refs/heads/master 04eeacddd -> 88755e98d


KUDU-1448 (take 2). Enable AVX2 bitshuffle at runtime

This uses some trickery when building bitshuffle so that we build it
twice: once with normal flags and a second time with -mavx2. Enabling
AVX2 turns on some faster code paths which have been previously
benchmarked to yield a ~30% end-to-end improvement in some Impala scans.

In order to avoid multiply-defined symbols, the build uses 'objcopy' to
rename the symbols to add a '_avx2' suffix before linking. Then a new
wrapper function calls the correct variant of the function based on the
current CPU.

A previous version of this patch tried to use the 'ifunc' attribute, but
ran into some issues with ld.so on el6, likely caused by this bug:
https://patchwork.ozlabs.org/patch/506223/

I'd previously attempted to do this upstream in the bitshuffle library
but it was somewhat difficult as the library makes heavy use of macros,
etc. In that prior attempt, I gave up after many hours of work, whereas
this was comparatively quite simple (only an hour of work!)

This also fixes a bug in the cpuid detection which we inherited
from Chromium. I filed the bug upstream here:
https://bugs.chromium.org/p/chromium/issues/detail?id=667457

I tested on my laptop which has AVX2 and verified that the AVX2 variant
functions were getting called.

Change-Id: Ic115040f5bcdf3aa12a285ad23704be1b0bcb7c6
Previously-Reviewed-on: http://gerrit.cloudera.org:8080/5166
Reviewed-on: http://gerrit.cloudera.org:8080/5247
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/88755e98
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/88755e98
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/88755e98

Branch: refs/heads/master
Commit: 88755e98d4692368639f563bcd1c02cfbd71178f
Parents: 04eeacd
Author: Todd Lipcon <[email protected]>
Authored: Mon Nov 21 12:03:26 2016 -0800
Committer: Adar Dembo <[email protected]>
Committed: Mon Nov 28 23:39:39 2016 +0000

----------------------------------------------------------------------
 src/kudu/cfile/CMakeLists.txt             |  1 +
 src/kudu/cfile/bitshuffle_arch_wrapper.cc | 84 ++++++++++++++++++++++++++
 src/kudu/cfile/bitshuffle_arch_wrapper.h  | 33 ++++++++++
 src/kudu/cfile/bshuf_block.cc             |  3 +-
 src/kudu/cfile/bshuf_block.h              | 11 ++--
 src/kudu/gutil/cpu.cc                     |  2 +-
 thirdparty/build-definitions.sh           | 45 ++++++++++++--
 7 files changed, 167 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/CMakeLists.txt b/src/kudu/cfile/CMakeLists.txt
index ad2a961..f192c6d 100644
--- a/src/kudu/cfile/CMakeLists.txt
+++ b/src/kudu/cfile/CMakeLists.txt
@@ -32,6 +32,7 @@ add_library(cfile
   binary_dict_block.cc
   binary_plain_block.cc
   binary_prefix_block.cc
+  bitshuffle_arch_wrapper.cc
   block_cache.cc
   block_compression.cc
   bloomfile.cc

http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/bitshuffle_arch_wrapper.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/bitshuffle_arch_wrapper.cc 
b/src/kudu/cfile/bitshuffle_arch_wrapper.cc
new file mode 100644
index 0000000..c5ce71a
--- /dev/null
+++ b/src/kudu/cfile/bitshuffle_arch_wrapper.cc
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "kudu/cfile/bitshuffle_arch_wrapper.h"
+
+#include <stdint.h>
+
+// Include the bitshuffle header once to get the default (non-AVX2)
+// symbols.
+#include <bitshuffle.h>
+
+// Include the bitshuffle header again, but this time importing the
+// AVX2-compiled symbols by defining some macros.
+#undef BITSHUFFLE_H
+#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx2
+#define bshuf_compress_lz4 bshuf_compress_lz4_avx2
+#define bshuf_decompress_lz4 bshuf_decompress_lz4_avx2
+#include <bitshuffle.h> // NOLINT(*)
+#undef bshuf_compress_lz4_bound
+#undef bshuf_compress_lz4
+#undef bshuf_decompress_lz4
+
+#include "kudu/gutil/cpu.h"
+
+using base::CPU;
+
+namespace kudu {
+namespace bitshuffle {
+
+
+// Function pointers which will be assigned the correct implementation
+// for the runtime architecture.
+namespace {
+decltype(&bshuf_compress_lz4_bound) g_bshuf_compress_lz4_bound;
+decltype(&bshuf_compress_lz4) g_bshuf_compress_lz4;
+decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4;
+} // anonymous namespace
+
+// When this translation unit is initialized, figure out the current CPU and
+// assign the correct function for this architecture.
+//
+// This avoids an expensive 'cpuid' call in the hot path, and also avoids
+// the cost of a 'std::once' call.
+__attribute__((constructor))
+void SelectBitshuffleFunctions() {
+  if (CPU().has_avx2()) {
+    g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2;
+    g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2;
+    g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx2;
+  } else {
+    g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound;
+    g_bshuf_compress_lz4 = bshuf_compress_lz4;
+    g_bshuf_decompress_lz4 = bshuf_decompress_lz4;
+  }
+}
+
+int64_t compress_lz4(void* in, void* out, size_t size,
+                     size_t elem_size, size_t block_size) {
+  return g_bshuf_compress_lz4(in, out, size, elem_size, block_size);
+}
+int64_t decompress_lz4(void* in, void* out, size_t size,
+                       size_t elem_size, size_t block_size) {
+  return g_bshuf_decompress_lz4(in, out, size, elem_size, block_size);
+}
+size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size) {
+  return g_bshuf_compress_lz4_bound(size, elem_size, block_size);
+}
+
+} // namespace bitshuffle
+} // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/bitshuffle_arch_wrapper.h
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/bitshuffle_arch_wrapper.h 
b/src/kudu/cfile/bitshuffle_arch_wrapper.h
new file mode 100644
index 0000000..cf5fd73
--- /dev/null
+++ b/src/kudu/cfile/bitshuffle_arch_wrapper.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+// This namespace has wrappers for the Bitshuffle library which do runtime 
dispatch to
+// either AVX2-accelerated or regular SSE2 implementations based on the 
available CPU.
+namespace kudu {
+namespace bitshuffle {
+
+// See <bitshuffle.h> for documentation on these functions.
+size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size);
+int64_t compress_lz4(void* in, void* out, size_t size, size_t elem_size, 
size_t block_size);
+int64_t decompress_lz4(void* in, void* out, size_t size, size_t elem_size, 
size_t block_size);
+
+} // namespace bitshuffle
+} // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/bshuf_block.cc
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/bshuf_block.cc b/src/kudu/cfile/bshuf_block.cc
index 019d141..e71c6cc 100644
--- a/src/kudu/cfile/bshuf_block.cc
+++ b/src/kudu/cfile/bshuf_block.cc
@@ -91,7 +91,8 @@ Status BShufBlockDecoder<UINT32>::Expand() {
     decoded_.resize(num_elems_after_padding_ * size_of_elem_);
     uint8_t* in = const_cast<uint8_t*>(&data_[kHeaderSize]);
 
-    bytes = bshuf_decompress_lz4(in, decoded_.data(), 
num_elems_after_padding_, size_of_elem_, 0);
+    bytes = bitshuffle::decompress_lz4(in, decoded_.data(), 
num_elems_after_padding_,
+                                       size_of_elem_, 0);
     if (PREDICT_FALSE(bytes < 0)) {
       // Ideally, this should not happen.
       AbortWithBitShuffleError(bytes);

http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/bshuf_block.h
----------------------------------------------------------------------
diff --git a/src/kudu/cfile/bshuf_block.h b/src/kudu/cfile/bshuf_block.h
index 7730842..22b207c 100644
--- a/src/kudu/cfile/bshuf_block.h
+++ b/src/kudu/cfile/bshuf_block.h
@@ -25,8 +25,8 @@
 
 #include <algorithm>
 #include <stdint.h>
-#include <bitshuffle.h>
 
+#include "kudu/cfile/bitshuffle_arch_wrapper.h"
 #include "kudu/cfile/block_encodings.h"
 #include "kudu/cfile/cfile_util.h"
 #include "kudu/common/columnblock.h"
@@ -142,12 +142,12 @@ class BShufBlockBuilder : public BlockBuilder {
 
     int num_elems_after_padding = count_ + NumOfPaddingNeeded();
     buffer_.resize(kMaxHeaderSize +
-                   bshuf_compress_lz4_bound(num_elems_after_padding, 
final_size_of_type, 0));
+                   bitshuffle::compress_lz4_bound(num_elems_after_padding, 
final_size_of_type, 0));
 
     InlineEncodeFixed32(&buffer_[0], ordinal_pos);
     InlineEncodeFixed32(&buffer_[4], count_);
-    int64_t bytes = bshuf_compress_lz4(data_.data(), &buffer_[kMaxHeaderSize],
-                                    num_elems_after_padding, 
final_size_of_type, 0);
+    int64_t bytes = bitshuffle::compress_lz4(data_.data(), 
&buffer_[kMaxHeaderSize],
+                                             num_elems_after_padding, 
final_size_of_type, 0);
     if (PREDICT_FALSE(bytes < 0)) {
       // This means the bitshuffle function fails.
       // Ideally, this should not happen.
@@ -337,7 +337,8 @@ class BShufBlockDecoder : public BlockDecoder {
       int64_t bytes;
       decoded_.resize(num_elems_after_padding_ * size_of_type);
       uint8_t* in = const_cast<uint8_t*>(&data_[kHeaderSize]);
-      bytes = bshuf_decompress_lz4(in, decoded_.data(), 
num_elems_after_padding_, size_of_type, 0);
+      bytes = bitshuffle::decompress_lz4(in, decoded_.data(), 
num_elems_after_padding_,
+                                         size_of_type, 0);
       if (PREDICT_FALSE(bytes < 0)) {
         // Ideally, this should not happen.
         AbortWithBitShuffleError(bytes);

http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/gutil/cpu.cc
----------------------------------------------------------------------
diff --git a/src/kudu/gutil/cpu.cc b/src/kudu/gutil/cpu.cc
index c6bf41f..f4c3885a 100644
--- a/src/kudu/gutil/cpu.cc
+++ b/src/kudu/gutil/cpu.cc
@@ -68,7 +68,7 @@ void __cpuid(int cpu_info[4], int info_type) {
   __asm__ volatile (
     "cpuid\n"
     : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), 
"=d"(cpu_info[3])
-    : "a"(info_type)
+    : "a"(info_type), "c"(0)
   );
 }
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/thirdparty/build-definitions.sh
----------------------------------------------------------------------
diff --git a/thirdparty/build-definitions.sh b/thirdparty/build-definitions.sh
index d90f2c1..923a98b 100644
--- a/thirdparty/build-definitions.sh
+++ b/thirdparty/build-definitions.sh
@@ -407,11 +407,46 @@ build_bitshuffle() {
   mkdir -p $BITSHUFFLE_BDIR
   pushd $BITSHUFFLE_BDIR
   # bitshuffle depends on lz4, therefore set the flag I$PREFIX/include
-  ${CC:-gcc} $EXTRA_CFLAGS -std=c99 -I$PREFIX/include -O3 -DNDEBUG -fPIC -c \
-    "$BITSHUFFLE_SOURCE/src/bitshuffle_core.c" \
-    "$BITSHUFFLE_SOURCE/src/bitshuffle.c" \
-    "$BITSHUFFLE_SOURCE/src/iochain.c"
-  ar rs bitshuffle.a bitshuffle_core.o bitshuffle.o iochain.o
+
+  # This library has significant optimizations when built with -mavx2. However,
+  # we still need to support non-AVX2-capable hardware. So, we build it twice,
+  # once with the flag and once without, and use some linker tricks to
+  # suffix the AVX2 symbols with '_avx2'. OSX doesn't have objcopy, so we only
+  # do this trick on Linux.
+  if [ -n "$OS_LINUX" ]; then
+    arches="default avx2"
+  else
+    arches="default"
+  fi
+  to_link=""
+  for arch in $arches ; do
+    arch_flag=""
+    if [ "$arch" == "avx2" ]; then
+      arch_flag="-mavx2"
+    fi
+    tmp_obj=bitshuffle_${arch}_tmp.o
+    dst_obj=bitshuffle_${arch}.o
+    ${CC:-gcc} $EXTRA_CFLAGS $arch_flag -std=c99 -I$PREFIX/include -O3 
-DNDEBUG -fPIC -c \
+      "$BITSHUFFLE_SOURCE/src/bitshuffle_core.c" \
+      "$BITSHUFFLE_SOURCE/src/bitshuffle.c" \
+      "$BITSHUFFLE_SOURCE/src/iochain.c"
+    # Merge the object files together to produce a combined .o file.
+    ld -r -o $tmp_obj bitshuffle_core.o bitshuffle.o iochain.o
+    # For the AVX2 symbols, suffix them.
+    if [ "$arch" == "avx2" ]; then
+      # Create a mapping file with '<old_sym> <suffixed_sym>' on each line.
+      nm --defined-only --extern-only $tmp_obj | while read addr type sym ; do
+        echo ${sym} ${sym}_${arch}
+      done > renames.txt
+      objcopy --redefine-syms=renames.txt $tmp_obj $dst_obj
+    else
+      mv $tmp_obj $dst_obj
+    fi
+    to_link="$to_link $dst_obj"
+  done
+
+  rm -f bitshuffle.a
+  ar rs bitshuffle.a $to_link
   cp bitshuffle.a $PREFIX/lib/
   cp $BITSHUFFLE_SOURCE/src/bitshuffle.h $PREFIX/include/bitshuffle.h
   cp $BITSHUFFLE_SOURCE/src/bitshuffle_core.h $PREFIX/include/bitshuffle_core.h

Reply via email to