Repository: kudu Updated Branches: refs/heads/master 04eeacddd -> 88755e98d
KUDU-1448 (take 2). Enable AVX2 bitshuffle at runtime This uses some trickery when building bitshuffle so that we build it twice: once with normal flags and a second time with -mavx2. Enabling AVX2 turns on some faster code paths which have been previously benchmarked to yield a ~30% end-to-end improvement in some Impala scans. In order to avoid multiply-defined symbols, the build uses 'objcopy' to rename the symbols to add a '_avx2' suffix before linking. Then a new wrapper function calls the correct variant of the function based on the current CPU. A previous version of this patch tried to use the 'ifunc' attribute, but ran into some issues with ld.so on el6, likely caused by this bug: https://patchwork.ozlabs.org/patch/506223/ I'd previously attempted to do this upstream in the bitshuffle library but it was somewhat difficult as the library makes heavy use of macros, etc. In that prior attempt, I gave up after many hours of work, whereas this was comparatively quite simple (only an hour of work!) This also fixes a bug in the cpuid detection which we inherited from Chromium. I filed the bug upstream here: https://bugs.chromium.org/p/chromium/issues/detail?id=667457 I tested on my laptop which has AVX2 and verified that the AVX2 variant functions were getting called. Change-Id: Ic115040f5bcdf3aa12a285ad23704be1b0bcb7c6 Previously-Reviewed-on: http://gerrit.cloudera.org:8080/5166 Reviewed-on: http://gerrit.cloudera.org:8080/5247 Tested-by: Kudu Jenkins Reviewed-by: Adar Dembo <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/88755e98 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/88755e98 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/88755e98 Branch: refs/heads/master Commit: 88755e98d4692368639f563bcd1c02cfbd71178f Parents: 04eeacd Author: Todd Lipcon <[email protected]> Authored: Mon Nov 21 12:03:26 2016 -0800 Committer: Adar Dembo <[email protected]> Committed: Mon Nov 28 23:39:39 2016 +0000 ---------------------------------------------------------------------- src/kudu/cfile/CMakeLists.txt | 1 + src/kudu/cfile/bitshuffle_arch_wrapper.cc | 84 ++++++++++++++++++++++++++ src/kudu/cfile/bitshuffle_arch_wrapper.h | 33 ++++++++++ src/kudu/cfile/bshuf_block.cc | 3 +- src/kudu/cfile/bshuf_block.h | 11 ++-- src/kudu/gutil/cpu.cc | 2 +- thirdparty/build-definitions.sh | 45 ++++++++++++-- 7 files changed, 167 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/kudu/cfile/CMakeLists.txt b/src/kudu/cfile/CMakeLists.txt index ad2a961..f192c6d 100644 --- a/src/kudu/cfile/CMakeLists.txt +++ b/src/kudu/cfile/CMakeLists.txt @@ -32,6 +32,7 @@ add_library(cfile binary_dict_block.cc binary_plain_block.cc binary_prefix_block.cc + bitshuffle_arch_wrapper.cc block_cache.cc block_compression.cc bloomfile.cc http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/bitshuffle_arch_wrapper.cc ---------------------------------------------------------------------- diff --git a/src/kudu/cfile/bitshuffle_arch_wrapper.cc b/src/kudu/cfile/bitshuffle_arch_wrapper.cc new file mode 100644 index 0000000..c5ce71a --- /dev/null +++ b/src/kudu/cfile/bitshuffle_arch_wrapper.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/bitshuffle_arch_wrapper.h" + +#include <stdint.h> + +// Include the bitshuffle header once to get the default (non-AVX2) +// symbols. +#include <bitshuffle.h> + +// Include the bitshuffle header again, but this time importing the +// AVX2-compiled symbols by defining some macros. +#undef BITSHUFFLE_H +#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_avx2 +#define bshuf_compress_lz4 bshuf_compress_lz4_avx2 +#define bshuf_decompress_lz4 bshuf_decompress_lz4_avx2 +#include <bitshuffle.h> // NOLINT(*) +#undef bshuf_compress_lz4_bound +#undef bshuf_compress_lz4 +#undef bshuf_decompress_lz4 + +#include "kudu/gutil/cpu.h" + +using base::CPU; + +namespace kudu { +namespace bitshuffle { + + +// Function pointers which will be assigned the correct implementation +// for the runtime architecture. +namespace { +decltype(&bshuf_compress_lz4_bound) g_bshuf_compress_lz4_bound; +decltype(&bshuf_compress_lz4) g_bshuf_compress_lz4; +decltype(&bshuf_decompress_lz4) g_bshuf_decompress_lz4; +} // anonymous namespace + +// When this translation unit is initialized, figure out the current CPU and +// assign the correct function for this architecture. +// +// This avoids an expensive 'cpuid' call in the hot path, and also avoids +// the cost of a 'std::once' call. +__attribute__((constructor)) +void SelectBitshuffleFunctions() { + if (CPU().has_avx2()) { + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_avx2; + g_bshuf_compress_lz4 = bshuf_compress_lz4_avx2; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4_avx2; + } else { + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; + g_bshuf_compress_lz4 = bshuf_compress_lz4; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4; + } +} + +int64_t compress_lz4(void* in, void* out, size_t size, + size_t elem_size, size_t block_size) { + return g_bshuf_compress_lz4(in, out, size, elem_size, block_size); +} +int64_t decompress_lz4(void* in, void* out, size_t size, + size_t elem_size, size_t block_size) { + return g_bshuf_decompress_lz4(in, out, size, elem_size, block_size); +} +size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size) { + return g_bshuf_compress_lz4_bound(size, elem_size, block_size); +} + +} // namespace bitshuffle +} // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/bitshuffle_arch_wrapper.h ---------------------------------------------------------------------- diff --git a/src/kudu/cfile/bitshuffle_arch_wrapper.h b/src/kudu/cfile/bitshuffle_arch_wrapper.h new file mode 100644 index 0000000..cf5fd73 --- /dev/null +++ b/src/kudu/cfile/bitshuffle_arch_wrapper.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include <stddef.h> +#include <stdint.h> + +// This namespace has wrappers for the Bitshuffle library which do runtime dispatch to +// either AVX2-accelerated or regular SSE2 implementations based on the available CPU. +namespace kudu { +namespace bitshuffle { + +// See <bitshuffle.h> for documentation on these functions. +size_t compress_lz4_bound(size_t size, size_t elem_size, size_t block_size); +int64_t compress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); +int64_t decompress_lz4(void* in, void* out, size_t size, size_t elem_size, size_t block_size); + +} // namespace bitshuffle +} // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/bshuf_block.cc ---------------------------------------------------------------------- diff --git a/src/kudu/cfile/bshuf_block.cc b/src/kudu/cfile/bshuf_block.cc index 019d141..e71c6cc 100644 --- a/src/kudu/cfile/bshuf_block.cc +++ b/src/kudu/cfile/bshuf_block.cc @@ -91,7 +91,8 @@ Status BShufBlockDecoder<UINT32>::Expand() { decoded_.resize(num_elems_after_padding_ * size_of_elem_); uint8_t* in = const_cast<uint8_t*>(&data_[kHeaderSize]); - bytes = bshuf_decompress_lz4(in, decoded_.data(), num_elems_after_padding_, size_of_elem_, 0); + bytes = bitshuffle::decompress_lz4(in, decoded_.data(), num_elems_after_padding_, + size_of_elem_, 0); if (PREDICT_FALSE(bytes < 0)) { // Ideally, this should not happen. AbortWithBitShuffleError(bytes); http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/cfile/bshuf_block.h ---------------------------------------------------------------------- diff --git a/src/kudu/cfile/bshuf_block.h b/src/kudu/cfile/bshuf_block.h index 7730842..22b207c 100644 --- a/src/kudu/cfile/bshuf_block.h +++ b/src/kudu/cfile/bshuf_block.h @@ -25,8 +25,8 @@ #include <algorithm> #include <stdint.h> -#include <bitshuffle.h> +#include "kudu/cfile/bitshuffle_arch_wrapper.h" #include "kudu/cfile/block_encodings.h" #include "kudu/cfile/cfile_util.h" #include "kudu/common/columnblock.h" @@ -142,12 +142,12 @@ class BShufBlockBuilder : public BlockBuilder { int num_elems_after_padding = count_ + NumOfPaddingNeeded(); buffer_.resize(kMaxHeaderSize + - bshuf_compress_lz4_bound(num_elems_after_padding, final_size_of_type, 0)); + bitshuffle::compress_lz4_bound(num_elems_after_padding, final_size_of_type, 0)); InlineEncodeFixed32(&buffer_[0], ordinal_pos); InlineEncodeFixed32(&buffer_[4], count_); - int64_t bytes = bshuf_compress_lz4(data_.data(), &buffer_[kMaxHeaderSize], - num_elems_after_padding, final_size_of_type, 0); + int64_t bytes = bitshuffle::compress_lz4(data_.data(), &buffer_[kMaxHeaderSize], + num_elems_after_padding, final_size_of_type, 0); if (PREDICT_FALSE(bytes < 0)) { // This means the bitshuffle function fails. // Ideally, this should not happen. @@ -337,7 +337,8 @@ class BShufBlockDecoder : public BlockDecoder { int64_t bytes; decoded_.resize(num_elems_after_padding_ * size_of_type); uint8_t* in = const_cast<uint8_t*>(&data_[kHeaderSize]); - bytes = bshuf_decompress_lz4(in, decoded_.data(), num_elems_after_padding_, size_of_type, 0); + bytes = bitshuffle::decompress_lz4(in, decoded_.data(), num_elems_after_padding_, + size_of_type, 0); if (PREDICT_FALSE(bytes < 0)) { // Ideally, this should not happen. AbortWithBitShuffleError(bytes); http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/src/kudu/gutil/cpu.cc ---------------------------------------------------------------------- diff --git a/src/kudu/gutil/cpu.cc b/src/kudu/gutil/cpu.cc index c6bf41f..f4c3885a 100644 --- a/src/kudu/gutil/cpu.cc +++ b/src/kudu/gutil/cpu.cc @@ -68,7 +68,7 @@ void __cpuid(int cpu_info[4], int info_type) { __asm__ volatile ( "cpuid\n" : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type) + : "a"(info_type), "c"(0) ); } http://git-wip-us.apache.org/repos/asf/kudu/blob/88755e98/thirdparty/build-definitions.sh ---------------------------------------------------------------------- diff --git a/thirdparty/build-definitions.sh b/thirdparty/build-definitions.sh index d90f2c1..923a98b 100644 --- a/thirdparty/build-definitions.sh +++ b/thirdparty/build-definitions.sh @@ -407,11 +407,46 @@ build_bitshuffle() { mkdir -p $BITSHUFFLE_BDIR pushd $BITSHUFFLE_BDIR # bitshuffle depends on lz4, therefore set the flag I$PREFIX/include - ${CC:-gcc} $EXTRA_CFLAGS -std=c99 -I$PREFIX/include -O3 -DNDEBUG -fPIC -c \ - "$BITSHUFFLE_SOURCE/src/bitshuffle_core.c" \ - "$BITSHUFFLE_SOURCE/src/bitshuffle.c" \ - "$BITSHUFFLE_SOURCE/src/iochain.c" - ar rs bitshuffle.a bitshuffle_core.o bitshuffle.o iochain.o + + # This library has significant optimizations when built with -mavx2. However, + # we still need to support non-AVX2-capable hardware. So, we build it twice, + # once with the flag and once without, and use some linker tricks to + # suffix the AVX2 symbols with '_avx2'. OSX doesn't have objcopy, so we only + # do this trick on Linux. + if [ -n "$OS_LINUX" ]; then + arches="default avx2" + else + arches="default" + fi + to_link="" + for arch in $arches ; do + arch_flag="" + if [ "$arch" == "avx2" ]; then + arch_flag="-mavx2" + fi + tmp_obj=bitshuffle_${arch}_tmp.o + dst_obj=bitshuffle_${arch}.o + ${CC:-gcc} $EXTRA_CFLAGS $arch_flag -std=c99 -I$PREFIX/include -O3 -DNDEBUG -fPIC -c \ + "$BITSHUFFLE_SOURCE/src/bitshuffle_core.c" \ + "$BITSHUFFLE_SOURCE/src/bitshuffle.c" \ + "$BITSHUFFLE_SOURCE/src/iochain.c" + # Merge the object files together to produce a combined .o file. + ld -r -o $tmp_obj bitshuffle_core.o bitshuffle.o iochain.o + # For the AVX2 symbols, suffix them. + if [ "$arch" == "avx2" ]; then + # Create a mapping file with '<old_sym> <suffixed_sym>' on each line. + nm --defined-only --extern-only $tmp_obj | while read addr type sym ; do + echo ${sym} ${sym}_${arch} + done > renames.txt + objcopy --redefine-syms=renames.txt $tmp_obj $dst_obj + else + mv $tmp_obj $dst_obj + fi + to_link="$to_link $dst_obj" + done + + rm -f bitshuffle.a + ar rs bitshuffle.a $to_link cp bitshuffle.a $PREFIX/lib/ cp $BITSHUFFLE_SOURCE/src/bitshuffle.h $PREFIX/include/bitshuffle.h cp $BITSHUFFLE_SOURCE/src/bitshuffle_core.h $PREFIX/include/bitshuffle_core.h
