IMPALA-2809: Improve scalar ByteSwap(). This patch improves our ByteSwap() function by handling more byte sizes in the fast path, as opposed to the loop-based slow path.
ByteSwap() is used heavily in when scanning Parquet decimals. Before this patch, VTune showed ByteSwap() among the top three worst cycle offenders when running TPCH-Q6 on my local setup with a large lineitem table. After this patch, ByteSwap() shows no significant contribution to the overall cycles spent. There was a measurable improvement of a few percent for TPCH-Q6. Change-Id: I4f462e6bdb022db46b48889a6a7426120a80d9b4 Reviewed-on: http://gerrit.cloudera.org:8080/3033 Reviewed-by: Dan Hecht <dhe...@cloudera.com> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0306dd57 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0306dd57 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0306dd57 Branch: refs/heads/master Commit: 0306dd576d05b6773e380d93af8b3dab90ecdd32 Parents: 46c3e43 Author: Youwei Wang <youwei.a.w...@intel.com> Authored: Wed May 11 15:27:27 2016 -0700 Committer: Tim Armstrong <tarmstr...@cloudera.com> Committed: Fri May 13 15:52:53 2016 -0700 ---------------------------------------------------------------------- be/src/util/bit-util-test.cc | 24 ++++++- be/src/util/bit-util.h | 40 +++--------- be/src/util/bit-util.inline.h | 130 +++++++++++++++++++++++++++++++++++++ be/src/util/decimal-util.h | 2 +- 4 files changed, 162 insertions(+), 34 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0306dd57/be/src/util/bit-util-test.cc ---------------------------------------------------------------------- diff --git a/be/src/util/bit-util-test.cc b/be/src/util/bit-util-test.cc index f376737..e2bdd97 100644 --- a/be/src/util/bit-util-test.cc +++ b/be/src/util/bit-util-test.cc @@ -21,7 +21,7 @@ #include <gtest/gtest.h> #include "gutil/bits.h" -#include "util/bit-util.h" +#include "util/bit-util.inline.h" #include "util/cpu-info.h" #include "common/names.h" @@ -111,6 +111,28 @@ TEST(BitUtil, ByteSwap) { EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0)), 0); EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0x1122)), 0x2211); + + // Test ByteSwap() with an input/output buffer, swapping up to 32 bytes. + int buf_size = 32; + uint8_t src_buf[buf_size]; + for (int i = 0; i < buf_size; ++i) { + src_buf[i] = i; + } + uint8_t dst_buf[buf_size]; + for (int i = 0; i < buf_size; ++i) { + // Init dst buffer and swap i bytes. + memset(dst_buf, 0, buf_size); + BitUtil::ByteSwap(dst_buf, src_buf, i); + // Validate the swap results. + for (int j = 0; j < i; ++j) { + EXPECT_EQ(dst_buf[j], i - j - 1); + EXPECT_EQ(dst_buf[j], src_buf[i - j - 1]); + } + // Check that the dst buffer is otherwise unmodified. + for (int j = i; j < buf_size; ++j) { + EXPECT_EQ(dst_buf[j], 0); + } + } } TEST(BitUtil, Log2) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0306dd57/be/src/util/bit-util.h ---------------------------------------------------------------------- diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h index eed6df3..cae8212 100644 --- a/be/src/util/bit-util.h +++ b/be/src/util/bit-util.h @@ -148,49 +148,25 @@ class BitUtil { return __builtin_bswap64(value); } static inline uint64_t ByteSwap(uint64_t value) { - return static_cast<uint64_t>(__builtin_bswap64(value)); + return __builtin_bswap64(value); } static inline int32_t ByteSwap(int32_t value) { return __builtin_bswap32(value); } static inline uint32_t ByteSwap(uint32_t value) { - return static_cast<uint32_t>(__builtin_bswap32(value)); + return __builtin_bswap32(value); } static inline int16_t ByteSwap(int16_t value) { - return (((value >> 8) & 0xff) | ((value & 0xff) << 8)); + return __builtin_bswap16(value); } static inline uint16_t ByteSwap(uint16_t value) { - return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value))); + return __builtin_bswap16(value); } - /// Write the swapped bytes into dst. Src and st cannot overlap. - static inline void ByteSwap(void* dst, const void* src, int len) { - switch (len) { - case 1: - *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src); - return; - case 2: - *reinterpret_cast<int16_t*>(dst) = - ByteSwap(*reinterpret_cast<const int16_t*>(src)); - return; - case 4: - *reinterpret_cast<int32_t*>(dst) = - ByteSwap(*reinterpret_cast<const int32_t*>(src)); - return; - case 8: - *reinterpret_cast<int64_t*>(dst) = - ByteSwap(*reinterpret_cast<const int64_t*>(src)); - return; - default: break; - } - - uint8_t* d = reinterpret_cast<uint8_t*>(dst); - const uint8_t* s = reinterpret_cast<const uint8_t*>(src); - for (int i = 0; i < len; ++i) { - d[i] = s[len - i - 1]; - } - - } + /// Write the swapped bytes into dest; source and dest must not overlap. + /// This function is optimized for len <= 16. It reverts to a slow loop-based + /// swap for len > 16. + static inline void ByteSwap(void* dest, const void* source, int len); /// Converts to big endian format (if not already in big endian) from the /// machine's native endian format. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0306dd57/be/src/util/bit-util.inline.h ---------------------------------------------------------------------- diff --git a/be/src/util/bit-util.inline.h b/be/src/util/bit-util.inline.h new file mode 100644 index 0000000..33b44c6 --- /dev/null +++ b/be/src/util/bit-util.inline.h @@ -0,0 +1,130 @@ +// Copyright 2016 Cloudera Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef IMPALA_BIT_UTIL_INLINE_H +#define IMPALA_BIT_UTIL_INLINE_H + +#include "util/bit-util.h" + +namespace impala { + +inline void BitUtil::ByteSwap(void* dest, const void* source, int len) { + uint8_t* dst = reinterpret_cast<uint8_t*>(dest); + const uint8_t* src = reinterpret_cast<const uint8_t*>(source); + switch (len) { + case 1: + *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src); + return; + case 2: + *reinterpret_cast<uint16_t*>(dst) = + ByteSwap(*reinterpret_cast<const uint16_t*>(src)); + return; + case 3: + *reinterpret_cast<uint16_t*>(dst + 1) = + ByteSwap(*reinterpret_cast<const uint16_t*>(src)); + *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 2); + return; + case 4: + *reinterpret_cast<uint32_t*>(dst) = + ByteSwap(*reinterpret_cast<const uint32_t*>(src)); + return; + case 5: + *reinterpret_cast<uint32_t*>(dst + 1) = + ByteSwap(*reinterpret_cast<const uint32_t*>(src)); + *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 4); + return; + case 6: + *reinterpret_cast<uint32_t*>(dst + 2) = + ByteSwap(*reinterpret_cast<const uint32_t*>(src)); + *reinterpret_cast<uint16_t*>(dst) = + ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4)); + return; + case 7: + *reinterpret_cast<uint32_t*>(dst + 3) = + ByteSwap(*reinterpret_cast<const uint32_t*>(src)); + *reinterpret_cast<uint16_t*>(dst + 1) = + ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4)); + *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 6); + return; + case 8: + *reinterpret_cast<uint64_t*>(dst) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + return; + case 9: + *reinterpret_cast<uint64_t*>(dst + 1) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 8); + return; + case 10: + *reinterpret_cast<uint64_t*>(dst + 2) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + *reinterpret_cast<uint16_t*>(dst) = + ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8)); + return; + case 11: + *reinterpret_cast<uint64_t*>(dst + 3) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + *reinterpret_cast<uint16_t*>(dst + 1) = + ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8)); + *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 10); + return; + case 12: + *reinterpret_cast<uint64_t*>(dst + 4) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + *reinterpret_cast<uint32_t*>(dst) = + ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8)); + return; + case 13: + *reinterpret_cast<uint64_t*>(dst + 5) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + *reinterpret_cast<uint32_t*>(dst + 1) = + ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8)); + *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 12); + return; + case 14: + *reinterpret_cast<uint64_t*>(dst + 6) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + *reinterpret_cast<uint32_t*>(dst + 2) = + ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8)); + *reinterpret_cast<uint16_t*>(dst) = + ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12)); + return; + case 15: + *reinterpret_cast<uint64_t*>(dst + 7) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + *reinterpret_cast<uint32_t*>(dst + 3) = + ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8)); + *reinterpret_cast<uint16_t*>(dst + 1) = + ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12)); + *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 14); + return; + case 16: + *reinterpret_cast<uint64_t*>(dst + 8) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src)); + *reinterpret_cast<uint64_t*>(dst) = + ByteSwap(*reinterpret_cast<const uint64_t*>(src + 8)); + return; + default: + // Revert to slow loop-based swap. + for (int i = 0; i < len; ++i) { + dst[i] = src[len - i - 1]; + } + return; + } +} + +} + +#endif http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0306dd57/be/src/util/decimal-util.h ---------------------------------------------------------------------- diff --git a/be/src/util/decimal-util.h b/be/src/util/decimal-util.h index 79b7130..2ce69bc 100644 --- a/be/src/util/decimal-util.h +++ b/be/src/util/decimal-util.h @@ -22,7 +22,7 @@ #include "runtime/multi-precision.h" #include "runtime/types.h" -#include "util/bit-util.h" +#include "util/bit-util.inline.h" namespace impala {