This is an automated email from the ASF dual-hosted git repository. adar pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 1c60956335675ea0b6ba87a5548cce37634e8306 Author: Bankim Bhavsar <[email protected]> AuthorDate: Fri Mar 13 23:04:22 2020 -0700 [util] Minor changes in BlockBloomFilter requested by Impala 1) Do not include glog/logging.h and gflags/gflags.h in public header file for utilities like block_bloom_filter.h and hash_util.h as it causes problems with code-gen/IR in Impala. 2) Public OrEqualArray() static function in BlockBloomFilter to avoid a copy. This change forced using static function pointers that are initialized only once. Change-Id: I5944f80f4c071ce787eded3f5b41d3bc56560cd0 Reviewed-on: http://gerrit.cloudera.org:8080/15450 Tested-by: Adar Dembo <[email protected]> Reviewed-by: Adar Dembo <[email protected]> --- src/kudu/util/block_bloom_filter-test.cc | 11 ++++++ src/kudu/util/block_bloom_filter.cc | 59 +++++++++++++++++++++++++------- src/kudu/util/block_bloom_filter.h | 47 +++++++++++++++++-------- src/kudu/util/block_bloom_filter_avx2.cc | 5 ++- src/kudu/util/hash_util.h | 8 +++-- 5 files changed, 100 insertions(+), 30 deletions(-) diff --git a/src/kudu/util/block_bloom_filter-test.cc b/src/kudu/util/block_bloom_filter-test.cc index 8e12099..fa9b989 100644 --- a/src/kudu/util/block_bloom_filter-test.cc +++ b/src/kudu/util/block_bloom_filter-test.cc @@ -20,6 +20,7 @@ #include <cmath> // IWYU pragma: keep #include <cstdint> #include <cstdlib> +#include <cstring> #include <iosfwd> #include <memory> #include <unordered_set> @@ -60,6 +61,7 @@ class BlockBloomFilterTest : public KuduTest { BlockBloomFilter* CreateBloomFilter(size_t log_space_bytes) { FLAGS_disable_blockbloomfilter_avx2 = (MakeRand() & 0x1) == 0; + BlockBloomFilter::InitializeFunctionPtrs(); unique_ptr<BlockBloomFilter> bf(new BlockBloomFilter(allocator_)); CHECK_OK(bf->Init(log_space_bytes, FAST_HASH, 0)); @@ -318,5 +320,14 @@ TEST_F(BlockBloomFilterTest, Or) { Status s = bf4->Or(*bf5); ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_STR_CONTAINS(s.ToString(), "Directory size don't match"); + + // Test the public OrEqualArray() function. + static constexpr size_t kNumBytes = 64; + unique_ptr<uint8_t[]> a_ptr(new uint8_t[kNumBytes]); + unique_ptr<uint8_t[]> b_ptr(new uint8_t[kNumBytes]); + memset(a_ptr.get(), 0xDE, kNumBytes); + memset(b_ptr.get(), 0, kNumBytes); + ASSERT_OK(BlockBloomFilter::OrEqualArray(kNumBytes, a_ptr.get(), b_ptr.get())); + ASSERT_EQ(0, memcmp(a_ptr.get(), b_ptr.get(), kNumBytes)); } } // namespace kudu diff --git a/src/kudu/util/block_bloom_filter.cc b/src/kudu/util/block_bloom_filter.cc index 40a3d64..bb48aa0 100644 --- a/src/kudu/util/block_bloom_filter.cc +++ b/src/kudu/util/block_bloom_filter.cc @@ -24,10 +24,12 @@ #include <cmath> #include <cstdlib> #include <cstring> +#include <mutex> #include <string> #include <gflags/gflags.h> +#include "kudu/gutil/cpu.h" #include "kudu/gutil/strings/substitute.h" #include "kudu/util/block_bloom_filter.pb.h" #include "kudu/util/flag_tags.h" @@ -43,22 +45,24 @@ DEFINE_bool(disable_blockbloomfilter_avx2, false, "that doesn't support AVX2."); TAG_FLAG(disable_blockbloomfilter_avx2, hidden); +// Flag used to initialize the static function pointers for the BlockBloomFilter class. +static std::once_flag g_init_func_ptrs_flag; + namespace kudu { +// Initialize the static member variables from BlockBloomFilter class. constexpr uint32_t BlockBloomFilter::kRehash[8] __attribute__((aligned(32))); const base::CPU BlockBloomFilter::kCpu = base::CPU(); // constexpr data member requires initialization in the class declaration. // Hence no duplicate initialization in the definition here. constexpr BlockBloomFilter* const BlockBloomFilter::kAlwaysTrueFilter; -BlockBloomFilter::BlockBloomFilter(BlockBloomFilterBufferAllocatorIf* buffer_allocator) : - always_false_(true), - buffer_allocator_(buffer_allocator), - log_num_buckets_(0), - directory_mask_(0), - directory_(nullptr), - hash_algorithm_(UNKNOWN_HASH), - hash_seed_(0) { +decltype(&BlockBloomFilter::BucketInsert) BlockBloomFilter::bucket_insert_func_ptr_ = nullptr; +decltype(&BlockBloomFilter::BucketFind) BlockBloomFilter::bucket_find_func_ptr_ = nullptr; +decltype(&BlockBloomFilter::OrEqualArrayNoAVX2) BlockBloomFilter::or_equal_array_func_ptr_ = + nullptr; + +void BlockBloomFilter::InitializeFunctionPtrs() { #ifdef USE_AVX2 if (has_avx2()) { bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsertAVX2; @@ -67,15 +71,30 @@ BlockBloomFilter::BlockBloomFilter(BlockBloomFilterBufferAllocatorIf* buffer_all } else { bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsert; bucket_find_func_ptr_ = &BlockBloomFilter::BucketFind; - or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArray; + or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArrayNoAVX2; } #else bucket_insert_func_ptr_ = &BlockBloomFilter::BucketInsert; bucket_find_func_ptr_ = &BlockBloomFilter::BucketFind; - or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArray; + or_equal_array_func_ptr_ = &BlockBloomFilter::OrEqualArrayNoAVX2; #endif } +BlockBloomFilter::BlockBloomFilter(BlockBloomFilterBufferAllocatorIf* buffer_allocator) : + always_false_(true), + buffer_allocator_(buffer_allocator), + log_num_buckets_(0), + directory_mask_(0), + directory_(nullptr), + hash_algorithm_(UNKNOWN_HASH), + hash_seed_(0) { + std::call_once(g_init_func_ptrs_flag, InitializeFunctionPtrs); + + DCHECK_NOTNULL(bucket_insert_func_ptr_); + DCHECK_NOTNULL(bucket_find_func_ptr_); + DCHECK_NOTNULL(or_equal_array_func_ptr_); +} + BlockBloomFilter::~BlockBloomFilter() { Close(); } @@ -273,8 +292,20 @@ bool BlockBloomFilter::operator!=(const BlockBloomFilter& rhs) const { return !(rhs == *this); } -void BlockBloomFilter::OrEqualArray(size_t n, const uint8_t* __restrict__ in, - uint8_t* __restrict__ out) { +Status BlockBloomFilter::OrEqualArray(size_t n, const uint8_t* __restrict__ in, + uint8_t* __restrict__ out) { + if ((n % kBucketByteSize) != 0) { + return Status::InvalidArgument(Substitute("Input size $0 not a multiple of 32-bytes", n)); + } + + std::call_once(g_init_func_ptrs_flag, InitializeFunctionPtrs); + DCHECK_NOTNULL(or_equal_array_func_ptr_); + (*or_equal_array_func_ptr_)(n, in, out); + return Status::OK(); +} + +void BlockBloomFilter::OrEqualArrayNoAVX2(size_t n, const uint8_t* __restrict__ in, + uint8_t* __restrict__ out) { // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not // written in a way that is very friendly to auto-vectorization. Instead, we manually // vectorize, increasing the speed by up to 56x. @@ -323,6 +354,10 @@ Status BlockBloomFilter::Or(const BlockBloomFilter& other) { return Status::OK(); } +bool BlockBloomFilter::has_avx2() { + return !FLAGS_disable_blockbloomfilter_avx2 && kCpu.has_avx2(); +} + shared_ptr<DefaultBlockBloomFilterBufferAllocator> DefaultBlockBloomFilterBufferAllocator::GetSingletonSharedPtr() { // Meyer's Singleton. diff --git a/src/kudu/util/block_bloom_filter.h b/src/kudu/util/block_bloom_filter.h index 6a9a991..35b2166 100644 --- a/src/kudu/util/block_bloom_filter.h +++ b/src/kudu/util/block_bloom_filter.h @@ -23,10 +23,9 @@ #include <limits> #include <memory> -#include <gflags/gflags_declare.h> -#include <glog/logging.h> +// Including glog/logging.h causes problems while compiling in Apache Impala for codegen. +// IWYU pragma: no_include <glog/logging.h> -#include "kudu/gutil/cpu.h" #include "kudu/gutil/macros.h" #include "kudu/gutil/port.h" #include "kudu/util/hash.pb.h" @@ -35,13 +34,15 @@ #include "kudu/util/slice.h" #include "kudu/util/status.h" +namespace base { +class CPU; +} // namespace base + namespace kudu { class Arena; class BlockBloomFilterPB; } // namespace kudu -DECLARE_bool(disable_blockbloomfilter_avx2); - namespace kudu { // Forward declaration. @@ -159,6 +160,11 @@ class BlockBloomFilter { // - Or'ing with kAlwaysTrueFilter is disallowed. Status Or(const BlockBloomFilter& other); + // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' bytes where 'n' + // is multiple of 32-bytes. + static Status OrEqualArray(size_t n, const uint8_t* __restrict__ in, + uint8_t* __restrict__ out); + // Returns whether the Bloom filter is empty and hence would return false for all lookups. bool always_false() const { return always_false_; @@ -195,6 +201,8 @@ class BlockBloomFilter { // log2(number of bytes in a bucket) static constexpr int kLogBucketByteSize = 5; + // Bucket size in bytes. + static constexpr size_t kBucketByteSize = 1UL << kLogBucketByteSize; static_assert((1 << kLogBucketWordBits) == std::numeric_limits<BucketWord>::digits, "BucketWord must have a bit-width that is be a power of 2, like 64 for uint64_t."); @@ -229,8 +237,10 @@ class BlockBloomFilter { bool BucketFind(uint32_t bucket_idx, uint32_t hash) const noexcept; - // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n'. - static void OrEqualArray(size_t n, const uint8_t* __restrict__ in, uint8_t* __restrict__ out); + // Computes out[i] |= in[i] for the arrays 'in' and 'out' of length 'n' without using AVX2 + // operations. + static void OrEqualArrayNoAVX2(size_t n, const uint8_t* __restrict__ in, + uint8_t* __restrict__ out); #ifdef USE_AVX2 // Same as Insert(), but skips the CPU check and assumes that AVX2 is available. @@ -250,11 +260,16 @@ class BlockBloomFilter { uint8_t* __restrict__ out) __attribute__((target("avx2"))); #endif - // Function pointers initialized in constructor to avoid run-time cost - // in hot-path of Find and Insert operations. - decltype(&BlockBloomFilter::BucketInsert) bucket_insert_func_ptr_; - decltype(&BlockBloomFilter::BucketFind) bucket_find_func_ptr_; - decltype(&BlockBloomFilter::OrEqualArray) or_equal_array_func_ptr_; + // Function pointers initialized just once to avoid run-time cost + // in hot-path of Find, Insert and Or operations. + static decltype(&BlockBloomFilter::BucketInsert) bucket_insert_func_ptr_; + static decltype(&BlockBloomFilter::BucketFind) bucket_find_func_ptr_; + static decltype(&BlockBloomFilter::OrEqualArrayNoAVX2) or_equal_array_func_ptr_; + + // Helper function to initialize the function pointers above based on whether + // compiler and/or CPU at run-time supports AVX2. + // It also helps testing both AVX2 and non-AVX2 code paths. + static void InitializeFunctionPtrs(); // Size of the internal directory structure in bytes. int64_t directory_size() const { @@ -262,9 +277,7 @@ class BlockBloomFilter { } // Detect at run-time whether CPU supports AVX2 - static bool has_avx2() { - return !FLAGS_disable_blockbloomfilter_avx2 && kCpu.has_avx2(); - } + static bool has_avx2(); // Some constants used in hashing. #defined for efficiency reasons. #define BLOOM_HASH_CONSTANTS \ @@ -292,6 +305,10 @@ class BlockBloomFilter { } DISALLOW_COPY_AND_ASSIGN(BlockBloomFilter); + + // Allow BlockBloomFilterTest unit test to invoke the private InitializeFunctionPtrs() + // function to test both AVX2 and non-AVX2 code paths. + friend class BlockBloomFilterTest; }; // Generic interface to allocate and de-allocate memory for the BlockBloomFilter. diff --git a/src/kudu/util/block_bloom_filter_avx2.cc b/src/kudu/util/block_bloom_filter_avx2.cc index 93c3ff6..551c75d 100644 --- a/src/kudu/util/block_bloom_filter_avx2.cc +++ b/src/kudu/util/block_bloom_filter_avx2.cc @@ -82,8 +82,11 @@ void BlockBloomFilter::InsertAvx2(const uint32_t hash) noexcept { void BlockBloomFilter::OrEqualArrayAVX2(size_t n, const uint8_t* __restrict__ in, uint8_t* __restrict__ out) { - constexpr size_t kAVXRegisterBytes = sizeof(__m256d); + static constexpr size_t kAVXRegisterBytes = sizeof(__m256d); + static_assert(kAVXRegisterBytes == kBucketByteSize, + "Unexpected AVX register bytes"); DCHECK_EQ(n % kAVXRegisterBytes, 0) << "Invalid Bloom filter directory size"; + const uint8_t* const in_end = in + n; for (; in != in_end; (in += kAVXRegisterBytes), (out += kAVXRegisterBytes)) { const double* double_in = reinterpret_cast<const double*>(in); diff --git a/src/kudu/util/hash_util.h b/src/kudu/util/hash_util.h index f9ac3f7..71055e1 100644 --- a/src/kudu/util/hash_util.h +++ b/src/kudu/util/hash_util.h @@ -19,7 +19,9 @@ #define KUDU_UTIL_HASH_UTIL_H #include <cstdint> -#include <glog/logging.h> + +// Including glog/logging.h causes problems while compiling in Apache Impala for codegen. +// IWYU pragma: no_include <glog/logging.h> #include "kudu/gutil/port.h" #include "kudu/util/hash.pb.h" @@ -134,7 +136,9 @@ class HashUtil { case FAST_HASH: return FastHash32(data.data(), data.size(), seed); default: - LOG(FATAL) << "Not implemented 32-bit hash function: " << hash_algorithm; + // Can't use LOG(FATAL)/CHECK() since including glog/logging.h causes problems + // with code-gen in Impala. + abort(); } }
