This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 8867a826bca [opt](arm) Optimize the BlockBloomFilter::bucket_find on
ARM platform… (#43508)
8867a826bca is described below
commit 8867a826bca6982e6fc88672409212b704ab75d7
Author: Mryange <[email protected]>
AuthorDate: Sun Nov 10 18:23:58 2024 +0800
[opt](arm) Optimize the BlockBloomFilter::bucket_find on ARM platform…
(#43508)
…s using NEON instructions. (#38888)
https://github.com/apache/doris/pull/38888
## Proposed changes
```
--------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------
BM_BucketFindNeon 8.14 ns 8.14 ns 344002441
BM_BucketFindNative 17.5 ns 17.5 ns 160152491
```
---
be/src/exprs/block_bloom_filter.hpp | 36 ++++++++++++++++++++++++++++++++-
be/src/exprs/block_bloom_filter_impl.cc | 29 +++++++++++++++++++++++---
2 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/be/src/exprs/block_bloom_filter.hpp
b/be/src/exprs/block_bloom_filter.hpp
index f31d7f7d4c0..b7d488a3003 100644
--- a/be/src/exprs/block_bloom_filter.hpp
+++ b/be/src/exprs/block_bloom_filter.hpp
@@ -124,6 +124,39 @@ public:
return false;
}
+#ifdef __ARM_NEON
+ void make_find_mask(uint32_t key, uint32x4_t* masks) const noexcept {
+ uint32x4_t hash_data_1 = vdupq_n_u32(key);
+ uint32x4_t hash_data_2 = vdupq_n_u32(key);
+
+ uint32x4_t rehash_1 = vld1q_u32(&kRehash[0]);
+ uint32x4_t rehash_2 = vld1q_u32(&kRehash[4]);
+
+ // masks[i] = key * kRehash[i];
+ hash_data_1 = vmulq_u32(rehash_1, hash_data_1);
+ hash_data_2 = vmulq_u32(rehash_2, hash_data_2);
+ // masks[i] = masks[i] >> shift_num;
+ hash_data_1 = vshrq_n_u32(hash_data_1, shift_num);
+ hash_data_2 = vshrq_n_u32(hash_data_2, shift_num);
+
+ const uint32x4_t ones = vdupq_n_u32(1);
+
+ // masks[i] = 0x1 << masks[i];
+ masks[0] = vshlq_u32(ones, reinterpret_cast<int32x4_t>(hash_data_1));
+ masks[1] = vshlq_u32(ones, reinterpret_cast<int32x4_t>(hash_data_2));
+ }
+#else
+ void make_find_mask(uint32_t key, uint32_t* masks) const noexcept {
+ for (int i = 0; i < kBucketWords; ++i) {
+ masks[i] = key * kRehash[i];
+
+ masks[i] = masks[i] >> shift_num;
+
+ masks[i] = 0x1 << masks[i];
+ }
+ }
+#endif
+
// Computes the logical OR of this filter with 'other' and stores the
result in this
// filter.
// Notes:
@@ -163,7 +196,8 @@ private:
// log2(number of bits in a BucketWord)
static constexpr int kLogBucketWordBits = 5;
static constexpr BucketWord kBucketWordMask = (1 << kLogBucketWordBits) -
1;
-
+ // (>> 27) is equivalent to (mod 32)
+ static constexpr auto shift_num = ((1 << kLogBucketWordBits) -
kLogBucketWordBits);
// log2(number of bytes in a bucket)
static constexpr int kLogBucketByteSize = 5;
// Bucket size in bytes.
diff --git a/be/src/exprs/block_bloom_filter_impl.cc
b/be/src/exprs/block_bloom_filter_impl.cc
index d285edcb310..e89b9142266 100644
--- a/be/src/exprs/block_bloom_filter_impl.cc
+++ b/be/src/exprs/block_bloom_filter_impl.cc
@@ -138,14 +138,37 @@ void BlockBloomFilter::bucket_insert(const uint32_t
bucket_idx, const uint32_t h
}
bool BlockBloomFilter::bucket_find(const uint32_t bucket_idx, const uint32_t
hash) const noexcept {
+#if defined(__ARM_NEON)
+ uint32x4_t masks[2];
+
+ uint32x4_t directory_1 = vld1q_u32(&_directory[bucket_idx][0]);
+ uint32x4_t directory_2 = vld1q_u32(&_directory[bucket_idx][4]);
+
+ make_find_mask(hash, masks);
+ // The condition for returning true is that all the bits in
_directory[bucket_idx][i] specified by masks[i] are 1.
+ // This can be equivalently expressed as all the bits in not(
_directory[bucket_idx][i]) specified by masks[i] are 0.
+ // vbicq_u32(vec1, vec2) : Result of (vec1 AND NOT vec2)
+ // If true is returned, out_1 and out_2 should be all zeros.
+ uint32x4_t out_1 = vbicq_u32(masks[0], directory_1);
+ uint32x4_t out_2 = vbicq_u32(masks[1], directory_2);
+
+ out_1 = vorrq_u32(out_1, out_2);
+
+ uint32x2_t low = vget_low_u32(out_1);
+ uint32x2_t high = vget_high_u32(out_1);
+ low = vorr_u32(low, high);
+ uint32_t res = vget_lane_u32(low, 0) | vget_lane_u32(low, 1);
+ return !(res);
+#else
+ uint32_t masks[kBucketWords];
+ make_find_mask(hash, masks);
for (int i = 0; i < kBucketWords; ++i) {
- BucketWord hval = (kRehash[i] * hash) >> ((1 << kLogBucketWordBits) -
kLogBucketWordBits);
- hval = 1U << hval;
- if (!(DCHECK_NOTNULL(_directory)[bucket_idx][i] & hval)) {
+ if ((DCHECK_NOTNULL(_directory)[bucket_idx][i] & masks[i]) == 0) {
return false;
}
}
return true;
+#endif
}
void BlockBloomFilter::insert_no_avx2(const uint32_t hash) noexcept {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]