This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 6244527a8b1 branch-4.0: [opt](arm) Improve count_zero_num performance
with NEON intrinsics (#58803)
6244527a8b1 is described below
commit 6244527a8b186ade2e5f5ff3db0f3abbede3b2f7
Author: Zhiguo Wu <[email protected]>
AuthorDate: Tue Dec 9 15:38:52 2025 +0800
branch-4.0: [opt](arm) Improve count_zero_num performance with NEON
intrinsics (#58803)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #58615
Problem Summary: Fix SSE and port NEON code from #58615 , without
benchmark and ut.
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/util/simd/bits.h | 62 ++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 51 insertions(+), 11 deletions(-)
diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h
index 186e43746d2..01ac30d45ba 100644
--- a/be/src/util/simd/bits.h
+++ b/be/src/util/simd/bits.h
@@ -22,7 +22,7 @@
#include <type_traits>
#include <vector>
-#if defined(__ARM_NEON) && defined(__aarch64__)
+#if defined(__ARM_NEON)
#include <arm_neon.h>
#endif
@@ -130,7 +130,21 @@ template <typename T>
inline T count_zero_num(const int8_t* __restrict data, T size) {
T num = 0;
const int8_t* end = data + size;
-#if defined(__SSE2__) && defined(__POPCNT__)
+#if defined(__ARM_NEON)
+ const int8_t* end64 = data + (size / 64 * 64);
+
+ for (; data < end64; data += 64) {
+ auto a0 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data)), 7);
+ auto a1 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 16)), 7);
+ auto a2 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 32)), 7);
+ auto a3 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 48)), 7);
+
+ auto s0 = vaddq_u8(a0, a1);
+ auto s1 = vaddq_u8(a2, a3);
+ auto s = vaddq_u8(s0, s1);
+ num += vaddvq_u8(s);
+ }
+#elif defined(__SSE2__) && defined(__POPCNT__)
const __m128i zero16 = _mm_setzero_si128();
const int8_t* end64 = data + (size / 64 * 64);
@@ -160,8 +174,28 @@ template <typename T>
inline T count_zero_num(const int8_t* __restrict data, const uint8_t*
__restrict null_map, T size) {
T num = 0;
const int8_t* end = data + size;
-#if defined(__SSE2__) && defined(__POPCNT__)
+#if defined(__ARM_NEON)
+ const int8_t* end64 = data + (size / 64 * 64);
+
+ for (; data < end64; data += 64, null_map += 64) {
+ auto a0 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data)), 7);
+ auto a1 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 16)), 7);
+ auto a2 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 32)), 7);
+ auto a3 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 48)), 7);
+
+ auto r0 = vorrq_u8(a0, vld1q_u8(null_map));
+ auto r1 = vorrq_u8(a1, vld1q_u8(null_map + 16));
+ auto r2 = vorrq_u8(a2, vld1q_u8(null_map + 32));
+ auto r3 = vorrq_u8(a3, vld1q_u8(null_map + 48));
+
+ auto s0 = vaddq_u8(r0, r1);
+ auto s1 = vaddq_u8(r2, r3);
+ auto s = vaddq_u8(s0, s1);
+ num += vaddvq_u8(s);
+ }
+#elif defined(__SSE2__) && defined(__POPCNT__)
const __m128i zero16 = _mm_setzero_si128();
+ const __m128i one16 = _mm_set1_epi8(1);
const int8_t* end64 = data + (size / 64 * 64);
for (; data < end64; data += 64, null_map += 64) {
@@ -169,25 +203,31 @@ inline T count_zero_num(const int8_t* __restrict data,
const uint8_t* __restrict
static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
_mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast<const
__m128i*>(data)),
zero16),
- _mm_loadu_si128(reinterpret_cast<const
__m128i*>(null_map))))) |
+ _mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast<const
__m128i*>(null_map)),
+ one16)))) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const
__m128i*>(data + 16)),
zero16),
- _mm_loadu_si128(reinterpret_cast<const
__m128i*>(null_map + 16)))))
+ _mm_cmpeq_epi8(
+ _mm_loadu_si128(reinterpret_cast<const
__m128i*>(null_map + 16)),
+ one16))))
<< 16U) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const
__m128i*>(data + 32)),
zero16),
- _mm_loadu_si128(reinterpret_cast<const
__m128i*>(null_map + 32)))))
+ _mm_cmpeq_epi8(
+ _mm_loadu_si128(reinterpret_cast<const
__m128i*>(null_map + 32)),
+ one16))))
<< 32U) |
(static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
- _mm_cmpeq_epi8(
- _mm_loadu_si128(reinterpret_cast<const
__m128i*>(data + 48)),
- zero16),
- _mm_loadu_si128(reinterpret_cast<const
__m128i*>(null_map + 48)))))
- << 48U));
+ _mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast<const
__m128i*>(data + 48)),
+ zero16),
+ _mm_cmpeq_epi8(
+ _mm_loadu_si128(reinterpret_cast<const
__m128i*>(null_map + 48)),
+ one16)))))
+ << 48U);
}
#endif
for (; data < end; ++data, ++null_map) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]