(doris) branch branch-4.0 updated: branch-4.0: [opt](arm) Improve count_zero_num performance with NEON intrinsics (#58803)

yiguolei Mon, 08 Dec 2025 23:39:05 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 6244527a8b1 branch-4.0: [opt](arm) Improve count_zero_num performance 
with NEON intrinsics (#58803)
6244527a8b1 is described below

commit 6244527a8b186ade2e5f5ff3db0f3abbede3b2f7
Author: Zhiguo Wu <[email protected]>
AuthorDate: Tue Dec 9 15:38:52 2025 +0800

    branch-4.0: [opt](arm) Improve count_zero_num performance with NEON 
intrinsics (#58803)
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #58615
    
    Problem Summary: Fix SSE and port NEON code from #58615 , without
    benchmark and ut.
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/util/simd/bits.h | 62 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 11 deletions(-)

diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h
index 186e43746d2..01ac30d45ba 100644
--- a/be/src/util/simd/bits.h
+++ b/be/src/util/simd/bits.h
@@ -22,7 +22,7 @@
 #include <type_traits>
 #include <vector>
 
-#if defined(__ARM_NEON) && defined(__aarch64__)
+#if defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif
 
@@ -130,7 +130,21 @@ template <typename T>
 inline T count_zero_num(const int8_t* __restrict data, T size) {
     T num = 0;
     const int8_t* end = data + size;
-#if defined(__SSE2__) && defined(__POPCNT__)
+#if defined(__ARM_NEON)
+    const int8_t* end64 = data + (size / 64 * 64);
+
+    for (; data < end64; data += 64) {
+        auto a0 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data)), 7);
+        auto a1 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 16)), 7);
+        auto a2 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 32)), 7);
+        auto a3 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 48)), 7);
+
+        auto s0 = vaddq_u8(a0, a1);
+        auto s1 = vaddq_u8(a2, a3);
+        auto s = vaddq_u8(s0, s1);
+        num += vaddvq_u8(s);
+    }
+#elif defined(__SSE2__) && defined(__POPCNT__)
     const __m128i zero16 = _mm_setzero_si128();
     const int8_t* end64 = data + (size / 64 * 64);
 
@@ -160,8 +174,28 @@ template <typename T>
 inline T count_zero_num(const int8_t* __restrict data, const uint8_t* 
__restrict null_map, T size) {
     T num = 0;
     const int8_t* end = data + size;
-#if defined(__SSE2__) && defined(__POPCNT__)
+#if defined(__ARM_NEON)
+    const int8_t* end64 = data + (size / 64 * 64);
+
+    for (; data < end64; data += 64, null_map += 64) {
+        auto a0 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data)), 7);
+        auto a1 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 16)), 7);
+        auto a2 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 32)), 7);
+        auto a3 = vshrq_n_u8(vceqzq_s8(vld1q_s8(data + 48)), 7);
+
+        auto r0 = vorrq_u8(a0, vld1q_u8(null_map));
+        auto r1 = vorrq_u8(a1, vld1q_u8(null_map + 16));
+        auto r2 = vorrq_u8(a2, vld1q_u8(null_map + 32));
+        auto r3 = vorrq_u8(a3, vld1q_u8(null_map + 48));
+
+        auto s0 = vaddq_u8(r0, r1);
+        auto s1 = vaddq_u8(r2, r3);
+        auto s = vaddq_u8(s0, s1);
+        num += vaddvq_u8(s);
+    }
+#elif defined(__SSE2__) && defined(__POPCNT__)
     const __m128i zero16 = _mm_setzero_si128();
+    const __m128i one16 = _mm_set1_epi8(1);
     const int8_t* end64 = data + (size / 64 * 64);
 
     for (; data < end64; data += 64, null_map += 64) {
@@ -169,25 +203,31 @@ inline T count_zero_num(const int8_t* __restrict data, 
const uint8_t* __restrict
                 static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
                         _mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast<const 
__m128i*>(data)),
                                        zero16),
-                        _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(null_map))))) |
+                        _mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast<const 
__m128i*>(null_map)),
+                                       one16)))) |
                 (static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
                          _mm_cmpeq_epi8(
                                  _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(data + 16)),
                                  zero16),
-                         _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(null_map + 16)))))
+                         _mm_cmpeq_epi8(
+                                 _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(null_map + 16)),
+                                 one16))))
                  << 16U) |
                 (static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
                          _mm_cmpeq_epi8(
                                  _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(data + 32)),
                                  zero16),
-                         _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(null_map + 32)))))
+                         _mm_cmpeq_epi8(
+                                 _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(null_map + 32)),
+                                 one16))))
                  << 32U) |
                 (static_cast<uint64_t>(_mm_movemask_epi8(_mm_or_si128(
-                         _mm_cmpeq_epi8(
-                                 _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(data + 48)),
-                                 zero16),
-                         _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(null_map + 48)))))
-                 << 48U));
+                        _mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast<const 
__m128i*>(data + 48)),
+                                       zero16),
+                        _mm_cmpeq_epi8(
+                                _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(null_map + 48)),
+                                one16)))))
+                        << 48U);
     }
 #endif
     for (; data < end; ++data, ++null_map) {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-4.0 updated: branch-4.0: [opt](arm) Improve count_zero_num performance with NEON intrinsics (#58803)

Reply via email to