This is an automated email from the ASF dual-hosted git repository.
zhangstar333 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 6ea6c2fb238 [refine](bits) refine bytes_mask_to_bits_mask code (#38360)
6ea6c2fb238 is described below
commit 6ea6c2fb2381b422caf157db3c617045aecf7be1
Author: Mryange <[email protected]>
AuthorDate: Mon Aug 19 15:28:39 2024 +0800
[refine](bits) refine bytes_mask_to_bits_mask code (#38360)
## Proposed changes
The previous code only considered the x86 architecture, and
_mm_movemask_epi8 does not have a corresponding instruction in ARM.
According to the article below, we need to abstract the overall logic.
For ARM, optimize using the content mentioned in the following article:
filter function origin 0.711375 seconds 0.7154 seconds 0.71782 seconds
0.715296 seconds
filter function arm opt 0.559854 seconds 0.559854 seconds 0.559854
seconds 0.559854 seconds
[link](https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e)
---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 30 ++++----
be/src/util/simd/bits.h | 80 ++++++++++++++++++++--
be/src/vec/columns/column_decimal.cpp | 37 +++++-----
be/src/vec/columns/column_vector.cpp | 38 +++++-----
be/src/vec/columns/columns_common.cpp | 37 +++++-----
5 files changed, 144 insertions(+), 78 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 2cec6f48f6b..8fa1a81540a 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -2223,23 +2223,21 @@ uint16_t
SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_
uint32_t sel_pos = 0;
const uint32_t sel_end = sel_pos + selected_size;
- static constexpr size_t SIMD_BYTES = 32;
+ static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES *
SIMD_BYTES;
while (sel_pos < sel_end_simd) {
- auto mask = simd::bytes32_mask_to_bits32_mask(_ret_flags.data() +
sel_pos);
+ auto mask = simd::bytes_mask_to_bits_mask(_ret_flags.data() + sel_pos);
if (0 == mask) {
//pass
- } else if (0xffffffff == mask) {
+ } else if (simd::bits_mask_all() == mask) {
for (uint32_t i = 0; i < SIMD_BYTES; i++) {
sel_rowid_idx[new_size++] = sel_pos + i;
}
} else {
- while (mask) {
- const size_t bit_pos = __builtin_ctzll(mask);
- sel_rowid_idx[new_size++] = sel_pos + bit_pos;
- mask = mask & (mask - 1);
- }
+ simd::iterate_through_bits_mask(
+ [&](const size_t bit_pos) { sel_rowid_idx[new_size++] =
sel_pos + bit_pos; },
+ mask);
}
sel_pos += SIMD_BYTES;
}
@@ -2709,23 +2707,23 @@ uint16_t
SegmentIterator::_evaluate_common_expr_filter(uint16_t* sel_rowid_idx,
uint16_t new_size = 0;
uint32_t sel_pos = 0;
const uint32_t sel_end = selected_size;
- static constexpr size_t SIMD_BYTES = 32;
+ static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES *
SIMD_BYTES;
while (sel_pos < sel_end_simd) {
- auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos + sel_pos);
+ auto mask = simd::bytes_mask_to_bits_mask(filt_pos + sel_pos);
if (0 == mask) {
//pass
- } else if (0xffffffff == mask) {
+ } else if (simd::bits_mask_all() == mask) {
for (uint32_t i = 0; i < SIMD_BYTES; i++) {
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + i];
}
} else {
- while (mask) {
- const size_t bit_pos = __builtin_ctzll(mask);
- sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos +
bit_pos];
- mask = mask & (mask - 1);
- }
+ simd::iterate_through_bits_mask(
+ [&](const size_t bit_pos) {
+ sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos
+ bit_pos];
+ },
+ mask);
}
sel_pos += SIMD_BYTES;
}
diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h
index a36a95b6eef..7e2e7c82025 100644
--- a/be/src/util/simd/bits.h
+++ b/be/src/util/simd/bits.h
@@ -21,19 +21,58 @@
#include <cstring>
#include <vector>
+#if defined(__ARM_NEON) && defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
#include "util/sse_util.hpp"
namespace doris {
namespace simd {
-/// todo(zeno) Compile add avx512 parameter, modify it to
bytes64_mask_to_bits64_mask
-/// Transform 32-byte mask to 32-bit mask
+consteval auto bits_mask_length() {
+#if defined(__ARM_NEON) && defined(__aarch64__)
+ return 16;
+#else
+ return 32;
+#endif
+}
+
+#if defined(__ARM_NEON) && defined(__aarch64__)
+inline uint64_t get_nibble_mask(uint8x16_t values) {
+ // It produces 4-bit out of each byte, alternating between the high 4-bits
and low 4-bits of the 16-byte vector.
+ // Given that the comparison operators give a 16-byte result of 0x00 or
0xff, the result is close to being a PMOVMSKB,
+ // the only difference is that every matching bit is repeated 4 times and
is a 64-bit integer.
+ //
https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e
+ return
vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(values),
4)), 0);
+}
+/*
+Input 16 bytes of data and convert it into a 64-bit integer, where one bit
appears 4 times.
+Compare with bytes32_mask_to_bits32_mask, a u8 array with a length of 32
+ std::vector<uint8_t> vec = {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0};
+
+bytes32_mask_to_bits32_mask 0100 0000 0000 0000,1101 0000 0000 0011
+
+
+ (1101 0000 0000 0011)
+bytes16_mask_to_bits64_mask 1111 1111 0000 1111,0000 0000 0000 0000,0000
0000 0000 0000,0000 0000 1111 1111
+ (0100 0000 0000 0000)
+ 0000 1111 0000 0000,0000 0000 0000 0000,0000
0000 0000 0000,0000 0000 0000 0000
+*/
+
+inline uint64_t bytes16_mask_to_bits64_mask(const uint8_t* data) {
+ const uint8x16_t vfilter = vld1q_u8(data);
+ return get_nibble_mask(vmvnq_u8(vceqzq_u8(vfilter)));
+}
+#endif
+
inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
#ifdef __AVX2__
auto zero32 = _mm256_setzero_si256();
uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const
__m256i*>(data)), zero32)));
-#elif defined(__SSE2__) || defined(__aarch64__)
+#elif defined(__SSE2__)
auto zero16 = _mm_setzero_si128();
uint32_t mask =
(static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(
@@ -51,8 +90,39 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t*
data) {
return mask;
}
-inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) {
- return bytes32_mask_to_bits32_mask(reinterpret_cast<const uint8_t*>(data));
+inline auto bytes_mask_to_bits_mask(const uint8_t* data) {
+#if defined(__ARM_NEON) && defined(__aarch64__)
+ return bytes16_mask_to_bits64_mask(data);
+#else
+ return bytes32_mask_to_bits32_mask(data);
+#endif
+}
+
+inline constexpr auto bits_mask_all() {
+#if defined(__ARM_NEON) && defined(__aarch64__)
+ return 0xffff'ffff'ffff'ffffULL;
+#else
+ return 0xffffffff;
+#endif
+}
+
+template <typename Func>
+void iterate_through_bits_mask(Func func,
decltype(bytes_mask_to_bits_mask(nullptr)) mask) {
+#if defined(__ARM_NEON) && defined(__aarch64__)
+ mask &= 0x8888'8888'8888'8888ULL;
+ while (mask) {
+ const auto index = __builtin_ctzll(mask) >> 2;
+ func(index);
+ mask &= mask - 1;
+ }
+
+#else
+ while (mask) {
+ const auto bit_pos = __builtin_ctzll(mask);
+ func(bit_pos);
+ mask = mask & (mask - 1);
+ }
+#endif
}
inline size_t count_zero_num(const int8_t* __restrict data, size_t size) {
diff --git a/be/src/vec/columns/column_decimal.cpp
b/be/src/vec/columns/column_decimal.cpp
index 65e8c9d79ac..beeb6224c22 100644
--- a/be/src/vec/columns/column_decimal.cpp
+++ b/be/src/vec/columns/column_decimal.cpp
@@ -337,20 +337,18 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter&
filt, ssize_t result_s
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES`
values.
*/
- static constexpr size_t SIMD_BYTES = 32;
+ static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_sse) {
- uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos);
-
- if (0xFFFFFFFF == mask) {
+ auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
+ if (0 == mask) {
+ //pass
+ } else if (simd::bits_mask_all() == mask) {
res_data.insert(data_pos, data_pos + SIMD_BYTES);
} else {
- while (mask) {
- const size_t idx = __builtin_ctzll(mask);
- res_data.push_back(data_pos[idx]);
- mask = mask & (mask - 1);
- }
+ simd::iterate_through_bits_mask(
+ [&](const size_t bit_pos) {
res_data.push_back(data_pos[bit_pos]); }, mask);
}
filt_pos += SIMD_BYTES;
@@ -382,22 +380,23 @@ size_t ColumnDecimal<T>::filter(const IColumn::Filter&
filter) {
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES`
values.
*/
- static constexpr size_t SIMD_BYTES = 32;
+ static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filter_pos < filter_end_sse) {
- uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos);
-
- if (0xFFFFFFFF == mask) {
+ auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
+ if (0 == mask) {
+ //pass
+ } else if (simd::bits_mask_all() == mask) {
memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES);
result_data += SIMD_BYTES;
} else {
- while (mask) {
- const size_t idx = __builtin_ctzll(mask);
- *result_data = data_pos[idx];
- ++result_data;
- mask = mask & (mask - 1);
- }
+ simd::iterate_through_bits_mask(
+ [&](const size_t idx) {
+ *result_data = data_pos[idx];
+ ++result_data;
+ },
+ mask);
}
filter_pos += SIMD_BYTES;
diff --git a/be/src/vec/columns/column_vector.cpp
b/be/src/vec/columns/column_vector.cpp
index 590e2047cab..3d34bd5d55b 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -406,20 +406,19 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter&
filt, ssize_t result_si
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES`
values.
*/
- static constexpr size_t SIMD_BYTES = 32;
+ static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_sse) {
- uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos);
-
- if (0xFFFFFFFF == mask) {
+ auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
+ if (0 == mask) {
+ //pass
+ } else if (simd::bits_mask_all() == mask) {
res_data.insert(data_pos, data_pos + SIMD_BYTES);
} else {
- while (mask) {
- const size_t idx = __builtin_ctzll(mask);
- res_data.push_back_without_reserve(data_pos[idx]);
- mask = mask & (mask - 1);
- }
+ simd::iterate_through_bits_mask(
+ [&](const size_t idx) {
res_data.push_back_without_reserve(data_pos[idx]); },
+ mask);
}
filt_pos += SIMD_BYTES;
@@ -453,22 +452,23 @@ size_t ColumnVector<T>::filter(const IColumn::Filter&
filter) {
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES`
values.
*/
- static constexpr size_t SIMD_BYTES = 32;
+ static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filter_pos < filter_end_sse) {
- uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos);
-
- if (0xFFFFFFFF == mask) {
+ auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
+ if (0 == mask) {
+ //pass
+ } else if (simd::bits_mask_all() == mask) {
memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES);
result_data += SIMD_BYTES;
} else {
- while (mask) {
- const size_t idx = __builtin_ctzll(mask);
- *result_data = data_pos[idx];
- ++result_data;
- mask = mask & (mask - 1);
- }
+ simd::iterate_through_bits_mask(
+ [&](const size_t idx) {
+ *result_data = data_pos[idx];
+ ++result_data;
+ },
+ mask);
}
filter_pos += SIMD_BYTES;
diff --git a/be/src/vec/columns/columns_common.cpp
b/be/src/vec/columns/columns_common.cpp
index d1f7df85433..0671e9abd85 100644
--- a/be/src/vec/columns/columns_common.cpp
+++ b/be/src/vec/columns/columns_common.cpp
@@ -182,13 +182,14 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>&
src_elems,
memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size *
sizeof(T));
};
- static constexpr size_t SIMD_BYTES = 32;
+ static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
while (filt_pos < filt_end_aligned) {
- auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos);
-
- if (mask == 0xffffffff) {
+ auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
+ if (0 == mask) {
+ //pass
+ } else if (mask == simd::bits_mask_all()) {
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;
@@ -203,11 +204,8 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>&
src_elems,
res_elems.resize(elems_size_old + chunk_size);
memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset],
chunk_size * sizeof(T));
} else {
- while (mask) {
- const size_t bit_pos = __builtin_ctzll(mask);
- copy_array(offsets_pos + bit_pos);
- mask = mask & (mask - 1);
- }
+ simd::iterate_through_bits_mask(
+ [&](const size_t bit_pos) { copy_array(offsets_pos +
bit_pos); }, mask);
}
filt_pos += SIMD_BYTES;
@@ -259,13 +257,14 @@ size_t
filter_arrays_impl_generic_without_reserving(PaddedPODArray<T>& elems,
result_data += arr_size;
};
- static constexpr size_t SIMD_BYTES = 32;
+ static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const auto filter_end_aligned = filter_pos + size / SIMD_BYTES *
SIMD_BYTES;
while (filter_pos < filter_end_aligned) {
- auto mask = simd::bytes32_mask_to_bits32_mask(filter_pos);
-
- if (mask == 0xffffffff) {
+ auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
+ if (0 == mask) {
+ //pass
+ } else if (mask == simd::bits_mask_all()) {
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;
@@ -281,12 +280,12 @@ size_t
filter_arrays_impl_generic_without_reserving(PaddedPODArray<T>& elems,
result_data += chunk_size;
result_size += SIMD_BYTES;
} else {
- while (mask) {
- const size_t bit_pos = __builtin_ctzll(mask);
- copy_array(offsets_pos + bit_pos);
- ++result_size;
- mask = mask & (mask - 1);
- }
+ simd::iterate_through_bits_mask(
+ [&](const size_t bit_pos) {
+ copy_array(offsets_pos + bit_pos);
+ ++result_size;
+ },
+ mask);
}
filter_pos += SIMD_BYTES;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]