This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6b1abbd Implement SIMD comparison operations for types with less than
4 lanes (i128) (#1146)
6b1abbd is described below
commit 6b1abbd6507e944cbe4fdceebc71f05219d330bd
Author: Jörn Horstmann <[email protected]>
AuthorDate: Mon Jan 10 22:49:46 2022 +0100
Implement SIMD comparison operations for types with less than 4 lanes
(i128) (#1146)
* Implement simd mask creation for 128 bit types
* Adjust comparison kernels to always append 64 bit chunks
* Only append minimal number of bytes
* Add benchmark for MonthDayNano comparison
* Fix typo in comment
Co-authored-by: Paddy Horan <[email protected]>
* Fix typo in comment
Co-authored-by: Paddy Horan <[email protected]>
Co-authored-by: Paddy Horan <[email protected]>
---
arrow/benches/comparison_kernels.rs | 14 +++-
arrow/src/compute/kernels/comparison.rs | 122 ++++++++++++++++++--------------
arrow/src/datatypes/numeric.rs | 29 +++++++-
3 files changed, 110 insertions(+), 55 deletions(-)
diff --git a/arrow/benches/comparison_kernels.rs
b/arrow/benches/comparison_kernels.rs
index 94ff7df..cf9ccdd 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -22,7 +22,7 @@ use criterion::Criterion;
extern crate arrow;
use arrow::compute::*;
-use arrow::datatypes::ArrowNumericType;
+use arrow::datatypes::{ArrowNumericType, IntervalMonthDayNanoType};
use arrow::util::bench_util::*;
use arrow::{array::*, datatypes::Float32Type};
@@ -138,6 +138,11 @@ fn add_benchmark(c: &mut Criterion) {
let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
let arr_b = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 43);
+ let arr_month_day_nano_a =
+ create_primitive_array_with_seed::<IntervalMonthDayNanoType>(size,
0.0, 43);
+ let arr_month_day_nano_b =
+ create_primitive_array_with_seed::<IntervalMonthDayNanoType>(size,
0.0, 43);
+
let arr_string = create_string_array::<i32>(size, 0.0);
c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b)));
@@ -170,6 +175,13 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_gt_eq_scalar(&arr_a, 1.0))
});
+ c.bench_function("eq MonthDayNano", |b| {
+ b.iter(|| bench_eq(&arr_month_day_nano_a, &arr_month_day_nano_b))
+ });
+ c.bench_function("eq scalar MonthDayNano", |b| {
+ b.iter(|| bench_eq_scalar(&arr_month_day_nano_a, 123))
+ });
+
c.bench_function("like_utf8 scalar equals", |b| {
b.iter(|| bench_like_utf8_scalar(&arr_string, "xxxx"))
});
diff --git a/arrow/src/compute/kernels/comparison.rs
b/arrow/src/compute/kernels/comparison.rs
index c7bee58..9390d84 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -1471,14 +1471,21 @@ where
let null_bit_buffer = combine_option_bitmap(left.data_ref(),
right.data_ref(), len)?;
+ // we process the data in chunks so that each iteration results in one u64
of comparison result bits
+ const CHUNK_SIZE: usize = 64;
let lanes = T::lanes();
+
+ // this is currently the case for all our datatypes and allows us to
always append full bytes
+ assert!(
+ lanes <= CHUNK_SIZE,
+ "Number of vector lanes must be at most 64"
+ );
+
let buffer_size = bit_util::ceil(len, 8);
let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size,
false);
- // this is currently the case for all our datatypes and allows us to
always append full bytes
- assert_eq!(lanes % 8, 0, "Number of vector lanes must be multiple of 8");
- let mut left_chunks = left.values().chunks_exact(lanes);
- let mut right_chunks = right.values().chunks_exact(lanes);
+ let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE);
+ let mut right_chunks = right.values().chunks_exact(CHUNK_SIZE);
// safety: result is newly created above, always written as a T below
let result_chunks = unsafe { result.typed_data_mut() };
@@ -1486,15 +1493,22 @@ where
.borrow_mut()
.zip(right_chunks.borrow_mut())
.fold(result_chunks, |result_slice, (left_slice, right_slice)| {
- let simd_left = T::load(left_slice);
- let simd_right = T::load(right_slice);
- let simd_result = simd_op(simd_left, simd_right);
+ let mut i = 0;
+ let mut bitmask = 0_u64;
+ while i < CHUNK_SIZE {
+ let simd_left = T::load(&left_slice[i..]);
+ let simd_right = T::load(&right_slice[i..]);
+ let simd_result = simd_op(simd_left, simd_right);
- let bitmask = T::mask_to_u64(&simd_result);
+ let m = T::mask_to_u64(&simd_result);
+ bitmask |= m << (i / lanes);
+
+ i += lanes;
+ }
let bytes = bitmask.to_le_bytes();
- result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 8]);
+ result_slice[0..8].copy_from_slice(&bytes);
- &mut result_slice[lanes / 8..]
+ &mut result_slice[8..]
});
let left_remainder = left_chunks.remainder();
@@ -1502,22 +1516,20 @@ where
assert_eq!(left_remainder.len(), right_remainder.len());
- let remainder_bitmask = left_remainder
- .iter()
- .zip(right_remainder.iter())
- .enumerate()
- .fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| {
- let bit = if scalar_op(*scalar_left, *scalar_right) {
- 1_u64
- } else {
- 0_u64
- };
- mask |= bit << i;
- mask
- });
- let remainder_mask_as_bytes =
-
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
- result_remainder.copy_from_slice(remainder_mask_as_bytes);
+ if !left_remainder.is_empty() {
+ let remainder_bitmask = left_remainder
+ .iter()
+ .zip(right_remainder.iter())
+ .enumerate()
+ .fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| {
+ let bit = scalar_op(*scalar_left, *scalar_right) as u64;
+ mask |= bit << i;
+ mask
+ });
+ let remainder_mask_as_bytes =
+
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
+ result_remainder.copy_from_slice(remainder_mask_as_bytes);
+ }
let data = unsafe {
ArrayData::new_unchecked(
@@ -1551,16 +1563,20 @@ where
let len = left.len();
+ // we process the data in chunks so that each iteration results in one u64
of comparison result bits
+ const CHUNK_SIZE: usize = 64;
let lanes = T::lanes();
- let buffer_size = bit_util::ceil(len, 8);
- let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size,
false);
// this is currently the case for all our datatypes and allows us to
always append full bytes
assert!(
- lanes % 8 == 0,
- "Number of vector lanes must be multiple of 8"
+ lanes <= CHUNK_SIZE,
+ "Number of vector lanes must be at most 64"
);
- let mut left_chunks = left.values().chunks_exact(lanes);
+
+ let buffer_size = bit_util::ceil(len, 8);
+ let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size,
false);
+
+ let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE);
let simd_right = T::init(right);
// safety: result is newly created above, always written as a T below
@@ -1569,34 +1585,38 @@ where
left_chunks
.borrow_mut()
.fold(result_chunks, |result_slice, left_slice| {
- let simd_left = T::load(left_slice);
- let simd_result = simd_op(simd_left, simd_right);
+ let mut i = 0;
+ let mut bitmask = 0_u64;
+ while i < CHUNK_SIZE {
+ let simd_left = T::load(&left_slice[i..]);
+ let simd_result = simd_op(simd_left, simd_right);
+
+ let m = T::mask_to_u64(&simd_result);
+ bitmask |= m << (i / lanes);
- let bitmask = T::mask_to_u64(&simd_result);
+ i += lanes;
+ }
let bytes = bitmask.to_le_bytes();
- result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes /
8]);
+ result_slice[0..8].copy_from_slice(&bytes);
- &mut result_slice[lanes / 8..]
+ &mut result_slice[8..]
});
let left_remainder = left_chunks.remainder();
- let remainder_bitmask =
- left_remainder
- .iter()
- .enumerate()
- .fold(0_u64, |mut mask, (i, scalar_left)| {
- let bit = if scalar_op(*scalar_left, right) {
- 1_u64
- } else {
- 0_u64
- };
+ if !left_remainder.is_empty() {
+ let remainder_bitmask = left_remainder.iter().enumerate().fold(
+ 0_u64,
+ |mut mask, (i, scalar_left)| {
+ let bit = scalar_op(*scalar_left, right) as u64;
mask |= bit << i;
mask
- });
- let remainder_mask_as_bytes =
-
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
- result_remainder.copy_from_slice(remainder_mask_as_bytes);
+ },
+ );
+ let remainder_mask_as_bytes =
+
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
+ result_remainder.copy_from_slice(remainder_mask_as_bytes);
+ }
let null_bit_buffer = left
.data_ref()
@@ -2723,8 +2743,6 @@ mod tests {
);
}
- // Fails when simd is enabled:
https://github.com/apache/arrow-rs/issues/1136
- #[cfg(not(feature = "simd"))]
#[test]
fn test_interval_array() {
let a = IntervalDayTimeArray::from(
diff --git a/arrow/src/datatypes/numeric.rs b/arrow/src/datatypes/numeric.rs
index cbb953c..2ff53c9 100644
--- a/arrow/src/datatypes/numeric.rs
+++ b/arrow/src/datatypes/numeric.rs
@@ -147,6 +147,20 @@ macro_rules! make_numeric_type {
// this match will get removed by the compiler since the
number of lanes is known at
// compile-time for each concrete numeric type
match Self::lanes() {
+ 4 => {
+ // the bit position in each lane indicates the index
of that lane
+ let vecidx = i128x4::new(1, 2, 4, 8);
+
+ // broadcast the lowermost 8 bits of mask to each lane
+ let vecmask = i128x4::splat((mask & 0x0F) as i128);
+ // compute whether the bit corresponding to each lanes
index is set
+ let vecmask = (vecidx & vecmask).eq(vecidx);
+
+ // transmute is necessary because the different match
arms return different
+ // mask types, at runtime only one of those
expressions will exist per type,
+ // with the type being equal to `SimdMask`.
+ unsafe { std::mem::transmute(vecmask) }
+ }
8 => {
// the bit position in each lane indicates the index
of that lane
let vecidx = i64x8::new(1, 2, 4, 8, 16, 32, 64, 128);
@@ -448,11 +462,11 @@ macro_rules! make_float_numeric_type {
make_float_numeric_type!(Float32Type, f32x16);
make_float_numeric_type!(Float64Type, f64x8);
-#[cfg(all(test, simd_x86))]
+#[cfg(all(test, feature = "simd"))]
mod tests {
use crate::datatypes::{
ArrowNumericType, Float32Type, Float64Type, Int32Type, Int64Type,
Int8Type,
- UInt16Type,
+ IntervalMonthDayNanoType, UInt16Type,
};
use packed_simd::*;
use FromCast;
@@ -471,6 +485,17 @@ mod tests {
}
#[test]
+ fn test_mask_i128() {
+ let mask = 0b1101;
+ let actual = IntervalMonthDayNanoType::mask_from_u64(mask);
+ let expected = expected_mask!(i128, mask);
+ let expected =
+
m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice()));
+
+ assert_eq!(expected, actual);
+ }
+
+ #[test]
fn test_mask_f64() {
let mask = 0b10101010;
let actual = Float64Type::mask_from_u64(mask);