[arrow-rs] branch master updated: Implement SIMD comparison operations for types with less than 4 lanes (i128) (#1146)

alamb Mon, 10 Jan 2022 13:49:59 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 6b1abbd  Implement SIMD comparison operations for types with less than 
4 lanes (i128) (#1146)
6b1abbd is described below

commit 6b1abbd6507e944cbe4fdceebc71f05219d330bd
Author: Jörn Horstmann <[email protected]>
AuthorDate: Mon Jan 10 22:49:46 2022 +0100

    Implement SIMD comparison operations for types with less than 4 lanes 
(i128) (#1146)
    
    * Implement simd mask creation for 128 bit types
    
    * Adjust comparison kernels to always append 64 bit chunks
    
    * Only append minimal number of bytes
    
    * Add benchmark for MonthDayNano comparison
    
    * Fix typo in comment
    
    Co-authored-by: Paddy Horan <[email protected]>
    
    * Fix typo in comment
    
    Co-authored-by: Paddy Horan <[email protected]>
    
    Co-authored-by: Paddy Horan <[email protected]>
---
 arrow/benches/comparison_kernels.rs     |  14 +++-
 arrow/src/compute/kernels/comparison.rs | 122 ++++++++++++++++++--------------
 arrow/src/datatypes/numeric.rs          |  29 +++++++-
 3 files changed, 110 insertions(+), 55 deletions(-)

diff --git a/arrow/benches/comparison_kernels.rs 
b/arrow/benches/comparison_kernels.rs
index 94ff7df..cf9ccdd 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -22,7 +22,7 @@ use criterion::Criterion;
 extern crate arrow;
 
 use arrow::compute::*;
-use arrow::datatypes::ArrowNumericType;
+use arrow::datatypes::{ArrowNumericType, IntervalMonthDayNanoType};
 use arrow::util::bench_util::*;
 use arrow::{array::*, datatypes::Float32Type};
 
@@ -138,6 +138,11 @@ fn add_benchmark(c: &mut Criterion) {
     let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
     let arr_b = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 43);
 
+    let arr_month_day_nano_a =
+        create_primitive_array_with_seed::<IntervalMonthDayNanoType>(size, 
0.0, 43);
+    let arr_month_day_nano_b =
+        create_primitive_array_with_seed::<IntervalMonthDayNanoType>(size, 
0.0, 43);
+
     let arr_string = create_string_array::<i32>(size, 0.0);
 
     c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b)));
@@ -170,6 +175,13 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_gt_eq_scalar(&arr_a, 1.0))
     });
 
+    c.bench_function("eq MonthDayNano", |b| {
+        b.iter(|| bench_eq(&arr_month_day_nano_a, &arr_month_day_nano_b))
+    });
+    c.bench_function("eq scalar MonthDayNano", |b| {
+        b.iter(|| bench_eq_scalar(&arr_month_day_nano_a, 123))
+    });
+
     c.bench_function("like_utf8 scalar equals", |b| {
         b.iter(|| bench_like_utf8_scalar(&arr_string, "xxxx"))
     });
diff --git a/arrow/src/compute/kernels/comparison.rs 
b/arrow/src/compute/kernels/comparison.rs
index c7bee58..9390d84 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -1471,14 +1471,21 @@ where
 
     let null_bit_buffer = combine_option_bitmap(left.data_ref(), 
right.data_ref(), len)?;
 
+    // we process the data in chunks so that each iteration results in one u64 
of comparison result bits
+    const CHUNK_SIZE: usize = 64;
     let lanes = T::lanes();
+
+    // this is currently the case for all our datatypes and allows us to 
always append full bytes
+    assert!(
+        lanes <= CHUNK_SIZE,
+        "Number of vector lanes must be at most 64"
+    );
+
     let buffer_size = bit_util::ceil(len, 8);
     let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, 
false);
 
-    // this is currently the case for all our datatypes and allows us to 
always append full bytes
-    assert_eq!(lanes % 8, 0, "Number of vector lanes must be multiple of 8");
-    let mut left_chunks = left.values().chunks_exact(lanes);
-    let mut right_chunks = right.values().chunks_exact(lanes);
+    let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE);
+    let mut right_chunks = right.values().chunks_exact(CHUNK_SIZE);
 
     // safety: result is newly created above, always written as a T below
     let result_chunks = unsafe { result.typed_data_mut() };
@@ -1486,15 +1493,22 @@ where
         .borrow_mut()
         .zip(right_chunks.borrow_mut())
         .fold(result_chunks, |result_slice, (left_slice, right_slice)| {
-            let simd_left = T::load(left_slice);
-            let simd_right = T::load(right_slice);
-            let simd_result = simd_op(simd_left, simd_right);
+            let mut i = 0;
+            let mut bitmask = 0_u64;
+            while i < CHUNK_SIZE {
+                let simd_left = T::load(&left_slice[i..]);
+                let simd_right = T::load(&right_slice[i..]);
+                let simd_result = simd_op(simd_left, simd_right);
 
-            let bitmask = T::mask_to_u64(&simd_result);
+                let m = T::mask_to_u64(&simd_result);
+                bitmask |= m << (i / lanes);
+
+                i += lanes;
+            }
             let bytes = bitmask.to_le_bytes();
-            result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 8]);
+            result_slice[0..8].copy_from_slice(&bytes);
 
-            &mut result_slice[lanes / 8..]
+            &mut result_slice[8..]
         });
 
     let left_remainder = left_chunks.remainder();
@@ -1502,22 +1516,20 @@ where
 
     assert_eq!(left_remainder.len(), right_remainder.len());
 
-    let remainder_bitmask = left_remainder
-        .iter()
-        .zip(right_remainder.iter())
-        .enumerate()
-        .fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| {
-            let bit = if scalar_op(*scalar_left, *scalar_right) {
-                1_u64
-            } else {
-                0_u64
-            };
-            mask |= bit << i;
-            mask
-        });
-    let remainder_mask_as_bytes =
-        
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
-    result_remainder.copy_from_slice(remainder_mask_as_bytes);
+    if !left_remainder.is_empty() {
+        let remainder_bitmask = left_remainder
+            .iter()
+            .zip(right_remainder.iter())
+            .enumerate()
+            .fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| {
+                let bit = scalar_op(*scalar_left, *scalar_right) as u64;
+                mask |= bit << i;
+                mask
+            });
+        let remainder_mask_as_bytes =
+            
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
+        result_remainder.copy_from_slice(remainder_mask_as_bytes);
+    }
 
     let data = unsafe {
         ArrayData::new_unchecked(
@@ -1551,16 +1563,20 @@ where
 
     let len = left.len();
 
+    // we process the data in chunks so that each iteration results in one u64 
of comparison result bits
+    const CHUNK_SIZE: usize = 64;
     let lanes = T::lanes();
-    let buffer_size = bit_util::ceil(len, 8);
-    let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, 
false);
 
     // this is currently the case for all our datatypes and allows us to 
always append full bytes
     assert!(
-        lanes % 8 == 0,
-        "Number of vector lanes must be multiple of 8"
+        lanes <= CHUNK_SIZE,
+        "Number of vector lanes must be at most 64"
     );
-    let mut left_chunks = left.values().chunks_exact(lanes);
+
+    let buffer_size = bit_util::ceil(len, 8);
+    let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, 
false);
+
+    let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE);
     let simd_right = T::init(right);
 
     // safety: result is newly created above, always written as a T below
@@ -1569,34 +1585,38 @@ where
         left_chunks
             .borrow_mut()
             .fold(result_chunks, |result_slice, left_slice| {
-                let simd_left = T::load(left_slice);
-                let simd_result = simd_op(simd_left, simd_right);
+                let mut i = 0;
+                let mut bitmask = 0_u64;
+                while i < CHUNK_SIZE {
+                    let simd_left = T::load(&left_slice[i..]);
+                    let simd_result = simd_op(simd_left, simd_right);
+
+                    let m = T::mask_to_u64(&simd_result);
+                    bitmask |= m << (i / lanes);
 
-                let bitmask = T::mask_to_u64(&simd_result);
+                    i += lanes;
+                }
                 let bytes = bitmask.to_le_bytes();
-                result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 
8]);
+                result_slice[0..8].copy_from_slice(&bytes);
 
-                &mut result_slice[lanes / 8..]
+                &mut result_slice[8..]
             });
 
     let left_remainder = left_chunks.remainder();
 
-    let remainder_bitmask =
-        left_remainder
-            .iter()
-            .enumerate()
-            .fold(0_u64, |mut mask, (i, scalar_left)| {
-                let bit = if scalar_op(*scalar_left, right) {
-                    1_u64
-                } else {
-                    0_u64
-                };
+    if !left_remainder.is_empty() {
+        let remainder_bitmask = left_remainder.iter().enumerate().fold(
+            0_u64,
+            |mut mask, (i, scalar_left)| {
+                let bit = scalar_op(*scalar_left, right) as u64;
                 mask |= bit << i;
                 mask
-            });
-    let remainder_mask_as_bytes =
-        
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
-    result_remainder.copy_from_slice(remainder_mask_as_bytes);
+            },
+        );
+        let remainder_mask_as_bytes =
+            
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
+        result_remainder.copy_from_slice(remainder_mask_as_bytes);
+    }
 
     let null_bit_buffer = left
         .data_ref()
@@ -2723,8 +2743,6 @@ mod tests {
         );
     }
 
-    // Fails when simd is enabled: 
https://github.com/apache/arrow-rs/issues/1136
-    #[cfg(not(feature = "simd"))]
     #[test]
     fn test_interval_array() {
         let a = IntervalDayTimeArray::from(
diff --git a/arrow/src/datatypes/numeric.rs b/arrow/src/datatypes/numeric.rs
index cbb953c..2ff53c9 100644
--- a/arrow/src/datatypes/numeric.rs
+++ b/arrow/src/datatypes/numeric.rs
@@ -147,6 +147,20 @@ macro_rules! make_numeric_type {
                 // this match will get removed by the compiler since the 
number of lanes is known at
                 // compile-time for each concrete numeric type
                 match Self::lanes() {
+                    4 => {
+                        // the bit position in each lane indicates the index 
of that lane
+                        let vecidx = i128x4::new(1, 2, 4, 8);
+
+                        // broadcast the lowermost 8 bits of mask to each lane
+                        let vecmask = i128x4::splat((mask & 0x0F) as i128);
+                        // compute whether the bit corresponding to each lanes 
index is set
+                        let vecmask = (vecidx & vecmask).eq(vecidx);
+
+                        // transmute is necessary because the different match 
arms return different
+                        // mask types, at runtime only one of those 
expressions will exist per type,
+                        // with the type being equal to `SimdMask`.
+                        unsafe { std::mem::transmute(vecmask) }
+                    }
                     8 => {
                         // the bit position in each lane indicates the index 
of that lane
                         let vecidx = i64x8::new(1, 2, 4, 8, 16, 32, 64, 128);
@@ -448,11 +462,11 @@ macro_rules! make_float_numeric_type {
 make_float_numeric_type!(Float32Type, f32x16);
 make_float_numeric_type!(Float64Type, f64x8);
 
-#[cfg(all(test, simd_x86))]
+#[cfg(all(test, feature = "simd"))]
 mod tests {
     use crate::datatypes::{
         ArrowNumericType, Float32Type, Float64Type, Int32Type, Int64Type, 
Int8Type,
-        UInt16Type,
+        IntervalMonthDayNanoType, UInt16Type,
     };
     use packed_simd::*;
     use FromCast;
@@ -471,6 +485,17 @@ mod tests {
     }
 
     #[test]
+    fn test_mask_i128() {
+        let mask = 0b1101;
+        let actual = IntervalMonthDayNanoType::mask_from_u64(mask);
+        let expected = expected_mask!(i128, mask);
+        let expected =
+            
m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice()));
+
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
     fn test_mask_f64() {
         let mask = 0b10101010;
         let actual = Float64Type::mask_from_u64(mask);

[arrow-rs] branch master updated: Implement SIMD comparison operations for types with less than 4 lanes (i128) (#1146)

Reply via email to