[arrow-rs] branch master updated: Remove explicit simd arithmetic kernels except for division/modulo (#1221)

alamb Mon, 24 Jan 2022 11:50:43 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 2d6352b  Remove explicit simd arithmetic kernels except for 
division/modulo (#1221)
2d6352b is described below

commit 2d6352b3b35a17cef2618f6475b7230ce0146d55
Author: Jörn Horstmann <[email protected]>
AuthorDate: Mon Jan 24 20:50:30 2022 +0100

    Remove explicit simd arithmetic kernels except for division/modulo (#1221)
    
    * Extend arithmetic benchmarks
    
    * Remove explicit simd arithmetic except for div/mod because 
autovectorization generates better code
    
    * Remove unneeded return keywords
---
 arrow/benches/arithmetic_kernels.rs     |  62 +++++---
 arrow/src/buffer/immutable.rs           |   1 +
 arrow/src/compute/kernels/arithmetic.rs | 242 ++++----------------------------
 3 files changed, 68 insertions(+), 237 deletions(-)

diff --git a/arrow/benches/arithmetic_kernels.rs 
b/arrow/benches/arithmetic_kernels.rs
index bbe4123..4be4a26 100644
--- a/arrow/benches/arithmetic_kernels.rs
+++ b/arrow/benches/arithmetic_kernels.rs
@@ -24,7 +24,6 @@ use std::sync::Arc;
 
 extern crate arrow;
 
-use arrow::compute::kernels::limit::*;
 use arrow::util::bench_util::*;
 use arrow::{array::*, datatypes::Float32Type};
 use arrow::{compute::kernels::arithmetic::*, util::test_util::seedable_rng};
@@ -59,44 +58,69 @@ fn bench_divide(arr_a: &ArrayRef, arr_b: &ArrayRef) {
     criterion::black_box(divide(arr_a, arr_b).unwrap());
 }
 
+fn bench_divide_unchecked(arr_a: &ArrayRef, arr_b: &ArrayRef) {
+    let arr_a = arr_a.as_any().downcast_ref::<Float32Array>().unwrap();
+    let arr_b = arr_b.as_any().downcast_ref::<Float32Array>().unwrap();
+    criterion::black_box(divide_unchecked(arr_a, arr_b).unwrap());
+}
+
 fn bench_divide_scalar(array: &ArrayRef, divisor: f32) {
     let array = array.as_any().downcast_ref::<Float32Array>().unwrap();
     criterion::black_box(divide_scalar(array, divisor).unwrap());
 }
 
-fn bench_limit(arr_a: &ArrayRef, max: usize) {
-    criterion::black_box(limit(arr_a, max));
+fn bench_modulo(arr_a: &ArrayRef, arr_b: &ArrayRef) {
+    let arr_a = arr_a.as_any().downcast_ref::<Float32Array>().unwrap();
+    let arr_b = arr_b.as_any().downcast_ref::<Float32Array>().unwrap();
+    criterion::black_box(modulus(arr_a, arr_b).unwrap());
+}
+
+fn bench_modulo_scalar(array: &ArrayRef, divisor: f32) {
+    let array = array.as_any().downcast_ref::<Float32Array>().unwrap();
+    criterion::black_box(modulus_scalar(array, divisor).unwrap());
 }
 
 fn add_benchmark(c: &mut Criterion) {
-    let arr_a = create_array(512, false);
-    let arr_b = create_array(512, false);
+    const BATCH_SIZE: usize = 64 * 1024;
+    let arr_a = create_array(BATCH_SIZE, false);
+    let arr_b = create_array(BATCH_SIZE, false);
     let scalar = seedable_rng().gen();
 
-    c.bench_function("add 512", |b| b.iter(|| bench_add(&arr_a, &arr_b)));
-    c.bench_function("subtract 512", |b| {
-        b.iter(|| bench_subtract(&arr_a, &arr_b))
+    c.bench_function("add", |b| b.iter(|| bench_add(&arr_a, &arr_b)));
+    c.bench_function("subtract", |b| b.iter(|| bench_subtract(&arr_a, 
&arr_b)));
+    c.bench_function("multiply", |b| b.iter(|| bench_multiply(&arr_a, 
&arr_b)));
+    c.bench_function("divide", |b| b.iter(|| bench_divide(&arr_a, &arr_b)));
+    c.bench_function("divide_unchecked", |b| {
+        b.iter(|| bench_divide_unchecked(&arr_a, &arr_b))
     });
-    c.bench_function("multiply 512", |b| {
-        b.iter(|| bench_multiply(&arr_a, &arr_b))
-    });
-    c.bench_function("divide 512", |b| b.iter(|| bench_divide(&arr_a, 
&arr_b)));
-    c.bench_function("divide_scalar 512", |b| {
+    c.bench_function("divide_scalar", |b| {
         b.iter(|| bench_divide_scalar(&arr_a, scalar))
     });
-    c.bench_function("limit 512, 512", |b| b.iter(|| bench_limit(&arr_a, 
512)));
+    c.bench_function("modulo", |b| b.iter(|| bench_modulo(&arr_a, &arr_b)));
+    c.bench_function("modulo_scalar", |b| {
+        b.iter(|| bench_modulo_scalar(&arr_a, scalar))
+    });
 
-    let arr_a_nulls = create_array(512, false);
-    let arr_b_nulls = create_array(512, false);
-    c.bench_function("add_nulls_512", |b| {
+    let arr_a_nulls = create_array(BATCH_SIZE, true);
+    let arr_b_nulls = create_array(BATCH_SIZE, true);
+    c.bench_function("add_nulls", |b| {
         b.iter(|| bench_add(&arr_a_nulls, &arr_b_nulls))
     });
-    c.bench_function("divide_nulls_512", |b| {
+    c.bench_function("divide_nulls", |b| {
         b.iter(|| bench_divide(&arr_a_nulls, &arr_b_nulls))
     });
-    c.bench_function("divide_scalar_nulls_512", |b| {
+    c.bench_function("divide_nulls_unchecked", |b| {
+        b.iter(|| bench_divide_unchecked(&arr_a_nulls, &arr_b_nulls))
+    });
+    c.bench_function("divide_scalar_nulls", |b| {
         b.iter(|| bench_divide_scalar(&arr_a_nulls, scalar))
     });
+    c.bench_function("modulo_nulls", |b| {
+        b.iter(|| bench_modulo(&arr_a_nulls, &arr_b_nulls))
+    });
+    c.bench_function("modulo_scalar_nulls", |b| {
+        b.iter(|| bench_modulo_scalar(&arr_a_nulls, scalar))
+    });
 }
 
 criterion_group!(benches, add_benchmark);
diff --git a/arrow/src/buffer/immutable.rs b/arrow/src/buffer/immutable.rs
index 0823de5..a3de0d4 100644
--- a/arrow/src/buffer/immutable.rs
+++ b/arrow/src/buffer/immutable.rs
@@ -153,6 +153,7 @@ impl Buffer {
     ///
     /// Note that this should be used cautiously, and the returned pointer 
should not be
     /// stored anywhere, to avoid dangling pointers.
+    #[inline]
     pub fn as_ptr(&self) -> *const u8 {
         unsafe { self.data.ptr().as_ptr().add(self.offset) }
     }
diff --git a/arrow/src/compute/kernels/arithmetic.rs 
b/arrow/src/compute/kernels/arithmetic.rs
index 4eb9576..dcdfa95 100644
--- a/arrow/src/compute/kernels/arithmetic.rs
+++ b/arrow/src/compute/kernels/arithmetic.rs
@@ -29,7 +29,6 @@ use num::{One, Zero};
 use crate::buffer::Buffer;
 #[cfg(feature = "simd")]
 use crate::buffer::MutableBuffer;
-#[cfg(not(feature = "simd"))]
 use crate::compute::kernels::arity::unary;
 use crate::compute::util::combine_option_bitmap;
 use crate::datatypes;
@@ -42,65 +41,6 @@ use std::borrow::BorrowMut;
 #[cfg(feature = "simd")]
 use std::slice::{ChunksExact, ChunksExactMut};
 
-/// Creates a new PrimitiveArray by applying `simd_op` to the input `array`.
-/// If the length of the array is not multiple of the number of vector lanes
-/// then the remainder of the array will be calculated using `scalar_op`.
-/// Any operation on a `NULL` value will result in a `NULL` value in the 
output.
-#[cfg(feature = "simd")]
-fn simd_unary_math_op<T, SIMD_OP, SCALAR_OP>(
-    array: &PrimitiveArray<T>,
-    simd_op: SIMD_OP,
-    scalar_op: SCALAR_OP,
-) -> PrimitiveArray<T>
-where
-    T: ArrowNumericType,
-    SIMD_OP: Fn(T::Simd) -> T::Simd,
-    SCALAR_OP: Fn(T::Native) -> T::Native,
-{
-    let lanes = T::lanes();
-    let buffer_size = array.len() * std::mem::size_of::<T::Native>();
-
-    let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, 
false);
-
-    // safety: result is newly created above, always written as a T below
-    let mut result_chunks = unsafe { 
result.typed_data_mut().chunks_exact_mut(lanes) };
-    let mut array_chunks = array.values().chunks_exact(lanes);
-
-    result_chunks
-        .borrow_mut()
-        .zip(array_chunks.borrow_mut())
-        .for_each(|(result_slice, input_slice)| {
-            let simd_input = T::load(input_slice);
-            let simd_result = T::unary_op(simd_input, &simd_op);
-            T::write(simd_result, result_slice);
-        });
-
-    let result_remainder = result_chunks.into_remainder();
-    let array_remainder = array_chunks.remainder();
-
-    result_remainder.into_iter().zip(array_remainder).for_each(
-        |(scalar_result, scalar_input)| {
-            *scalar_result = scalar_op(*scalar_input);
-        },
-    );
-
-    let data = unsafe {
-        ArrayData::new_unchecked(
-            T::DATA_TYPE,
-            array.len(),
-            None,
-            array
-                .data_ref()
-                .null_buffer()
-                .map(|b| b.bit_slice(array.offset(), array.len())),
-            0,
-            vec![result.into()],
-            vec![],
-        )
-    };
-    PrimitiveArray::<T>::from(data)
-}
-
 /// Helper function to perform math lambda function on values from two arrays. 
If either
 /// left or right value is null then the output value is also null, so `1 + 
null` is
 /// `null`.
@@ -227,79 +167,6 @@ where
     Ok(PrimitiveArray::<T>::from(data))
 }
 
-/// Creates a new PrimitiveArray by applying `simd_op` to the `left` and 
`right` input arrays.
-/// If the length of the arrays is not multiple of the number of vector lanes
-/// then the remainder of the array will be calculated using `scalar_op`.
-/// Any operation on a `NULL` value will result in a `NULL` value in the 
output.
-///
-/// # Errors
-///
-/// This function errors if the arrays have different lengths
-#[cfg(feature = "simd")]
-fn simd_math_op<T, SIMD_OP, SCALAR_OP>(
-    left: &PrimitiveArray<T>,
-    right: &PrimitiveArray<T>,
-    simd_op: SIMD_OP,
-    scalar_op: SCALAR_OP,
-) -> Result<PrimitiveArray<T>>
-where
-    T: ArrowNumericType,
-    SIMD_OP: Fn(T::Simd, T::Simd) -> T::Simd,
-    SCALAR_OP: Fn(T::Native, T::Native) -> T::Native,
-{
-    if left.len() != right.len() {
-        return Err(ArrowError::ComputeError(
-            "Cannot perform math operation on arrays of different 
length".to_string(),
-        ));
-    }
-
-    let null_bit_buffer =
-        combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
-
-    let lanes = T::lanes();
-    let buffer_size = left.len() * std::mem::size_of::<T::Native>();
-    let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, 
false);
-
-    // safety: result is newly created above, always written as a T below
-    let mut result_chunks = unsafe { 
result.typed_data_mut().chunks_exact_mut(lanes) };
-    let mut left_chunks = left.values().chunks_exact(lanes);
-    let mut right_chunks = right.values().chunks_exact(lanes);
-
-    result_chunks
-        .borrow_mut()
-        .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
-        .for_each(|(result_slice, (left_slice, right_slice))| {
-            let simd_left = T::load(left_slice);
-            let simd_right = T::load(right_slice);
-            let simd_result = T::bin_op(simd_left, simd_right, &simd_op);
-            T::write(simd_result, result_slice);
-        });
-
-    let result_remainder = result_chunks.into_remainder();
-    let left_remainder = left_chunks.remainder();
-    let right_remainder = right_chunks.remainder();
-
-    result_remainder
-        .iter_mut()
-        .zip(left_remainder.iter().zip(right_remainder.iter()))
-        .for_each(|(scalar_result, (scalar_left, scalar_right))| {
-            *scalar_result = scalar_op(*scalar_left, *scalar_right);
-        });
-
-    let data = unsafe {
-        ArrayData::new_unchecked(
-            T::DATA_TYPE,
-            left.len(),
-            None,
-            null_bit_buffer,
-            0,
-            vec![result.into()],
-            vec![],
-        )
-    };
-    Ok(PrimitiveArray::<T>::from(data))
-}
-
 /// Calculates the modulus operation `left % right` on two SIMD inputs.
 /// The lower-most bits of `valid_mask` specify which vector lanes are 
considered as valid.
 ///
@@ -566,10 +433,7 @@ where
     T: ArrowNumericType,
     T::Native: Add<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    return simd_math_op(&left, &right, |a, b| a + b, |a, b| a + b);
-    #[cfg(not(feature = "simd"))]
-    return math_op(left, right, |a, b| a + b);
+    math_op(left, right, |a, b| a + b)
 }
 
 /// Add every value in an array by a scalar. If any value in the array is null 
then the
@@ -582,17 +446,7 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Add<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    {
-        let scalar_vector = T::init(scalar);
-        return Ok(simd_unary_math_op(
-            array,
-            |x| x + scalar_vector,
-            |x| x + scalar,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
-    return Ok(unary(array, |value| value + scalar));
+    Ok(unary(array, |value| value + scalar))
 }
 
 /// Perform `left - right` operation on two arrays. If either left or right 
value is null
@@ -605,10 +459,7 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Sub<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    return simd_math_op(&left, &right, |a, b| a - b, |a, b| a - b);
-    #[cfg(not(feature = "simd"))]
-    return math_op(left, right, |a, b| a - b);
+    math_op(left, right, |a, b| a - b)
 }
 
 /// Subtract every value in an array by a scalar. If any value in the array is 
null then the
@@ -625,17 +476,7 @@ where
         + Div<Output = T::Native>
         + Zero,
 {
-    #[cfg(feature = "simd")]
-    {
-        let scalar_vector = T::init(scalar);
-        return Ok(simd_unary_math_op(
-            array,
-            |x| x - scalar_vector,
-            |x| x - scalar,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
-    return Ok(unary(array, |value| value - scalar));
+    Ok(unary(array, |value| value - scalar))
 }
 
 /// Perform `-` operation on an array. If value is null then the result is 
also null.
@@ -644,13 +485,7 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Neg<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    {
-        let zero_vector = T::init(T::default_value());
-        return Ok(simd_unary_math_op(array, |x| zero_vector - x, |x| -x));
-    }
-    #[cfg(not(feature = "simd"))]
-    return Ok(unary(array, |x| -x));
+    Ok(unary(array, |x| -x))
 }
 
 /// Raise array with floating point values to the power of a scalar.
@@ -662,17 +497,7 @@ where
     T: datatypes::ArrowFloatNumericType,
     T::Native: Pow<T::Native, Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    {
-        let raise_vector = T::init(raise);
-        return Ok(simd_unary_math_op(
-            array,
-            |x| T::pow(x, raise_vector),
-            |x| x.pow(raise),
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
-    return Ok(unary(array, |x| x.pow(raise)));
+    Ok(unary(array, |x| x.pow(raise)))
 }
 
 /// Perform `left * right` operation on two arrays. If either left or right 
value is null
@@ -685,10 +510,7 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Mul<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    return simd_math_op(&left, &right, |a, b| a * b, |a, b| a * b);
-    #[cfg(not(feature = "simd"))]
-    return math_op(left, right, |a, b| a * b);
+    math_op(left, right, |a, b| a * b)
 }
 
 /// Multiply every value in an array by a scalar. If any value in the array is 
null then the
@@ -707,17 +529,7 @@ where
         + Zero
         + One,
 {
-    #[cfg(feature = "simd")]
-    {
-        let scalar_vector = T::init(scalar);
-        return Ok(simd_unary_math_op(
-            array,
-            |x| x * scalar_vector,
-            |x| x * scalar,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
-    return Ok(unary(array, |value| value * scalar));
+    Ok(unary(array, |value| value * scalar))
 }
 
 /// Perform `left % right` operation on two arrays. If either left or right 
value is null
@@ -756,6 +568,20 @@ where
     return math_checked_divide_op(left, right, |a, b| a / b);
 }
 
+/// Perform `left / right` operation on two arrays without checking for 
division by zero.
+/// The result of dividing by zero follows normal floating point rules.
+/// If either left or right value is null then the result is also null. If any 
right hand value is zero then the result of this
+pub fn divide_unchecked<T>(
+    left: &PrimitiveArray<T>,
+    right: &PrimitiveArray<T>,
+) -> Result<PrimitiveArray<T>>
+where
+    T: datatypes::ArrowFloatNumericType,
+    T::Native: Div<Output = T::Native>,
+{
+    math_op(left, right, |a, b| a / b)
+}
+
 /// Modulus every value in an array by a scalar. If any value in the array is 
null then the
 /// result is also null. If the scalar is zero then the result of this 
operation will be
 /// `Err(ArrowError::DivideByZero)`.
@@ -771,17 +597,7 @@ where
         return Err(ArrowError::DivideByZero);
     }
 
-    #[cfg(feature = "simd")]
-    {
-        let modulo_vector = T::init(modulo);
-        return Ok(simd_unary_math_op(
-            &array,
-            |a| a % modulo_vector,
-            |a| a % modulo,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
-    return Ok(unary(array, |a| a % modulo));
+    Ok(unary(array, |a| a % modulo))
 }
 
 /// Divide every value in an array by a scalar. If any value in the array is 
null then the
@@ -798,17 +614,7 @@ where
     if divisor.is_zero() {
         return Err(ArrowError::DivideByZero);
     }
-    #[cfg(feature = "simd")]
-    {
-        let divisor_vector = T::init(divisor);
-        return Ok(simd_unary_math_op(
-            &array,
-            |a| a / divisor_vector,
-            |a| a / divisor,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
-    return Ok(unary(array, |a| a / divisor));
+    Ok(unary(array, |a| a / divisor))
 }
 
 #[cfg(test)]

[arrow-rs] branch master updated: Remove explicit simd arithmetic kernels except for division/modulo (#1221)

Reply via email to