[GitHub] [arrow-rs] tustvold commented on a diff in pull request #4560: Restructure `sum` for better auto-vectorization for floats

via GitHub Sun, 30 Jul 2023 13:57:17 -0700


tustvold commented on code in PR #4560:
URL: https://github.com/apache/arrow-rs/pull/4560#discussion_r1278611116



##########
arrow-arith/src/aggregate.rs:
##########
@@ -285,44 +285,178 @@ where
         return None;
     }
 
-    let data: &[T::Native] = array.values();
+    fn sum_impl_integer<T>(array: &PrimitiveArray<T>) -> Option<T::Native>
+    where
+        T: ArrowNumericType,
+        T::Native: ArrowNativeTypeOp,
+    {
+        let data: &[T::Native] = array.values();
 
-    match array.nulls() {
-        None => {
-            let sum = data.iter().fold(T::default_value(), |accumulator, 
value| {
-                accumulator.add_wrapping(*value)
-            });
+        match array.nulls() {
+            None => {
+                let sum = data.iter().fold(T::default_value(), |accumulator, 
value| {
+                    accumulator.add_wrapping(*value)
+                });
 
-            Some(sum)
+                Some(sum)
+            }
+            Some(nulls) => {
+                let mut sum = T::default_value();
+                let data_chunks = data.chunks_exact(64);
+                let remainder = data_chunks.remainder();
+
+                let bit_chunks = nulls.inner().bit_chunks();
+                data_chunks
+                    .zip(bit_chunks.iter())
+                    .for_each(|(chunk, mask)| {
+                        // index_mask has value 1 << i in the loop
+                        let mut index_mask = 1;
+                        chunk.iter().for_each(|value| {
+                            if (mask & index_mask) != 0 {
+                                sum = sum.add_wrapping(*value);
+                            }
+                            index_mask <<= 1;
+                        });
+                    });
+
+                let remainder_bits = bit_chunks.remainder_bits();
+
+                remainder.iter().enumerate().for_each(|(i, value)| {
+                    if remainder_bits & (1 << i) != 0 {
+                        sum = sum.add_wrapping(*value);
+                    }
+                });
+
+                Some(sum)
+            }
         }
-        Some(nulls) => {
-            let mut sum = T::default_value();
-            let data_chunks = data.chunks_exact(64);
-            let remainder = data_chunks.remainder();
-
-            let bit_chunks = nulls.inner().bit_chunks();
-            data_chunks
-                .zip(bit_chunks.iter())
-                .for_each(|(chunk, mask)| {
-                    // index_mask has value 1 << i in the loop
-                    let mut index_mask = 1;
-                    chunk.iter().for_each(|value| {
-                        if (mask & index_mask) != 0 {
-                            sum = sum.add_wrapping(*value);
+    }
+
+    fn sum_impl_floating<T, const LANES: usize>(
+        array: &PrimitiveArray<T>,
+    ) -> Option<T::Native>
+    where
+        T: ArrowNumericType,
+        T::Native: ArrowNativeTypeOp,
+    {
+        let data: &[T::Native] = array.values();
+        let mut chunk_acc = [T::default_value(); LANES];
+        let mut rem_acc = T::default_value();
+
+        match array.nulls() {
+            None => {
+                let data_chunks = data.chunks_exact(LANES);
+                let remainder = data_chunks.remainder();
+
+                data_chunks.for_each(|chunk| {
+                    let chunk: [T::Native; LANES] = chunk.try_into().unwrap();
+
+                    for i in 0..LANES {
+                        chunk_acc[i] = chunk_acc[i].add_wrapping(chunk[i]);
+                    }
+                });
+
+                remainder.iter().copied().for_each(|value| {
+                    rem_acc = rem_acc.add_wrapping(value);
+                });
+
+                let mut reduced = T::default_value();
+                for v in chunk_acc {
+                    reduced = reduced.add_wrapping(v);
+                }
+                let sum = reduced.add_wrapping(rem_acc);
+
+                Some(sum)
+            }
+            Some(nulls) => {
+                // process data in chunks of 64 elements since we also get 64 
bits of validity information at a time
+                let data_chunks = data.chunks_exact(64);
+                let remainder = data_chunks.remainder();
+
+                let bit_chunks = nulls.inner().bit_chunks();
+                let remainder_bits = bit_chunks.remainder_bits();
+
+                data_chunks.zip(bit_chunks).for_each(|(chunk, mut mask)| {
+                    // split chunks further into slices corresponding to the 
vector length
+                    // the compiler is able to unroll this inner loop and 
remove bounds checks
+                    // since the outer chunk size (64) is always a multiple of 
the number of lanes
+                    chunk.chunks_exact(LANES).for_each(|chunk| {
+                        let mut chunk: [T::Native; LANES] = 
chunk.try_into().unwrap();
+
+                        for i in 0..LANES {
+                            if mask & (1 << i) == 0 {
+                                chunk[i] = T::default_value();
+                            }
+                            chunk_acc[i] = chunk_acc[i].add_wrapping(chunk[i]);
                         }
-                        index_mask <<= 1;
-                    });
+
+                        mask >>= LANES;
+                    })
                 });
 
-            let remainder_bits = bit_chunks.remainder_bits();
+                remainder.iter().enumerate().for_each(|(i, value)| {
+                    if remainder_bits & (1 << i) != 0 {
+                        rem_acc = rem_acc.add_wrapping(*value);
+                    }
+                });
 
-            remainder.iter().enumerate().for_each(|(i, value)| {
-                if remainder_bits & (1 << i) != 0 {
-                    sum = sum.add_wrapping(*value);
+                let mut reduced = T::default_value();
+                for v in chunk_acc {
+                    reduced = reduced.add_wrapping(v);
                 }
-            });
+                let sum = reduced.add_wrapping(rem_acc);
 
-            Some(sum)
+                Some(sum)
+            }
+        }
+    }
+
+    match T::DATA_TYPE {
+        DataType::Timestamp(_, _)
+        | DataType::Time32(_)
+        | DataType::Time64(_)
+        | DataType::Date32
+        | DataType::Date64
+        | DataType::Duration(_)
+        | DataType::Interval(_)
+        | DataType::Int8
+        | DataType::Int16
+        | DataType::Int32
+        | DataType::Int64
+        | DataType::UInt8
+        | DataType::UInt16
+        | DataType::UInt32
+        | DataType::UInt64 => sum_impl_integer(array),
+        DataType::Float16
+        | DataType::Float32
+        | DataType::Float64
+        | DataType::Decimal128(_, _)
+        | DataType::Decimal256(_, _) => match T::lanes() {

Review Comment:
   Why is decimal here?



##########
arrow-arith/src/aggregate.rs:
##########
@@ -285,44 +285,178 @@ where
         return None;
     }
 
-    let data: &[T::Native] = array.values();
+    fn sum_impl_integer<T>(array: &PrimitiveArray<T>) -> Option<T::Native>
+    where
+        T: ArrowNumericType,
+        T::Native: ArrowNativeTypeOp,
+    {
+        let data: &[T::Native] = array.values();
 
-    match array.nulls() {
-        None => {
-            let sum = data.iter().fold(T::default_value(), |accumulator, 
value| {
-                accumulator.add_wrapping(*value)
-            });
+        match array.nulls() {
+            None => {
+                let sum = data.iter().fold(T::default_value(), |accumulator, 
value| {
+                    accumulator.add_wrapping(*value)
+                });
 
-            Some(sum)
+                Some(sum)
+            }
+            Some(nulls) => {
+                let mut sum = T::default_value();
+                let data_chunks = data.chunks_exact(64);
+                let remainder = data_chunks.remainder();
+
+                let bit_chunks = nulls.inner().bit_chunks();
+                data_chunks
+                    .zip(bit_chunks.iter())
+                    .for_each(|(chunk, mask)| {
+                        // index_mask has value 1 << i in the loop
+                        let mut index_mask = 1;
+                        chunk.iter().for_each(|value| {
+                            if (mask & index_mask) != 0 {
+                                sum = sum.add_wrapping(*value);
+                            }
+                            index_mask <<= 1;
+                        });
+                    });
+
+                let remainder_bits = bit_chunks.remainder_bits();
+
+                remainder.iter().enumerate().for_each(|(i, value)| {
+                    if remainder_bits & (1 << i) != 0 {
+                        sum = sum.add_wrapping(*value);
+                    }
+                });
+
+                Some(sum)
+            }
         }
-        Some(nulls) => {
-            let mut sum = T::default_value();
-            let data_chunks = data.chunks_exact(64);
-            let remainder = data_chunks.remainder();
-
-            let bit_chunks = nulls.inner().bit_chunks();
-            data_chunks
-                .zip(bit_chunks.iter())
-                .for_each(|(chunk, mask)| {
-                    // index_mask has value 1 << i in the loop
-                    let mut index_mask = 1;
-                    chunk.iter().for_each(|value| {
-                        if (mask & index_mask) != 0 {
-                            sum = sum.add_wrapping(*value);
+    }
+
+    fn sum_impl_floating<T, const LANES: usize>(
+        array: &PrimitiveArray<T>,
+    ) -> Option<T::Native>
+    where
+        T: ArrowNumericType,
+        T::Native: ArrowNativeTypeOp,
+    {
+        let data: &[T::Native] = array.values();
+        let mut chunk_acc = [T::default_value(); LANES];
+        let mut rem_acc = T::default_value();
+
+        match array.nulls() {
+            None => {
+                let data_chunks = data.chunks_exact(LANES);
+                let remainder = data_chunks.remainder();
+
+                data_chunks.for_each(|chunk| {
+                    let chunk: [T::Native; LANES] = chunk.try_into().unwrap();
+
+                    for i in 0..LANES {
+                        chunk_acc[i] = chunk_acc[i].add_wrapping(chunk[i]);
+                    }
+                });
+
+                remainder.iter().copied().for_each(|value| {
+                    rem_acc = rem_acc.add_wrapping(value);
+                });
+
+                let mut reduced = T::default_value();
+                for v in chunk_acc {
+                    reduced = reduced.add_wrapping(v);
+                }
+                let sum = reduced.add_wrapping(rem_acc);
+
+                Some(sum)
+            }
+            Some(nulls) => {
+                // process data in chunks of 64 elements since we also get 64 
bits of validity information at a time
+                let data_chunks = data.chunks_exact(64);
+                let remainder = data_chunks.remainder();
+
+                let bit_chunks = nulls.inner().bit_chunks();
+                let remainder_bits = bit_chunks.remainder_bits();
+
+                data_chunks.zip(bit_chunks).for_each(|(chunk, mut mask)| {
+                    // split chunks further into slices corresponding to the 
vector length
+                    // the compiler is able to unroll this inner loop and 
remove bounds checks
+                    // since the outer chunk size (64) is always a multiple of 
the number of lanes
+                    chunk.chunks_exact(LANES).for_each(|chunk| {
+                        let mut chunk: [T::Native; LANES] = 
chunk.try_into().unwrap();
+
+                        for i in 0..LANES {
+                            if mask & (1 << i) == 0 {
+                                chunk[i] = T::default_value();
+                            }
+                            chunk_acc[i] = chunk_acc[i].add_wrapping(chunk[i]);
                         }
-                        index_mask <<= 1;
-                    });
+
+                        mask >>= LANES;
+                    })
                 });
 
-            let remainder_bits = bit_chunks.remainder_bits();
+                remainder.iter().enumerate().for_each(|(i, value)| {
+                    if remainder_bits & (1 << i) != 0 {
+                        rem_acc = rem_acc.add_wrapping(*value);
+                    }
+                });
 
-            remainder.iter().enumerate().for_each(|(i, value)| {
-                if remainder_bits & (1 << i) != 0 {
-                    sum = sum.add_wrapping(*value);
+                let mut reduced = T::default_value();
+                for v in chunk_acc {
+                    reduced = reduced.add_wrapping(v);
                 }
-            });
+                let sum = reduced.add_wrapping(rem_acc);
 
-            Some(sum)
+                Some(sum)
+            }
+        }
+    }
+
+    match T::DATA_TYPE {
+        DataType::Timestamp(_, _)
+        | DataType::Time32(_)
+        | DataType::Time64(_)
+        | DataType::Date32
+        | DataType::Date64
+        | DataType::Duration(_)
+        | DataType::Interval(_)
+        | DataType::Int8
+        | DataType::Int16
+        | DataType::Int32
+        | DataType::Int64
+        | DataType::UInt8
+        | DataType::UInt16
+        | DataType::UInt32
+        | DataType::UInt64 => sum_impl_integer(array),
+        DataType::Float16
+        | DataType::Float32
+        | DataType::Float64
+        | DataType::Decimal128(_, _)
+        | DataType::Decimal256(_, _) => match T::lanes() {
+            1 => sum_impl_floating::<T, 1>(array),
+            2 => sum_impl_floating::<T, 2>(array),
+            4 => sum_impl_floating::<T, 4>(array),

Review Comment:
   It occurs to me that we have 3 floating point types, we could just dispatch 
to sum_impl_floating with the appropriate constant specified, without needing 
ArrowNumericType?



##########
arrow-array/src/numeric.rs:
##########
@@ -113,10 +113,13 @@ where
 
 /// A subtype of primitive type that represents numeric values.
 #[cfg(not(feature = "simd"))]
-pub trait ArrowNumericType: ArrowPrimitiveType {}
+pub trait ArrowNumericType: ArrowPrimitiveType {
+    /// The number of SIMD lanes available
+    fn lanes() -> usize;

Review Comment:
   It feels a little off to define this for all the types, but then only use it 
for a special case of floats :thinking:  



##########
arrow-arith/src/aggregate.rs:
##########
@@ -285,44 +285,178 @@ where
         return None;
     }
 
-    let data: &[T::Native] = array.values();
+    fn sum_impl_integer<T>(array: &PrimitiveArray<T>) -> Option<T::Native>
+    where
+        T: ArrowNumericType,
+        T::Native: ArrowNativeTypeOp,
+    {
+        let data: &[T::Native] = array.values();
 
-    match array.nulls() {
-        None => {
-            let sum = data.iter().fold(T::default_value(), |accumulator, 
value| {
-                accumulator.add_wrapping(*value)
-            });
+        match array.nulls() {
+            None => {
+                let sum = data.iter().fold(T::default_value(), |accumulator, 
value| {
+                    accumulator.add_wrapping(*value)
+                });
 
-            Some(sum)
+                Some(sum)
+            }
+            Some(nulls) => {
+                let mut sum = T::default_value();
+                let data_chunks = data.chunks_exact(64);
+                let remainder = data_chunks.remainder();
+
+                let bit_chunks = nulls.inner().bit_chunks();
+                data_chunks
+                    .zip(bit_chunks.iter())
+                    .for_each(|(chunk, mask)| {
+                        // index_mask has value 1 << i in the loop
+                        let mut index_mask = 1;
+                        chunk.iter().for_each(|value| {
+                            if (mask & index_mask) != 0 {
+                                sum = sum.add_wrapping(*value);
+                            }
+                            index_mask <<= 1;
+                        });
+                    });
+
+                let remainder_bits = bit_chunks.remainder_bits();
+
+                remainder.iter().enumerate().for_each(|(i, value)| {
+                    if remainder_bits & (1 << i) != 0 {
+                        sum = sum.add_wrapping(*value);
+                    }
+                });
+
+                Some(sum)
+            }
         }
-        Some(nulls) => {
-            let mut sum = T::default_value();
-            let data_chunks = data.chunks_exact(64);
-            let remainder = data_chunks.remainder();
-
-            let bit_chunks = nulls.inner().bit_chunks();
-            data_chunks
-                .zip(bit_chunks.iter())
-                .for_each(|(chunk, mask)| {
-                    // index_mask has value 1 << i in the loop
-                    let mut index_mask = 1;
-                    chunk.iter().for_each(|value| {
-                        if (mask & index_mask) != 0 {
-                            sum = sum.add_wrapping(*value);
+    }
+
+    fn sum_impl_floating<T, const LANES: usize>(
+        array: &PrimitiveArray<T>,
+    ) -> Option<T::Native>
+    where
+        T: ArrowNumericType,
+        T::Native: ArrowNativeTypeOp,
+    {
+        let data: &[T::Native] = array.values();
+        let mut chunk_acc = [T::default_value(); LANES];
+        let mut rem_acc = T::default_value();
+
+        match array.nulls() {
+            None => {
+                let data_chunks = data.chunks_exact(LANES);
+                let remainder = data_chunks.remainder();
+
+                data_chunks.for_each(|chunk| {
+                    let chunk: [T::Native; LANES] = chunk.try_into().unwrap();
+
+                    for i in 0..LANES {
+                        chunk_acc[i] = chunk_acc[i].add_wrapping(chunk[i]);
+                    }
+                });
+
+                remainder.iter().copied().for_each(|value| {
+                    rem_acc = rem_acc.add_wrapping(value);
+                });
+
+                let mut reduced = T::default_value();
+                for v in chunk_acc {
+                    reduced = reduced.add_wrapping(v);
+                }
+                let sum = reduced.add_wrapping(rem_acc);
+
+                Some(sum)
+            }
+            Some(nulls) => {
+                // process data in chunks of 64 elements since we also get 64 
bits of validity information at a time
+                let data_chunks = data.chunks_exact(64);
+                let remainder = data_chunks.remainder();
+
+                let bit_chunks = nulls.inner().bit_chunks();
+                let remainder_bits = bit_chunks.remainder_bits();
+
+                data_chunks.zip(bit_chunks).for_each(|(chunk, mut mask)| {
+                    // split chunks further into slices corresponding to the 
vector length
+                    // the compiler is able to unroll this inner loop and 
remove bounds checks
+                    // since the outer chunk size (64) is always a multiple of 
the number of lanes
+                    chunk.chunks_exact(LANES).for_each(|chunk| {
+                        let mut chunk: [T::Native; LANES] = 
chunk.try_into().unwrap();
+
+                        for i in 0..LANES {
+                            if mask & (1 << i) == 0 {
+                                chunk[i] = T::default_value();
+                            }
+                            chunk_acc[i] = chunk_acc[i].add_wrapping(chunk[i]);
                         }
-                        index_mask <<= 1;
-                    });
+
+                        mask >>= LANES;
+                    })
                 });
 
-            let remainder_bits = bit_chunks.remainder_bits();
+                remainder.iter().enumerate().for_each(|(i, value)| {
+                    if remainder_bits & (1 << i) != 0 {
+                        rem_acc = rem_acc.add_wrapping(*value);
+                    }
+                });
 
-            remainder.iter().enumerate().for_each(|(i, value)| {
-                if remainder_bits & (1 << i) != 0 {
-                    sum = sum.add_wrapping(*value);
+                let mut reduced = T::default_value();
+                for v in chunk_acc {
+                    reduced = reduced.add_wrapping(v);
                 }
-            });
+                let sum = reduced.add_wrapping(rem_acc);
 
-            Some(sum)
+                Some(sum)
+            }
+        }
+    }
+
+    match T::DATA_TYPE {

Review Comment:
   This match block is kind of grim, but I don't have a better solution off the 
top of my head... Perhaps some sort of trait :thinking: 



##########
arrow/benches/aggregate_kernels.rs:
##########
@@ -17,42 +17,79 @@
 
 #[macro_use]
 extern crate criterion;
+use arrow_array::types::{
+    Float64Type, TimestampMillisecondType, UInt16Type, UInt32Type, UInt64Type, 
UInt8Type,
+};
+use arrow_array::ArrowNumericType;
 use criterion::Criterion;
 
 extern crate arrow;
 
 use arrow::compute::kernels::aggregate::*;
 use arrow::util::bench_util::*;
 use arrow::{array::*, datatypes::Float32Type};
+use rand::distributions::Standard;
+use rand::prelude::Distribution;
 
-fn bench_sum(arr_a: &Float32Array) {
+fn bench_sum<T: ArrowNumericType>(arr_a: &PrimitiveArray<T>) {
     criterion::black_box(sum(arr_a).unwrap());
 }
 
-fn bench_min(arr_a: &Float32Array) {
+fn bench_min<T: ArrowNumericType>(arr_a: &PrimitiveArray<T>) {
     criterion::black_box(min(arr_a).unwrap());
 }
 
-fn bench_max(arr_a: &Float32Array) {
+fn bench_max<T: ArrowNumericType>(arr_a: &PrimitiveArray<T>) {
     criterion::black_box(max(arr_a).unwrap());
 }
 
 fn bench_min_string(arr_a: &StringArray) {
     criterion::black_box(min_string(arr_a).unwrap());
 }
 
+fn sum_min_max_bench<T>(
+    c: &mut Criterion,
+    size: usize,
+    null_density: f32,
+    description: &str,
+) where
+    T: ArrowNumericType,
+    Standard: Distribution<T::Native>,
+{
+    let arr_a = create_primitive_array::<T>(size, null_density);
+
+    c.bench_function(&format!("sum {size} {description}"), |b| {
+        b.iter(|| bench_sum(&arr_a))
+    });
+    c.bench_function(&format!("min {size} {description}"), |b| {
+        b.iter(|| bench_min(&arr_a))
+    });
+    c.bench_function(&format!("max {size} {description}"), |b| {
+        b.iter(|| bench_max(&arr_a))
+    });
+}
+
 fn add_benchmark(c: &mut Criterion) {
-    let arr_a = create_primitive_array::<Float32Type>(512, 0.0);
+    sum_min_max_bench::<UInt8Type>(c, 512, 0.0, "u8 no nulls");
+    sum_min_max_bench::<UInt8Type>(c, 512, 0.5, "u8 50% nulls");
+
+    sum_min_max_bench::<UInt16Type>(c, 512, 0.0, "u16 no nulls");
+    sum_min_max_bench::<UInt16Type>(c, 512, 0.5, "u16 50% nulls");
+
+    sum_min_max_bench::<UInt32Type>(c, 512, 0.0, "u32 no nulls");
+    sum_min_max_bench::<UInt32Type>(c, 512, 0.5, "u32 50% nulls");
+
+    sum_min_max_bench::<UInt64Type>(c, 512, 0.0, "u64 no nulls");
+    sum_min_max_bench::<UInt64Type>(c, 512, 0.5, "u64 50% nulls");
 
-    c.bench_function("sum 512", |b| b.iter(|| bench_sum(&arr_a)));
-    c.bench_function("min 512", |b| b.iter(|| bench_min(&arr_a)));
-    c.bench_function("max 512", |b| b.iter(|| bench_max(&arr_a)));
+    sum_min_max_bench::<TimestampMillisecondType>(c, 512, 0.0, "ts_millis no 
nulls");
+    sum_min_max_bench::<TimestampMillisecondType>(c, 512, 0.5, "ts_millis 50% 
nulls");

Review Comment:
   FWIW arithmetic on timestamps as this does is not especially meaningful, 
adding two timestamps doesn't yield a timestamp, `DurationMillisecondType` 
might be more meaningful



##########
arrow-arith/src/aggregate.rs:
##########
@@ -285,44 +285,178 @@ where
         return None;
     }
 
-    let data: &[T::Native] = array.values();
+    fn sum_impl_integer<T>(array: &PrimitiveArray<T>) -> Option<T::Native>

Review Comment:
   FWIW if you changed the signature to
   
   ```
   fn sum_impl_integer<T: ArrowNativeType>(values: &[T], nulls: 
Option<&NullBuffer>) -> Option<T>
   ```
   
   It would potentially save on codegen, as it would be instantiated per native 
type not per primitive type



##########
arrow-arith/src/aggregate.rs:
##########
@@ -285,44 +285,178 @@ where
         return None;
     }
 
-    let data: &[T::Native] = array.values();
+    fn sum_impl_integer<T>(array: &PrimitiveArray<T>) -> Option<T::Native>
+    where
+        T: ArrowNumericType,
+        T::Native: ArrowNativeTypeOp,
+    {
+        let data: &[T::Native] = array.values();
 
-    match array.nulls() {
-        None => {
-            let sum = data.iter().fold(T::default_value(), |accumulator, 
value| {
-                accumulator.add_wrapping(*value)
-            });
+        match array.nulls() {
+            None => {
+                let sum = data.iter().fold(T::default_value(), |accumulator, 
value| {
+                    accumulator.add_wrapping(*value)
+                });
 
-            Some(sum)
+                Some(sum)
+            }
+            Some(nulls) => {
+                let mut sum = T::default_value();
+                let data_chunks = data.chunks_exact(64);
+                let remainder = data_chunks.remainder();
+
+                let bit_chunks = nulls.inner().bit_chunks();
+                data_chunks
+                    .zip(bit_chunks.iter())
+                    .for_each(|(chunk, mask)| {
+                        // index_mask has value 1 << i in the loop
+                        let mut index_mask = 1;
+                        chunk.iter().for_each(|value| {
+                            if (mask & index_mask) != 0 {
+                                sum = sum.add_wrapping(*value);
+                            }
+                            index_mask <<= 1;
+                        });
+                    });
+
+                let remainder_bits = bit_chunks.remainder_bits();
+
+                remainder.iter().enumerate().for_each(|(i, value)| {
+                    if remainder_bits & (1 << i) != 0 {
+                        sum = sum.add_wrapping(*value);
+                    }
+                });
+
+                Some(sum)
+            }
         }
-        Some(nulls) => {
-            let mut sum = T::default_value();
-            let data_chunks = data.chunks_exact(64);
-            let remainder = data_chunks.remainder();
-
-            let bit_chunks = nulls.inner().bit_chunks();
-            data_chunks
-                .zip(bit_chunks.iter())
-                .for_each(|(chunk, mask)| {
-                    // index_mask has value 1 << i in the loop
-                    let mut index_mask = 1;
-                    chunk.iter().for_each(|value| {
-                        if (mask & index_mask) != 0 {
-                            sum = sum.add_wrapping(*value);
+    }
+
+    fn sum_impl_floating<T, const LANES: usize>(

Review Comment:
   Same comment as above



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-rs] tustvold commented on a diff in pull request #4560: Restructure `sum` for better auto-vectorization for floats

Reply via email to