[arrow-rs] branch master updated: Remove simd and avx512 bitwise kernels in favor of autovectorization (#1830)

nevime Sun, 12 Jun 2022 10:09:17 -0700

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new fb697ce43 Remove simd and avx512 bitwise kernels in favor of 
autovectorization (#1830)
fb697ce43 is described below

commit fb697ce4351fae39ebac810508ecc31583c6cdd7
Author: Jörn Horstmann <[email protected]>
AuthorDate: Sun Jun 12 19:09:02 2022 +0200

    Remove simd and avx512 bitwise kernels in favor of autovectorization (#1830)
    
    * Remove simd and avx512 bitwise kernels since they are actually slightly 
slower than the autovectorized version
    
    * Add notes about target-cpu to README
---
 arrow/Cargo.toml                |   1 -
 arrow/README.md                 |  14 ++
 arrow/benches/buffer_bit_ops.rs |  61 ++++++--
 arrow/src/arch/avx512.rs        |  73 ----------
 arrow/src/arch/mod.rs           |  22 ---
 arrow/src/buffer/ops.rs         | 307 +---------------------------------------
 arrow/src/lib.rs                |   4 -
 7 files changed, 69 insertions(+), 413 deletions(-)

diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index ebcdd9e7a..3f69888d5 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -61,7 +61,6 @@ bitflags = "1.2.1"
 
 [features]
 default = ["csv", "ipc", "test_utils"]
-avx512 = []
 csv = ["csv_crate"]
 ipc = ["flatbuffers"]
 simd = ["packed_simd"]
diff --git a/arrow/README.md b/arrow/README.md
index 67de57ff0..28240e77d 100644
--- a/arrow/README.md
+++ b/arrow/README.md
@@ -100,3 +100,17 @@ cargo run --example read_csv
 ```
 
 [arrow]: https://arrow.apache.org/
+
+
+## Performance
+
+Most of the compute kernels benefit a lot from being optimized for a specific 
CPU target.
+This is especially so on x86-64 since without specifying a target the compiler 
can only assume support for SSE2 vector instructions.
+One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can 
therefore improve performance significantly:
+
+ - `native`: Target the exact features of the cpu that the build is running on.
+   This should give the best performance when building and running locally, 
but should be used carefully for example when building in a CI pipeline or when 
shipping pre-compiled software. 
+ - `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell` 
architecture released in 2013 and should be supported by any recent Intel or 
Amd cpu.
+ - `x86-64-v4`: Includes AVX512 support available on intel `skylake` server 
and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors.
+
+These flags should be used in addition to the `simd` feature, since they will 
also affect the code generated by the simd library. 
\ No newline at end of file
diff --git a/arrow/benches/buffer_bit_ops.rs b/arrow/benches/buffer_bit_ops.rs
index 063f39c92..6c6bb0463 100644
--- a/arrow/benches/buffer_bit_ops.rs
+++ b/arrow/benches/buffer_bit_ops.rs
@@ -17,11 +17,14 @@
 
 #[macro_use]
 extern crate criterion;
-use criterion::Criterion;
+
+use criterion::{Criterion, Throughput};
 
 extern crate arrow;
 
-use arrow::buffer::{Buffer, MutableBuffer};
+use arrow::buffer::{
+    buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer,
+};
 
 ///  Helper function to create arrays
 fn create_buffer(size: usize) -> Buffer {
@@ -42,17 +45,59 @@ fn bench_buffer_or(left: &Buffer, right: &Buffer) {
     criterion::black_box((left | right).unwrap());
 }
 
+fn bench_buffer_not(buffer: &Buffer) {
+    criterion::black_box(!buffer);
+}
+
+fn bench_buffer_and_with_offsets(
+    left: &Buffer,
+    left_offset: usize,
+    right: &Buffer,
+    right_offset: usize,
+    len: usize,
+) {
+    criterion::black_box(buffer_bin_and(left, left_offset, right, 
right_offset, len));
+}
+
+fn bench_buffer_or_with_offsets(
+    left: &Buffer,
+    left_offset: usize,
+    right: &Buffer,
+    right_offset: usize,
+    len: usize,
+) {
+    criterion::black_box(buffer_bin_or(left, left_offset, right, right_offset, 
len));
+}
+
+fn bench_buffer_not_with_offsets(buffer: &Buffer, offset: usize, len: usize) {
+    criterion::black_box(buffer_unary_not(buffer, offset, len));
+}
+
 fn bit_ops_benchmark(c: &mut Criterion) {
     let left = create_buffer(512 * 10);
     let right = create_buffer(512 * 10);
 
-    c.bench_function("buffer_bit_ops and", |b| {
-        b.iter(|| bench_buffer_and(&left, &right))
-    });
+    c.benchmark_group("buffer_binary_ops")
+        .throughput(Throughput::Bytes(3 * left.len() as u64))
+        .bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right)))
+        .bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right)))
+        .bench_function("and_with_offset", |b| {
+            b.iter(|| {
+                bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len() 
* 8 - 5)
+            })
+        })
+        .bench_function("or_with_offset", |b| {
+            b.iter(|| {
+                bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 
8 - 5)
+            })
+        });
 
-    c.bench_function("buffer_bit_ops or", |b| {
-        b.iter(|| bench_buffer_or(&left, &right))
-    });
+    c.benchmark_group("buffer_unary_ops")
+        .throughput(Throughput::Bytes(2 * left.len() as u64))
+        .bench_function("not", |b| b.iter(|| bench_buffer_not(&left)))
+        .bench_function("not_with_offset", |b| {
+            b.iter(|| bench_buffer_not_with_offsets(&left, 1, left.len() * 8 - 
5))
+        });
 }
 
 criterion_group!(benches, bit_ops_benchmark);
diff --git a/arrow/src/arch/avx512.rs b/arrow/src/arch/avx512.rs
deleted file mode 100644
index 264532f35..000000000
--- a/arrow/src/arch/avx512.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-pub(crate) const AVX512_U8X64_LANES: usize = 64;
-
-#[target_feature(enable = "avx512f")]
-pub(crate) unsafe fn avx512_bin_and(left: &[u8], right: &[u8], res: &mut [u8]) 
{
-    use core::arch::x86_64::{__m512i, _mm512_and_si512, _mm512_loadu_epi64};
-
-    let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _);
-    let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _);
-    let f = _mm512_and_si512(l, r);
-    let s = &f as *const __m512i as *const u8;
-    let d = res.get_unchecked_mut(0) as *mut _ as *mut u8;
-    std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>());
-}
-
-#[target_feature(enable = "avx512f")]
-pub(crate) unsafe fn avx512_bin_or(left: &[u8], right: &[u8], res: &mut [u8]) {
-    use core::arch::x86_64::{__m512i, _mm512_loadu_epi64, _mm512_or_si512};
-
-    let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _);
-    let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _);
-    let f = _mm512_or_si512(l, r);
-    let s = &f as *const __m512i as *const u8;
-    let d = res.get_unchecked_mut(0) as *mut _ as *mut u8;
-    std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>());
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_bitwise_and_avx512() {
-        let buf1 = [0b00110011u8; 64];
-        let buf2 = [0b11110000u8; 64];
-        let mut buf3 = [0b00000000; 64];
-        unsafe {
-            avx512_bin_and(&buf1, &buf2, &mut buf3);
-        };
-        for i in buf3.iter() {
-            assert_eq!(&0b00110000u8, i);
-        }
-    }
-
-    #[test]
-    fn test_bitwise_or_avx512() {
-        let buf1 = [0b00010011u8; 64];
-        let buf2 = [0b11100000u8; 64];
-        let mut buf3 = [0b00000000; 64];
-        unsafe {
-            avx512_bin_or(&buf1, &buf2, &mut buf3);
-        };
-        for i in buf3.iter() {
-            assert_eq!(&0b11110011u8, i);
-        }
-    }
-}
diff --git a/arrow/src/arch/mod.rs b/arrow/src/arch/mod.rs
deleted file mode 100644
index 56d8f4c0e..000000000
--- a/arrow/src/arch/mod.rs
+++ /dev/null
@@ -1,22 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-///
-/// Arch module contains architecture specific code.
-/// Be aware that not all machines have these specific operations available.
-#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
-pub(crate) mod avx512;
diff --git a/arrow/src/buffer/ops.rs b/arrow/src/buffer/ops.rs
index e0086a1a8..b3571d174 100644
--- a/arrow/src/buffer/ops.rs
+++ b/arrow/src/buffer/ops.rs
@@ -15,110 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#[cfg(feature = "simd")]
-use crate::util::bit_util;
-#[cfg(feature = "simd")]
-use packed_simd::u8x64;
-
-#[cfg(feature = "avx512")]
-use crate::arch::avx512::*;
-use crate::util::bit_util::ceil;
-#[cfg(any(feature = "simd", feature = "avx512"))]
-use std::borrow::BorrowMut;
-
 use super::{Buffer, MutableBuffer};
-
-/// Apply a bitwise operation `simd_op` / `scalar_op` to two inputs using simd 
instructions and return the result as a Buffer.
-/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a 
time
-/// and the `scalar_op` gets applied to remaining bytes.
-/// Contrary to the non-simd version `bitwise_bin_op_helper`, the offset and 
length is specified in bytes
-/// and this version does not support operations starting at arbitrary bit 
offsets.
-#[cfg(feature = "simd")]
-pub fn bitwise_bin_op_simd_helper<SI, SC>(
-    left: &Buffer,
-    left_offset: usize,
-    right: &Buffer,
-    right_offset: usize,
-    len: usize,
-    simd_op: SI,
-    scalar_op: SC,
-) -> Buffer
-where
-    SI: Fn(u8x64, u8x64) -> u8x64,
-    SC: Fn(u8, u8) -> u8,
-{
-    let mut result = MutableBuffer::new(len).with_bitset(len, false);
-    let lanes = u8x64::lanes();
-
-    let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes);
-    let mut right_chunks = 
right.as_slice()[right_offset..].chunks_exact(lanes);
-    let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes);
-
-    result_chunks
-        .borrow_mut()
-        .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
-        .for_each(|(res, (left, right))| {
-            unsafe { bit_util::bitwise_bin_op_simd(&left, &right, res, 
&simd_op) };
-        });
-
-    result_chunks
-        .into_remainder()
-        .iter_mut()
-        .zip(
-            left_chunks
-                .remainder()
-                .iter()
-                .zip(right_chunks.remainder().iter()),
-        )
-        .for_each(|(res, (left, right))| {
-            *res = scalar_op(*left, *right);
-        });
-
-    result.into()
-}
-
-/// Apply a bitwise operation `simd_op` / `scalar_op` to one input using simd 
instructions and return the result as a Buffer.
-/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a 
time
-/// and the `scalar_op` gets applied to remaining bytes.
-/// Contrary to the non-simd version `bitwise_unary_op_helper`, the offset and 
length is specified in bytes
-/// and this version does not support operations starting at arbitrary bit 
offsets.
-#[cfg(feature = "simd")]
-pub fn bitwise_unary_op_simd_helper<SI, SC>(
-    left: &Buffer,
-    left_offset: usize,
-    len: usize,
-    simd_op: SI,
-    scalar_op: SC,
-) -> Buffer
-where
-    SI: Fn(u8x64) -> u8x64,
-    SC: Fn(u8) -> u8,
-{
-    let mut result = MutableBuffer::new(len).with_bitset(len, false);
-    let lanes = u8x64::lanes();
-
-    let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes);
-    let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes);
-
-    result_chunks
-        .borrow_mut()
-        .zip(left_chunks.borrow_mut())
-        .for_each(|(res, left)| unsafe {
-            let data_simd = u8x64::from_slice_unaligned_unchecked(left);
-            let simd_result = simd_op(data_simd);
-            simd_result.write_to_slice_unaligned_unchecked(res);
-        });
-
-    result_chunks
-        .into_remainder()
-        .iter_mut()
-        .zip(left_chunks.remainder().iter())
-        .for_each(|(res, left)| {
-            *res = scalar_op(*left);
-        });
-
-    result.into()
-}
+use crate::util::bit_util::ceil;
 
 /// Apply a bitwise operation `op` to two inputs and return the result as a 
Buffer.
 /// The inputs are treated as bitmaps, meaning that offsets and length are 
specified in number of bits.
@@ -189,100 +87,6 @@ where
     result.into()
 }
 
-#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
-pub fn buffer_bin_and(
-    left: &Buffer,
-    left_offset_in_bits: usize,
-    right: &Buffer,
-    right_offset_in_bits: usize,
-    len_in_bits: usize,
-) -> Buffer {
-    if left_offset_in_bits % 8 == 0
-        && right_offset_in_bits % 8 == 0
-        && len_in_bits % 8 == 0
-    {
-        let len = len_in_bits / 8;
-        let left_offset = left_offset_in_bits / 8;
-        let right_offset = right_offset_in_bits / 8;
-
-        let mut result = MutableBuffer::new(len).with_bitset(len, false);
-
-        let mut left_chunks =
-            left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES);
-        let mut right_chunks =
-            right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES);
-        let mut result_chunks =
-            result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES);
-
-        result_chunks
-            .borrow_mut()
-            .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
-            .for_each(|(res, (left, right))| unsafe {
-                avx512_bin_and(left, right, res);
-            });
-
-        result_chunks
-            .into_remainder()
-            .iter_mut()
-            .zip(
-                left_chunks
-                    .remainder()
-                    .iter()
-                    .zip(right_chunks.remainder().iter()),
-            )
-            .for_each(|(res, (left, right))| {
-                *res = *left & *right;
-            });
-
-        result.into()
-    } else {
-        bitwise_bin_op_helper(
-            &left,
-            left_offset_in_bits,
-            right,
-            right_offset_in_bits,
-            len_in_bits,
-            |a, b| a & b,
-        )
-    }
-}
-
-#[cfg(all(feature = "simd", not(feature = "avx512")))]
-pub fn buffer_bin_and(
-    left: &Buffer,
-    left_offset_in_bits: usize,
-    right: &Buffer,
-    right_offset_in_bits: usize,
-    len_in_bits: usize,
-) -> Buffer {
-    if left_offset_in_bits % 8 == 0
-        && right_offset_in_bits % 8 == 0
-        && len_in_bits % 8 == 0
-    {
-        bitwise_bin_op_simd_helper(
-            &left,
-            left_offset_in_bits / 8,
-            &right,
-            right_offset_in_bits / 8,
-            len_in_bits / 8,
-            |a, b| a & b,
-            |a, b| a & b,
-        )
-    } else {
-        bitwise_bin_op_helper(
-            &left,
-            left_offset_in_bits,
-            right,
-            right_offset_in_bits,
-            len_in_bits,
-            |a, b| a & b,
-        )
-    }
-}
-
-// Note: do not target specific features like x86 without considering
-// other targets like wasm32, as those would fail to build
-#[cfg(all(not(any(feature = "simd", feature = "avx512"))))]
 pub fn buffer_bin_and(
     left: &Buffer,
     left_offset_in_bits: usize,
@@ -300,98 +104,6 @@ pub fn buffer_bin_and(
     )
 }
 
-#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
-pub fn buffer_bin_or(
-    left: &Buffer,
-    left_offset_in_bits: usize,
-    right: &Buffer,
-    right_offset_in_bits: usize,
-    len_in_bits: usize,
-) -> Buffer {
-    if left_offset_in_bits % 8 == 0
-        && right_offset_in_bits % 8 == 0
-        && len_in_bits % 8 == 0
-    {
-        let len = len_in_bits / 8;
-        let left_offset = left_offset_in_bits / 8;
-        let right_offset = right_offset_in_bits / 8;
-
-        let mut result = MutableBuffer::new(len).with_bitset(len, false);
-
-        let mut left_chunks =
-            left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES);
-        let mut right_chunks =
-            right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES);
-        let mut result_chunks =
-            result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES);
-
-        result_chunks
-            .borrow_mut()
-            .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
-            .for_each(|(res, (left, right))| unsafe {
-                avx512_bin_or(left, right, res);
-            });
-
-        result_chunks
-            .into_remainder()
-            .iter_mut()
-            .zip(
-                left_chunks
-                    .remainder()
-                    .iter()
-                    .zip(right_chunks.remainder().iter()),
-            )
-            .for_each(|(res, (left, right))| {
-                *res = *left | *right;
-            });
-
-        result.into()
-    } else {
-        bitwise_bin_op_helper(
-            &left,
-            left_offset_in_bits,
-            right,
-            right_offset_in_bits,
-            len_in_bits,
-            |a, b| a | b,
-        )
-    }
-}
-
-#[cfg(all(feature = "simd", not(feature = "avx512")))]
-pub fn buffer_bin_or(
-    left: &Buffer,
-    left_offset_in_bits: usize,
-    right: &Buffer,
-    right_offset_in_bits: usize,
-    len_in_bits: usize,
-) -> Buffer {
-    if left_offset_in_bits % 8 == 0
-        && right_offset_in_bits % 8 == 0
-        && len_in_bits % 8 == 0
-    {
-        bitwise_bin_op_simd_helper(
-            &left,
-            left_offset_in_bits / 8,
-            &right,
-            right_offset_in_bits / 8,
-            len_in_bits / 8,
-            |a, b| a | b,
-            |a, b| a | b,
-        )
-    } else {
-        bitwise_bin_op_helper(
-            &left,
-            left_offset_in_bits,
-            right,
-            right_offset_in_bits,
-            len_in_bits,
-            |a, b| a | b,
-        )
-    }
-}
-
-#[cfg(all(not(any(feature = "simd", feature = "avx512"))))]
 pub fn buffer_bin_or(
     left: &Buffer,
     left_offset_in_bits: usize,
@@ -414,20 +126,5 @@ pub fn buffer_unary_not(
     offset_in_bits: usize,
     len_in_bits: usize,
 ) -> Buffer {
-    // SIMD implementation if available and byte-aligned
-    #[cfg(feature = "simd")]
-    if offset_in_bits % 8 == 0 && len_in_bits % 8 == 0 {
-        return bitwise_unary_op_simd_helper(
-            &left,
-            offset_in_bits / 8,
-            len_in_bits / 8,
-            |a| !a,
-            |a| !a,
-        );
-    }
-    // Default implementation
-    #[allow(unreachable_code)]
-    {
-        bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a)
-    }
+    bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a)
 }
diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs
index 0cb77a360..95c69ca0b 100644
--- a/arrow/src/lib.rs
+++ b/arrow/src/lib.rs
@@ -225,14 +225,10 @@
 //! [issue tracker]: https://github.com/apache/arrow-rs/issues
 //!
 
-#![cfg_attr(feature = "avx512", feature(stdsimd))]
-#![cfg_attr(feature = "avx512", feature(repr_simd))]
-#![cfg_attr(feature = "avx512", feature(avx512_target_feature))]
 #![deny(clippy::redundant_clone)]
 #![warn(missing_debug_implementations)]
 
 pub mod alloc;
-mod arch;
 pub mod array;
 pub mod bitmap;
 pub mod buffer;

[arrow-rs] branch master updated: Remove simd and avx512 bitwise kernels in favor of autovectorization (#1830)

Reply via email to