This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new fb697ce43 Remove simd and avx512 bitwise kernels in favor of
autovectorization (#1830)
fb697ce43 is described below
commit fb697ce4351fae39ebac810508ecc31583c6cdd7
Author: Jörn Horstmann <[email protected]>
AuthorDate: Sun Jun 12 19:09:02 2022 +0200
Remove simd and avx512 bitwise kernels in favor of autovectorization (#1830)
* Remove simd and avx512 bitwise kernels since they are actually slightly
slower than the autovectorized version
* Add notes about target-cpu to README
---
arrow/Cargo.toml | 1 -
arrow/README.md | 14 ++
arrow/benches/buffer_bit_ops.rs | 61 ++++++--
arrow/src/arch/avx512.rs | 73 ----------
arrow/src/arch/mod.rs | 22 ---
arrow/src/buffer/ops.rs | 307 +---------------------------------------
arrow/src/lib.rs | 4 -
7 files changed, 69 insertions(+), 413 deletions(-)
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index ebcdd9e7a..3f69888d5 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -61,7 +61,6 @@ bitflags = "1.2.1"
[features]
default = ["csv", "ipc", "test_utils"]
-avx512 = []
csv = ["csv_crate"]
ipc = ["flatbuffers"]
simd = ["packed_simd"]
diff --git a/arrow/README.md b/arrow/README.md
index 67de57ff0..28240e77d 100644
--- a/arrow/README.md
+++ b/arrow/README.md
@@ -100,3 +100,17 @@ cargo run --example read_csv
```
[arrow]: https://arrow.apache.org/
+
+
+## Performance
+
+Most of the compute kernels benefit a lot from being optimized for a specific
CPU target.
+This is especially so on x86-64 since without specifying a target the compiler
can only assume support for SSE2 vector instructions.
+One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can
therefore improve performance significantly:
+
+ - `native`: Target the exact features of the cpu that the build is running on.
+ This should give the best performance when building and running locally,
but should be used carefully for example when building in a CI pipeline or when
shipping pre-compiled software.
+ - `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell`
architecture released in 2013 and should be supported by any recent Intel or
Amd cpu.
+ - `x86-64-v4`: Includes AVX512 support available on intel `skylake` server
and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors.
+
+These flags should be used in addition to the `simd` feature, since they will
also affect the code generated by the simd library.
\ No newline at end of file
diff --git a/arrow/benches/buffer_bit_ops.rs b/arrow/benches/buffer_bit_ops.rs
index 063f39c92..6c6bb0463 100644
--- a/arrow/benches/buffer_bit_ops.rs
+++ b/arrow/benches/buffer_bit_ops.rs
@@ -17,11 +17,14 @@
#[macro_use]
extern crate criterion;
-use criterion::Criterion;
+
+use criterion::{Criterion, Throughput};
extern crate arrow;
-use arrow::buffer::{Buffer, MutableBuffer};
+use arrow::buffer::{
+ buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer,
+};
/// Helper function to create arrays
fn create_buffer(size: usize) -> Buffer {
@@ -42,17 +45,59 @@ fn bench_buffer_or(left: &Buffer, right: &Buffer) {
criterion::black_box((left | right).unwrap());
}
+fn bench_buffer_not(buffer: &Buffer) {
+ criterion::black_box(!buffer);
+}
+
+fn bench_buffer_and_with_offsets(
+ left: &Buffer,
+ left_offset: usize,
+ right: &Buffer,
+ right_offset: usize,
+ len: usize,
+) {
+ criterion::black_box(buffer_bin_and(left, left_offset, right,
right_offset, len));
+}
+
+fn bench_buffer_or_with_offsets(
+ left: &Buffer,
+ left_offset: usize,
+ right: &Buffer,
+ right_offset: usize,
+ len: usize,
+) {
+ criterion::black_box(buffer_bin_or(left, left_offset, right, right_offset,
len));
+}
+
+fn bench_buffer_not_with_offsets(buffer: &Buffer, offset: usize, len: usize) {
+ criterion::black_box(buffer_unary_not(buffer, offset, len));
+}
+
fn bit_ops_benchmark(c: &mut Criterion) {
let left = create_buffer(512 * 10);
let right = create_buffer(512 * 10);
- c.bench_function("buffer_bit_ops and", |b| {
- b.iter(|| bench_buffer_and(&left, &right))
- });
+ c.benchmark_group("buffer_binary_ops")
+ .throughput(Throughput::Bytes(3 * left.len() as u64))
+ .bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right)))
+ .bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right)))
+ .bench_function("and_with_offset", |b| {
+ b.iter(|| {
+ bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len()
* 8 - 5)
+ })
+ })
+ .bench_function("or_with_offset", |b| {
+ b.iter(|| {
+ bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() *
8 - 5)
+ })
+ });
- c.bench_function("buffer_bit_ops or", |b| {
- b.iter(|| bench_buffer_or(&left, &right))
- });
+ c.benchmark_group("buffer_unary_ops")
+ .throughput(Throughput::Bytes(2 * left.len() as u64))
+ .bench_function("not", |b| b.iter(|| bench_buffer_not(&left)))
+ .bench_function("not_with_offset", |b| {
+ b.iter(|| bench_buffer_not_with_offsets(&left, 1, left.len() * 8 -
5))
+ });
}
criterion_group!(benches, bit_ops_benchmark);
diff --git a/arrow/src/arch/avx512.rs b/arrow/src/arch/avx512.rs
deleted file mode 100644
index 264532f35..000000000
--- a/arrow/src/arch/avx512.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-pub(crate) const AVX512_U8X64_LANES: usize = 64;
-
-#[target_feature(enable = "avx512f")]
-pub(crate) unsafe fn avx512_bin_and(left: &[u8], right: &[u8], res: &mut [u8])
{
- use core::arch::x86_64::{__m512i, _mm512_and_si512, _mm512_loadu_epi64};
-
- let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _);
- let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _);
- let f = _mm512_and_si512(l, r);
- let s = &f as *const __m512i as *const u8;
- let d = res.get_unchecked_mut(0) as *mut _ as *mut u8;
- std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>());
-}
-
-#[target_feature(enable = "avx512f")]
-pub(crate) unsafe fn avx512_bin_or(left: &[u8], right: &[u8], res: &mut [u8]) {
- use core::arch::x86_64::{__m512i, _mm512_loadu_epi64, _mm512_or_si512};
-
- let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _);
- let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _);
- let f = _mm512_or_si512(l, r);
- let s = &f as *const __m512i as *const u8;
- let d = res.get_unchecked_mut(0) as *mut _ as *mut u8;
- std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>());
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn test_bitwise_and_avx512() {
- let buf1 = [0b00110011u8; 64];
- let buf2 = [0b11110000u8; 64];
- let mut buf3 = [0b00000000; 64];
- unsafe {
- avx512_bin_and(&buf1, &buf2, &mut buf3);
- };
- for i in buf3.iter() {
- assert_eq!(&0b00110000u8, i);
- }
- }
-
- #[test]
- fn test_bitwise_or_avx512() {
- let buf1 = [0b00010011u8; 64];
- let buf2 = [0b11100000u8; 64];
- let mut buf3 = [0b00000000; 64];
- unsafe {
- avx512_bin_or(&buf1, &buf2, &mut buf3);
- };
- for i in buf3.iter() {
- assert_eq!(&0b11110011u8, i);
- }
- }
-}
diff --git a/arrow/src/arch/mod.rs b/arrow/src/arch/mod.rs
deleted file mode 100644
index 56d8f4c0e..000000000
--- a/arrow/src/arch/mod.rs
+++ /dev/null
@@ -1,22 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-///
-/// Arch module contains architecture specific code.
-/// Be aware that not all machines have these specific operations available.
-#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
-pub(crate) mod avx512;
diff --git a/arrow/src/buffer/ops.rs b/arrow/src/buffer/ops.rs
index e0086a1a8..b3571d174 100644
--- a/arrow/src/buffer/ops.rs
+++ b/arrow/src/buffer/ops.rs
@@ -15,110 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-#[cfg(feature = "simd")]
-use crate::util::bit_util;
-#[cfg(feature = "simd")]
-use packed_simd::u8x64;
-
-#[cfg(feature = "avx512")]
-use crate::arch::avx512::*;
-use crate::util::bit_util::ceil;
-#[cfg(any(feature = "simd", feature = "avx512"))]
-use std::borrow::BorrowMut;
-
use super::{Buffer, MutableBuffer};
-
-/// Apply a bitwise operation `simd_op` / `scalar_op` to two inputs using simd
instructions and return the result as a Buffer.
-/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a
time
-/// and the `scalar_op` gets applied to remaining bytes.
-/// Contrary to the non-simd version `bitwise_bin_op_helper`, the offset and
length is specified in bytes
-/// and this version does not support operations starting at arbitrary bit
offsets.
-#[cfg(feature = "simd")]
-pub fn bitwise_bin_op_simd_helper<SI, SC>(
- left: &Buffer,
- left_offset: usize,
- right: &Buffer,
- right_offset: usize,
- len: usize,
- simd_op: SI,
- scalar_op: SC,
-) -> Buffer
-where
- SI: Fn(u8x64, u8x64) -> u8x64,
- SC: Fn(u8, u8) -> u8,
-{
- let mut result = MutableBuffer::new(len).with_bitset(len, false);
- let lanes = u8x64::lanes();
-
- let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes);
- let mut right_chunks =
right.as_slice()[right_offset..].chunks_exact(lanes);
- let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes);
-
- result_chunks
- .borrow_mut()
- .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
- .for_each(|(res, (left, right))| {
- unsafe { bit_util::bitwise_bin_op_simd(&left, &right, res,
&simd_op) };
- });
-
- result_chunks
- .into_remainder()
- .iter_mut()
- .zip(
- left_chunks
- .remainder()
- .iter()
- .zip(right_chunks.remainder().iter()),
- )
- .for_each(|(res, (left, right))| {
- *res = scalar_op(*left, *right);
- });
-
- result.into()
-}
-
-/// Apply a bitwise operation `simd_op` / `scalar_op` to one input using simd
instructions and return the result as a Buffer.
-/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a
time
-/// and the `scalar_op` gets applied to remaining bytes.
-/// Contrary to the non-simd version `bitwise_unary_op_helper`, the offset and
length is specified in bytes
-/// and this version does not support operations starting at arbitrary bit
offsets.
-#[cfg(feature = "simd")]
-pub fn bitwise_unary_op_simd_helper<SI, SC>(
- left: &Buffer,
- left_offset: usize,
- len: usize,
- simd_op: SI,
- scalar_op: SC,
-) -> Buffer
-where
- SI: Fn(u8x64) -> u8x64,
- SC: Fn(u8) -> u8,
-{
- let mut result = MutableBuffer::new(len).with_bitset(len, false);
- let lanes = u8x64::lanes();
-
- let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes);
- let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes);
-
- result_chunks
- .borrow_mut()
- .zip(left_chunks.borrow_mut())
- .for_each(|(res, left)| unsafe {
- let data_simd = u8x64::from_slice_unaligned_unchecked(left);
- let simd_result = simd_op(data_simd);
- simd_result.write_to_slice_unaligned_unchecked(res);
- });
-
- result_chunks
- .into_remainder()
- .iter_mut()
- .zip(left_chunks.remainder().iter())
- .for_each(|(res, left)| {
- *res = scalar_op(*left);
- });
-
- result.into()
-}
+use crate::util::bit_util::ceil;
/// Apply a bitwise operation `op` to two inputs and return the result as a
Buffer.
/// The inputs are treated as bitmaps, meaning that offsets and length are
specified in number of bits.
@@ -189,100 +87,6 @@ where
result.into()
}
-#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
-pub fn buffer_bin_and(
- left: &Buffer,
- left_offset_in_bits: usize,
- right: &Buffer,
- right_offset_in_bits: usize,
- len_in_bits: usize,
-) -> Buffer {
- if left_offset_in_bits % 8 == 0
- && right_offset_in_bits % 8 == 0
- && len_in_bits % 8 == 0
- {
- let len = len_in_bits / 8;
- let left_offset = left_offset_in_bits / 8;
- let right_offset = right_offset_in_bits / 8;
-
- let mut result = MutableBuffer::new(len).with_bitset(len, false);
-
- let mut left_chunks =
- left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES);
- let mut right_chunks =
- right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES);
- let mut result_chunks =
- result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES);
-
- result_chunks
- .borrow_mut()
- .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
- .for_each(|(res, (left, right))| unsafe {
- avx512_bin_and(left, right, res);
- });
-
- result_chunks
- .into_remainder()
- .iter_mut()
- .zip(
- left_chunks
- .remainder()
- .iter()
- .zip(right_chunks.remainder().iter()),
- )
- .for_each(|(res, (left, right))| {
- *res = *left & *right;
- });
-
- result.into()
- } else {
- bitwise_bin_op_helper(
- &left,
- left_offset_in_bits,
- right,
- right_offset_in_bits,
- len_in_bits,
- |a, b| a & b,
- )
- }
-}
-
-#[cfg(all(feature = "simd", not(feature = "avx512")))]
-pub fn buffer_bin_and(
- left: &Buffer,
- left_offset_in_bits: usize,
- right: &Buffer,
- right_offset_in_bits: usize,
- len_in_bits: usize,
-) -> Buffer {
- if left_offset_in_bits % 8 == 0
- && right_offset_in_bits % 8 == 0
- && len_in_bits % 8 == 0
- {
- bitwise_bin_op_simd_helper(
- &left,
- left_offset_in_bits / 8,
- &right,
- right_offset_in_bits / 8,
- len_in_bits / 8,
- |a, b| a & b,
- |a, b| a & b,
- )
- } else {
- bitwise_bin_op_helper(
- &left,
- left_offset_in_bits,
- right,
- right_offset_in_bits,
- len_in_bits,
- |a, b| a & b,
- )
- }
-}
-
-// Note: do not target specific features like x86 without considering
-// other targets like wasm32, as those would fail to build
-#[cfg(all(not(any(feature = "simd", feature = "avx512"))))]
pub fn buffer_bin_and(
left: &Buffer,
left_offset_in_bits: usize,
@@ -300,98 +104,6 @@ pub fn buffer_bin_and(
)
}
-#[cfg(all(target_arch = "x86_64", feature = "avx512"))]
-pub fn buffer_bin_or(
- left: &Buffer,
- left_offset_in_bits: usize,
- right: &Buffer,
- right_offset_in_bits: usize,
- len_in_bits: usize,
-) -> Buffer {
- if left_offset_in_bits % 8 == 0
- && right_offset_in_bits % 8 == 0
- && len_in_bits % 8 == 0
- {
- let len = len_in_bits / 8;
- let left_offset = left_offset_in_bits / 8;
- let right_offset = right_offset_in_bits / 8;
-
- let mut result = MutableBuffer::new(len).with_bitset(len, false);
-
- let mut left_chunks =
- left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES);
- let mut right_chunks =
- right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES);
- let mut result_chunks =
- result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES);
-
- result_chunks
- .borrow_mut()
- .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
- .for_each(|(res, (left, right))| unsafe {
- avx512_bin_or(left, right, res);
- });
-
- result_chunks
- .into_remainder()
- .iter_mut()
- .zip(
- left_chunks
- .remainder()
- .iter()
- .zip(right_chunks.remainder().iter()),
- )
- .for_each(|(res, (left, right))| {
- *res = *left | *right;
- });
-
- result.into()
- } else {
- bitwise_bin_op_helper(
- &left,
- left_offset_in_bits,
- right,
- right_offset_in_bits,
- len_in_bits,
- |a, b| a | b,
- )
- }
-}
-
-#[cfg(all(feature = "simd", not(feature = "avx512")))]
-pub fn buffer_bin_or(
- left: &Buffer,
- left_offset_in_bits: usize,
- right: &Buffer,
- right_offset_in_bits: usize,
- len_in_bits: usize,
-) -> Buffer {
- if left_offset_in_bits % 8 == 0
- && right_offset_in_bits % 8 == 0
- && len_in_bits % 8 == 0
- {
- bitwise_bin_op_simd_helper(
- &left,
- left_offset_in_bits / 8,
- &right,
- right_offset_in_bits / 8,
- len_in_bits / 8,
- |a, b| a | b,
- |a, b| a | b,
- )
- } else {
- bitwise_bin_op_helper(
- &left,
- left_offset_in_bits,
- right,
- right_offset_in_bits,
- len_in_bits,
- |a, b| a | b,
- )
- }
-}
-
-#[cfg(all(not(any(feature = "simd", feature = "avx512"))))]
pub fn buffer_bin_or(
left: &Buffer,
left_offset_in_bits: usize,
@@ -414,20 +126,5 @@ pub fn buffer_unary_not(
offset_in_bits: usize,
len_in_bits: usize,
) -> Buffer {
- // SIMD implementation if available and byte-aligned
- #[cfg(feature = "simd")]
- if offset_in_bits % 8 == 0 && len_in_bits % 8 == 0 {
- return bitwise_unary_op_simd_helper(
- &left,
- offset_in_bits / 8,
- len_in_bits / 8,
- |a| !a,
- |a| !a,
- );
- }
- // Default implementation
- #[allow(unreachable_code)]
- {
- bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a)
- }
+ bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a)
}
diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs
index 0cb77a360..95c69ca0b 100644
--- a/arrow/src/lib.rs
+++ b/arrow/src/lib.rs
@@ -225,14 +225,10 @@
//! [issue tracker]: https://github.com/apache/arrow-rs/issues
//!
-#![cfg_attr(feature = "avx512", feature(stdsimd))]
-#![cfg_attr(feature = "avx512", feature(repr_simd))]
-#![cfg_attr(feature = "avx512", feature(avx512_target_feature))]
#![deny(clippy::redundant_clone)]
#![warn(missing_debug_implementations)]
pub mod alloc;
-mod arch;
pub mod array;
pub mod bitmap;
pub mod buffer;