This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new f9e4cf5 Improve performance if dictionary kernels, add benchmark and
add `take_iter_unchecked` (#1372)
f9e4cf5 is described below
commit f9e4cf5777e16d9b1329017b1bb1d1e6c44334de
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Thu Mar 3 03:34:00 2022 -0800
Improve performance if dictionary kernels, add benchmark and add
`take_iter_unchecked` (#1372)
* Add benchmark and take_iter_unchecked.
* Add Safety section for clippy
* Update arrow/src/compute/kernels/comparison.rs
Co-authored-by: Andrew Lamb <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow/benches/comparison_kernels.rs | 21 ++++++++++++++++++++-
arrow/src/array/array_binary.rs | 11 +++++++++++
arrow/src/array/array_boolean.rs | 11 +++++++++++
arrow/src/array/array_primitive.rs | 11 +++++++++++
arrow/src/array/array_string.rs | 11 +++++++++++
arrow/src/compute/kernels/comparison.rs | 33 ++++++++++++++++++++-------------
6 files changed, 84 insertions(+), 14 deletions(-)
diff --git a/arrow/benches/comparison_kernels.rs
b/arrow/benches/comparison_kernels.rs
index cf9ccdd..4dced67 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -24,7 +24,7 @@ extern crate arrow;
use arrow::compute::*;
use arrow::datatypes::{ArrowNumericType, IntervalMonthDayNanoType};
use arrow::util::bench_util::*;
-use arrow::{array::*, datatypes::Float32Type};
+use arrow::{array::*, datatypes::Float32Type, datatypes::Int32Type};
fn bench_eq<T>(arr_a: &PrimitiveArray<T>, arr_b: &PrimitiveArray<T>)
where
@@ -133,6 +133,18 @@ fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray,
value_b: &str) {
.unwrap();
}
+fn bench_dict_eq<T>(arr_a: &DictionaryArray<T>, arr_b: &DictionaryArray<T>)
+where
+ T: ArrowNumericType,
+{
+ cmp_dict_utf8::<T, i32, _>(
+ criterion::black_box(arr_a),
+ criterion::black_box(arr_b),
+ |a, b| a == b,
+ )
+ .unwrap();
+}
+
fn add_benchmark(c: &mut Criterion) {
let size = 65536;
let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
@@ -249,6 +261,13 @@ fn add_benchmark(c: &mut Criterion) {
c.bench_function("egexp_matches_utf8 scalar ends with", |b| {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
});
+
+ let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0);
+ let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0);
+
+ c.bench_function("dict eq string", |b| {
+ b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
+ });
}
criterion_group!(benches, add_benchmark);
diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index 40a5ee6..d9118dd 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -209,6 +209,17 @@ impl<OffsetSize: BinaryOffsetSizeTrait>
GenericBinaryArray<OffsetSize> {
) -> impl Iterator<Item = Option<&[u8]>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}
+
+ /// Returns an iterator that returns the values of `array.value(i)` for an
iterator with each element `i`
+ /// # Safety
+ ///
+ /// caller must ensure that the offsets in the iterator are less than the
array len()
+ pub unsafe fn take_iter_unchecked<'a>(
+ &'a self,
+ indexes: impl Iterator<Item = Option<usize>> + 'a,
+ ) -> impl Iterator<Item = Option<&[u8]>> + 'a {
+ indexes.map(|opt_index| opt_index.map(|index|
self.value_unchecked(index)))
+ }
}
impl<'a, T: BinaryOffsetSizeTrait> GenericBinaryArray<T> {
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index ca3bb2d..12cecd4 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -130,6 +130,17 @@ impl BooleanArray {
) -> impl Iterator<Item = Option<bool>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}
+
+ /// Returns an iterator that returns the values of `array.value(i)` for an
iterator with each element `i`
+ /// # Safety
+ ///
+ /// caller must ensure that the offsets in the iterator are less than the
array len()
+ pub unsafe fn take_iter_unchecked<'a>(
+ &'a self,
+ indexes: impl Iterator<Item = Option<usize>> + 'a,
+ ) -> impl Iterator<Item = Option<bool>> + 'a {
+ indexes.map(|opt_index| opt_index.map(|index|
self.value_unchecked(index)))
+ }
}
impl Array for BooleanArray {
diff --git a/arrow/src/array/array_primitive.rs
b/arrow/src/array/array_primitive.rs
index 0d18032..79aa8e6 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -154,6 +154,17 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
) -> impl Iterator<Item = Option<T::Native>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}
+
+ /// Returns an iterator that returns the values of `array.value(i)` for an
iterator with each element `i`
+ /// # Safety
+ ///
+ /// caller must ensure that the offsets in the iterator are less than the
array len()
+ pub unsafe fn take_iter_unchecked<'a>(
+ &'a self,
+ indexes: impl Iterator<Item = Option<usize>> + 'a,
+ ) -> impl Iterator<Item = Option<T::Native>> + 'a {
+ indexes.map(|opt_index| opt_index.map(|index|
self.value_unchecked(index)))
+ }
}
impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs
index 95c7cd6..b17534a 100644
--- a/arrow/src/array/array_string.rs
+++ b/arrow/src/array/array_string.rs
@@ -180,6 +180,17 @@ impl<OffsetSize: StringOffsetSizeTrait>
GenericStringArray<OffsetSize> {
) -> impl Iterator<Item = Option<&str>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}
+
+ /// Returns an iterator that returns the values of `array.value(i)` for an
iterator with each element `i`
+ /// # Safety
+ ///
+ /// caller must ensure that the offsets in the iterator are less than the
array len()
+ pub unsafe fn take_iter_unchecked<'a>(
+ &'a self,
+ indexes: impl Iterator<Item = Option<usize>> + 'a,
+ ) -> impl Iterator<Item = Option<&str>> + 'a {
+ indexes.map(|opt_index| opt_index.map(|index|
self.value_unchecked(index)))
+ }
}
impl<'a, Ptr, OffsetSize: StringOffsetSizeTrait> FromIterator<&'a Option<Ptr>>
diff --git a/arrow/src/compute/kernels/comparison.rs
b/arrow/src/compute/kernels/comparison.rs
index d1c33e3..1154076 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -2214,19 +2214,26 @@ macro_rules! compare_dict_op {
));
}
- let left_iter = $left
- .values()
- .as_any()
- .downcast_ref::<$value_ty>()
- .unwrap()
- .take_iter($left.keys_iter());
-
- let right_iter = $right
- .values()
- .as_any()
- .downcast_ref::<$value_ty>()
- .unwrap()
- .take_iter($right.keys_iter());
+ // Safety justification: Since the inputs are valid Arrow arrays, all
values are
+ // valid indexes into the dictionary (which is verified during
construction)
+
+ let left_iter = unsafe {
+ $left
+ .values()
+ .as_any()
+ .downcast_ref::<$value_ty>()
+ .unwrap()
+ .take_iter_unchecked($left.keys_iter())
+ };
+
+ let right_iter = unsafe {
+ $right
+ .values()
+ .as_any()
+ .downcast_ref::<$value_ty>()
+ .unwrap()
+ .take_iter_unchecked($right.keys_iter())
+ };
let result = left_iter
.zip(right_iter)