This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new f9e4cf5  Improve performance if dictionary kernels, add benchmark and 
add `take_iter_unchecked` (#1372)
f9e4cf5 is described below

commit f9e4cf5777e16d9b1329017b1bb1d1e6c44334de
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Thu Mar 3 03:34:00 2022 -0800

    Improve performance if dictionary kernels, add benchmark and add 
`take_iter_unchecked` (#1372)
    
    * Add benchmark and take_iter_unchecked.
    
    * Add Safety section for clippy
    
    * Update arrow/src/compute/kernels/comparison.rs
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow/benches/comparison_kernels.rs     | 21 ++++++++++++++++++++-
 arrow/src/array/array_binary.rs         | 11 +++++++++++
 arrow/src/array/array_boolean.rs        | 11 +++++++++++
 arrow/src/array/array_primitive.rs      | 11 +++++++++++
 arrow/src/array/array_string.rs         | 11 +++++++++++
 arrow/src/compute/kernels/comparison.rs | 33 ++++++++++++++++++++-------------
 6 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/arrow/benches/comparison_kernels.rs 
b/arrow/benches/comparison_kernels.rs
index cf9ccdd..4dced67 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -24,7 +24,7 @@ extern crate arrow;
 use arrow::compute::*;
 use arrow::datatypes::{ArrowNumericType, IntervalMonthDayNanoType};
 use arrow::util::bench_util::*;
-use arrow::{array::*, datatypes::Float32Type};
+use arrow::{array::*, datatypes::Float32Type, datatypes::Int32Type};
 
 fn bench_eq<T>(arr_a: &PrimitiveArray<T>, arr_b: &PrimitiveArray<T>)
 where
@@ -133,6 +133,18 @@ fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, 
value_b: &str) {
     .unwrap();
 }
 
+fn bench_dict_eq<T>(arr_a: &DictionaryArray<T>, arr_b: &DictionaryArray<T>)
+where
+    T: ArrowNumericType,
+{
+    cmp_dict_utf8::<T, i32, _>(
+        criterion::black_box(arr_a),
+        criterion::black_box(arr_b),
+        |a, b| a == b,
+    )
+    .unwrap();
+}
+
 fn add_benchmark(c: &mut Criterion) {
     let size = 65536;
     let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
@@ -249,6 +261,13 @@ fn add_benchmark(c: &mut Criterion) {
     c.bench_function("egexp_matches_utf8 scalar ends with", |b| {
         b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
     });
+
+    let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0);
+    let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0);
+
+    c.bench_function("dict eq string", |b| {
+        b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
+    });
 }
 
 criterion_group!(benches, add_benchmark);
diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index 40a5ee6..d9118dd 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -209,6 +209,17 @@ impl<OffsetSize: BinaryOffsetSizeTrait> 
GenericBinaryArray<OffsetSize> {
     ) -> impl Iterator<Item = Option<&[u8]>> + 'a {
         indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
     }
+
+    /// Returns an iterator that returns the values of `array.value(i)` for an 
iterator with each element `i`
+    /// # Safety
+    ///
+    /// caller must ensure that the offsets in the iterator are less than the 
array len()
+    pub unsafe fn take_iter_unchecked<'a>(
+        &'a self,
+        indexes: impl Iterator<Item = Option<usize>> + 'a,
+    ) -> impl Iterator<Item = Option<&[u8]>> + 'a {
+        indexes.map(|opt_index| opt_index.map(|index| 
self.value_unchecked(index)))
+    }
 }
 
 impl<'a, T: BinaryOffsetSizeTrait> GenericBinaryArray<T> {
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index ca3bb2d..12cecd4 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -130,6 +130,17 @@ impl BooleanArray {
     ) -> impl Iterator<Item = Option<bool>> + 'a {
         indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
     }
+
+    /// Returns an iterator that returns the values of `array.value(i)` for an 
iterator with each element `i`
+    /// # Safety
+    ///
+    /// caller must ensure that the offsets in the iterator are less than the 
array len()
+    pub unsafe fn take_iter_unchecked<'a>(
+        &'a self,
+        indexes: impl Iterator<Item = Option<usize>> + 'a,
+    ) -> impl Iterator<Item = Option<bool>> + 'a {
+        indexes.map(|opt_index| opt_index.map(|index| 
self.value_unchecked(index)))
+    }
 }
 
 impl Array for BooleanArray {
diff --git a/arrow/src/array/array_primitive.rs 
b/arrow/src/array/array_primitive.rs
index 0d18032..79aa8e6 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -154,6 +154,17 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
     ) -> impl Iterator<Item = Option<T::Native>> + 'a {
         indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
     }
+
+    /// Returns an iterator that returns the values of `array.value(i)` for an 
iterator with each element `i`
+    /// # Safety
+    ///
+    /// caller must ensure that the offsets in the iterator are less than the 
array len()
+    pub unsafe fn take_iter_unchecked<'a>(
+        &'a self,
+        indexes: impl Iterator<Item = Option<usize>> + 'a,
+    ) -> impl Iterator<Item = Option<T::Native>> + 'a {
+        indexes.map(|opt_index| opt_index.map(|index| 
self.value_unchecked(index)))
+    }
 }
 
 impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs
index 95c7cd6..b17534a 100644
--- a/arrow/src/array/array_string.rs
+++ b/arrow/src/array/array_string.rs
@@ -180,6 +180,17 @@ impl<OffsetSize: StringOffsetSizeTrait> 
GenericStringArray<OffsetSize> {
     ) -> impl Iterator<Item = Option<&str>> + 'a {
         indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
     }
+
+    /// Returns an iterator that returns the values of `array.value(i)` for an 
iterator with each element `i`
+    /// # Safety
+    ///
+    /// caller must ensure that the offsets in the iterator are less than the 
array len()
+    pub unsafe fn take_iter_unchecked<'a>(
+        &'a self,
+        indexes: impl Iterator<Item = Option<usize>> + 'a,
+    ) -> impl Iterator<Item = Option<&str>> + 'a {
+        indexes.map(|opt_index| opt_index.map(|index| 
self.value_unchecked(index)))
+    }
 }
 
 impl<'a, Ptr, OffsetSize: StringOffsetSizeTrait> FromIterator<&'a Option<Ptr>>
diff --git a/arrow/src/compute/kernels/comparison.rs 
b/arrow/src/compute/kernels/comparison.rs
index d1c33e3..1154076 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -2214,19 +2214,26 @@ macro_rules! compare_dict_op {
             ));
         }
 
-        let left_iter = $left
-            .values()
-            .as_any()
-            .downcast_ref::<$value_ty>()
-            .unwrap()
-            .take_iter($left.keys_iter());
-
-        let right_iter = $right
-            .values()
-            .as_any()
-            .downcast_ref::<$value_ty>()
-            .unwrap()
-            .take_iter($right.keys_iter());
+        // Safety justification: Since the inputs are valid Arrow arrays, all 
values are
+        // valid indexes into the dictionary (which is verified during 
construction)
+
+        let left_iter = unsafe {
+            $left
+                .values()
+                .as_any()
+                .downcast_ref::<$value_ty>()
+                .unwrap()
+                .take_iter_unchecked($left.keys_iter())
+        };
+
+        let right_iter = unsafe {
+            $right
+                .values()
+                .as_any()
+                .downcast_ref::<$value_ty>()
+                .unwrap()
+                .take_iter_unchecked($right.keys_iter())
+        };
 
         let result = left_iter
             .zip(right_iter)

Reply via email to