This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 62e878e12 Specialize filter kernel for binary arrays (#2969) (#2971)
62e878e12 is described below
commit 62e878e12229c7bc911e3096390fd72a8e20bda2
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Nov 2 07:53:17 2022 +1300
Specialize filter kernel for binary arrays (#2969) (#2971)
* Generalize filter byte array (#2969)
* Fix doc
* Update comment
---
arrow-select/src/filter.rs | 56 +++++++++++++++++++++++-----------------------
1 file changed, 28 insertions(+), 28 deletions(-)
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index 71175ca57..4596afc87 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -17,12 +17,11 @@
//! Defines filter kernels
-use std::ops::AddAssign;
use std::sync::Arc;
-use num::Zero;
-
use arrow_array::builder::BooleanBufferBuilder;
+use arrow_array::cast::{as_generic_binary_array, as_largestring_array,
as_string_array};
+use arrow_array::types::ByteArrayType;
use arrow_array::*;
use arrow_buffer::bit_util;
use arrow_buffer::{buffer::buffer_bin_and, Buffer, MutableBuffer};
@@ -355,18 +354,16 @@ fn filter_array(
Ok(Arc::new(filter_boolean(values, predicate)))
}
DataType::Utf8 => {
- let values = values
- .as_any()
- .downcast_ref::<GenericStringArray<i32>>()
- .unwrap();
- Ok(Arc::new(filter_string::<i32>(values, predicate)))
+ Ok(Arc::new(filter_bytes(as_string_array(values), predicate)))
}
DataType::LargeUtf8 => {
- let values = values
- .as_any()
- .downcast_ref::<GenericStringArray<i64>>()
- .unwrap();
- Ok(Arc::new(filter_string::<i64>(values, predicate)))
+ Ok(Arc::new(filter_bytes(as_largestring_array(values),
predicate)))
+ }
+ DataType::Binary => {
+
Ok(Arc::new(filter_bytes(as_generic_binary_array::<i32>(values), predicate)))
+ }
+ DataType::LargeBinary => {
+
Ok(Arc::new(filter_bytes(as_generic_binary_array::<i64>(values), predicate)))
}
DataType::Dictionary(_, _) => downcast_dictionary_array! {
values => Ok(Arc::new(filter_dict(values, predicate))),
@@ -545,11 +542,11 @@ where
PrimitiveArray::from(data)
}
-/// [`FilterString`] is created from a source [`GenericStringArray`] and can be
-/// used to build a new [`GenericStringArray`] by copying values from the
source
+/// [`FilterBytes`] is created from a source [`GenericByteArray`] and can be
+/// used to build a new [`GenericByteArray`] by copying values from the source
///
/// TODO(raphael): Could this be used for the take kernel as well?
-struct FilterString<'a, OffsetSize> {
+struct FilterBytes<'a, OffsetSize> {
src_offsets: &'a [OffsetSize],
src_values: &'a [u8],
dst_offsets: MutableBuffer,
@@ -557,15 +554,18 @@ struct FilterString<'a, OffsetSize> {
cur_offset: OffsetSize,
}
-impl<'a, OffsetSize> FilterString<'a, OffsetSize>
+impl<'a, OffsetSize> FilterBytes<'a, OffsetSize>
where
- OffsetSize: Zero + AddAssign + OffsetSizeTrait,
+ OffsetSize: OffsetSizeTrait,
{
- fn new(capacity: usize, array: &'a GenericStringArray<OffsetSize>) -> Self
{
+ fn new<T>(capacity: usize, array: &'a GenericByteArray<T>) -> Self
+ where
+ T: ByteArrayType<Offset = OffsetSize>,
+ {
let num_offsets_bytes = (capacity + 1) *
std::mem::size_of::<OffsetSize>();
let mut dst_offsets = MutableBuffer::new(num_offsets_bytes);
let dst_values = MutableBuffer::new(0);
- let cur_offset = OffsetSize::zero();
+ let cur_offset = OffsetSize::from_usize(0).unwrap();
dst_offsets.push(cur_offset);
Self {
@@ -622,21 +622,21 @@ where
}
}
-/// `filter` implementation for string arrays
+/// `filter` implementation for byte arrays
///
/// Note: NULLs with a non-zero slot length in `array` will have the
corresponding
/// data copied across. This allows handling the null mask separately from the
data
-fn filter_string<OffsetSize>(
- array: &GenericStringArray<OffsetSize>,
+fn filter_bytes<T>(
+ array: &GenericByteArray<T>,
predicate: &FilterPredicate,
-) -> GenericStringArray<OffsetSize>
+) -> GenericByteArray<T>
where
- OffsetSize: Zero + AddAssign + OffsetSizeTrait,
+ T: ByteArrayType,
{
let data = array.data();
assert_eq!(data.buffers().len(), 2);
assert_eq!(data.child_data().len(), 0);
- let mut filter = FilterString::new(predicate.count, array);
+ let mut filter = FilterBytes::new(predicate.count, array);
match &predicate.strategy {
IterationStrategy::SlicesIterator => {
@@ -650,7 +650,7 @@ where
IterationStrategy::All | IterationStrategy::None => unreachable!(),
}
- let mut builder = ArrayDataBuilder::new(data.data_type().clone())
+ let mut builder = ArrayDataBuilder::new(T::DATA_TYPE)
.len(predicate.count)
.add_buffer(filter.dst_offsets.into())
.add_buffer(filter.dst_values.into());
@@ -660,7 +660,7 @@ where
}
let data = unsafe { builder.build_unchecked() };
- GenericStringArray::from(data)
+ GenericByteArray::from(data)
}
/// `filter` implementation for dictionaries