This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 376c9ad ARROW-4854: [Rust] Use zero-copy slice for limit kernel
376c9ad is described below
commit 376c9ad63b21d872ecf9cbac221ff8b37a08b3af
Author: Neville Dipale <[email protected]>
AuthorDate: Thu Mar 21 14:29:49 2019 +0100
ARROW-4854: [Rust] Use zero-copy slice for limit kernel
Has the benefit of being faster and supporting all array types
Author: Neville Dipale <[email protected]>
Closes #4000 from nevi-me/ARROW-4854 and squashes the following commits:
304647bc <Neville Dipale> ARROW-4854: Use zero-copy slice for limit kernel
---
rust/arrow/src/compute/array_ops.rs | 173 ++++++++++++++++++++++++++----------
1 file changed, 126 insertions(+), 47 deletions(-)
diff --git a/rust/arrow/src/compute/array_ops.rs
b/rust/arrow/src/compute/array_ops.rs
index 088661d..d9de6d7 100644
--- a/rust/arrow/src/compute/array_ops.rs
+++ b/rust/arrow/src/compute/array_ops.rs
@@ -153,6 +153,7 @@ macro_rules! filter_array {
}};
}
+/// Returns the array, taking only the elements matching the filter
pub fn filter(array: &Array, filter: &BooleanArray) -> Result<ArrayRef> {
match array.data_type() {
DataType::UInt8 => filter_array!(array, filter, UInt8Array),
@@ -183,61 +184,25 @@ pub fn filter(array: &Array, filter: &BooleanArray) ->
Result<ArrayRef> {
}
}
-macro_rules! limit_array {
- ($array:expr, $num_elements:expr, $array_type:ident) => {{
- let b = $array.as_any().downcast_ref::<$array_type>().unwrap();
- let mut builder = $array_type::builder($num_elements);
- for i in 0..$num_elements {
- if b.is_null(i) {
- builder.append_null()?;
- } else {
- builder.append_value(b.value(i))?;
- }
- }
- Ok(Arc::new(builder.finish()))
- }};
-}
-
/// Returns the array, taking only the number of elements specified
///
-/// Returns the whole array if the number of elements specified is larger than
the length
-/// of the array
+/// Limit performs a zero-copy slice of the array, and is a convenience method
on slice
+/// where:
+/// * it performs a bounds-check on the array
+/// * it slices from offset 0
pub fn limit(array: &ArrayRef, num_elements: usize) -> Result<ArrayRef> {
- if num_elements >= array.len() {
- return Ok(array.clone());
- }
-
- match array.data_type() {
- DataType::UInt8 => limit_array!(array, num_elements, UInt8Array),
- DataType::UInt16 => limit_array!(array, num_elements, UInt16Array),
- DataType::UInt32 => limit_array!(array, num_elements, UInt32Array),
- DataType::UInt64 => limit_array!(array, num_elements, UInt64Array),
- DataType::Int8 => limit_array!(array, num_elements, Int8Array),
- DataType::Int16 => limit_array!(array, num_elements, Int16Array),
- DataType::Int32 => limit_array!(array, num_elements, Int32Array),
- DataType::Int64 => limit_array!(array, num_elements, Int64Array),
- DataType::Float32 => limit_array!(array, num_elements, Float32Array),
- DataType::Float64 => limit_array!(array, num_elements, Float64Array),
- DataType::Boolean => limit_array!(array, num_elements, BooleanArray),
- DataType::Utf8 => {
- let b = array.as_any().downcast_ref::<BinaryArray>().unwrap();
- let mut values: Vec<&[u8]> = Vec::with_capacity(num_elements);
- for i in 0..num_elements {
- values.push(b.value(i));
- }
- Ok(Arc::new(BinaryArray::from(values)))
- }
- other => Err(ArrowError::ComputeError(format!(
- "limit not supported for {:?}",
- other
- ))),
- }
+ let lim = num_elements.min(array.len());
+ Ok(array.slice(0, lim))
}
#[cfg(test)]
mod tests {
use super::*;
- use crate::array::{ArrayRef, Float64Array, Int32Array};
+ use crate::array::*;
+ use crate::array_data::ArrayData;
+ use crate::buffer::Buffer;
+ use crate::datatypes::{Field, ToByteSlice};
+ use crate::util::bit_util;
use std::sync::Arc;
@@ -355,4 +320,118 @@ mod tests {
assert_eq!(8, c.value(3));
assert_eq!(9, c.value(4));
}
+
+ #[test]
+ fn test_list_array_limit() {
+ // adapted from crate::array::test::test_list_array_slice
+ // Construct a value array
+ let value_data = ArrayData::builder(DataType::Int32)
+ .len(10)
+ .add_buffer(Buffer::from(
+ &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9].to_byte_slice(),
+ ))
+ .build();
+
+ // Construct a buffer for value offsets, for the nested array:
+ // [[0, 1], null, [2, 3], null, [4, 5], null, [6, 7, 8], null, [9]]
+ let value_offsets =
+ Buffer::from(&[0, 2, 2, 4, 4, 6, 6, 9, 9, 10].to_byte_slice());
+ // 01010101 00000001
+ let mut null_bits: [u8; 2] = [0; 2];
+ bit_util::set_bit(&mut null_bits, 0);
+ bit_util::set_bit(&mut null_bits, 2);
+ bit_util::set_bit(&mut null_bits, 4);
+ bit_util::set_bit(&mut null_bits, 6);
+ bit_util::set_bit(&mut null_bits, 8);
+
+ // Construct a list array from the above two
+ let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data = ArrayData::builder(list_data_type.clone())
+ .len(9)
+ .add_buffer(value_offsets.clone())
+ .add_child_data(value_data.clone())
+ .null_bit_buffer(Buffer::from(null_bits))
+ .build();
+ let list_array: ArrayRef = Arc::new(ListArray::from(list_data));
+
+ let limit_array = limit(&list_array, 6).unwrap();
+ assert_eq!(6, limit_array.len());
+ assert_eq!(0, limit_array.offset());
+ assert_eq!(3, limit_array.null_count());
+
+ // Check offset and length for each non-null value.
+ let limit_array: &ListArray =
+ limit_array.as_any().downcast_ref::<ListArray>().unwrap();
+ for i in 0..limit_array.len() {
+ let offset = limit_array.value_offset(i);
+ let length = limit_array.value_length(i);
+ if i % 2 == 0 {
+ assert_eq!(2, length);
+ assert_eq!(i as i32, offset);
+ } else {
+ assert_eq!(0, length);
+ }
+ }
+ }
+
+ #[test]
+ fn test_struct_array_limit() {
+ // adapted from crate::array::test::test_struct_array_slice
+ let boolean_data = ArrayData::builder(DataType::Boolean)
+ .len(5)
+ .add_buffer(Buffer::from([0b00010000]))
+ .null_bit_buffer(Buffer::from([0b00010001]))
+ .build();
+ let int_data = ArrayData::builder(DataType::Int32)
+ .len(5)
+ .add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice()))
+ .null_bit_buffer(Buffer::from([0b00000110]))
+ .build();
+
+ let mut field_types = vec![];
+ field_types.push(Field::new("a", DataType::Boolean, false));
+ field_types.push(Field::new("b", DataType::Int32, false));
+ let struct_array_data =
ArrayData::builder(DataType::Struct(field_types))
+ .len(5)
+ .add_child_data(boolean_data.clone())
+ .add_child_data(int_data.clone())
+ .null_bit_buffer(Buffer::from([0b00010111]))
+ .build();
+ let struct_array = StructArray::from(struct_array_data);
+
+ assert_eq!(5, struct_array.len());
+ assert_eq!(1, struct_array.null_count());
+ assert_eq!(boolean_data, struct_array.column(0).data());
+ assert_eq!(int_data, struct_array.column(1).data());
+
+ let array: ArrayRef = Arc::new(struct_array);
+
+ let sliced_array = limit(&array, 3).unwrap();
+ let sliced_array =
sliced_array.as_any().downcast_ref::<StructArray>().unwrap();
+ assert_eq!(3, sliced_array.len());
+ assert_eq!(0, sliced_array.offset());
+ assert_eq!(0, sliced_array.null_count());
+ assert!(sliced_array.is_valid(0));
+ assert!(sliced_array.is_valid(1));
+ assert!(sliced_array.is_valid(2));
+
+ let sliced_c0 = sliced_array.column(0);
+ let sliced_c0 =
sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap();
+ assert_eq!(3, sliced_c0.len());
+ assert_eq!(0, sliced_c0.offset());
+ assert_eq!(2, sliced_c0.null_count());
+ assert!(sliced_c0.is_valid(0));
+ assert!(sliced_c0.is_null(1));
+ assert!(sliced_c0.is_null(2));
+ assert_eq!(false, sliced_c0.value(0));
+
+ let sliced_c1 = sliced_array.column(1);
+ let sliced_c1 =
sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert_eq!(3, sliced_c1.len());
+ assert_eq!(0, sliced_c1.offset());
+ assert_eq!(1, sliced_c1.null_count());
+ assert!(sliced_c1.is_null(0));
+ assert_eq!(28, sliced_c1.value(1));
+ assert_eq!(42, sliced_c1.value(2));
+ }
}