This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 20d81f578 Add FixedSizeBinaryArray::try_from_sparse_iter_with_size
(#3054)
20d81f578 is described below
commit 20d81f5784e5609666fa3d0599f2f95e5e5a40c6
Author: Max Burke <[email protected]>
AuthorDate: Sun Nov 13 10:23:29 2022 -0800
Add FixedSizeBinaryArray::try_from_sparse_iter_with_size (#3054)
* implement ord for FixedSizeBinary types
* add ord test
* add compare for fixed size binary arrays
* FixedSizeBinaryArray::try_from_sparse_iter has fails to generate a valid
array when the iterator only produces None values
* Tweak try_from_sparse_iter_with_size to take an Option<i32>; pass tests
* simplify try_from_sparse_iter_with_size, make size parameter non-optional
* add test for fixed size binary comparisons
* move tests to use FixedSizeBinaryArray::from_sparse_iter_with_size, add
docstring
* format + fix failing tests
* fix build
---
arrow-array/src/array/fixed_size_binary_array.rs | 121 +++++++++++++++++++++--
arrow-select/src/take.rs | 7 +-
arrow/src/array/ffi.rs | 3 +-
arrow/src/compute/kernels/comparison.rs | 41 ++++++++
arrow/src/compute/kernels/sort.rs | 15 ++-
arrow/src/compute/kernels/substring.rs | 3 +-
arrow/src/ffi.rs | 8 +-
arrow/src/util/bench_util.rs | 25 ++---
arrow/tests/array_transform.rs | 9 +-
9 files changed, 198 insertions(+), 34 deletions(-)
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs
b/arrow-array/src/array/fixed_size_binary_array.rs
index f37d1e3e5..9bac49810 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -129,6 +129,9 @@ impl FixedSizeBinaryArray {
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices
don't match.
+ #[deprecated(
+ note = "This function will fail if the iterator produces only None
values; prefer `try_from_sparse_iter_with_size`"
+ )]
pub fn try_from_sparse_iter<T, U>(mut iter: T) -> Result<Self, ArrowError>
where
T: Iterator<Item = Option<U>>,
@@ -196,6 +199,86 @@ impl FixedSizeBinaryArray {
Ok(FixedSizeBinaryArray::from(array_data))
}
+ /// Create an array from an iterable argument of sparse byte slices.
+ /// Sparsity means that items returned by the iterator are optional, i.e
input argument can
+ /// contain `None` items. In cases where the iterator returns only `None`
values, this
+ /// also takes a size parameter to ensure that the a valid
FixedSizeBinaryArray is still
+ /// created.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use arrow_array::FixedSizeBinaryArray;
+ /// let input_arg = vec![
+ /// None,
+ /// Some(vec![7, 8]),
+ /// Some(vec![9, 10]),
+ /// None,
+ /// Some(vec![13, 14]),
+ /// None,
+ /// ];
+ /// let array =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(),
2).unwrap();
+ /// ```
+ ///
+ /// # Errors
+ ///
+ /// Returns error if argument has length zero, or sizes of nested slices
don't match.
+ pub fn try_from_sparse_iter_with_size<T, U>(
+ mut iter: T,
+ size: i32,
+ ) -> Result<Self, ArrowError>
+ where
+ T: Iterator<Item = Option<U>>,
+ U: AsRef<[u8]>,
+ {
+ let mut len = 0;
+ let mut byte = 0;
+ let mut null_buf = MutableBuffer::from_len_zeroed(0);
+ let mut buffer = MutableBuffer::from_len_zeroed(0);
+
+ iter.try_for_each(|item| -> Result<(), ArrowError> {
+ // extend null bitmask by one byte per each 8 items
+ if byte == 0 {
+ null_buf.push(0u8);
+ byte = 8;
+ }
+ byte -= 1;
+
+ if let Some(slice) = item {
+ let slice = slice.as_ref();
+ if size as usize != slice.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Nested array size mismatch: one is {}, and the other
is {}",
+ size,
+ slice.len()
+ )));
+ }
+
+ bit_util::set_bit(null_buf.as_slice_mut(), len);
+ buffer.extend_from_slice(slice);
+ } else {
+ buffer.extend_zeros(size as usize);
+ }
+
+ len += 1;
+
+ Ok(())
+ })?;
+
+ let array_data = unsafe {
+ ArrayData::new_unchecked(
+ DataType::FixedSizeBinary(size),
+ len,
+ None,
+ Some(null_buf.into()),
+ 0,
+ vec![buffer.into()],
+ vec![],
+ )
+ };
+ Ok(FixedSizeBinaryArray::from(array_data))
+ }
+
/// Create an array from an iterable argument of byte slices.
///
/// # Examples
@@ -333,6 +416,7 @@ impl From<FixedSizeListArray> for FixedSizeBinaryArray {
impl From<Vec<Option<&[u8]>>> for FixedSizeBinaryArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
+ #[allow(deprecated)]
Self::try_from_sparse_iter(v.into_iter()).unwrap()
}
}
@@ -561,6 +645,7 @@ mod tests {
fn test_all_none_fixed_size_binary_array_from_sparse_iter() {
let none_option: Option<[u8; 32]> = None;
let input_arg = vec![none_option, none_option, none_option];
+ #[allow(deprecated)]
let arr =
FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
assert_eq!(0, arr.value_length());
@@ -576,9 +661,31 @@ mod tests {
None,
Some(vec![13, 14]),
];
- let arr =
-
FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
+ #[allow(deprecated)]
+ let arr =
FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned())
+ .unwrap();
assert_eq!(2, arr.value_length());
+ assert_eq!(5, arr.len());
+
+ let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
+ input_arg.into_iter(),
+ 2,
+ )
+ .unwrap();
+ assert_eq!(2, arr.value_length());
+ assert_eq!(5, arr.len());
+ }
+
+ #[test]
+ fn test_fixed_size_binary_array_from_sparse_iter_with_size_all_none() {
+ let input_arg = vec![None, None, None, None, None] as
Vec<Option<Vec<u8>>>;
+
+ let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
+ input_arg.into_iter(),
+ 16,
+ )
+ .unwrap();
+ assert_eq!(16, arr.value_length());
assert_eq!(5, arr.len())
}
@@ -643,7 +750,9 @@ mod tests {
#[test]
fn fixed_size_binary_array_all_null() {
let data = vec![None] as Vec<Option<String>>;
- let array =
FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap();
+ let array =
+
FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0)
+ .unwrap();
array
.data()
.validate_full()
@@ -652,16 +761,14 @@ mod tests {
#[test]
// Test for https://github.com/apache/arrow-rs/issues/1390
- #[should_panic(
- expected = "column types must match schema types, expected
FixedSizeBinary(2) but found FixedSizeBinary(0) at column index 0"
- )]
fn fixed_size_binary_array_all_null_in_batch_with_schema() {
let schema =
Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2),
true)]);
let none_option: Option<[u8; 2]> = None;
- let item = FixedSizeBinaryArray::try_from_sparse_iter(
+ let item = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![none_option, none_option, none_option].into_iter(),
+ 2,
)
.unwrap();
diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs
index d34a88ba5..4af876a79 100644
--- a/arrow-select/src/take.rs
+++ b/arrow-select/src/take.rs
@@ -207,12 +207,12 @@ where
DataType::LargeBinary => {
Ok(Arc::new(take_bytes(as_generic_binary_array::<i64>(values),
indices)?))
}
- DataType::FixedSizeBinary(_) => {
+ DataType::FixedSizeBinary(size) => {
let values = values
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
- Ok(Arc::new(take_fixed_size_binary(values, indices)?))
+ Ok(Arc::new(take_fixed_size_binary(values, indices, *size)?))
}
DataType::Null => {
// Take applied to a null array produces a null array.
@@ -769,6 +769,7 @@ where
fn take_fixed_size_binary<IndexType>(
values: &FixedSizeBinaryArray,
indices: &PrimitiveArray<IndexType>,
+ size: i32,
) -> Result<FixedSizeBinaryArray, ArrowError>
where
IndexType: ArrowPrimitiveType,
@@ -789,7 +790,7 @@ where
.collect::<Result<Vec<_>, ArrowError>>()?
.into_iter();
- FixedSizeBinaryArray::try_from_sparse_iter(array_iter)
+ FixedSizeBinaryArray::try_from_sparse_iter_with_size(array_iter, size)
}
/// `take` implementation for dictionary arrays
diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs
index 72030f900..a18f408a4 100644
--- a/arrow/src/array/ffi.rs
+++ b/arrow/src/array/ffi.rs
@@ -212,7 +212,8 @@ mod tests {
Some(vec![30, 30, 30]),
None,
];
- let array =
FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?;
+ let array =
+
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?;
let data = array.data();
test_round_trip(data)
diff --git a/arrow/src/compute/kernels/comparison.rs
b/arrow/src/compute/kernels/comparison.rs
index a286eedd1..4566b4969 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -2270,6 +2270,18 @@ macro_rules! typed_compares {
as_largestring_array($RIGHT),
$OP,
),
+ (DataType::FixedSizeBinary(_), DataType::FixedSizeBinary(_)) => {
+ let lhs = $LEFT
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .unwrap();
+ let rhs = $RIGHT
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .unwrap();
+
+ compare_op(lhs, rhs, $OP)
+ }
(DataType::Binary, DataType::Binary) => compare_op(
as_generic_binary_array::<i32>($LEFT),
as_generic_binary_array::<i32>($RIGHT),
@@ -5449,6 +5461,35 @@ mod tests {
);
}
+ #[test]
+ fn test_eq_dyn_neq_dyn_fixed_size_binary() {
+ use crate::array::FixedSizeBinaryArray;
+
+ let values1: Vec<Option<&[u8]>> =
+ vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x01])];
+ let values2: Vec<Option<&[u8]>> =
+ vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x00])];
+
+ let array1 =
+
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values1.into_iter(), 2)
+ .unwrap();
+ let array2 =
+
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values2.into_iter(), 2)
+ .unwrap();
+
+ let result = eq_dyn(&array1, &array2).unwrap();
+ assert_eq!(
+ BooleanArray::from(vec![Some(true), None, Some(false)]),
+ result
+ );
+
+ let result = neq_dyn(&array1, &array2).unwrap();
+ assert_eq!(
+ BooleanArray::from(vec![Some(false), None, Some(true)]),
+ result
+ );
+ }
+
#[test]
#[cfg(feature = "dyn_cmp_dict")]
fn test_eq_dyn_neq_dyn_dictionary_i8_array() {
diff --git a/arrow/src/compute/kernels/sort.rs
b/arrow/src/compute/kernels/sort.rs
index 97b0758e5..81895760e 100644
--- a/arrow/src/compute/kernels/sort.rs
+++ b/arrow/src/compute/kernels/sort.rs
@@ -1437,17 +1437,24 @@ mod tests {
fixed_length: Option<i32>,
) {
// Fixed size binary array
- if fixed_length.is_some() {
+ if let Some(length) = fixed_length {
let input = Arc::new(
-
FixedSizeBinaryArray::try_from_sparse_iter(data.iter().cloned()).unwrap(),
+ FixedSizeBinaryArray::try_from_sparse_iter_with_size(
+ data.iter().cloned(),
+ length,
+ )
+ .unwrap(),
);
let sorted = match limit {
Some(_) => sort_limit(&(input as ArrayRef), options,
limit).unwrap(),
None => sort(&(input as ArrayRef), options).unwrap(),
};
let expected = Arc::new(
-
FixedSizeBinaryArray::try_from_sparse_iter(expected_data.iter().cloned())
- .unwrap(),
+ FixedSizeBinaryArray::try_from_sparse_iter_with_size(
+ expected_data.iter().cloned(),
+ length,
+ )
+ .unwrap(),
) as ArrayRef;
assert_eq!(&sorted, &expected);
diff --git a/arrow/src/compute/kernels/substring.rs
b/arrow/src/compute/kernels/substring.rs
index f52ddb3bc..76568ae0d 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -713,8 +713,9 @@ mod tests {
.as_any()
.downcast_ref::<FixedSizeBinaryArray>()
.unwrap();
- let expected = FixedSizeBinaryArray::try_from_sparse_iter(
+ let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![None, Some(b"rrow")].into_iter(),
+ 4,
)
.unwrap();
assert_eq!(result, &expected);
diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs
index 95e6dce3c..03c265318 100644
--- a/arrow/src/ffi.rs
+++ b/arrow/src/ffi.rs
@@ -1231,7 +1231,8 @@ mod tests {
Some(vec![30, 30, 30]),
None,
];
- let array =
FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?;
+ let array =
+
FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?;
// export it
let array = ArrowArray::try_from(array.into_data())?;
@@ -1250,7 +1251,7 @@ mod tests {
// verify
assert_eq!(
array,
- &FixedSizeBinaryArray::try_from_sparse_iter(
+ &FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![
None,
Some(vec![10, 10, 10]),
@@ -1265,7 +1266,8 @@ mod tests {
Some(vec![30, 30, 30]),
None,
]
- .into_iter()
+ .into_iter(),
+ 3
)?
);
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
index d07443301..6420b6346 100644
--- a/arrow/src/util/bench_util.rs
+++ b/arrow/src/util/bench_util.rs
@@ -176,17 +176,20 @@ pub fn create_fsb_array(
) -> FixedSizeBinaryArray {
let rng = &mut seedable_rng();
- FixedSizeBinaryArray::try_from_sparse_iter((0..size).map(|_| {
- if rng.gen::<f32>() < null_density {
- None
- } else {
- let value = rng
- .sample_iter::<u8, _>(Standard)
- .take(value_len)
- .collect::<Vec<u8>>();
- Some(value)
- }
- }))
+ FixedSizeBinaryArray::try_from_sparse_iter_with_size(
+ (0..size).map(|_| {
+ if rng.gen::<f32>() < null_density {
+ None
+ } else {
+ let value = rng
+ .sample_iter::<u8, _>(Standard)
+ .take(value_len)
+ .collect::<Vec<u8>>();
+ Some(value)
+ }
+ }),
+ value_len as i32,
+ )
.unwrap()
}
diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs
index 3619abacd..03942be10 100644
--- a/arrow/tests/array_transform.rs
+++ b/arrow/tests/array_transform.rs
@@ -868,7 +868,7 @@ fn test_list_of_strings_append() {
#[test]
fn test_fixed_size_binary_append() {
let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])];
- let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter())
+ let a =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(a.into_iter(), 2)
.expect("Failed to create FixedSizeBinaryArray from iterable");
let b = vec![
@@ -879,7 +879,7 @@ fn test_fixed_size_binary_append() {
Some(vec![13, 14]),
None,
];
- let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter())
+ let b =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(b.into_iter(), 2)
.expect("Failed to create FixedSizeBinaryArray from iterable");
let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false,
10);
@@ -911,8 +911,9 @@ fn test_fixed_size_binary_append() {
Some(vec![9, 10]),
// b[4..4]
];
- let expected =
FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter())
- .expect("Failed to create FixedSizeBinaryArray from iterable");
+ let expected =
+
FixedSizeBinaryArray::try_from_sparse_iter_with_size(expected.into_iter(), 2)
+ .expect("Failed to create FixedSizeBinaryArray from iterable");
assert_eq!(&result, expected.data());
}