This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new f4ee8b9ac Handle empty offsets buffer (#1824) (#2836)
f4ee8b9ac is described below
commit f4ee8b9acbd2ad3110dfc1bf3cb8b93bd876adb5
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Oct 13 06:11:27 2022 +0100
Handle empty offsets buffer (#1824) (#2836)
* Handle empty offsets buffer (#1824)
* Review feedback
---
arrow-array/src/array/binary_array.rs | 30 +++++++++++++++++++++++--
arrow-array/src/array/list_array.rs | 42 +++++++++++++++++++++++++++++++++--
arrow-array/src/array/string_array.rs | 31 +++++++++++++++++++++++---
3 files changed, 96 insertions(+), 7 deletions(-)
diff --git a/arrow-array/src/array/binary_array.rs
b/arrow-array/src/array/binary_array.rs
index cb168daf0..851fb60c0 100644
--- a/arrow-array/src/array/binary_array.rs
+++ b/arrow-array/src/array/binary_array.rs
@@ -17,7 +17,10 @@
use crate::iterator::GenericBinaryIter;
use crate::raw_pointer::RawPtrBox;
-use crate::{print_long_array, Array, ArrayAccessor, GenericListArray,
OffsetSizeTrait};
+use crate::{
+ empty_offsets, print_long_array, Array, ArrayAccessor, GenericListArray,
+ OffsetSizeTrait,
+};
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::DataType;
@@ -286,7 +289,11 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for
GenericBinaryArray<OffsetS
2,
"BinaryArray data should contain 2 buffers only (offsets and
values)"
);
- let offsets = data.buffers()[0].as_ptr();
+ // Handle case of empty offsets
+ let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
+ true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
+ false => data.buffers()[0].as_ptr(),
+ };
let values = data.buffers()[1].as_ptr();
Self {
data,
@@ -845,4 +852,23 @@ mod tests {
.validate_full()
.expect("All null array has valid array data");
}
+
+ #[test]
+ fn test_empty_offsets() {
+ let string = BinaryArray::from(
+ ArrayData::builder(DataType::Binary)
+ .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
+ .build()
+ .unwrap(),
+ );
+ assert_eq!(string.value_offsets(), &[0]);
+ let string = LargeBinaryArray::from(
+ ArrayData::builder(DataType::LargeBinary)
+ .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
+ .build()
+ .unwrap(),
+ );
+ assert_eq!(string.len(), 0);
+ assert_eq!(string.value_offsets(), &[0]);
+ }
}
diff --git a/arrow-array/src/array/list_array.rs
b/arrow-array/src/array/list_array.rs
index b45a0f925..3022db023 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -43,6 +43,17 @@ impl OffsetSizeTrait for i64 {
const PREFIX: &'static str = "Large";
}
+/// Returns a slice of `OffsetSize` consisting of a single zero value
+#[inline]
+pub(crate) fn empty_offsets<OffsetSize: OffsetSizeTrait>() -> &'static
[OffsetSize] {
+ static OFFSET: &[i64] = &[0];
+ // SAFETY:
+ // OffsetSize is ArrowNativeType and is therefore trivially transmutable
+ let (prefix, val, suffix) = unsafe { OFFSET.align_to::<OffsetSize>() };
+ assert!(prefix.is_empty() && suffix.is_empty());
+ val
+}
+
/// Generic struct for a variable-size list array.
///
/// Columnar format in Apache Arrow:
@@ -240,8 +251,13 @@ impl<OffsetSize: OffsetSizeTrait>
GenericListArray<OffsetSize> {
}
let values = make_array(values);
- let value_offsets = data.buffers()[0].as_ptr();
- let value_offsets = unsafe {
RawPtrBox::<OffsetSize>::new(value_offsets) };
+ // Handle case of empty offsets
+ let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
+ true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
+ false => data.buffers()[0].as_ptr(),
+ };
+
+ let value_offsets = unsafe { RawPtrBox::new(offsets) };
Ok(Self {
data,
values,
@@ -941,4 +957,26 @@ mod tests {
false,
);
}
+
+ #[test]
+ fn test_empty_offsets() {
+ let f = Box::new(Field::new("element", DataType::Int32, true));
+ let string = ListArray::from(
+ ArrayData::builder(DataType::List(f.clone()))
+ .buffers(vec![Buffer::from(&[])])
+ .add_child_data(ArrayData::new_empty(&DataType::Int32))
+ .build()
+ .unwrap(),
+ );
+ assert_eq!(string.value_offsets(), &[0]);
+ let string = LargeListArray::from(
+ ArrayData::builder(DataType::LargeList(f))
+ .buffers(vec![Buffer::from(&[])])
+ .add_child_data(ArrayData::new_empty(&DataType::Int32))
+ .build()
+ .unwrap(),
+ );
+ assert_eq!(string.len(), 0);
+ assert_eq!(string.value_offsets(), &[0]);
+ }
}
diff --git a/arrow-array/src/array/string_array.rs
b/arrow-array/src/array/string_array.rs
index 22ad81eaa..7e2ed3667 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -18,8 +18,8 @@
use crate::iterator::GenericStringIter;
use crate::raw_pointer::RawPtrBox;
use crate::{
- print_long_array, Array, ArrayAccessor, GenericBinaryArray,
GenericListArray,
- OffsetSizeTrait,
+ empty_offsets, print_long_array, Array, ArrayAccessor, GenericBinaryArray,
+ GenericListArray, OffsetSizeTrait,
};
use arrow_buffer::{bit_util, Buffer, MutableBuffer};
use arrow_data::ArrayData;
@@ -370,7 +370,11 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for
GenericStringArray<OffsetS
2,
"StringArray data should contain 2 buffers only (offsets and
values)"
);
- let offsets = data.buffers()[0].as_ptr();
+ // Handle case of empty offsets
+ let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
+ true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
+ false => data.buffers()[0].as_ptr(),
+ };
let values = data.buffers()[1].as_ptr();
Self {
data,
@@ -823,4 +827,25 @@ mod tests {
fn test_large_string_array_from_list_array_wrong_type() {
_test_generic_string_array_from_list_array_wrong_type::<i32>();
}
+
+ #[test]
+ fn test_empty_offsets() {
+ let string = StringArray::from(
+ ArrayData::builder(DataType::Utf8)
+ .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
+ .build()
+ .unwrap(),
+ );
+ assert_eq!(string.len(), 0);
+ assert_eq!(string.value_offsets(), &[0]);
+
+ let string = LargeStringArray::from(
+ ArrayData::builder(DataType::LargeUtf8)
+ .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
+ .build()
+ .unwrap(),
+ );
+ assert_eq!(string.len(), 0);
+ assert_eq!(string.value_offsets(), &[0]);
+ }
}