This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new f4ee8b9ac Handle empty offsets buffer (#1824) (#2836)
f4ee8b9ac is described below

commit f4ee8b9acbd2ad3110dfc1bf3cb8b93bd876adb5
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Oct 13 06:11:27 2022 +0100

    Handle empty offsets buffer (#1824) (#2836)
    
    * Handle empty offsets buffer (#1824)
    
    * Review feedback
---
 arrow-array/src/array/binary_array.rs | 30 +++++++++++++++++++++++--
 arrow-array/src/array/list_array.rs   | 42 +++++++++++++++++++++++++++++++++--
 arrow-array/src/array/string_array.rs | 31 +++++++++++++++++++++++---
 3 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/arrow-array/src/array/binary_array.rs 
b/arrow-array/src/array/binary_array.rs
index cb168daf0..851fb60c0 100644
--- a/arrow-array/src/array/binary_array.rs
+++ b/arrow-array/src/array/binary_array.rs
@@ -17,7 +17,10 @@
 
 use crate::iterator::GenericBinaryIter;
 use crate::raw_pointer::RawPtrBox;
-use crate::{print_long_array, Array, ArrayAccessor, GenericListArray, 
OffsetSizeTrait};
+use crate::{
+    empty_offsets, print_long_array, Array, ArrayAccessor, GenericListArray,
+    OffsetSizeTrait,
+};
 use arrow_buffer::{bit_util, Buffer, MutableBuffer};
 use arrow_data::ArrayData;
 use arrow_schema::DataType;
@@ -286,7 +289,11 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for 
GenericBinaryArray<OffsetS
             2,
             "BinaryArray data should contain 2 buffers only (offsets and 
values)"
         );
-        let offsets = data.buffers()[0].as_ptr();
+        // Handle case of empty offsets
+        let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
+            true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
+            false => data.buffers()[0].as_ptr(),
+        };
         let values = data.buffers()[1].as_ptr();
         Self {
             data,
@@ -845,4 +852,23 @@ mod tests {
             .validate_full()
             .expect("All null array has valid array data");
     }
+
+    #[test]
+    fn test_empty_offsets() {
+        let string = BinaryArray::from(
+            ArrayData::builder(DataType::Binary)
+                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
+                .build()
+                .unwrap(),
+        );
+        assert_eq!(string.value_offsets(), &[0]);
+        let string = LargeBinaryArray::from(
+            ArrayData::builder(DataType::LargeBinary)
+                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
+                .build()
+                .unwrap(),
+        );
+        assert_eq!(string.len(), 0);
+        assert_eq!(string.value_offsets(), &[0]);
+    }
 }
diff --git a/arrow-array/src/array/list_array.rs 
b/arrow-array/src/array/list_array.rs
index b45a0f925..3022db023 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -43,6 +43,17 @@ impl OffsetSizeTrait for i64 {
     const PREFIX: &'static str = "Large";
 }
 
+/// Returns a slice of `OffsetSize` consisting of a single zero value
+#[inline]
+pub(crate) fn empty_offsets<OffsetSize: OffsetSizeTrait>() -> &'static 
[OffsetSize] {
+    static OFFSET: &[i64] = &[0];
+    // SAFETY:
+    // OffsetSize is ArrowNativeType and is therefore trivially transmutable
+    let (prefix, val, suffix) = unsafe { OFFSET.align_to::<OffsetSize>() };
+    assert!(prefix.is_empty() && suffix.is_empty());
+    val
+}
+
 /// Generic struct for a variable-size list array.
 ///
 /// Columnar format in Apache Arrow:
@@ -240,8 +251,13 @@ impl<OffsetSize: OffsetSizeTrait> 
GenericListArray<OffsetSize> {
         }
 
         let values = make_array(values);
-        let value_offsets = data.buffers()[0].as_ptr();
-        let value_offsets = unsafe { 
RawPtrBox::<OffsetSize>::new(value_offsets) };
+        // Handle case of empty offsets
+        let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
+            true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
+            false => data.buffers()[0].as_ptr(),
+        };
+
+        let value_offsets = unsafe { RawPtrBox::new(offsets) };
         Ok(Self {
             data,
             values,
@@ -941,4 +957,26 @@ mod tests {
             false,
         );
     }
+
+    #[test]
+    fn test_empty_offsets() {
+        let f = Box::new(Field::new("element", DataType::Int32, true));
+        let string = ListArray::from(
+            ArrayData::builder(DataType::List(f.clone()))
+                .buffers(vec![Buffer::from(&[])])
+                .add_child_data(ArrayData::new_empty(&DataType::Int32))
+                .build()
+                .unwrap(),
+        );
+        assert_eq!(string.value_offsets(), &[0]);
+        let string = LargeListArray::from(
+            ArrayData::builder(DataType::LargeList(f))
+                .buffers(vec![Buffer::from(&[])])
+                .add_child_data(ArrayData::new_empty(&DataType::Int32))
+                .build()
+                .unwrap(),
+        );
+        assert_eq!(string.len(), 0);
+        assert_eq!(string.value_offsets(), &[0]);
+    }
 }
diff --git a/arrow-array/src/array/string_array.rs 
b/arrow-array/src/array/string_array.rs
index 22ad81eaa..7e2ed3667 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -18,8 +18,8 @@
 use crate::iterator::GenericStringIter;
 use crate::raw_pointer::RawPtrBox;
 use crate::{
-    print_long_array, Array, ArrayAccessor, GenericBinaryArray, 
GenericListArray,
-    OffsetSizeTrait,
+    empty_offsets, print_long_array, Array, ArrayAccessor, GenericBinaryArray,
+    GenericListArray, OffsetSizeTrait,
 };
 use arrow_buffer::{bit_util, Buffer, MutableBuffer};
 use arrow_data::ArrayData;
@@ -370,7 +370,11 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for 
GenericStringArray<OffsetS
             2,
             "StringArray data should contain 2 buffers only (offsets and 
values)"
         );
-        let offsets = data.buffers()[0].as_ptr();
+        // Handle case of empty offsets
+        let offsets = match data.is_empty() && data.buffers()[0].is_empty() {
+            true => empty_offsets::<OffsetSize>().as_ptr() as *const _,
+            false => data.buffers()[0].as_ptr(),
+        };
         let values = data.buffers()[1].as_ptr();
         Self {
             data,
@@ -823,4 +827,25 @@ mod tests {
     fn test_large_string_array_from_list_array_wrong_type() {
         _test_generic_string_array_from_list_array_wrong_type::<i32>();
     }
+
+    #[test]
+    fn test_empty_offsets() {
+        let string = StringArray::from(
+            ArrayData::builder(DataType::Utf8)
+                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
+                .build()
+                .unwrap(),
+        );
+        assert_eq!(string.len(), 0);
+        assert_eq!(string.value_offsets(), &[0]);
+
+        let string = LargeStringArray::from(
+            ArrayData::builder(DataType::LargeUtf8)
+                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
+                .build()
+                .unwrap(),
+        );
+        assert_eq!(string.len(), 0);
+        assert_eq!(string.value_offsets(), &[0]);
+    }
 }

Reply via email to