This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new cd39b8c024e Compute data buffer length by using start and end values 
in offset buffer (#5741)
cd39b8c024e is described below

commit cd39b8c024ef09ed80f725b34ba06a45a0b7eedc
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Sun May 12 09:01:21 2024 -0700

    Compute data buffer length by using start and end values in offset buffer 
(#5741)
    
    * Compute data buffer length by offset buffer start and end values
    
    * Update code comment
    
    * Add unit test
    
    * Add round_trip check
    
    * Fix clippy
---
 arrow-array/src/ffi.rs                | 50 ++++++++++++++++++++++++++++++-----
 arrow-data/src/equal/variable_size.rs | 21 ++++++++-------
 2 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs
index 7b988bb0747..088a0a6ab58 100644
--- a/arrow-array/src/ffi.rs
+++ b/arrow-array/src/ffi.rs
@@ -425,24 +425,32 @@ impl<'a> ImportedArrowArray<'a> {
                 (length + 1) * (bits / 8)
             }
             (DataType::Utf8, 2) | (DataType::Binary, 2) => {
-                // the len of the data buffer (buffer 2) equals the last value 
of the offset buffer (buffer 1)
+                // the len of the data buffer (buffer 2) equals the difference 
between the last value
+                // and the first value of the offset buffer (buffer 1).
                 let len = self.buffer_len(1, dt)?;
                 // first buffer is the null buffer => add(1)
                 // we assume that pointer is aligned for `i32`, as Utf8 uses 
`i32` offsets.
                 #[allow(clippy::cast_ptr_alignment)]
                 let offset_buffer = self.array.buffer(1) as *const i32;
+                // get first offset
+                let start = (unsafe { *offset_buffer.add(0) }) as usize;
                 // get last offset
-                (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as 
usize
+                let end = (unsafe { *offset_buffer.add(len / size_of::<i32>() 
- 1) }) as usize;
+                end - start
             }
             (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
-                // the len of the data buffer (buffer 2) equals the last value 
of the offset buffer (buffer 1)
+                // the len of the data buffer (buffer 2) equals the difference 
between the last value
+                // and the first value of the offset buffer (buffer 1).
                 let len = self.buffer_len(1, dt)?;
                 // first buffer is the null buffer => add(1)
                 // we assume that pointer is aligned for `i64`, as Large uses 
`i64` offsets.
                 #[allow(clippy::cast_ptr_alignment)]
                 let offset_buffer = self.array.buffer(1) as *const i64;
+                // get first offset
+                let start = (unsafe { *offset_buffer.add(0) }) as usize;
                 // get last offset
-                (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as 
usize
+                let end = (unsafe { *offset_buffer.add(len / size_of::<i64>() 
- 1) }) as usize;
+                end - start
             }
             // buffer len of primitive types
             _ => {
@@ -1216,7 +1224,7 @@ mod tests_to_then_from_ffi {
 mod tests_from_ffi {
     use std::sync::Arc;
 
-    use arrow_buffer::{bit_util, buffer::Buffer};
+    use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer};
     use arrow_data::ArrayData;
     use arrow_schema::{DataType, Field};
 
@@ -1228,7 +1236,7 @@ mod tests_from_ffi {
         ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
     };
 
-    use super::Result;
+    use super::{ImportedArrowArray, Result};
 
     fn test_round_trip(expected: &ArrayData) -> Result<()> {
         // here we export the array
@@ -1420,4 +1428,34 @@ mod tests_from_ffi {
         let data = array.into_data();
         test_round_trip(&data)
     }
+
+    #[test]
+    fn test_empty_string_with_non_zero_offset() -> Result<()> {
+        // Simulate an empty string array with a non-zero offset from a 
producer
+        let data: Buffer = MutableBuffer::new(0).into();
+        let offsets = OffsetBuffer::new(vec![123].into());
+        let string_array =
+            unsafe { StringArray::new_unchecked(offsets.clone(), data.clone(), 
None) };
+
+        let data = string_array.into_data();
+
+        let array = FFI_ArrowArray::new(&data);
+        let schema = FFI_ArrowSchema::try_from(data.data_type())?;
+
+        let dt = DataType::try_from(&schema)?;
+        let array = Arc::new(array);
+        let imported_array = ImportedArrowArray {
+            array: &array,
+            data_type: dt,
+            owner: &array,
+        };
+
+        let offset_buf_len = imported_array.buffer_len(1, 
&imported_array.data_type)?;
+        let data_buf_len = imported_array.buffer_len(2, 
&imported_array.data_type)?;
+
+        assert_eq!(offset_buf_len, 4);
+        assert_eq!(data_buf_len, 0);
+
+        test_round_trip(&imported_array.consume()?)
+    }
 }
diff --git a/arrow-data/src/equal/variable_size.rs 
b/arrow-data/src/equal/variable_size.rs
index 92f00818b4a..d6e8e6a9548 100644
--- a/arrow-data/src/equal/variable_size.rs
+++ b/arrow-data/src/equal/variable_size.rs
@@ -32,17 +32,18 @@ fn offset_value_equal<T: ArrowNativeType + Integer>(
 ) -> bool {
     let lhs_start = lhs_offsets[lhs_pos].as_usize();
     let rhs_start = rhs_offsets[rhs_pos].as_usize();
-    let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos];
-    let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos];
+    let lhs_len = (lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos])
+        .to_usize()
+        .unwrap();
+    let rhs_len = (rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos])
+        .to_usize()
+        .unwrap();
 
-    lhs_len == rhs_len
-        && equal_len(
-            lhs_values,
-            rhs_values,
-            lhs_start,
-            rhs_start,
-            lhs_len.to_usize().unwrap(),
-        )
+    if lhs_len == 0 && rhs_len == 0 {
+        return true;
+    }
+
+    lhs_len == rhs_len && equal_len(lhs_values, rhs_values, lhs_start, 
rhs_start, lhs_len)
 }
 
 pub(super) fn variable_sized_equal<T: ArrowNativeType + Integer>(

Reply via email to