This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 20a569a733f fix: Adjust FFI_ArrowArray offset based on the offset of 
offset buffer (#5895)
20a569a733f is described below

commit 20a569a733f450e463eea2d635d052958a71f750
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Wed Jun 19 18:43:05 2024 -0700

    fix: Adjust FFI_ArrowArray offset based on the offset of offset buffer 
(#5895)
    
    * fix: Adjust FFI_ArrowArray offset based on the offset of offset buffer
    
    * For review
    
    * Fix clippy
    
    * For review
---
 arrow-array/src/ffi.rs               | 56 ++++++++++++++++++++++++++++++++++++
 arrow-buffer/src/buffer/immutable.rs | 13 +++++++++
 arrow-data/src/ffi.rs                | 55 ++++++++++++++++++++++++++++++++---
 3 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs
index 088a0a6ab58..fe755b91f76 100644
--- a/arrow-array/src/ffi.rs
+++ b/arrow-array/src/ffi.rs
@@ -1225,6 +1225,7 @@ mod tests_from_ffi {
     use std::sync::Arc;
 
     use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer};
+    use arrow_data::transform::MutableArrayData;
     use arrow_data::ArrayData;
     use arrow_schema::{DataType, Field};
 
@@ -1234,6 +1235,7 @@ mod tests_from_ffi {
             Int32Array, Int64Array, StringArray, StructArray, UInt32Array, 
UInt64Array,
         },
         ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
+        make_array, ArrayRef,
     };
 
     use super::{ImportedArrowArray, Result};
@@ -1458,4 +1460,58 @@ mod tests_from_ffi {
 
         test_round_trip(&imported_array.consume()?)
     }
+
+    fn roundtrip_string_array(array: StringArray) -> StringArray {
+        let data = array.into_data();
+
+        let array = FFI_ArrowArray::new(&data);
+        let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap();
+
+        let array = unsafe { from_ffi(array, &schema) }.unwrap();
+        StringArray::from(array)
+    }
+
+    fn extend_array(array: &dyn Array) -> ArrayRef {
+        let len = array.len();
+        let data = array.to_data();
+
+        let mut mutable = MutableArrayData::new(vec![&data], false, len);
+        mutable.extend(0, 0, len);
+        make_array(mutable.freeze())
+    }
+
+    #[test]
+    fn test_extend_imported_string_slice() {
+        let mut strings = vec![];
+
+        for i in 0..1000 {
+            strings.push(format!("string: {}", i));
+        }
+
+        let string_array = StringArray::from(strings);
+
+        let imported = roundtrip_string_array(string_array.clone());
+        assert_eq!(imported.len(), 1000);
+        assert_eq!(imported.value(0), "string: 0");
+        assert_eq!(imported.value(499), "string: 499");
+
+        let copied = extend_array(&imported);
+        assert_eq!(
+            copied.as_any().downcast_ref::<StringArray>().unwrap(),
+            &imported
+        );
+
+        let slice = string_array.slice(500, 500);
+
+        let imported = roundtrip_string_array(slice);
+        assert_eq!(imported.len(), 500);
+        assert_eq!(imported.value(0), "string: 500");
+        assert_eq!(imported.value(499), "string: 999");
+
+        let copied = extend_array(&imported);
+        assert_eq!(
+            copied.as_any().downcast_ref::<StringArray>().unwrap(),
+            &imported
+        );
+    }
 }
diff --git a/arrow-buffer/src/buffer/immutable.rs 
b/arrow-buffer/src/buffer/immutable.rs
index f26cde05b7a..2c743842fb0 100644
--- a/arrow-buffer/src/buffer/immutable.rs
+++ b/arrow-buffer/src/buffer/immutable.rs
@@ -71,6 +71,19 @@ impl Buffer {
         }
     }
 
+    /// Returns the offset, in bytes, of `Self::ptr` to `Self::data`
+    ///
+    /// self.ptr and self.data can be different after slicing or advancing the 
buffer.
+    pub fn ptr_offset(&self) -> usize {
+        // Safety: `ptr` is always in bounds of `data`.
+        unsafe { self.ptr.offset_from(self.data.ptr().as_ptr()) as usize }
+    }
+
+    /// Returns the pointer to the start of the buffer without the offset.
+    pub fn data_ptr(&self) -> NonNull<u8> {
+        self.data.ptr()
+    }
+
     /// Create a [`Buffer`] from the provided [`Vec`] without copying
     #[inline]
     pub fn from_vec<T: ArrowNativeType>(vec: Vec<T>) -> Self {
diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs
index 589f7dac6d1..7fe87d4ae10 100644
--- a/arrow-data/src/ffi.rs
+++ b/arrow-data/src/ffi.rs
@@ -131,6 +131,37 @@ impl FFI_ArrowArray {
             data.buffers().iter().map(|b| Some(b.clone())).collect()
         };
 
+        // Handle buffer offset for offset buffer.
+        let offset_offset = match data.data_type() {
+            DataType::Utf8 | DataType::Binary => {
+                // Offset buffer is possible a slice of the buffer.
+                // If we use slice pointer as exported buffer pointer, it will 
cause
+                // the consumer to calculate incorrect length of data buffer 
(buffer 1).
+                // We need to get the offset of the offset buffer and fill it 
in
+                // the `FFI_ArrowArray` offset field.
+                Some(data.buffers()[0].ptr_offset() / 
std::mem::size_of::<i32>())
+            }
+            DataType::LargeUtf8 | DataType::LargeBinary => {
+                // Offset buffer is possible a slice of the buffer.
+                // If we use slice pointer as exported buffer pointer, it will 
cause
+                // the consumer to calculate incorrect length of data buffer 
(buffer 1).
+                // We need to get the offset of the offset buffer and fill it 
in
+                // the `FFI_ArrowArray` offset field.
+                Some(data.buffers()[0].ptr_offset() / 
std::mem::size_of::<i64>())
+            }
+            _ => None,
+        };
+
+        let offset = if let Some(offset) = offset_offset {
+            if data.offset() != 0 {
+                // TODO: Adjust for data offset
+                panic!("The ArrayData of a slice offset buffer should not have 
offset");
+            }
+            offset
+        } else {
+            data.offset()
+        };
+
         // `n_buffers` is the number of buffers by the spec.
         let n_buffers = {
             data_layout.buffers.len() + {
@@ -143,9 +174,25 @@ impl FFI_ArrowArray {
 
         let buffers_ptr = buffers
             .iter()
-            .flat_map(|maybe_buffer| match maybe_buffer {
-                // note that `raw_data` takes into account the buffer's offset
-                Some(b) => Some(b.as_ptr() as *const c_void),
+            .enumerate()
+            .flat_map(|(buffer_idx, maybe_buffer)| match maybe_buffer {
+                Some(b) => {
+                    match (data.data_type(), buffer_idx) {
+                        (
+                            DataType::Utf8
+                            | DataType::LargeUtf8
+                            | DataType::Binary
+                            | DataType::LargeBinary,
+                            1,
+                        ) => {
+                            // For offset buffer, take original pointer 
without offset.
+                            // Buffer offset should be handled by 
`FFI_ArrowArray` offset field.
+                            Some(b.data_ptr().as_ptr() as *const c_void)
+                        }
+                        // For other buffers, note that `raw_data` takes into 
account the buffer's offset
+                        _ => Some(b.as_ptr() as *const c_void),
+                    }
+                }
                 // This is for null buffer. We only put a null pointer for
                 // null buffer if by spec it can contain null mask.
                 None if data_layout.can_contain_null_mask => 
Some(std::ptr::null()),
@@ -186,7 +233,7 @@ impl FFI_ArrowArray {
         Self {
             length: data.len() as i64,
             null_count: null_count as i64,
-            offset: data.offset() as i64,
+            offset: offset as i64,
             n_buffers,
             n_children,
             buffers: private_data.buffers_ptr.as_mut_ptr(),

Reply via email to