This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 20a569a733f fix: Adjust FFI_ArrowArray offset based on the offset of
offset buffer (#5895)
20a569a733f is described below
commit 20a569a733f450e463eea2d635d052958a71f750
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Wed Jun 19 18:43:05 2024 -0700
fix: Adjust FFI_ArrowArray offset based on the offset of offset buffer
(#5895)
* fix: Adjust FFI_ArrowArray offset based on the offset of offset buffer
* For review
* Fix clippy
* For review
---
arrow-array/src/ffi.rs | 56 ++++++++++++++++++++++++++++++++++++
arrow-buffer/src/buffer/immutable.rs | 13 +++++++++
arrow-data/src/ffi.rs | 55 ++++++++++++++++++++++++++++++++---
3 files changed, 120 insertions(+), 4 deletions(-)
diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs
index 088a0a6ab58..fe755b91f76 100644
--- a/arrow-array/src/ffi.rs
+++ b/arrow-array/src/ffi.rs
@@ -1225,6 +1225,7 @@ mod tests_from_ffi {
use std::sync::Arc;
use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer};
+ use arrow_data::transform::MutableArrayData;
use arrow_data::ArrayData;
use arrow_schema::{DataType, Field};
@@ -1234,6 +1235,7 @@ mod tests_from_ffi {
Int32Array, Int64Array, StringArray, StructArray, UInt32Array,
UInt64Array,
},
ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
+ make_array, ArrayRef,
};
use super::{ImportedArrowArray, Result};
@@ -1458,4 +1460,58 @@ mod tests_from_ffi {
test_round_trip(&imported_array.consume()?)
}
+
+ fn roundtrip_string_array(array: StringArray) -> StringArray {
+ let data = array.into_data();
+
+ let array = FFI_ArrowArray::new(&data);
+ let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap();
+
+ let array = unsafe { from_ffi(array, &schema) }.unwrap();
+ StringArray::from(array)
+ }
+
+ fn extend_array(array: &dyn Array) -> ArrayRef {
+ let len = array.len();
+ let data = array.to_data();
+
+ let mut mutable = MutableArrayData::new(vec![&data], false, len);
+ mutable.extend(0, 0, len);
+ make_array(mutable.freeze())
+ }
+
+ #[test]
+ fn test_extend_imported_string_slice() {
+ let mut strings = vec![];
+
+ for i in 0..1000 {
+ strings.push(format!("string: {}", i));
+ }
+
+ let string_array = StringArray::from(strings);
+
+ let imported = roundtrip_string_array(string_array.clone());
+ assert_eq!(imported.len(), 1000);
+ assert_eq!(imported.value(0), "string: 0");
+ assert_eq!(imported.value(499), "string: 499");
+
+ let copied = extend_array(&imported);
+ assert_eq!(
+ copied.as_any().downcast_ref::<StringArray>().unwrap(),
+ &imported
+ );
+
+ let slice = string_array.slice(500, 500);
+
+ let imported = roundtrip_string_array(slice);
+ assert_eq!(imported.len(), 500);
+ assert_eq!(imported.value(0), "string: 500");
+ assert_eq!(imported.value(499), "string: 999");
+
+ let copied = extend_array(&imported);
+ assert_eq!(
+ copied.as_any().downcast_ref::<StringArray>().unwrap(),
+ &imported
+ );
+ }
}
diff --git a/arrow-buffer/src/buffer/immutable.rs
b/arrow-buffer/src/buffer/immutable.rs
index f26cde05b7a..2c743842fb0 100644
--- a/arrow-buffer/src/buffer/immutable.rs
+++ b/arrow-buffer/src/buffer/immutable.rs
@@ -71,6 +71,19 @@ impl Buffer {
}
}
+ /// Returns the offset, in bytes, of `Self::ptr` to `Self::data`
+ ///
+ /// self.ptr and self.data can be different after slicing or advancing the
buffer.
+ pub fn ptr_offset(&self) -> usize {
+ // Safety: `ptr` is always in bounds of `data`.
+ unsafe { self.ptr.offset_from(self.data.ptr().as_ptr()) as usize }
+ }
+
+ /// Returns the pointer to the start of the buffer without the offset.
+ pub fn data_ptr(&self) -> NonNull<u8> {
+ self.data.ptr()
+ }
+
/// Create a [`Buffer`] from the provided [`Vec`] without copying
#[inline]
pub fn from_vec<T: ArrowNativeType>(vec: Vec<T>) -> Self {
diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs
index 589f7dac6d1..7fe87d4ae10 100644
--- a/arrow-data/src/ffi.rs
+++ b/arrow-data/src/ffi.rs
@@ -131,6 +131,37 @@ impl FFI_ArrowArray {
data.buffers().iter().map(|b| Some(b.clone())).collect()
};
+ // Handle buffer offset for offset buffer.
+ let offset_offset = match data.data_type() {
+ DataType::Utf8 | DataType::Binary => {
+ // Offset buffer is possible a slice of the buffer.
+ // If we use slice pointer as exported buffer pointer, it will
cause
+ // the consumer to calculate incorrect length of data buffer
(buffer 1).
+ // We need to get the offset of the offset buffer and fill it
in
+ // the `FFI_ArrowArray` offset field.
+ Some(data.buffers()[0].ptr_offset() /
std::mem::size_of::<i32>())
+ }
+ DataType::LargeUtf8 | DataType::LargeBinary => {
+ // Offset buffer is possible a slice of the buffer.
+ // If we use slice pointer as exported buffer pointer, it will
cause
+ // the consumer to calculate incorrect length of data buffer
(buffer 1).
+ // We need to get the offset of the offset buffer and fill it
in
+ // the `FFI_ArrowArray` offset field.
+ Some(data.buffers()[0].ptr_offset() /
std::mem::size_of::<i64>())
+ }
+ _ => None,
+ };
+
+ let offset = if let Some(offset) = offset_offset {
+ if data.offset() != 0 {
+ // TODO: Adjust for data offset
+ panic!("The ArrayData of a slice offset buffer should not have
offset");
+ }
+ offset
+ } else {
+ data.offset()
+ };
+
// `n_buffers` is the number of buffers by the spec.
let n_buffers = {
data_layout.buffers.len() + {
@@ -143,9 +174,25 @@ impl FFI_ArrowArray {
let buffers_ptr = buffers
.iter()
- .flat_map(|maybe_buffer| match maybe_buffer {
- // note that `raw_data` takes into account the buffer's offset
- Some(b) => Some(b.as_ptr() as *const c_void),
+ .enumerate()
+ .flat_map(|(buffer_idx, maybe_buffer)| match maybe_buffer {
+ Some(b) => {
+ match (data.data_type(), buffer_idx) {
+ (
+ DataType::Utf8
+ | DataType::LargeUtf8
+ | DataType::Binary
+ | DataType::LargeBinary,
+ 1,
+ ) => {
+ // For offset buffer, take original pointer
without offset.
+ // Buffer offset should be handled by
`FFI_ArrowArray` offset field.
+ Some(b.data_ptr().as_ptr() as *const c_void)
+ }
+ // For other buffers, note that `raw_data` takes into
account the buffer's offset
+ _ => Some(b.as_ptr() as *const c_void),
+ }
+ }
// This is for null buffer. We only put a null pointer for
// null buffer if by spec it can contain null mask.
None if data_layout.can_contain_null_mask =>
Some(std::ptr::null()),
@@ -186,7 +233,7 @@ impl FFI_ArrowArray {
Self {
length: data.len() as i64,
null_count: null_count as i64,
- offset: data.offset() as i64,
+ offset: offset as i64,
n_buffers,
n_children,
buffers: private_data.buffers_ptr.as_mut_ptr(),