This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new cd39b8c024e Compute data buffer length by using start and end values
in offset buffer (#5741)
cd39b8c024e is described below
commit cd39b8c024ef09ed80f725b34ba06a45a0b7eedc
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Sun May 12 09:01:21 2024 -0700
Compute data buffer length by using start and end values in offset buffer
(#5741)
* Compute data buffer length by offset buffer start and end values
* Update code comment
* Add unit test
* Add round_trip check
* Fix clippy
---
arrow-array/src/ffi.rs | 50 ++++++++++++++++++++++++++++++-----
arrow-data/src/equal/variable_size.rs | 21 ++++++++-------
2 files changed, 55 insertions(+), 16 deletions(-)
diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs
index 7b988bb0747..088a0a6ab58 100644
--- a/arrow-array/src/ffi.rs
+++ b/arrow-array/src/ffi.rs
@@ -425,24 +425,32 @@ impl<'a> ImportedArrowArray<'a> {
(length + 1) * (bits / 8)
}
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
- // the len of the data buffer (buffer 2) equals the last value
of the offset buffer (buffer 1)
+ // the len of the data buffer (buffer 2) equals the difference
between the last value
+ // and the first value of the offset buffer (buffer 1).
let len = self.buffer_len(1, dt)?;
// first buffer is the null buffer => add(1)
// we assume that pointer is aligned for `i32`, as Utf8 uses
`i32` offsets.
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = self.array.buffer(1) as *const i32;
+ // get first offset
+ let start = (unsafe { *offset_buffer.add(0) }) as usize;
// get last offset
- (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as
usize
+ let end = (unsafe { *offset_buffer.add(len / size_of::<i32>()
- 1) }) as usize;
+ end - start
}
(DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
- // the len of the data buffer (buffer 2) equals the last value
of the offset buffer (buffer 1)
+ // the len of the data buffer (buffer 2) equals the difference
between the last value
+ // and the first value of the offset buffer (buffer 1).
let len = self.buffer_len(1, dt)?;
// first buffer is the null buffer => add(1)
// we assume that pointer is aligned for `i64`, as Large uses
`i64` offsets.
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = self.array.buffer(1) as *const i64;
+ // get first offset
+ let start = (unsafe { *offset_buffer.add(0) }) as usize;
// get last offset
- (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as
usize
+ let end = (unsafe { *offset_buffer.add(len / size_of::<i64>()
- 1) }) as usize;
+ end - start
}
// buffer len of primitive types
_ => {
@@ -1216,7 +1224,7 @@ mod tests_to_then_from_ffi {
mod tests_from_ffi {
use std::sync::Arc;
- use arrow_buffer::{bit_util, buffer::Buffer};
+ use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer};
use arrow_data::ArrayData;
use arrow_schema::{DataType, Field};
@@ -1228,7 +1236,7 @@ mod tests_from_ffi {
ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
};
- use super::Result;
+ use super::{ImportedArrowArray, Result};
fn test_round_trip(expected: &ArrayData) -> Result<()> {
// here we export the array
@@ -1420,4 +1428,34 @@ mod tests_from_ffi {
let data = array.into_data();
test_round_trip(&data)
}
+
+ #[test]
+ fn test_empty_string_with_non_zero_offset() -> Result<()> {
+ // Simulate an empty string array with a non-zero offset from a
producer
+ let data: Buffer = MutableBuffer::new(0).into();
+ let offsets = OffsetBuffer::new(vec![123].into());
+ let string_array =
+ unsafe { StringArray::new_unchecked(offsets.clone(), data.clone(),
None) };
+
+ let data = string_array.into_data();
+
+ let array = FFI_ArrowArray::new(&data);
+ let schema = FFI_ArrowSchema::try_from(data.data_type())?;
+
+ let dt = DataType::try_from(&schema)?;
+ let array = Arc::new(array);
+ let imported_array = ImportedArrowArray {
+ array: &array,
+ data_type: dt,
+ owner: &array,
+ };
+
+ let offset_buf_len = imported_array.buffer_len(1,
&imported_array.data_type)?;
+ let data_buf_len = imported_array.buffer_len(2,
&imported_array.data_type)?;
+
+ assert_eq!(offset_buf_len, 4);
+ assert_eq!(data_buf_len, 0);
+
+ test_round_trip(&imported_array.consume()?)
+ }
}
diff --git a/arrow-data/src/equal/variable_size.rs
b/arrow-data/src/equal/variable_size.rs
index 92f00818b4a..d6e8e6a9548 100644
--- a/arrow-data/src/equal/variable_size.rs
+++ b/arrow-data/src/equal/variable_size.rs
@@ -32,17 +32,18 @@ fn offset_value_equal<T: ArrowNativeType + Integer>(
) -> bool {
let lhs_start = lhs_offsets[lhs_pos].as_usize();
let rhs_start = rhs_offsets[rhs_pos].as_usize();
- let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos];
- let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos];
+ let lhs_len = (lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos])
+ .to_usize()
+ .unwrap();
+ let rhs_len = (rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos])
+ .to_usize()
+ .unwrap();
- lhs_len == rhs_len
- && equal_len(
- lhs_values,
- rhs_values,
- lhs_start,
- rhs_start,
- lhs_len.to_usize().unwrap(),
- )
+ if lhs_len == 0 && rhs_len == 0 {
+ return true;
+ }
+
+ lhs_len == rhs_len && equal_len(lhs_values, rhs_values, lhs_start,
rhs_start, lhs_len)
}
pub(super) fn variable_sized_equal<T: ArrowNativeType + Integer>(