This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new f1ef71aa5a [arrow-array] use usize arithmetic in FixedSizeBinaryArray,
aggressive overflow checks (#9910)
f1ef71aa5a is described below
commit f1ef71aa5aab3d5f3e3ff46f63dd198cf5da7b38
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu May 14 16:42:58 2026 -0400
[arrow-array] use usize arithmetic in FixedSizeBinaryArray, aggressive
overflow checks (#9910)
# Which issue does this PR close?
- Closes #9906.
# Rationale for this change
`FixedSizeBinaryArray` still stores its public fixed width as `i32`,
which means internal address calculations rely on repeated conversions
between `i32` and pointer-sized offsets. We recently had issue with some
i32 based arithmetic overflowing (see
https://github.com/apache/arrow-rs/issues/9898)
To avoid inadvertently using i32 arithmetic, this PR proposes to change
the internal representation of the FixedSizeBinaryArray to use `usize`
and compute byte positions using `usize` ( pointer-sized arithmetic)
directly, with checked conversions only at the public API boundaries
that still require `i32`.
I am quite pleased it is a net reduction in lines of code (admittedly
most of that was the checks added in
https://github.com/apache/arrow-rs/pull/9872
# What changes are included in this PR?
- Store the fixed-width element size as `value_size: usize` inside
`FixedSizeBinaryArray`.
- Rewrite internal position calculations in accessors and slicing to use
`usize` arithmetic.
- Remove the old `validate_lengths` invariant that existed only to keep
internal `i32` offset arithmetic in range.
- Remove implicit `as` casts from the implementation and replace them
with checked conversions or typed bindings.
# Are these changes tested?
These changes are covered by CI.
# Are there any user-facing changes?
No.
---------
Co-authored-by: Adam Reeve <[email protected]>
Co-authored-by: Adam Reeve <[email protected]>
---
arrow-array/src/array/fixed_size_binary_array.rs | 153 ++++++++-------------
.../src/builder/fixed_size_binary_builder.rs | 13 +-
arrow-schema/src/datatype.rs | 8 +-
arrow-string/src/substring.rs | 59 ++++----
4 files changed, 102 insertions(+), 131 deletions(-)
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs
b/arrow-array/src/array/fixed_size_binary_array.rs
index a54496addc..a44d21ad5b 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -88,11 +88,23 @@ use std::sync::Arc;
///
#[derive(Clone)]
pub struct FixedSizeBinaryArray {
- data_type: DataType, // Must be DataType::FixedSizeBinary(value_length)
+ /// Must be DataType::FixedSizeBinary(value_size)
+ data_type: DataType,
+ /// `len` values, each `value_size` bytes
value_data: Buffer,
+ /// Optional Null Buffer
nulls: Option<NullBuffer>,
+ /// Number of elements in the array
len: usize,
- value_length: i32,
+ /// size of each element, validated to fit in a positive i32
+ ///
+ /// Corresponds to the [`byteWidth` field] in the Arrow Spec
+ ///
+ /// note: Arrow stores `value_len` using i32. This implementation stores it
+ /// as a usize to ensure correct offset calculations.
+ ///
+ /// [`byteWidth` field]:
https://github.com/apache/arrow/blob/2a89d03bbefd620b42126b8e00f8ae57e99cd638/format/Schema.fbs#L211
+ value_size: usize,
}
impl FixedSizeBinaryArray {
@@ -124,7 +136,6 @@ impl FixedSizeBinaryArray {
/// * `value_length < 0`
/// * `values.len() / value_length != nulls.len()`
/// * `value_length == 0 && values.len() != 0`
- /// * `len * value_length > i32::MAX`
pub fn try_new(
value_length: i32,
values: Buffer,
@@ -163,44 +174,15 @@ impl FixedSizeBinaryArray {
}
};
- Self::validate_lengths(value_size, len)?;
-
Ok(Self {
data_type,
value_data: values,
- value_length,
+ value_size,
nulls,
len,
})
}
- /// Some calculations below use i32 arithmetic which can overflow when
- /// valid offsets are past i32::MAX. Until that is solved for real do not
- /// permit constructing any FixedSizeBinaryArray that has a valid offset
- /// past i32::MAX
- fn validate_lengths(value_size: usize, len: usize) -> Result<(),
ArrowError> {
- // the offset is also calculated for the next element (i + 1) so
- // check `len` (not last element index) to ensure that all offsets are
valid
- let max_offset = value_size.checked_mul(len).ok_or_else(|| {
- ArrowError::InvalidArgumentError(format!(
- "FixedSizeBinaryArray error: value size {value_size} * len
{len} exceeds maximum valid offset"
- ))
- })?;
-
- let max_valid_offset: usize = i32::MAX.try_into().map_err(|_| {
- ArrowError::InvalidArgumentError(format!(
- "FixedSizeBinaryArray error: maximum valid offset exceeds
i32::MAX, got {max_offset}"
- ))
- })?;
-
- if max_offset > max_valid_offset {
- return Err(ArrowError::InvalidArgumentError(format!(
- "FixedSizeBinaryArray error: value size {value_size} * length
{len} exceeds maximum valid offset of {max_valid_offset}"
- )));
- };
- Ok(())
- }
-
/// Create a new [`FixedSizeBinaryArray`] of length `len` where all values
are null
///
/// # Panics
@@ -209,26 +191,25 @@ impl FixedSizeBinaryArray {
///
/// * `value_length < 0`
/// * `value_length * len` would overflow `usize`
- /// * `value_length * len > i32::MAX`
/// * `value_length * len * 8` would overflow `usize`
pub fn new_null(value_length: i32, len: usize) -> Self {
const BITS_IN_A_BYTE: usize = 8;
let value_size = value_length.to_usize().unwrap();
- Self::validate_lengths(value_size, len).unwrap();
let capacity_in_bytes = value_size.checked_mul(len).unwrap();
let capacity_in_bits =
capacity_in_bytes.checked_mul(BITS_IN_A_BYTE).unwrap();
Self {
data_type: DataType::FixedSizeBinary(value_length),
value_data: MutableBuffer::new_null(capacity_in_bits).into(),
nulls: Some(NullBuffer::new_null(len)),
- value_length,
+ value_size,
len,
}
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (i32, Buffer, Option<NullBuffer>) {
- (self.value_length, self.value_data, self.nulls)
+ let value_length = self.value_length();
+ (value_length, self.value_data, self.nulls)
}
/// Returns the element at index `i` as a byte slice.
@@ -239,19 +220,14 @@ impl FixedSizeBinaryArray {
/// # Panics
/// Panics if index `i` is out of bounds.
pub fn value(&self, i: usize) -> &[u8] {
+ let len = self.len();
assert!(
- i < self.len(),
- "Trying to access an element at index {} from a
FixedSizeBinaryArray of length {}",
- i,
- self.len()
+ i < len,
+ "Trying to access an element at index {i} from a
FixedSizeBinaryArray of length {len}",
);
- let offset = i + self.offset();
+ let position = i * self.value_size;
unsafe {
- let pos = self.value_offset_at(offset);
- std::slice::from_raw_parts(
- self.value_data.as_ptr().offset(pos as isize),
- (self.value_offset_at(offset + 1) - pos) as usize,
- )
+ std::slice::from_raw_parts(self.value_data.as_ptr().add(position),
self.value_size)
}
}
@@ -265,30 +241,46 @@ impl FixedSizeBinaryArray {
/// Caller is responsible for ensuring that the index is within the bounds
/// of the array
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
- let offset = i + self.offset();
- let pos = self.value_offset_at(offset);
+ let position = i * self.value_size;
unsafe {
- std::slice::from_raw_parts(
- self.value_data.as_ptr().offset(pos as isize),
- (self.value_offset_at(offset + 1) - pos) as usize,
- )
+ std::slice::from_raw_parts(self.value_data.as_ptr().add(position),
self.value_size)
}
}
/// Returns the offset for the element at index `i`.
///
/// Note this doesn't do any bound checking, for performance reason.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the computed byte offset exceeds `i32::MAX`.
+ #[deprecated(since = "59.0.0", note = "Use i * value_size() instead")]
#[inline]
pub fn value_offset(&self, i: usize) -> i32 {
- self.value_offset_at(self.offset() + i)
+ self.value_length() * i as i32
}
/// Returns the length for an element.
///
/// All elements have the same length as the array is a fixed size.
+ ///
+ /// Returns an `i32` to be compatible with the Arrow spec.
+ ///
+ /// Use [`Self::value_size`] to return a `usize`.
#[inline]
pub fn value_length(&self) -> i32 {
- self.value_length
+ // This is safe: constructor validated that value_size was a valid i32
+ self.value_size as i32
+ }
+
+ /// Return the length for an element, as a usize.
+ ///
+ /// All elements have the same length as the array is a fixed size.
+ ///
+ /// Note: This value will always fit, without overflow, into an i32
+ #[inline]
+ pub fn value_size(&self) -> usize {
+ self.value_size
}
/// Returns the values of this array.
@@ -311,14 +303,16 @@ impl FixedSizeBinaryArray {
offset.saturating_add(len) <= self.len,
"the length + offset of the sliced FixedSizeBinaryArray cannot
exceed the existing length"
);
-
- let size = self.value_length as usize;
+ let offset_bytes = offset
+ .checked_mul(self.value_size)
+ .expect("offset overflow");
+ let len_bytes = len.checked_mul(self.value_size).expect("offset
overflow");
Self {
data_type: self.data_type.clone(),
nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
- value_length: self.value_length,
- value_data: self.value_data.slice_with_length(offset * size, len *
size),
+ value_size: self.value_size,
+ value_data: self.value_data.slice_with_length(offset_bytes,
len_bytes),
len,
}
}
@@ -420,7 +414,6 @@ impl FixedSizeBinaryArray {
let nulls = NullBuffer::from_unsliced_buffer(null_buf, len);
let value_size = value_size.unwrap_or(0);
- Self::validate_lengths(value_size, len)?;
let value_length = value_size.try_into().map_err(|_| {
ArrowError::InvalidArgumentError(format!(
"FixedSizeBinaryArray value length exceeds i32, got
{value_size}"
@@ -430,7 +423,7 @@ impl FixedSizeBinaryArray {
data_type: DataType::FixedSizeBinary(value_length),
value_data: buffer.into(),
nulls,
- value_length,
+ value_size,
len,
})
}
@@ -514,14 +507,13 @@ impl FixedSizeBinaryArray {
})?;
let nulls = NullBuffer::from_unsliced_buffer(null_buf, len);
- Self::validate_lengths(value_size, len)?;
Ok(Self {
data_type: DataType::FixedSizeBinary(value_length),
value_data: buffer.into(),
nulls,
len,
- value_length,
+ value_size,
})
}
@@ -583,7 +575,6 @@ impl FixedSizeBinaryArray {
}
let value_size = value_size.unwrap_or(0);
- Self::validate_lengths(value_size, len)?;
let value_length = value_size.try_into().map_err(|_| {
ArrowError::InvalidArgumentError(format!(
"FixedSizeBinaryArray value length exceeds i32, got
{value_size}"
@@ -593,16 +584,11 @@ impl FixedSizeBinaryArray {
data_type: DataType::FixedSizeBinary(value_length),
value_data: buffer.into(),
nulls: None,
- value_length,
+ value_size,
len,
})
}
- #[inline]
- fn value_offset_at(&self, i: usize) -> i32 {
- self.value_length * i as i32
- }
-
/// constructs a new iterator
pub fn iter(&self) -> FixedSizeBinaryIter<'_> {
FixedSizeBinaryIter::new(self)
@@ -626,8 +612,6 @@ impl From<ArrayData> for FixedSizeBinaryArray {
let value_size = value_length
.to_usize()
.expect("FixedSizeBinaryArray value length must be non-negative");
- Self::validate_lengths(value_size, len)
- .expect("FixedSizeBinaryArray offsets must fit within i32");
let value_data = buffers[0].slice_with_length(
offset.checked_mul(value_size).expect("offset overflow"),
len.checked_mul(value_size).expect("length overflow"),
@@ -638,7 +622,7 @@ impl From<ArrayData> for FixedSizeBinaryArray {
nulls,
len,
value_data,
- value_length,
+ value_size,
}
}
}
@@ -849,7 +833,6 @@ mod tests {
fixed_size_binary_array.value(2)
);
assert_eq!(5, fixed_size_binary_array.value_length());
- assert_eq!(10, fixed_size_binary_array.value_offset(2));
for i in 0..3 {
assert!(fixed_size_binary_array.is_valid(i));
assert!(!fixed_size_binary_array.is_null(i));
@@ -872,9 +855,7 @@ mod tests {
fixed_size_binary_array.value(1)
);
assert_eq!(2, fixed_size_binary_array.len());
- assert_eq!(0, fixed_size_binary_array.value_offset(0));
assert_eq!(5, fixed_size_binary_array.value_length());
- assert_eq!(5, fixed_size_binary_array.value_offset(1));
}
#[test]
@@ -1122,26 +1103,6 @@ mod tests {
array.value(4);
}
- #[test]
- fn test_validate_lengths_allows_empty_array() {
- FixedSizeBinaryArray::validate_lengths(1024, 0).unwrap();
- }
-
- #[test]
- fn test_validate_lengths_allows_i32_max_offset() {
- FixedSizeBinaryArray::validate_lengths(1, i32::MAX as usize).unwrap();
- FixedSizeBinaryArray::validate_lengths(262_176, 8191).unwrap();
- }
-
- #[test]
- fn test_validate_lengths_rejects_offset_past_i32_max() {
- let err = FixedSizeBinaryArray::validate_lengths(262_177,
8192).unwrap_err();
- assert_eq!(
- err.to_string(),
- "Invalid argument error: FixedSizeBinaryArray error: value size
262177 * length 8192 exceeds maximum valid offset of 2147483647",
- );
- }
-
#[test]
fn test_constructors() {
let buffer = Buffer::from_vec(vec![0_u8; 10]);
diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs
b/arrow-array/src/builder/fixed_size_binary_builder.rs
index f6b4c33d94..b033ed0ff3 100644
--- a/arrow-array/src/builder/fixed_size_binary_builder.rs
+++ b/arrow-array/src/builder/fixed_size_binary_builder.rs
@@ -206,11 +206,11 @@ mod tests {
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
assert_eq!(6, array.len());
assert_eq!(3, array.null_count());
- assert_eq!(10, array.value_offset(2));
- assert_eq!(15, array.value_offset(3));
assert_eq!(5, array.value_length());
+ assert_eq!(b"arrow", array.value(2));
assert!(array.is_null(3));
assert!(array.is_null(4));
+ assert_eq!(b"world", array.value(5));
}
#[test]
@@ -226,8 +226,8 @@ mod tests {
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
assert_eq!(3, array.len());
assert_eq!(1, array.null_count());
- assert_eq!(10, array.value_offset(2));
assert_eq!(5, array.value_length());
+ assert_eq!(b"arrow", array.value(2));
// [b"finis", null, "clone"]
builder.append_value(b"finis").unwrap();
@@ -239,8 +239,8 @@ mod tests {
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
assert_eq!(6, array.len());
assert_eq!(2, array.null_count());
- assert_eq!(25, array.value_offset(5));
assert_eq!(5, array.value_length());
+ assert_eq!(b"clone", array.value(5));
}
#[test]
@@ -256,7 +256,6 @@ mod tests {
assert_eq!(&DataType::FixedSizeBinary(0), array.data_type());
assert_eq!(3, array.len());
assert_eq!(1, array.null_count());
- assert_eq!(0, array.value_offset(2));
assert_eq!(0, array.value_length());
assert_eq!(b"", array.value(0));
assert_eq!(b"", array.value(2));
@@ -307,10 +306,6 @@ mod tests {
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
assert_eq!(6, array.len());
assert_eq!(2, array.null_count());
- for i in 0..6 {
- assert_eq!(i * 5, array.value_offset(i as usize));
- }
-
assert_eq!(b"hello", array.value(0));
assert!(array.is_null(1));
assert_eq!(b"arrow", array.value(2));
diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
index 2eae433f4d..927ad221e9 100644
--- a/arrow-schema/src/datatype.rs
+++ b/arrow-schema/src/datatype.rs
@@ -273,7 +273,11 @@ pub enum DataType {
/// of binary data in total.
Binary,
/// Opaque binary data of fixed size.
- /// Enum parameter specifies the number of bytes per value.
+ ///
+ /// Enum parameter specifies the number of bytes per value, defined by the
+ /// [`byteWidth` field] in the Arrow Spec
+ ///
+ /// [`byteWidth` field]:
https://github.com/apache/arrow/blob/2a89d03bbefd620b42126b8e00f8ae57e99cd638/format/Schema.fbs#L211
FixedSizeBinary(i32),
/// Opaque binary data of variable length and 64-bit offsets.
///
@@ -312,7 +316,6 @@ pub enum DataType {
///
/// A single List array can store up to [`i32::MAX`] elements in total.
List(FieldRef),
-
/// A list of some logical data type with variable length.
///
/// Logically the same as [`List`], but the internal representation
differs in how child
@@ -326,7 +329,6 @@ pub enum DataType {
///
/// A single LargeList array can store up to [`i64::MAX`] elements in
total.
LargeList(FieldRef),
-
/// A list of some logical data type with variable length and 64-bit
offsets.
///
/// Logically the same as [`LargeList`], but the internal representation
differs in how child
diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs
index 3dc3bd695c..123c77cdf9 100644
--- a/arrow-string/src/substring.rs
+++ b/arrow-string/src/substring.rs
@@ -124,15 +124,20 @@ pub fn substring(
start as i32,
length.map(|e| e as i32),
),
- DataType::FixedSizeBinary(old_len) => fixed_size_binary_substring(
- array
- .as_any()
- .downcast_ref::<FixedSizeBinaryArray>()
- .expect("a fixed size binary is expected"),
- *old_len,
- start as i32,
- length.map(|e| e as i32),
- ),
+ DataType::FixedSizeBinary(old_len) => {
+ let old_len: usize = (*old_len)
+ .try_into()
+ .expect("negative FixedSizeBinary value length");
+ fixed_size_binary_substring(
+ array
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .expect("a fixed size binary is expected"),
+ old_len,
+ start,
+ length,
+ )
+ }
DataType::LargeUtf8 => byte_substring(
array
.as_any()
@@ -322,31 +327,37 @@ where
fn fixed_size_binary_substring(
array: &FixedSizeBinaryArray,
- old_len: i32,
- start: i32,
- length: Option<i32>,
+ old_len: usize,
+ start: i64,
+ length: Option<u64>,
) -> Result<ArrayRef, ArrowError> {
- let new_start = if start >= 0 {
- start.min(old_len)
- } else {
- (old_len + start).max(0)
+ let new_start = match start.cmp(&0) {
+ Ordering::Greater =>
usize::try_from(start).unwrap_or(usize::MAX).min(old_len),
+ Ordering::Equal => 0,
+ Ordering::Less => {
+ let offset =
usize::try_from(start.unsigned_abs()).unwrap_or(usize::MAX);
+ old_len.saturating_sub(offset)
+ }
};
+
let new_len = match length {
- Some(len) => len.min(old_len - new_start),
+ Some(len) => usize::try_from(len)
+ .unwrap_or(usize::MAX)
+ .min(old_len - new_start),
None => old_len - new_start,
};
// build value buffer
let num_of_elements = array.len();
let data = array.value_data();
- let mut new_values = MutableBuffer::new(num_of_elements * (new_len as
usize));
+ let capacity = num_of_elements
+ .checked_mul(new_len)
+ .expect("capacity overflow");
+ let mut new_values = MutableBuffer::new(capacity);
(0..num_of_elements)
.map(|idx| {
- let offset = array.value_offset(idx);
- (
- (offset + new_start) as usize,
- (offset + new_start + new_len) as usize,
- )
+ let offset = idx * array.value_size();
+ (offset + new_start, offset + new_start + new_len)
})
.for_each(|(start, end)|
new_values.extend_from_slice(&data[start..end]));
@@ -363,6 +374,8 @@ fn fixed_size_binary_substring(
nulls = Some(NullBuffer::new_valid(num_of_elements));
}
+ let new_len: i32 = new_len.try_into().expect("new_len overflow");
+
Ok(Arc::new(FixedSizeBinaryArray::new(
new_len,
new_values.into(),