This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 75f7916d2a Prevent `FixedSizeBinaryArray` `i32` offset overflows (try
2) (#9872)
75f7916d2a is described below
commit 75f7916d2ae8d11320451f1c3e05680d69865d5f
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon May 4 16:39:38 2026 -0400
Prevent `FixedSizeBinaryArray` `i32` offset overflows (try 2) (#9872)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/pull/9850
# Rationale for this change
`FixedSizeBinaryArray::value_offset_at` works use `i32` arithmetic which
can overflow. For offsets beyond `i32::MAX`, that can be bad
# What changes are included in this PR?
1. Prevent any FixedSizedBinaryArrays from being constructed where the
offset calculation could overflow
2. Add some other overflow checks
As @adamreeve [pointed
out](https://github.com/apache/arrow-rs/pull/9850#discussion_r3164949680)
on https://github.com/apache/arrow-rs/pull/9850 there are several places
where the `i32` arithmetic is problematic in `FixedSizeBinaryArray`. I
will fix them for real in a different, follow on PR, by switching to
entirely `usize` based arithmetic for offset calculations
However, since I hope to backport this PR to older releases, I would
like something that is easy to review and has the least potential for
unintended consequences.
# Are these changes tested?
I added unit tests. However, I can't find any way to fully trigger the
actual paths short of trying to allocate very large arrays, which I
don't think is appropriate for unit tests.
# Are there any user-facing changes?
Better limit checking
---
arrow-array/src/array/fixed_size_binary_array.rs | 120 ++++++++++++++++++++---
1 file changed, 107 insertions(+), 13 deletions(-)
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs
b/arrow-array/src/array/fixed_size_binary_array.rs
index 928aa38a27..3c3dc8f589 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -108,7 +108,8 @@ impl FixedSizeBinaryArray {
/// Create a new [`Scalar`] from `value`
pub fn new_scalar(value: impl AsRef<[u8]>) -> Scalar<Self> {
let v = value.as_ref();
- Scalar::new(Self::new(v.len() as _, Buffer::from(v), None))
+ let size = i32::try_from(v.len()).expect("FixedSizeBinaryArray value
length exceeds i32");
+ Scalar::new(Self::new(size, Buffer::from(v), None))
}
/// Create a new [`FixedSizeBinaryArray`] from the provided parts,
returning an error on failure
@@ -121,6 +122,7 @@ impl FixedSizeBinaryArray {
/// * `size < 0`
/// * `values.len() / size != nulls.len()`
/// * `size == 0 && values.len() != 0`
+ /// * `len * size > i32::MAX`
pub fn try_new(
size: i32,
values: Buffer,
@@ -157,6 +159,8 @@ impl FixedSizeBinaryArray {
}
};
+ Self::validate_lengths(s, len)?;
+
Ok(Self {
data_type,
value_data: values,
@@ -166,6 +170,33 @@ impl FixedSizeBinaryArray {
})
}
+ /// Some calculations below use i32 arithmetic which can overflow when
+ /// valid offsets are past i32::MAX. Until that is solved for real do not
+ /// permit constructing any FixedSizeBinaryArray that has a valid offset
+ /// past i32::MAX
+ fn validate_lengths(value_size: usize, len: usize) -> Result<(),
ArrowError> {
+ // the offset is also calculated for the next element (i + 1) so
+ // check `len` (not last element index) to ensure that all offsets are
valid
+ let max_offset = value_size.checked_mul(len).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinaryArray error: value size {value_size} * len
{len} exceeds maximum valid offset"
+ ))
+ })?;
+
+ let max_valid_offset: usize = i32::MAX.try_into().map_err(|_| {
+ ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinaryArray error: maximum valid offset exceeds
i32::MAX, got {max_offset}"
+ ))
+ })?;
+
+ if max_offset > max_valid_offset {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinaryArray error: value size {value_size} * length
{len} exceeds maximum valid offset of {max_valid_offset}"
+ )));
+ };
+ Ok(())
+ }
+
/// Create a new [`FixedSizeBinaryArray`] of length `len` where all values
are null
///
/// # Panics
@@ -174,12 +205,17 @@ impl FixedSizeBinaryArray {
///
/// * `size < 0`
/// * `size * len` would overflow `usize`
+ /// * `size * len > i32::MAX`
+ /// * `size * len * 8` would overflow `usize`
pub fn new_null(size: i32, len: usize) -> Self {
const BITS_IN_A_BYTE: usize = 8;
- let capacity_in_bytes =
size.to_usize().unwrap().checked_mul(len).unwrap();
+ let size_usize = size.to_usize().unwrap();
+ Self::validate_lengths(size_usize, len).unwrap();
+ let capacity_in_bytes = size_usize.checked_mul(len).unwrap();
+ let capacity_in_bits =
capacity_in_bytes.checked_mul(BITS_IN_A_BYTE).unwrap();
Self {
data_type: DataType::FixedSizeBinary(size),
- value_data: MutableBuffer::new_null(capacity_in_bytes *
BITS_IN_A_BYTE).into(),
+ value_data: MutableBuffer::new_null(capacity_in_bits).into(),
nulls: Some(NullBuffer::new_null(len)),
value_length: size,
len,
@@ -347,8 +383,16 @@ impl FixedSizeBinaryArray {
// Now that we know how large each element is we can
reserve
// sufficient capacity in the underlying mutable buffer for
// the data.
- buffer.reserve(iter_size_hint * len);
- buffer.extend_zeros(slice.len() * prepend);
+ if let Some(capacity) = iter_size_hint.checked_mul(len) {
+ buffer.reserve(capacity);
+ }
+ let prepend_zeros =
slice.len().checked_mul(prepend).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinaryArray error: value size {} *
prepend {prepend} exceeds usize",
+ slice.len()
+ ))
+ })?;
+ buffer.extend_zeros(prepend_zeros);
}
bit_util::set_bit(null_buf.as_slice_mut(), len);
buffer.extend_from_slice(slice);
@@ -371,7 +415,13 @@ impl FixedSizeBinaryArray {
let nulls = NullBuffer::from_unsliced_buffer(null_buf, len);
- let size = size.unwrap_or(0) as i32;
+ let size = size.unwrap_or(0);
+ Self::validate_lengths(size, len)?;
+ let size = size.try_into().map_err(|_| {
+ ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinaryArray value length exceeds i32, got {size}"
+ ))
+ })?;
Ok(Self {
data_type: DataType::FixedSizeBinary(size),
value_data: buffer.into(),
@@ -410,12 +460,20 @@ impl FixedSizeBinaryArray {
T: Iterator<Item = Option<U>>,
U: AsRef<[u8]>,
{
+ let size_usize = size.to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!("Size cannot be negative,
got {size}"))
+ })?;
let mut len = 0;
let mut byte = 0;
let iter_size_hint = iter.size_hint().0;
let mut null_buf = MutableBuffer::new(bit_util::ceil(iter_size_hint,
8));
- let mut buffer = MutableBuffer::new(iter_size_hint * (size as usize));
+ let capacity = iter_size_hint.checked_mul(size_usize).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinaryArray error: value size {size_usize} * len
hint {iter_size_hint} exceeds usize"
+ ))
+ })?;
+ let mut buffer = MutableBuffer::new(capacity);
iter.try_for_each(|item| -> Result<(), ArrowError> {
// extend null bitmask by one byte per each 8 items
@@ -427,7 +485,7 @@ impl FixedSizeBinaryArray {
if let Some(slice) = item {
let slice = slice.as_ref();
- if size as usize != slice.len() {
+ if size_usize != slice.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Nested array size mismatch: one is {}, and the other
is {}",
size,
@@ -438,7 +496,7 @@ impl FixedSizeBinaryArray {
bit_util::set_bit(null_buf.as_slice_mut(), len);
buffer.extend_from_slice(slice);
} else {
- buffer.extend_zeros(size as usize);
+ buffer.extend_zeros(size_usize);
}
len += 1;
@@ -447,6 +505,7 @@ impl FixedSizeBinaryArray {
})?;
let nulls = NullBuffer::from_unsliced_buffer(null_buf, len);
+ Self::validate_lengths(size_usize, len)?;
Ok(Self {
data_type: DataType::FixedSizeBinary(size),
@@ -497,7 +556,9 @@ impl FixedSizeBinaryArray {
} else {
let len = slice.len();
size = Some(len);
- buffer.reserve(iter_size_hint * len);
+ if let Some(capacity) = iter_size_hint.checked_mul(len) {
+ buffer.reserve(capacity);
+ }
}
buffer.extend_from_slice(slice);
@@ -513,7 +574,13 @@ impl FixedSizeBinaryArray {
));
}
- let size = size.unwrap_or(0).try_into().unwrap();
+ let size = size.unwrap_or(0);
+ Self::validate_lengths(size, len)?;
+ let size = size.try_into().map_err(|_| {
+ ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinaryArray value length exceeds i32, got {size}"
+ ))
+ })?;
Ok(Self {
data_type: DataType::FixedSizeBinary(size),
value_data: buffer.into(),
@@ -548,8 +615,15 @@ impl From<ArrayData> for FixedSizeBinaryArray {
_ => panic!("Expected data type to be FixedSizeBinary"),
};
- let size = value_length as usize;
- let value_data = buffers[0].slice_with_length(offset * size, len *
size);
+ let size = value_length
+ .to_usize()
+ .expect("FixedSizeBinaryArray value length must be non-negative");
+ Self::validate_lengths(size, len)
+ .expect("FixedSizeBinaryArray offsets must fit within i32");
+ let value_data = buffers[0].slice_with_length(
+ offset.checked_mul(size).expect("offset overflow"),
+ len.checked_mul(size).expect("length overflow"),
+ );
Self {
data_type,
@@ -1040,6 +1114,26 @@ mod tests {
array.value(4);
}
+ #[test]
+ fn test_validate_lengths_allows_empty_array() {
+ FixedSizeBinaryArray::validate_lengths(1024, 0).unwrap();
+ }
+
+ #[test]
+ fn test_validate_lengths_allows_i32_max_offset() {
+ FixedSizeBinaryArray::validate_lengths(1, i32::MAX as usize).unwrap();
+ FixedSizeBinaryArray::validate_lengths(262_176, 8191).unwrap();
+ }
+
+ #[test]
+ fn test_validate_lengths_rejects_offset_past_i32_max() {
+ let err = FixedSizeBinaryArray::validate_lengths(262_177,
8192).unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Invalid argument error: FixedSizeBinaryArray error: value size
262177 * length 8192 exceeds maximum valid offset of 2147483647",
+ );
+ }
+
#[test]
fn test_constructors() {
let buffer = Buffer::from_vec(vec![0_u8; 10]);