This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch 57_maintenance
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/57_maintenance by this push:
new 33d94791b9 [57_maintenance] Prevent ArrayData validation length
overflow (#9816) (#9925)
33d94791b9 is described below
commit 33d94791b9b7abed54a78e3bafad04b827051870
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed May 6 10:34:15 2026 -0400
[57_maintenance] Prevent ArrayData validation length overflow (#9816)
(#9925)
- Part of https://github.com/apache/arrow-rs/issues/9858
- Fixes https://github.com/apache/arrow-rs/issues/9900 in 57.x releases
This PR:
- Backports https://github.com/apache/arrow-rs/pull/9816 from @alamb to
the `57_maintenance` line
---
arrow-data/src/data.rs | 139 +++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 122 insertions(+), 17 deletions(-)
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index df3b2c578f..4522cea544 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -253,6 +253,18 @@ pub struct ArrayData {
/// A thread-safe, shared reference to the Arrow array data.
pub type ArrayDataRef = Arc<ArrayData>;
+fn checked_len_plus_offset(
+ data_type: &DataType,
+ len: usize,
+ offset: usize,
+) -> Result<usize, ArrowError> {
+ len.checked_add(offset).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Length {len} with offset {offset} overflows usize for {data_type}"
+ ))
+ })
+}
+
impl ArrayData {
/// Create a new ArrayData instance;
///
@@ -321,7 +333,8 @@ impl ArrayData {
// because we use this buffer to calculate `null_count`
// in `Self::new_unchecked`.
if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
- let needed_len = bit_util::ceil(len + offset, 8);
+ let len_plus_offset = checked_len_plus_offset(&data_type, len,
offset)?;
+ let needed_len = bit_util::ceil(len_plus_offset, 8);
if null_bit_buffer.len() < needed_len {
return Err(ArrowError::InvalidArgumentError(format!(
"null_bit_buffer size too small. got {} needed {}",
@@ -755,8 +768,8 @@ impl ArrayData {
/// See [ArrayData::validate_data] to validate fully the offset content
/// and the validity of utf8 data
pub fn validate(&self) -> Result<(), ArrowError> {
- // Need at least this mich space in each buffer
- let len_plus_offset = self.len + self.offset;
+ // Need at least this much space in each buffer
+ let len_plus_offset = checked_len_plus_offset(&self.data_type,
self.len, self.offset)?;
// Check that the data layout conforms to the spec
let layout = layout(&self.data_type);
@@ -906,7 +919,9 @@ impl ArrayData {
return Ok(&[]);
}
- self.typed_buffer(0, self.len + 1)
+ let len = checked_len_plus_offset(&self.data_type, self.len, 1)?;
+
+ self.typed_buffer(0, len)
}
/// Returns a reference to the data in `buffers[idx]` as a typed slice
after validating
@@ -917,7 +932,14 @@ impl ArrayData {
) -> Result<&[T], ArrowError> {
let buffer = &self.buffers[idx];
- let required_len = (len + self.offset) * mem::size_of::<T>();
+ let required_elements = checked_len_plus_offset(&self.data_type, len,
self.offset)?;
+ let byte_width = mem::size_of::<T>();
+ let required_len =
required_elements.checked_mul(byte_width).ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Buffer {idx} of {} byte length overflow: {} elements of {}
bytes exceeds usize",
+ self.data_type, required_elements, byte_width
+ ))
+ })?;
if buffer.len() < required_len {
return Err(ArrowError::InvalidArgumentError(format!(
@@ -929,7 +951,7 @@ impl ArrayData {
)));
}
- Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
+ Ok(&buffer.typed_data::<T>()[self.offset..required_elements])
}
/// Does a cheap sanity check that the `self.len` values in `buffer` are
valid
@@ -1113,13 +1135,15 @@ impl ArrayData {
for (i, (_, field)) in fields.iter().enumerate() {
let field_data = self.get_valid_child_data(i,
field.data_type())?;
- if mode == &UnionMode::Sparse && field_data.len <
(self.len + self.offset) {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Sparse union child array #{} has length smaller
than expected for union array ({} < {})",
- i,
- field_data.len,
- self.len + self.offset
- )));
+ if mode == &UnionMode::Sparse {
+ let len_plus_offset =
+ checked_len_plus_offset(&self.data_type, self.len,
self.offset)?;
+ if field_data.len < len_plus_offset {
+ return
Err(ArrowError::InvalidArgumentError(format!(
+ "Sparse union child array #{} has length
smaller than expected for union array ({} < {})",
+ i, field_data.len, len_plus_offset
+ )));
+ }
}
}
Ok(())
@@ -1495,7 +1519,7 @@ impl ArrayData {
where
T: ArrowNativeType + TryInto<i64> + num_traits::Num +
std::fmt::Display,
{
- let required_len = self.len + self.offset;
+ let required_len = checked_len_plus_offset(&self.data_type, self.len,
self.offset)?;
let buffer = &self.buffers[0];
// This should have been checked as part of `validate()` prior
@@ -1503,7 +1527,7 @@ impl ArrayData {
assert!(buffer.len() / mem::size_of::<T>() >= required_len);
// Justification: buffer size was validated above
- let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset
+ self.len];
+ let indexes: &[T] =
&buffer.typed_data::<T>()[self.offset..required_len];
indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
// Do not check the value is null (value can be arbitrary)
@@ -1553,10 +1577,11 @@ impl ArrayData {
Ok(())
})?;
- if prev_value.as_usize() < (self.offset + self.len) {
+ let len_plus_offset = checked_len_plus_offset(&self.data_type,
self.len, self.offset)?;
+ if prev_value.as_usize() < len_plus_offset {
return Err(ArrowError::InvalidArgumentError(format!(
"The offset + length of array should be less or equal to last
value in the run_ends array. The last value of run_ends array is {prev_value}
and offset + length of array is {}.",
- self.offset + self.len
+ len_plus_offset
)));
}
Ok(())
@@ -2289,6 +2314,86 @@ mod tests {
assert_eq!(data.null_count() - 1, new_data.null_count());
}
+ #[test]
+ fn test_typed_offsets_length_overflow() {
+ let data = ArrayData {
+ data_type: DataType::Binary,
+ len: usize::MAX,
+ offset: 0,
+ buffers: vec![Buffer::from_slice_ref([0_i32])],
+ child_data: vec![],
+ nulls: None,
+ };
+ let err = data.typed_offsets::<i32>().unwrap_err();
+
+ assert_eq!(
+ err.to_string(),
+ format!(
+ "Invalid argument error: Length {} with offset 1 overflows
usize for Binary",
+ usize::MAX
+ )
+ );
+ }
+
+ #[test]
+ fn test_validate_typed_buffer_length_overflow() {
+ let data = ArrayData {
+ data_type: DataType::Binary,
+ len: 0,
+ offset: 2,
+ buffers: vec![Buffer::from_slice_ref([0_i32])],
+ child_data: vec![],
+ nulls: None,
+ };
+ let err = data.typed_buffer::<i32>(0, usize::MAX).unwrap_err();
+
+ assert_eq!(
+ err.to_string(),
+ format!(
+ "Invalid argument error: Length {} with offset 2 overflows
usize for Binary",
+ usize::MAX
+ )
+ );
+ }
+
+ // Exercises ArrayData::try_new with len + offset overflowing
+ fn try_new_binary_length_offset_overflow() -> Result<ArrayData,
ArrowError> {
+ ArrayData::try_new(
+ DataType::Binary,
+ usize::MAX,
+ None,
+ 1,
+ vec![
+ Buffer::from_slice_ref([0_i32]),
+ Buffer::from_iter(std::iter::empty::<u8>()),
+ ],
+ vec![],
+ )
+ }
+
+ #[cfg(not(feature = "force_validate"))]
+ #[test]
+ fn test_try_new_length_offset_overflow() {
+ let err = try_new_binary_length_offset_overflow().unwrap_err();
+
+ assert_eq!(
+ err.to_string(),
+ format!(
+ "Invalid argument error: Length {} with offset 1 overflows
usize for Binary",
+ usize::MAX
+ )
+ );
+ }
+
+ #[cfg(feature = "force_validate")]
+ #[test]
+ #[should_panic(
+ expected = "Length 18446744073709551615 with offset 1 overflows usize
for Binary"
+ )]
+ fn test_try_new_length_offset_overflow_force_validate() {
+ try_new_binary_length_offset_overflow().unwrap();
+ }
+
#[test]
fn test_equality() {
let int_data = ArrayData::builder(DataType::Int32)