This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch 56_maintenance
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/56_maintenance by this push:
     new c1e887cdea [56_maintenance] Prevent ArrayData validation length 
overflow (#9816) (#9914)
c1e887cdea is described below

commit c1e887cdead7af41708bca10864f358c0fb06f43
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed May 6 10:33:49 2026 -0400

    [56_maintenance] Prevent ArrayData validation length overflow (#9816) 
(#9914)
    
    - Part of https://github.com/apache/arrow-rs/issues/9857
    - Fixes https://github.com/apache/arrow-rs/issues/9900 in 56.x releases
    
    This PR:
    - Backports https://github.com/apache/arrow-rs/pull/9816 from @alamb to
    the `56_maintenance` line
---
 arrow-data/src/data.rs | 137 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 122 insertions(+), 15 deletions(-)

diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index fca19bc3aa..c8202f5641 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -253,6 +253,18 @@ pub struct ArrayData {
 /// A thread-safe, shared reference to the Arrow array data.
 pub type ArrayDataRef = Arc<ArrayData>;
 
+fn checked_len_plus_offset(
+    data_type: &DataType,
+    len: usize,
+    offset: usize,
+) -> Result<usize, ArrowError> {
+    len.checked_add(offset).ok_or_else(|| {
+        ArrowError::InvalidArgumentError(format!(
+            "Length {len} with offset {offset} overflows usize for {data_type}"
+        ))
+    })
+}
+
 impl ArrayData {
     /// Create a new ArrayData instance;
     ///
@@ -321,7 +333,8 @@ impl ArrayData {
         // because we use this buffer to calculate `null_count`
         // in `Self::new_unchecked`.
         if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
-            let needed_len = bit_util::ceil(len + offset, 8);
+            let len_plus_offset = checked_len_plus_offset(&data_type, len, 
offset)?;
+            let needed_len = bit_util::ceil(len_plus_offset, 8);
             if null_bit_buffer.len() < needed_len {
                 return Err(ArrowError::InvalidArgumentError(format!(
                     "null_bit_buffer size too small. got {} needed {}",
@@ -748,8 +761,8 @@ impl ArrayData {
     /// See [ArrayData::validate_data] to validate fully the offset content
     /// and the validity of utf8 data
     pub fn validate(&self) -> Result<(), ArrowError> {
-        // Need at least this mich space in each buffer
-        let len_plus_offset = self.len + self.offset;
+        // Need at least this much space in each buffer
+        let len_plus_offset = checked_len_plus_offset(&self.data_type, 
self.len, self.offset)?;
 
         // Check that the data layout conforms to the spec
         let layout = layout(&self.data_type);
@@ -892,7 +905,9 @@ impl ArrayData {
             return Ok(&[]);
         }
 
-        self.typed_buffer(0, self.len + 1)
+        let len = checked_len_plus_offset(&self.data_type, self.len, 1)?;
+
+        self.typed_buffer(0, len)
     }
 
     /// Returns a reference to the data in `buffers[idx]` as a typed slice 
after validating
@@ -903,7 +918,14 @@ impl ArrayData {
     ) -> Result<&[T], ArrowError> {
         let buffer = &self.buffers[idx];
 
-        let required_len = (len + self.offset) * mem::size_of::<T>();
+        let required_elements = checked_len_plus_offset(&self.data_type, len, 
self.offset)?;
+        let byte_width = mem::size_of::<T>();
+        let required_len = 
required_elements.checked_mul(byte_width).ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Buffer {idx} of {} byte length overflow: {} elements of {} 
bytes exceeds usize",
+                self.data_type, required_elements, byte_width
+            ))
+        })?;
 
         if buffer.len() < required_len {
             return Err(ArrowError::InvalidArgumentError(format!(
@@ -915,7 +937,7 @@ impl ArrayData {
             )));
         }
 
-        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
+        Ok(&buffer.typed_data::<T>()[self.offset..required_elements])
     }
 
     /// Does a cheap sanity check that the `self.len` values in `buffer` are 
valid
@@ -1087,11 +1109,15 @@ impl ArrayData {
                 for (i, (_, field)) in fields.iter().enumerate() {
                     let field_data = self.get_valid_child_data(i, 
field.data_type())?;
 
-                    if mode == &UnionMode::Sparse && field_data.len < 
(self.len + self.offset) {
-                        return Err(ArrowError::InvalidArgumentError(format!(
-                            "Sparse union child array #{} has length smaller 
than expected for union array ({} < {})",
-                            i, field_data.len, self.len + self.offset
-                        )));
+                    if mode == &UnionMode::Sparse {
+                        let len_plus_offset =
+                            checked_len_plus_offset(&self.data_type, self.len, 
self.offset)?;
+                        if field_data.len < len_plus_offset {
+                            return 
Err(ArrowError::InvalidArgumentError(format!(
+                                "Sparse union child array #{} has length 
smaller than expected for union array ({} < {})",
+                                i, field_data.len, len_plus_offset
+                            )));
+                        }
                     }
                 }
                 Ok(())
@@ -1467,7 +1493,7 @@ impl ArrayData {
     where
         T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
     {
-        let required_len = self.len + self.offset;
+        let required_len = checked_len_plus_offset(&self.data_type, self.len, 
self.offset)?;
         let buffer = &self.buffers[0];
 
         // This should have been checked as part of `validate()` prior
@@ -1475,7 +1501,7 @@ impl ArrayData {
         assert!(buffer.len() / mem::size_of::<T>() >= required_len);
 
         // Justification: buffer size was validated above
-        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset 
+ self.len];
+        let indexes: &[T] = 
&buffer.typed_data::<T>()[self.offset..required_len];
 
         indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
             // Do not check the value is null (value can be arbitrary)
@@ -1525,10 +1551,11 @@ impl ArrayData {
             Ok(())
         })?;
 
-        if prev_value.as_usize() < (self.offset + self.len) {
+        let len_plus_offset = checked_len_plus_offset(&self.data_type, 
self.len, self.offset)?;
+        if prev_value.as_usize() < len_plus_offset {
             return Err(ArrowError::InvalidArgumentError(format!(
                 "The offset + length of array should be less or equal to last 
value in the run_ends array. The last value of run_ends array is {prev_value} 
and offset + length of array is {}.",
-                self.offset + self.len
+                len_plus_offset
             )));
         }
         Ok(())
@@ -2258,6 +2285,86 @@ mod tests {
         assert_eq!(data.null_count() - 1, new_data.null_count());
     }
 
+    #[test]
+    fn test_typed_offsets_length_overflow() {
+        let data = ArrayData {
+            data_type: DataType::Binary,
+            len: usize::MAX,
+            offset: 0,
+            buffers: vec![Buffer::from_slice_ref([0_i32])],
+            child_data: vec![],
+            nulls: None,
+        };
+        let err = data.typed_offsets::<i32>().unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            format!(
+                "Invalid argument error: Length {} with offset 1 overflows 
usize for Binary",
+                usize::MAX
+            )
+        );
+    }
+
+    #[test]
+    fn test_validate_typed_buffer_length_overflow() {
+        let data = ArrayData {
+            data_type: DataType::Binary,
+            len: 0,
+            offset: 2,
+            buffers: vec![Buffer::from_slice_ref([0_i32])],
+            child_data: vec![],
+            nulls: None,
+        };
+        let err = data.typed_buffer::<i32>(0, usize::MAX).unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            format!(
+                "Invalid argument error: Length {} with offset 2 overflows 
usize for Binary",
+                usize::MAX
+            )
+        );
+    }
+
+    // Exercises ArrayData::try_new with len + offset overflowing
+    fn try_new_binary_length_offset_overflow() -> Result<ArrayData, 
ArrowError> {
+        ArrayData::try_new(
+            DataType::Binary,
+            usize::MAX,
+            None,
+            1,
+            vec![
+                Buffer::from_slice_ref([0_i32]),
+                Buffer::from_iter(std::iter::empty::<u8>()),
+            ],
+            vec![],
+        )
+    }
+
+    #[cfg(not(feature = "force_validate"))]
+    #[test]
+    fn test_try_new_length_offset_overflow() {
+        let err = try_new_binary_length_offset_overflow().unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            format!(
+                "Invalid argument error: Length {} with offset 1 overflows 
usize for Binary",
+                usize::MAX
+            )
+        );
+    }
+
+    #[cfg(feature = "force_validate")]
+    #[test]
+    #[should_panic(
+        expected = "Length 18446744073709551615 with offset 1 overflows usize 
for Binary"
+    )]
+    fn test_try_new_length_offset_overflow_force_validate() {
+        try_new_binary_length_offset_overflow().unwrap();
+    }
+
     #[test]
     fn test_equality() {
         let int_data = ArrayData::builder(DataType::Int32)

Reply via email to