This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 389b2b0a67 [Variant] Fix several overflow panic risks for 32-bit arch 
(#7752)
389b2b0a67 is described below

commit 389b2b0a67a10aba185677c3fc5c5e6f285d5a0d
Author: Ryan Johnson <[email protected]>
AuthorDate: Tue Jun 24 14:12:52 2025 -0700

    [Variant] Fix several overflow panic risks for 32-bit arch (#7752)
    
    # Which issue does this PR close?
    
    Part of
    * https://github.com/apache/arrow-rs/issues/6736
    
    # Rationale for this change
    
    The variant spec makes extensive use of 4-byte offsets. On a 32-bit
    system, this can lead to integer overflow panics when doing offset
    arithmetic on malicious or malformed variant data, because `usize` is 32
    bits. That is bad.
    
    # What changes are included in this PR?
    
    Use checked arithmetic in code that operates on offsets extracted from
    (untrusted) variant byte buffers.
    
    Of particular interest, we define and use a new
    `slice_from_slice_at_offset` helper, which safely adds an offset to a
    range.
    
    # Are there any user-facing changes?
    
    No.
---
 parquet-variant/src/decoder.rs          | 16 +++++-----
 parquet-variant/src/utils.rs            | 45 +++++++++++++++++++++-------
 parquet-variant/src/variant.rs          |  4 +--
 parquet-variant/src/variant/list.rs     | 39 +++++++++++-------------
 parquet-variant/src/variant/metadata.rs | 31 +++++++++++--------
 parquet-variant/src/variant/object.rs   | 53 +++++++++++++++++++++------------
 6 files changed, 117 insertions(+), 71 deletions(-)

diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs
index 7096b0a086..cb8336b5b8 100644
--- a/parquet-variant/src/decoder.rs
+++ b/parquet-variant/src/decoder.rs
@@ -14,7 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-use crate::utils::{array_from_slice, slice_from_slice, string_from_slice};
+use crate::utils::{array_from_slice, slice_from_slice_at_offset, 
string_from_slice};
 use crate::ShortString;
 
 use arrow_schema::ArrowError;
@@ -23,6 +23,9 @@ use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, 
Utc};
 use std::array::TryFromSliceError;
 use std::num::TryFromIntError;
 
+// Makes the code a bit more readable
+pub(crate) const VARIANT_VALUE_HEADER_BYTES: usize = 1;
+
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub enum VariantBasicType {
     Primitive = 0,
@@ -262,21 +265,19 @@ pub(crate) fn decode_timestampntz_micros(data: &[u8]) -> 
Result<NaiveDateTime, A
 /// Decodes a Binary from the value section of a variant.
 pub(crate) fn decode_binary(data: &[u8]) -> Result<&[u8], ArrowError> {
     let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
-    let value = slice_from_slice(data, 4..4 + len)?;
-    Ok(value)
+    slice_from_slice_at_offset(data, 4, 0..len)
 }
 
 /// Decodes a long string from the value section of a variant.
 pub(crate) fn decode_long_string(data: &[u8]) -> Result<&str, ArrowError> {
     let len = u32::from_le_bytes(array_from_slice(data, 0)?) as usize;
-    let string = string_from_slice(data, 4..4 + len)?;
-    Ok(string)
+    string_from_slice(data, 4, 0..len)
 }
 
 /// Decodes a short string from the value section of a variant.
 pub(crate) fn decode_short_string(metadata: u8, data: &[u8]) -> 
Result<ShortString, ArrowError> {
     let len = (metadata >> 2) as usize;
-    let string = string_from_slice(data, 0..len)?;
+    let string = string_from_slice(data, 0, 0..len)?;
     ShortString::try_new(string)
 }
 
@@ -518,10 +519,11 @@ mod tests {
 
         let width = OffsetSizeBytes::Two;
 
-        // dictionary_size starts immediately after the header
+        // dictionary_size starts immediately after the header byte
         let dict_size = width.unpack_usize(&buf, 1, 0).unwrap();
         assert_eq!(dict_size, 2);
 
+        // offset array immediately follows the dictionary size
         let first = width.unpack_usize(&buf, 1, 1).unwrap();
         assert_eq!(first, 0);
 
diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs
index 7a1b9f0399..e0f966cab8 100644
--- a/parquet-variant/src/utils.rs
+++ b/parquet-variant/src/utils.rs
@@ -21,6 +21,11 @@ use arrow_schema::ArrowError;
 use std::fmt::Debug;
 use std::slice::SliceIndex;
 
+/// Helper for reporting integer overflow errors in a consistent way.
+pub(crate) fn overflow_error(msg: &str) -> ArrowError {
+    ArrowError::InvalidArgumentError(format!("Integer overflow computing 
{msg}"))
+}
+
 #[inline]
 pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone + Debug>(
     bytes: &[u8],
@@ -33,17 +38,33 @@ pub(crate) fn slice_from_slice<I: SliceIndex<[u8]> + Clone 
+ Debug>(
         ))
     })
 }
+
+/// Helper to safely slice bytes with offset calculations.
+///
+/// Equivalent to `slice_from_slice(bytes, (base_offset + 
range.start)..(base_offset + range.end))`
+/// but using checked addition to prevent integer overflow panics on 32-bit 
systems.
+#[inline]
+pub(crate) fn slice_from_slice_at_offset(
+    bytes: &[u8],
+    base_offset: usize,
+    range: Range<usize>,
+) -> Result<&[u8], ArrowError> {
+    let start_byte = base_offset
+        .checked_add(range.start)
+        .ok_or_else(|| overflow_error("slice start"))?;
+    let end_byte = base_offset
+        .checked_add(range.end)
+        .ok_or_else(|| overflow_error("slice end"))?;
+    slice_from_slice(bytes, start_byte..end_byte)
+}
+
 pub(crate) fn array_from_slice<const N: usize>(
     bytes: &[u8],
     offset: usize,
 ) -> Result<[u8; N], ArrowError> {
-    let bytes = slice_from_slice(bytes, offset..offset + N)?;
-    bytes.try_into().map_err(map_try_from_slice_error)
-}
-
-/// To be used in `map_err` when unpacking an integer from a slice of bytes.
-pub(crate) fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError {
-    ArrowError::InvalidArgumentError(e.to_string())
+    slice_from_slice_at_offset(bytes, offset, 0..N)?
+        .try_into()
+        .map_err(|e: TryFromSliceError| 
ArrowError::InvalidArgumentError(e.to_string()))
 }
 
 pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<u8, ArrowError> {
@@ -53,9 +74,13 @@ pub(crate) fn first_byte_from_slice(slice: &[u8]) -> 
Result<u8, ArrowError> {
         .ok_or_else(|| ArrowError::InvalidArgumentError("Received empty 
bytes".to_string()))
 }
 
-/// Helper to get a &str from a slice based on range, if it's valid or an 
error otherwise
-pub(crate) fn string_from_slice(slice: &[u8], range: Range<usize>) -> 
Result<&str, ArrowError> {
-    str::from_utf8(slice_from_slice(slice, range)?)
+/// Helper to get a &str from a slice at the given offset and range, or an 
error if invalid.
+pub(crate) fn string_from_slice(
+    slice: &[u8],
+    offset: usize,
+    range: Range<usize>,
+) -> Result<&str, ArrowError> {
+    str::from_utf8(slice_from_slice_at_offset(slice, offset, range)?)
         .map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8 
string".to_string()))
 }
 
diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs
index b343a538d5..da3fbd36fc 100644
--- a/parquet-variant/src/variant.rs
+++ b/parquet-variant/src/variant.rs
@@ -168,13 +168,13 @@ impl<'a> TryFrom<&'a str> for ShortString<'a> {
     }
 }
 
-impl<'a> AsRef<str> for ShortString<'a> {
+impl AsRef<str> for ShortString<'_> {
     fn as_ref(&self) -> &str {
         self.0
     }
 }
 
-impl<'a> Deref for ShortString<'a> {
+impl Deref for ShortString<'_> {
     type Target = str;
 
     fn deref(&self) -> &Self::Target {
diff --git a/parquet-variant/src/variant/list.rs 
b/parquet-variant/src/variant/list.rs
index d9fd20eacc..703761b420 100644
--- a/parquet-variant/src/variant/list.rs
+++ b/parquet-variant/src/variant/list.rs
@@ -15,11 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 use crate::decoder::OffsetSizeBytes;
-use crate::utils::{first_byte_from_slice, slice_from_slice, 
validate_fallible_iterator};
+use crate::utils::{
+    first_byte_from_slice, overflow_error, slice_from_slice_at_offset, 
validate_fallible_iterator,
+};
 use crate::variant::{Variant, VariantMetadata};
 
 use arrow_schema::ArrowError;
 
+// The value header occupies one byte; use a named constant for readability
+const NUM_HEADER_BYTES: usize = 1;
+
 /// A parsed version of the variant array value header byte.
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantListHeader {
@@ -78,25 +83,16 @@ impl<'m, 'v> VariantList<'m, 'v> {
             false => OffsetSizeBytes::One,
         };
 
-        // Skip the header byte to read the num_elements
-        let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
-        let first_offset_byte = 1 + num_elements_size as usize;
-
-        let overflow =
-            || ArrowError::InvalidArgumentError("Variant value_byte_length 
overflow".into());
-
-        // 1.  num_elements + 1
-        let n_offsets = num_elements.checked_add(1).ok_or_else(overflow)?;
-
-        // 2.  (num_elements + 1) * offset_size
-        let value_bytes = n_offsets
-            .checked_mul(header.offset_size as usize)
-            .ok_or_else(overflow)?;
+        // Skip the header byte to read the num_elements; the offset array 
immediately follows
+        let num_elements = num_elements_size.unpack_usize(value, 
NUM_HEADER_BYTES, 0)?;
+        let first_offset_byte = NUM_HEADER_BYTES + num_elements_size as usize;
 
-        // 3.  first_offset_byte + ...
-        let first_value_byte = first_offset_byte
-            .checked_add(value_bytes)
-            .ok_or_else(overflow)?;
+        // (num_elements + 1) * offset_size + first_offset_byte
+        let first_value_byte = num_elements
+            .checked_add(1)
+            .and_then(|n| n.checked_mul(header.offset_size as usize))
+            .and_then(|n| n.checked_add(first_offset_byte))
+            .ok_or_else(|| overflow_error("offset of variant list values"))?;
 
         let new_self = Self {
             metadata,
@@ -139,9 +135,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
         };
 
         // Read the value bytes from the offsets
-        let variant_value_bytes = slice_from_slice(
+        let variant_value_bytes = slice_from_slice_at_offset(
             self.value,
-            self.first_value_byte + unpack(index)?..self.first_value_byte + 
unpack(index + 1)?,
+            self.first_value_byte,
+            unpack(index)?..unpack(index + 1)?,
         )?;
         let variant = Variant::try_new_with_metadata(self.metadata, 
variant_value_bytes)?;
         Ok(variant)
diff --git a/parquet-variant/src/variant/metadata.rs 
b/parquet-variant/src/variant/metadata.rs
index bfefeb506d..8fff65a6ee 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -17,7 +17,8 @@
 
 use crate::decoder::OffsetSizeBytes;
 use crate::utils::{
-    first_byte_from_slice, slice_from_slice, string_from_slice, 
validate_fallible_iterator,
+    first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
+    validate_fallible_iterator,
 };
 
 use arrow_schema::ArrowError;
@@ -35,6 +36,9 @@ pub(crate) struct VariantMetadataHeader {
 // purposes and to make that visible.
 const CORRECT_VERSION_VALUE: u8 = 1;
 
+// The metadata header occupies one byte; use a named constant for readability
+const NUM_HEADER_BYTES: usize = 1;
+
 impl VariantMetadataHeader {
     /// Tries to construct the variant metadata header, which has the form
     ///
@@ -102,20 +106,22 @@ impl<'m> VariantMetadata<'m> {
         let header_byte = first_byte_from_slice(bytes)?;
         let header = VariantMetadataHeader::try_new(header_byte)?;
 
-        // Offset 1, index 0 because first element after header is dictionary 
size
-        let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?;
+        // First element after header is dictionary size
+        let dict_size = header
+            .offset_size
+            .unpack_usize(bytes, NUM_HEADER_BYTES, 0)?;
 
         // Calculate the starting offset of the dictionary string bytes.
         //
         // Value header, dict_size (offset_size bytes), and dict_size+1 offsets
-        // = 1 + offset_size + (dict_size + 1) * offset_size
-        // = (dict_size + 2) * offset_size + 1
+        // = NUM_HEADER_BYTES + offset_size + (dict_size + 1) * offset_size
+        // = (dict_size + 2) * offset_size + NUM_HEADER_BYTES
         let dictionary_key_start_byte = dict_size
             .checked_add(2)
             .and_then(|n| n.checked_mul(header.offset_size as usize))
-            .and_then(|n| n.checked_add(1))
-            .ok_or_else(|| ArrowError::InvalidArgumentError("metadata length 
overflow".into()))?;
-        println!("dictionary_key_start_byte: {dictionary_key_start_byte}");
+            .and_then(|n| n.checked_add(NUM_HEADER_BYTES))
+            .ok_or_else(|| overflow_error("offset of variant metadata 
dictionary"))?;
+
         let new_self = Self {
             bytes,
             header,
@@ -149,16 +155,17 @@ impl<'m> VariantMetadata<'m> {
     /// This offset is an index into the dictionary, at the boundary between 
string `i-1` and string
     /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
     fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
-        // Skipping the header byte (setting byte_offset = 1) and the 
dictionary_size (setting offset_index +1)
+        // Skip the header byte and the dictionary_size entry (by offset_index 
+ 1)
         let bytes = slice_from_slice(self.bytes, 
..self.dictionary_key_start_byte)?;
-        self.header.offset_size.unpack_usize(bytes, 1, i + 1)
+        self.header
+            .offset_size
+            .unpack_usize(bytes, NUM_HEADER_BYTES, i + 1)
     }
 
     /// Gets a dictionary entry by index
     pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
-        let dictionary_keys_bytes = slice_from_slice(self.bytes, 
self.dictionary_key_start_byte..)?;
         let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
-        string_from_slice(dictionary_keys_bytes, byte_range)
+        string_from_slice(self.bytes, self.dictionary_key_start_byte, 
byte_range)
     }
 
     /// Get all dictionary entries as an Iterator of strings
diff --git a/parquet-variant/src/variant/object.rs 
b/parquet-variant/src/variant/object.rs
index 471b94ccdb..b52701f8bb 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -16,12 +16,16 @@
 // under the License.
 use crate::decoder::OffsetSizeBytes;
 use crate::utils::{
-    first_byte_from_slice, slice_from_slice, try_binary_search_range_by, 
validate_fallible_iterator,
+    first_byte_from_slice, overflow_error, slice_from_slice, 
try_binary_search_range_by,
+    validate_fallible_iterator,
 };
 use crate::variant::{Variant, VariantMetadata};
 
 use arrow_schema::ArrowError;
 
+// The value header occupies one byte; use a named constant for readability
+const NUM_HEADER_BYTES: usize = 1;
+
 /// Header structure for [`VariantObject`]
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantObjectHeader {
@@ -72,36 +76,43 @@ impl<'m, 'v> VariantObject<'m, 'v> {
         let header_byte = first_byte_from_slice(value)?;
         let header = VariantObjectHeader::try_new(header_byte)?;
 
-        // Determine num_elements size based on is_large flag
+        // Determine num_elements size based on is_large flag and fetch the 
value
         let num_elements_size = if header.is_large {
             OffsetSizeBytes::Four
         } else {
             OffsetSizeBytes::One
         };
+        let num_elements = num_elements_size.unpack_usize(value, 
NUM_HEADER_BYTES, 0)?;
+
+        // Calculate byte offsets for different sections with overflow 
protection
+        let field_ids_start_byte = NUM_HEADER_BYTES
+            .checked_add(num_elements_size as usize)
+            .ok_or_else(|| overflow_error("offset of variant object field 
ids"))?;
 
-        // Parse num_elements
-        let num_elements = num_elements_size.unpack_usize(value, 1, 0)?;
+        let field_offsets_start_byte = num_elements
+            .checked_mul(header.field_id_size as usize)
+            .and_then(|n| n.checked_add(field_ids_start_byte))
+            .ok_or_else(|| overflow_error("offset of variant object field 
offsets"))?;
 
-        // Calculate byte offsets for different sections
-        let field_ids_start_byte = 1 + num_elements_size as usize;
-        let field_offsets_start_byte =
-            field_ids_start_byte + num_elements * header.field_id_size as 
usize;
-        let values_start_byte =
-            field_offsets_start_byte + (num_elements + 1) * 
header.field_offset_size as usize;
+        let values_start_byte = num_elements
+            .checked_add(1)
+            .and_then(|n| n.checked_mul(header.field_offset_size as usize))
+            .and_then(|n| n.checked_add(field_offsets_start_byte))
+            .ok_or_else(|| overflow_error("offset of variant object field 
values"))?;
 
         // Spec says: "The last field_offset points to the byte after the end 
of the last value"
         //
         // Use the last offset as a bounds check. The iterator check below 
doesn't use it -- offsets
         // are not monotonic -- so we have to check separately here.
-        let last_field_offset =
-            header
-                .field_offset_size
-                .unpack_usize(value, field_offsets_start_byte, num_elements)?;
-        if values_start_byte + last_field_offset > value.len() {
+        let end_offset = header
+            .field_offset_size
+            .unpack_usize(value, field_offsets_start_byte, num_elements)?
+            .checked_add(values_start_byte)
+            .ok_or_else(|| overflow_error("end of variant object field 
values"))?;
+        if end_offset > value.len() {
             return Err(ArrowError::InvalidArgumentError(format!(
-                "Last field offset value {} at offset {} is outside the value 
slice of length {}",
-                last_field_offset,
-                values_start_byte,
+                "Last field offset value {} is outside the value slice of 
length {}",
+                end_offset,
                 value.len()
             )));
         }
@@ -140,7 +151,11 @@ impl<'m, 'v> VariantObject<'m, 'v> {
             self.field_offsets_start_byte,
             i,
         )?;
-        let value_bytes = slice_from_slice(self.value, self.values_start_byte 
+ start_offset..)?;
+        let value_start = self
+            .values_start_byte
+            .checked_add(start_offset)
+            .ok_or_else(|| overflow_error("offset of variant object field"))?;
+        let value_bytes = slice_from_slice(self.value, value_start..)?;
         Variant::try_new_with_metadata(self.metadata, value_bytes)
     }
 

Reply via email to