scovich commented on code in PR #7807:
URL: https://github.com/apache/arrow-rs/pull/7807#discussion_r2176115818


##########
parquet-variant/src/decoder.rs:
##########
@@ -132,23 +134,41 @@ impl OffsetSizeBytes {
 
     /// Return one unsigned little-endian value from `bytes`.
     ///
-    /// * `bytes` – the Variant-metadata buffer.
+    /// * `bytes` – the byte buffer to index
+    /// * `index` – 0-based index into the buffer
+    ///
+    /// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+    /// Three-byte values are zero-extended to 32 bits before the final
+    /// fallible cast to `usize`.
+    pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) -> 
Result<usize, ArrowError> {
+        self.unpack_usize_at_offset(bytes, 0, index)
+    }
+
+    /// Return one unsigned little-endian value from `bytes`.
+    ///
+    /// * `bytes` – the byte buffer to index
     /// * `byte_offset` – number of bytes to skip **before** reading the first
-    ///   value (usually `1` to move past the header byte).
-    /// * `offset_index` – 0-based index **after** the skip
+    ///   value (e.g. `1` to move past a header byte).
+    /// * `offset_index` – 0-based index **after** the skipped bytes
     ///   (`0` is the first value, `1` the next, …).
     ///
     /// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
     /// Three-byte values are zero-extended to 32 bits before the final
     /// fallible cast to `usize`.
-    pub(crate) fn unpack_usize(
+    pub(crate) fn unpack_usize_at_offset(
         &self,
         bytes: &[u8],
         byte_offset: usize,  // how many bytes to skip
         offset_index: usize, // which offset in an array of offsets
     ) -> Result<usize, ArrowError> {
         use OffsetSizeBytes::*;
-        let offset = byte_offset + (*self as usize) * offset_index;
+
+        // Index into the byte array:
+        // byte_offset + (*self as usize) * offset_index
+        let offset = offset_index
+            .checked_mul(*self as usize)
+            .and_then(|n| n.checked_add(byte_offset))

Review Comment:
   This was unchecked arithmetic that risked overflow panic on 32-bit arch.



##########
parquet-variant/src/variant/list.rs:
##########
@@ -69,46 +134,88 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// # Validation
     ///
     /// This constructor verifies that `value` points to a valid variant array 
value. In particular,
-    /// that all offsets are in-bounds and point to valid objects.
-    // TODO: How to make the validation non-recursive while still making 
iterators safely infallible??
-    // See https://github.com/apache/arrow-rs/issues/7711
+    /// that all offsets are in-bounds and point to valid (recursively 
validated) objects.
     pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        Self::try_new_impl(metadata, value)?.validate()
+    }
+
+    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
+        Self::try_new_impl(metadata, value).expect("Invalid variant list 
value")
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant array, 
performing only basic
+    /// (constant-cost) [validation].
+    ///
+    /// [validation]: Self#Validation
+    pub(crate) fn try_new_impl(
+        metadata: VariantMetadata<'m>,
+        value: &'v [u8],
+    ) -> Result<Self, ArrowError> {
         let header_byte = first_byte_from_slice(value)?;
         let header = VariantListHeader::try_new(header_byte)?;
 
-        // The size of the num_elements entry in the array value_data is 4 
bytes if
-        // is_large is true, otherwise 1 byte.
-        let num_elements_size = match header.is_large {
-            true => OffsetSizeBytes::Four,
-            false => OffsetSizeBytes::One,
-        };
-
         // Skip the header byte to read the num_elements; the offset array 
immediately follows
-        let num_elements = num_elements_size.unpack_usize(value, 
NUM_HEADER_BYTES, 0)?;
-        let first_offset_byte = NUM_HEADER_BYTES + num_elements_size as usize;
+        let num_elements =
+            header
+                .num_elements_size
+                .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
 
         // (num_elements + 1) * offset_size + first_offset_byte
         let first_value_byte = num_elements
             .checked_add(1)
-            .and_then(|n| n.checked_mul(header.offset_size as usize))
-            .and_then(|n| n.checked_add(first_offset_byte))
+            .and_then(|n| n.checked_mul(header.offset_size()))
+            .and_then(|n| n.checked_add(header.first_offset_byte()))
             .ok_or_else(|| overflow_error("offset of variant list values"))?;
 
-        let new_self = Self {
+        let mut new_self = Self {
             metadata,
             value,
             header,
             num_elements,
-            first_offset_byte,
             first_value_byte,
+            validated: false,
         };
 
-        // Iterate over all values of this array in order to validate the 
field_offset array and
-        // prove that the field values are all in bounds. Otherwise, `iter` 
might panic on `unwrap`.
-        validate_fallible_iterator(new_self.iter_checked())?;
+        // Validate just the first and last offset, ignoring the other offsets 
and all value bytes.
+        let first_offset = new_self.get_offset(0)?;
+        if first_offset != 0 {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "First offset is not zero: {first_offset}"
+            )));
+        }
+
+        // Use the last offset to upper-bound the value buffer
+        let last_offset = new_self
+            .get_offset(num_elements)?
+            .checked_add(first_value_byte)
+            .ok_or_else(|| overflow_error("variant array size"))?;
+        new_self.value = slice_from_slice(value, ..last_offset)?;
         Ok(new_self)
     }
 
+    /// True if this instance is fully [validated] for panic-free infallible 
accesses.
+    ///
+    /// [validated]: Self#Validation
+    pub fn is_validated(&self) -> bool {
+        self.validated
+    }
+
+    /// Performs a full [validation] of this variant array and returns the 
result.
+    ///
+    /// [validation]: Self#Validation
+    pub fn validate(mut self) -> Result<Self, ArrowError> {
+        if !self.validated {
+            // Validate the metadata dictionary, if not already validated.
+            self.metadata = self.metadata.validate()?;

Review Comment:
   important to validate metadata first, because it's passed by copy to all the 
children!



##########
parquet-variant/src/variant/list.rs:
##########
@@ -119,54 +226,51 @@ impl<'m, 'v> VariantList<'m, 'v> {
         self.len() == 0
     }
 
-    /// Returns element by index in `0..self.len()`, if any
+    /// Returns element by index in `0..self.len()`, if any. May panic if this 
list is [invalid].
+    ///
+    /// [invalid]: Self#Validation
     pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
-        if index >= self.num_elements {
-            return None;
-        }
-
-        match self.try_get(index) {
-            Ok(variant) => Some(variant),
-            Err(err) => panic!("validation error: {err}"),
-        }
+        (index < self.num_elements).then(|| {

Review Comment:
   I took the liberty of simplifying this code while also changing `panic` to 
`expect`.



##########
parquet-variant/src/variant/list.rs:
##########
@@ -119,54 +226,51 @@ impl<'m, 'v> VariantList<'m, 'v> {
         self.len() == 0
     }
 
-    /// Returns element by index in `0..self.len()`, if any
+    /// Returns element by index in `0..self.len()`, if any. May panic if this 
list is [invalid].
+    ///
+    /// [invalid]: Self#Validation
     pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
-        if index >= self.num_elements {
-            return None;
-        }
-
-        match self.try_get(index) {
-            Ok(variant) => Some(variant),
-            Err(err) => panic!("validation error: {err}"),
-        }
+        (index < self.num_elements).then(|| {
+            self.try_get_impl(index)
+                .and_then(Variant::validate)
+                .expect("Invalid variant array element")
+        })
     }
 
     /// Fallible version of `get`. Returns element by index, capturing 
validation errors
-    fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
-        if index >= self.num_elements {
-            return Err(ArrowError::InvalidArgumentError(format!(
-                "Index {} out of bounds for list of length {}",
-                index, self.num_elements,
-            )));
-        }
-
-        // Skip header and num_elements bytes to read the offsets
-        let unpack = |i| {
-            self.header
-                .offset_size
-                .unpack_usize(self.value, self.first_offset_byte, i)
-        };
+    pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        self.try_get_impl(index)?.validate()
+    }
 
-        // Read the value bytes from the offsets
-        let variant_value_bytes = slice_from_slice_at_offset(
-            self.value,
-            self.first_value_byte,
-            unpack(index)?..unpack(index + 1)?,
-        )?;
-        let variant = Variant::try_new_with_metadata(self.metadata, 
variant_value_bytes)?;
-        Ok(variant)
+    /// Fallible iteration over the elements of this list.
+    pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, 
ArrowError>> + '_ {
+        (0..self.len()).map(move |i| self.try_get_impl(i))

Review Comment:
   Oops, forgot to split out the `iter_try_impl` here!



##########
parquet-variant/src/variant/object.rs:
##########
@@ -29,111 +29,196 @@ const NUM_HEADER_BYTES: usize = 1;
 /// Header structure for [`VariantObject`]
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantObjectHeader {
-    field_offset_size: OffsetSizeBytes,
+    num_elements_size: OffsetSizeBytes,
     field_id_size: OffsetSizeBytes,
-    is_large: bool,
+    field_offset_size: OffsetSizeBytes,
 }
 
 impl VariantObjectHeader {
+    // Hide the ugly casting
+    const fn num_elements_size(&self) -> usize {
+        self.num_elements_size as _
+    }
+    const fn field_id_size(&self) -> usize {
+        self.field_id_size as _
+    }
+    const fn field_offset_size(&self) -> usize {
+        self.field_offset_size as _
+    }
+
+    // Avoid materializing this offset, since it's cheaply and safely 
computable
+    const fn field_ids_start_byte(&self) -> usize {
+        NUM_HEADER_BYTES + self.num_elements_size()
+    }
+
     pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
         // Parse the header byte to get object parameters
         let value_header = header_byte >> 2;
         let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
         let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
         let is_large = (value_header & 0x10) != 0; // 5th bit
-
+        let num_elements_size = match is_large {
+            true => OffsetSizeBytes::Four,
+            false => OffsetSizeBytes::One,
+        };
         Ok(Self {
-            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
+            num_elements_size,
             field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
-            is_large,
+            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
         })
     }
 }
 
 /// A [`Variant`] Object (struct with named fields).
+///
+/// See the [Variant spec] file for more information.
+///
+/// # Validation
+///
+/// Every instance of variant object is either _valid_ or _invalid_. depending 
on whether the
+/// underlying bytes are a valid encoding of a variant object subtype (see 
below).
+///
+/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully 
(and recursively)
+/// _validated_. They always contain _valid_ data, and infallible accesses 
such as iteration and
+/// indexing are panic-free. The validation cost is linear in the number of 
underlying bytes.
+///
+/// Instances produced by [`Self::new`] are _unvalidated_ and so they may 
contain either _valid_ or
+/// _invalid_ data. Infallible accesses such as iteration and indexing will 
panic if the underlying
+/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] 
and [`Self::get`] are
+/// provided as panic-free alternatives. [`Self::validate`] can also be used 
to _validate_ an
+/// _unvalidated_ instance, if desired.
+///
+/// _Unvalidated_ instances can be constructed in constant time. They can be 
useful if the caller
+/// knows the underlying bytes were already validated previously, or if the 
caller intends to
+/// perform a small number of (fallible) field accesses against a large object.
+///
+/// A _validated_ instance guarantees that:
+///
+/// - header byte is valid
+/// - num_elements is in bounds
+/// - field id array is in bounds
+/// - field offset array is in bounds
+/// - field value array is in bounds
+/// - all field ids are valid metadata dictionary entries (*)
+/// - field ids are lexically ordered according by their corresponding string 
values (*)
+/// - all field offsets are in bounds (*)
+/// - all field values are (recursively) _valid_ variant values (*)
+/// - the associated variant metadata is [valid] (*)
+///
+/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation 
checks (marked by `(*)`
+/// in the list above); it panics any of the other checks fails.
+///
+/// # Safety
+///
+/// Even an _invalid_ variant object instance is still _safe_ to use in the 
Rust sense. Accessing it
+/// with infallible methods may cause panics but will never lead to undefined 
behavior.
+///
+/// [valid]: VariantMetadata#Validation
+/// [Variant spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2
 #[derive(Clone, Debug, PartialEq)]
 pub struct VariantObject<'m, 'v> {
     pub metadata: VariantMetadata<'m>,
     pub value: &'v [u8],
     header: VariantObjectHeader,
     num_elements: usize,
-    field_ids_start_byte: usize,
-    field_offsets_start_byte: usize,
-    values_start_byte: usize,
+    first_field_offset_byte: usize,
+    first_value_byte: usize,
+    validated: bool,
 }
 
 impl<'m, 'v> VariantObject<'m, 'v> {
-    /// Attempts to interpret `value` as a variant object value.
+    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
+        Self::try_new_impl(metadata, value).expect("Invalid variant object")
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant object.
     ///
     /// # Validation
     ///
     /// This constructor verifies that `value` points to a valid variant 
object value. In
     /// particular, that all field ids exist in `metadata`, and all offsets 
are in-bounds and point
     /// to valid objects.
-    // TODO: How to make the validation non-recursive while still making 
iterators safely infallible??
-    // See https://github.com/apache/arrow-rs/issues/7711
     pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        let mut new_self = Self::try_new_impl(metadata, value)?.validate()?;
+        new_self.validated = true;

Review Comment:
   Oops, not needed (`validate` already set the flag)



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -204,8 +292,8 @@ mod tests {
         let md = VariantMetadata::try_new(bytes).expect("should parse");
         assert_eq!(md.dictionary_size(), 2);
         // Fields
-        assert_eq!(md.get(0).unwrap(), "cat");
-        assert_eq!(md.get(1).unwrap(), "dog");
+        assert_eq!(&md[0], "cat");

Review Comment:
   Unfortunately, we can only do this for `VariantMetadata`... `VariantObject` 
and `VariantList` need to return a value, not a reference, which `Index` trait 
doesn't allow.



##########
parquet-variant/src/variant/list.rs:
##########
@@ -28,39 +29,103 @@ const NUM_HEADER_BYTES: usize = 1;
 /// A parsed version of the variant array value header byte.
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantListHeader {
+    num_elements_size: OffsetSizeBytes,
     offset_size: OffsetSizeBytes,
-    is_large: bool,
 }
 
 impl VariantListHeader {
+    // Hide the ugly casting
+    const fn num_elements_size(&self) -> usize {
+        self.num_elements_size as _
+    }
+    const fn offset_size(&self) -> usize {
+        self.offset_size as _
+    }
+
+    // Avoid materializing this offset, since it's cheaply and safely 
computable
+    const fn first_offset_byte(&self) -> usize {
+        NUM_HEADER_BYTES + self.num_elements_size()
+    }
+
     pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
         // The 6 first bits to the left are the value_header and the 2 bits
         // to the right are the basic type, so we shift to get only the 
value_header
         let value_header = header_byte >> 2;
         let is_large = (value_header & 0x04) != 0; // 3rd bit from the right
         let field_offset_size_minus_one = value_header & 0x03; // Last two bits
+
+        // The size of the num_elements entry in the array value_data is 4 
bytes if
+        // is_large is true, otherwise 1 byte.
+        let num_elements_size = match is_large {
+            true => OffsetSizeBytes::Four,
+            false => OffsetSizeBytes::One,
+        };
         let offset_size = 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
 
         Ok(Self {
+            num_elements_size,
             offset_size,
-            is_large,
         })
     }
 }
 
 /// [`Variant`] Array.
 ///
+/// See the [Variant spec] for details.
+///
 /// NOTE: The "list" naming differs from the variant spec -- which calls it 
"array" -- in order to be
 /// consistent with Parquet and Arrow type naming. Otherwise, the name would 
conflict with the
 /// `VariantArray : Array` we must eventually define for variant-typed arrow 
arrays.
+///
+/// # Validation
+///
+/// Every instance of variant list is either _valid_ or _invalid_. depending 
on whether the
+/// underlying bytes are a valid encoding of a variant array (see below).
+///
+/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully 
_validated_. They always
+/// contain _valid_ data, and infallible accesses such as iteration and 
indexing are panic-free. The
+/// validation cost is linear in the number of underlying bytes.
+///
+/// Instances produced by [`Self::new`] are _unvalidated_ and so they may 
contain either _valid_ or
+/// _invalid_ data. Infallible accesses such as iteration and indexing will 
panic if the underlying
+/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] 
and [`Self::get`] are
+/// provided as panic-free alternatives. [`Self::validate`] can also be used 
to _validate_ an
+/// _unvalidated_ instance, if desired.
+///
+/// _Unvalidated_ instances can be constructed in constant time. This can be 
useful if the caller
+/// knows the underlying bytes were already validated previously, or if the 
caller intends to
+/// perform a small number of (fallible) accesses to a large list.
+///
+/// A _validated_ variant list instance guarantees that:
+///
+/// - header byte is valid
+/// - num_elements is in bounds
+/// - offset array content is in-bounds
+/// - first offset is zero
+/// - last offset is in-bounds
+/// - all other offsets are in-bounds (*)
+/// - all offsets are monotonically increasing (*)
+/// - all values are (recursively) valid variant objects (*)
+/// - the associated variant metadata is [valid] (*)
+///
+/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation 
checks (marked by `(*)`
+/// in the list above); it panics any of the other checks fails.
+///
+/// # Safety
+///
+/// Even an _invalid_ variant list instance is still _safe_ to use in the Rust 
sense. Accessing
+/// it with infallible methods may cause panics but will never lead to 
undefined behavior.
+///
+/// [valid]: VariantMetadata#Validation
+/// [Variant spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-array-basic_type3
 #[derive(Clone, Debug, PartialEq)]
 pub struct VariantList<'m, 'v> {
     pub metadata: VariantMetadata<'m>,
     pub value: &'v [u8],
     header: VariantListHeader,
     num_elements: usize,
-    first_offset_byte: usize,
     first_value_byte: usize,
+    validated: bool,

Review Comment:
   Validation is expensive enough I figured it was worth remembering whether it 
has been done. 
   Especially since the new member hides in the existing padding of each struct.



##########
parquet-variant/src/variant/list.rs:
##########
@@ -28,39 +29,103 @@ const NUM_HEADER_BYTES: usize = 1;
 /// A parsed version of the variant array value header byte.
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantListHeader {
+    num_elements_size: OffsetSizeBytes,

Review Comment:
   It turns out `is_large` isn't nearly as useful as the actual offset 
unpacker. That, combined with the `first_offset_byte` helper below, reduces the 
size of the corresponding variant struct.
   
   * See also https://github.com/apache/arrow-rs/issues/7831



##########
parquet-variant/src/decoder.rs:
##########
@@ -159,14 +179,14 @@ impl OffsetSizeBytes {
                 let mut buf = [0u8; 4];
                 buf[..3].copy_from_slice(&b3_chunks);
                 u32::from_le_bytes(buf)
-                    .try_into()
-                    .map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?
             }
-            Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
-                .try_into()
-                .map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))?,
+            Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
         };
-        Ok(result)
+
+        // Convert the u32 we extracted to usize (should always succeed on 32- 
and 64-bit arch)
+        result
+            .try_into()
+            .map_err(|e: TryFromIntError| 
ArrowError::InvalidArgumentError(e.to_string()))

Review Comment:
   Factored out the u32 -> usize conversion to reduce redundancy.



##########
parquet-variant/src/variant.rs:
##########
@@ -304,12 +381,45 @@ impl<'m, 'v> Variant<'m, 'v> {
             VariantBasicType::ShortString => {
                 
Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?)
             }
-            VariantBasicType::Object => 
Variant::Object(VariantObject::try_new(metadata, value)?),
-            VariantBasicType::Array => 
Variant::List(VariantList::try_new(metadata, value)?),
+            VariantBasicType::Object => {
+                Variant::Object(VariantObject::try_new_impl(metadata, value)?)
+            }
+            VariantBasicType::Array => 
Variant::List(VariantList::try_new_impl(metadata, value)?),

Review Comment:
   By directly invoking `try_new_impl` methods, we avoid recursive validation 
of array elements and object fields. Unfortunately this requires making them 
`pub(crate)`, when I would have preferred to keep them private.



##########
parquet-variant/src/decoder.rs:
##########
@@ -132,23 +134,41 @@ impl OffsetSizeBytes {
 
     /// Return one unsigned little-endian value from `bytes`.
     ///
-    /// * `bytes` – the Variant-metadata buffer.
+    /// * `bytes` – the byte buffer to index
+    /// * `index` – 0-based index into the buffer
+    ///
+    /// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
+    /// Three-byte values are zero-extended to 32 bits before the final
+    /// fallible cast to `usize`.
+    pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) -> 
Result<usize, ArrowError> {
+        self.unpack_usize_at_offset(bytes, 0, index)

Review Comment:
   It turns out that a bunch of slice accesses were not correctly bounded; once 
I fixed that, most `unpack_usize` calls were passing offset 0. So I renamed the 
original method as `unpack_usize_at_offset`, and created a new 2-arg 
`unpack_usize` that doesn't take an offset.



##########
parquet-variant/src/variant/list.rs:
##########
@@ -69,46 +134,88 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// # Validation
     ///
     /// This constructor verifies that `value` points to a valid variant array 
value. In particular,
-    /// that all offsets are in-bounds and point to valid objects.
-    // TODO: How to make the validation non-recursive while still making 
iterators safely infallible??
-    // See https://github.com/apache/arrow-rs/issues/7711
+    /// that all offsets are in-bounds and point to valid (recursively 
validated) objects.
     pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        Self::try_new_impl(metadata, value)?.validate()
+    }
+
+    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
+        Self::try_new_impl(metadata, value).expect("Invalid variant list 
value")
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant array, 
performing only basic
+    /// (constant-cost) [validation].
+    ///
+    /// [validation]: Self#Validation
+    pub(crate) fn try_new_impl(
+        metadata: VariantMetadata<'m>,
+        value: &'v [u8],
+    ) -> Result<Self, ArrowError> {
         let header_byte = first_byte_from_slice(value)?;
         let header = VariantListHeader::try_new(header_byte)?;
 
-        // The size of the num_elements entry in the array value_data is 4 
bytes if
-        // is_large is true, otherwise 1 byte.
-        let num_elements_size = match header.is_large {
-            true => OffsetSizeBytes::Four,
-            false => OffsetSizeBytes::One,
-        };
-
         // Skip the header byte to read the num_elements; the offset array 
immediately follows
-        let num_elements = num_elements_size.unpack_usize(value, 
NUM_HEADER_BYTES, 0)?;
-        let first_offset_byte = NUM_HEADER_BYTES + num_elements_size as usize;
+        let num_elements =
+            header
+                .num_elements_size
+                .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
 
         // (num_elements + 1) * offset_size + first_offset_byte
         let first_value_byte = num_elements
             .checked_add(1)
-            .and_then(|n| n.checked_mul(header.offset_size as usize))
-            .and_then(|n| n.checked_add(first_offset_byte))
+            .and_then(|n| n.checked_mul(header.offset_size()))
+            .and_then(|n| n.checked_add(header.first_offset_byte()))
             .ok_or_else(|| overflow_error("offset of variant list values"))?;
 
-        let new_self = Self {
+        let mut new_self = Self {
             metadata,
             value,
             header,
             num_elements,
-            first_offset_byte,
             first_value_byte,
+            validated: false,
         };
 
-        // Iterate over all values of this array in order to validate the 
field_offset array and
-        // prove that the field values are all in bounds. Otherwise, `iter` 
might panic on `unwrap`.
-        validate_fallible_iterator(new_self.iter_checked())?;
+        // Validate just the first and last offset, ignoring the other offsets 
and all value bytes.
+        let first_offset = new_self.get_offset(0)?;
+        if first_offset != 0 {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "First offset is not zero: {first_offset}"
+            )));
+        }
+
+        // Use the last offset to upper-bound the value buffer
+        let last_offset = new_self
+            .get_offset(num_elements)?
+            .checked_add(first_value_byte)
+            .ok_or_else(|| overflow_error("variant array size"))?;
+        new_self.value = slice_from_slice(value, ..last_offset)?;

Review Comment:
   This PR adds proper "region bounding" in a lot of places. Offset arrays, 
value bytes, etc are all bounded now, so we can't accidentally read garbage out 
of neighboring regions that happen to be reachable through the slice. I suspect 
this might help address issues like 
   - https://github.com/apache/arrow-rs/issues/7784
   
   ... but I have not tested that hypothesis yet.



##########
parquet-variant/src/variant/list.rs:
##########
@@ -69,46 +134,88 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// # Validation
     ///
     /// This constructor verifies that `value` points to a valid variant array 
value. In particular,
-    /// that all offsets are in-bounds and point to valid objects.
-    // TODO: How to make the validation non-recursive while still making 
iterators safely infallible??
-    // See https://github.com/apache/arrow-rs/issues/7711
+    /// that all offsets are in-bounds and point to valid (recursively 
validated) objects.
     pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        Self::try_new_impl(metadata, value)?.validate()
+    }
+
+    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
+        Self::try_new_impl(metadata, value).expect("Invalid variant list 
value")
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant array, 
performing only basic
+    /// (constant-cost) [validation].
+    ///
+    /// [validation]: Self#Validation
+    pub(crate) fn try_new_impl(
+        metadata: VariantMetadata<'m>,
+        value: &'v [u8],
+    ) -> Result<Self, ArrowError> {
         let header_byte = first_byte_from_slice(value)?;
         let header = VariantListHeader::try_new(header_byte)?;
 
-        // The size of the num_elements entry in the array value_data is 4 
bytes if
-        // is_large is true, otherwise 1 byte.
-        let num_elements_size = match header.is_large {
-            true => OffsetSizeBytes::Four,
-            false => OffsetSizeBytes::One,
-        };
-
         // Skip the header byte to read the num_elements; the offset array 
immediately follows
-        let num_elements = num_elements_size.unpack_usize(value, 
NUM_HEADER_BYTES, 0)?;
-        let first_offset_byte = NUM_HEADER_BYTES + num_elements_size as usize;
+        let num_elements =
+            header
+                .num_elements_size
+                .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
 
         // (num_elements + 1) * offset_size + first_offset_byte
         let first_value_byte = num_elements
             .checked_add(1)
-            .and_then(|n| n.checked_mul(header.offset_size as usize))
-            .and_then(|n| n.checked_add(first_offset_byte))
+            .and_then(|n| n.checked_mul(header.offset_size()))
+            .and_then(|n| n.checked_add(header.first_offset_byte()))
             .ok_or_else(|| overflow_error("offset of variant list values"))?;
 
-        let new_self = Self {
+        let mut new_self = Self {
             metadata,
             value,
             header,
             num_elements,
-            first_offset_byte,
             first_value_byte,
+            validated: false,
         };
 
-        // Iterate over all values of this array in order to validate the 
field_offset array and
-        // prove that the field values are all in bounds. Otherwise, `iter` 
might panic on `unwrap`.
-        validate_fallible_iterator(new_self.iter_checked())?;
+        // Validate just the first and last offset, ignoring the other offsets 
and all value bytes.

Review Comment:
   These two validations were previously covered by the iterator validation, 
and is technically redundant if the caller will anyway `validate` the result. 
But it didn't seem worth the complexity (and potential bugs) to make this 
method aware of caller's (lack of) intent to validate.



##########
parquet-variant/src/variant/object.rs:
##########
@@ -29,111 +29,196 @@ const NUM_HEADER_BYTES: usize = 1;
 /// Header structure for [`VariantObject`]
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantObjectHeader {
-    field_offset_size: OffsetSizeBytes,
+    num_elements_size: OffsetSizeBytes,
     field_id_size: OffsetSizeBytes,
-    is_large: bool,
+    field_offset_size: OffsetSizeBytes,
 }
 
 impl VariantObjectHeader {
+    // Hide the ugly casting
+    const fn num_elements_size(&self) -> usize {
+        self.num_elements_size as _
+    }
+    const fn field_id_size(&self) -> usize {
+        self.field_id_size as _
+    }
+    const fn field_offset_size(&self) -> usize {
+        self.field_offset_size as _
+    }
+
+    // Avoid materializing this offset, since it's cheaply and safely 
computable
+    const fn field_ids_start_byte(&self) -> usize {
+        NUM_HEADER_BYTES + self.num_elements_size()
+    }
+
     pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
         // Parse the header byte to get object parameters
         let value_header = header_byte >> 2;
         let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
         let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
         let is_large = (value_header & 0x10) != 0; // 5th bit
-
+        let num_elements_size = match is_large {
+            true => OffsetSizeBytes::Four,
+            false => OffsetSizeBytes::One,
+        };
         Ok(Self {
-            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
+            num_elements_size,
             field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
-            is_large,
+            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
         })
     }
 }
 
 /// A [`Variant`] Object (struct with named fields).
+///
+/// See the [Variant spec] file for more information.
+///
+/// # Validation
+///
+/// Every instance of variant object is either _valid_ or _invalid_. depending 
on whether the
+/// underlying bytes are a valid encoding of a variant object subtype (see 
below).
+///
+/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully 
(and recursively)
+/// _validated_. They always contain _valid_ data, and infallible accesses 
such as iteration and
+/// indexing are panic-free. The validation cost is linear in the number of 
underlying bytes.
+///
+/// Instances produced by [`Self::new`] are _unvalidated_ and so they may 
contain either _valid_ or
+/// _invalid_ data. Infallible accesses such as iteration and indexing will 
panic if the underlying
+/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] 
and [`Self::get`] are
+/// provided as panic-free alternatives. [`Self::validate`] can also be used 
to _validate_ an
+/// _unvalidated_ instance, if desired.
+///
+/// _Unvalidated_ instances can be constructed in constant time. They can be 
useful if the caller
+/// knows the underlying bytes were already validated previously, or if the 
caller intends to
+/// perform a small number of (fallible) field accesses against a large object.
+///
+/// A _validated_ instance guarantees that:
+///
+/// - header byte is valid
+/// - num_elements is in bounds
+/// - field id array is in bounds
+/// - field offset array is in bounds
+/// - field value array is in bounds
+/// - all field ids are valid metadata dictionary entries (*)
+/// - field ids are lexically ordered according by their corresponding string 
values (*)
+/// - all field offsets are in bounds (*)
+/// - all field values are (recursively) _valid_ variant values (*)
+/// - the associated variant metadata is [valid] (*)
+///
+/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation 
checks (marked by `(*)`
+/// in the list above); it panics any of the other checks fails.
+///
+/// # Safety
+///
+/// Even an _invalid_ variant object instance is still _safe_ to use in the 
Rust sense. Accessing it
+/// with infallible methods may cause panics but will never lead to undefined 
behavior.
+///
+/// [valid]: VariantMetadata#Validation
+/// [Variant spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2
 #[derive(Clone, Debug, PartialEq)]
 pub struct VariantObject<'m, 'v> {
     pub metadata: VariantMetadata<'m>,
     pub value: &'v [u8],
     header: VariantObjectHeader,
     num_elements: usize,
-    field_ids_start_byte: usize,
-    field_offsets_start_byte: usize,
-    values_start_byte: usize,
+    first_field_offset_byte: usize,
+    first_value_byte: usize,
+    validated: bool,
 }
 
 impl<'m, 'v> VariantObject<'m, 'v> {
-    /// Attempts to interpret `value` as a variant object value.
+    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
+        Self::try_new_impl(metadata, value).expect("Invalid variant object")
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant object.
     ///
     /// # Validation
     ///
     /// This constructor verifies that `value` points to a valid variant 
object value. In
     /// particular, that all field ids exist in `metadata`, and all offsets 
are in-bounds and point
     /// to valid objects.
-    // TODO: How to make the validation non-recursive while still making 
iterators safely infallible??
-    // See https://github.com/apache/arrow-rs/issues/7711
     pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        let mut new_self = Self::try_new_impl(metadata, value)?.validate()?;
+        new_self.validated = true;
+        Ok(new_self)
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant object, 
performing only basic
+    /// (constant-cost) [validation].
+    ///
+    /// [validation]: Self#Validation
+    pub(crate) fn try_new_impl(
+        metadata: VariantMetadata<'m>,
+        value: &'v [u8],
+    ) -> Result<Self, ArrowError> {
         let header_byte = first_byte_from_slice(value)?;
         let header = VariantObjectHeader::try_new(header_byte)?;
 
         // Determine num_elements size based on is_large flag and fetch the 
value
-        let num_elements_size = if header.is_large {
-            OffsetSizeBytes::Four
-        } else {
-            OffsetSizeBytes::One
-        };
-        let num_elements = num_elements_size.unpack_usize(value, 
NUM_HEADER_BYTES, 0)?;
-
-        // Calculate byte offsets for different sections with overflow 
protection
-        let field_ids_start_byte = NUM_HEADER_BYTES
-            .checked_add(num_elements_size as usize)
-            .ok_or_else(|| overflow_error("offset of variant object field 
ids"))?;

Review Comment:
   This didn't need to be checked arithmetic... the maximum possible value is 5.



##########
parquet-variant/src/variant/object.rs:
##########
@@ -147,60 +232,73 @@ impl<'m, 'v> VariantObject<'m, 'v> {
     /// Get a field's value by index in `0..self.len()`
     ///
     /// # Panics
-    /// If the variant object is corrupted (e.g., invalid offsets or field 
IDs).
-    /// This should never happen since the constructor validates all data 
upfront.
+    ///
+    /// If the index is out of bounds. Also if variant object is corrupted 
(e.g., invalid offsets or
+    /// field IDs). The latter can only happen when working with an 
unvalidated object produced by
+    /// [`Self::new`].
     pub fn field(&self, i: usize) -> Option<Variant<'m, 'v>> {
-        Some(
-            self.try_field(i)
-                .expect("validation error after construction"),
-        )
+        (i < self.len()).then(|| self.try_field_impl(i).expect("Invalid object 
field value"))
     }
 
     /// Fallible version of `field`. Returns field value by index, capturing 
validation errors
-    fn try_field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
-        let start_offset = self.header.field_offset_size.unpack_usize(
-            self.value,
-            self.field_offsets_start_byte,
-            i,
-        )?;
-        let value_start = self
-            .values_start_byte
-            .checked_add(start_offset)
-            .ok_or_else(|| overflow_error("offset of variant object field"))?;
-        let value_bytes = slice_from_slice(self.value, value_start..)?;
+    pub fn try_field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+        self.try_field_impl(i)?.validate()
+    }
+
+    // Attempts to retrieve the ith field value from the value region of the 
byte buffer; it
+    // performs only basic (constant-cost) validation.
+    fn try_field_impl(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+        let value_bytes = slice_from_slice(self.value, 
self.first_value_byte..)?;
+        let value_bytes = slice_from_slice(value_bytes, 
self.get_offset(i)?..)?;

Review Comment:
   again, don't let the method access bytes from an obviously wrong part of the 
underlying buffer



##########
parquet-variant/src/variant/object.rs:
##########
@@ -29,111 +29,196 @@ const NUM_HEADER_BYTES: usize = 1;
 /// Header structure for [`VariantObject`]
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantObjectHeader {
-    field_offset_size: OffsetSizeBytes,
+    num_elements_size: OffsetSizeBytes,
     field_id_size: OffsetSizeBytes,
-    is_large: bool,
+    field_offset_size: OffsetSizeBytes,

Review Comment:
   Reordered to match the spec



##########
parquet-variant/src/variant/list.rs:
##########
@@ -119,54 +226,51 @@ impl<'m, 'v> VariantList<'m, 'v> {
         self.len() == 0
     }
 
-    /// Returns element by index in `0..self.len()`, if any
+    /// Returns element by index in `0..self.len()`, if any. May panic if this 
list is [invalid].
+    ///
+    /// [invalid]: Self#Validation
     pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
-        if index >= self.num_elements {
-            return None;
-        }
-
-        match self.try_get(index) {
-            Ok(variant) => Some(variant),
-            Err(err) => panic!("validation error: {err}"),
-        }
+        (index < self.num_elements).then(|| {
+            self.try_get_impl(index)
+                .and_then(Variant::validate)
+                .expect("Invalid variant array element")
+        })
     }
 
     /// Fallible version of `get`. Returns element by index, capturing 
validation errors
-    fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
-        if index >= self.num_elements {
-            return Err(ArrowError::InvalidArgumentError(format!(
-                "Index {} out of bounds for list of length {}",
-                index, self.num_elements,
-            )));
-        }
-
-        // Skip header and num_elements bytes to read the offsets
-        let unpack = |i| {
-            self.header
-                .offset_size
-                .unpack_usize(self.value, self.first_offset_byte, i)
-        };
+    pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        self.try_get_impl(index)?.validate()
+    }
 
-        // Read the value bytes from the offsets
-        let variant_value_bytes = slice_from_slice_at_offset(
-            self.value,
-            self.first_value_byte,
-            unpack(index)?..unpack(index + 1)?,
-        )?;
-        let variant = Variant::try_new_with_metadata(self.metadata, 
variant_value_bytes)?;
-        Ok(variant)
+    /// Fallible iteration over the elements of this list.
+    pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, 
ArrowError>> + '_ {
+        (0..self.len()).map(move |i| self.try_get_impl(i))
     }
 
-    /// Iterates over the values of this list
+    /// Iterates over the values of this list. When working with [unvalidated] 
input, consider
+    /// [`Self::iter_try`] to avoid panics due to invalid data.
+    ///
+    /// [unvalidated]: Self#Validation
     pub fn iter(&self) -> impl Iterator<Item = Variant<'m, 'v>> + '_ {
-        // NOTE: It is safe to unwrap because the constructor already made a 
successful traversal.
-        self.iter_checked().map(Result::unwrap)
+        self.iter_try()
+            .map(|result| result.expect("Invalid variant list entry"))
+    }
+
+    // Attempts to retrieve the ith offset from the offset array region of the 
byte buffer.
+    fn get_offset(&self, index: usize) -> Result<usize, ArrowError> {
+        let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;

Review Comment:
   Don't let this method access obviously wrong parts of the value slice.



##########
parquet-variant/src/variant/object.rs:
##########
@@ -147,60 +232,73 @@ impl<'m, 'v> VariantObject<'m, 'v> {
     /// Get a field's value by index in `0..self.len()`
     ///
     /// # Panics
-    /// If the variant object is corrupted (e.g., invalid offsets or field 
IDs).
-    /// This should never happen since the constructor validates all data 
upfront.
+    ///
+    /// If the index is out of bounds. Also if variant object is corrupted 
(e.g., invalid offsets or
+    /// field IDs). The latter can only happen when working with an 
unvalidated object produced by
+    /// [`Self::new`].
     pub fn field(&self, i: usize) -> Option<Variant<'m, 'v>> {
-        Some(
-            self.try_field(i)
-                .expect("validation error after construction"),
-        )
+        (i < self.len()).then(|| self.try_field_impl(i).expect("Invalid object 
field value"))

Review Comment:
   missing bounds check!
   
   (the original code was incapable of returning None)



##########
parquet-variant/src/variant/object.rs:
##########
@@ -147,60 +232,73 @@ impl<'m, 'v> VariantObject<'m, 'v> {
     /// Get a field's value by index in `0..self.len()`
     ///
     /// # Panics
-    /// If the variant object is corrupted (e.g., invalid offsets or field 
IDs).
-    /// This should never happen since the constructor validates all data 
upfront.
+    ///
+    /// If the index is out of bounds. Also if variant object is corrupted 
(e.g., invalid offsets or
+    /// field IDs). The latter can only happen when working with an 
unvalidated object produced by
+    /// [`Self::new`].
     pub fn field(&self, i: usize) -> Option<Variant<'m, 'v>> {
-        Some(
-            self.try_field(i)
-                .expect("validation error after construction"),
-        )
+        (i < self.len()).then(|| self.try_field_impl(i).expect("Invalid object 
field value"))
     }
 
     /// Fallible version of `field`. Returns field value by index, capturing 
validation errors
-    fn try_field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
-        let start_offset = self.header.field_offset_size.unpack_usize(
-            self.value,
-            self.field_offsets_start_byte,
-            i,
-        )?;
-        let value_start = self
-            .values_start_byte
-            .checked_add(start_offset)
-            .ok_or_else(|| overflow_error("offset of variant object field"))?;
-        let value_bytes = slice_from_slice(self.value, value_start..)?;
+    pub fn try_field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+        self.try_field_impl(i)?.validate()
+    }
+
+    // Attempts to retrieve the ith field value from the value region of the 
byte buffer; it
+    // performs only basic (constant-cost) validation.
+    fn try_field_impl(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+        let value_bytes = slice_from_slice(self.value, 
self.first_value_byte..)?;
+        let value_bytes = slice_from_slice(value_bytes, 
self.get_offset(i)?..)?;
         Variant::try_new_with_metadata(self.metadata, value_bytes)
     }
 
+    // Attempts to retrieve the ith offset from the field offset region of the 
byte buffer.
+    fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
+        let byte_range = self.first_field_offset_byte..self.first_value_byte;
+        let field_offsets = slice_from_slice(self.value, byte_range)?;
+        self.header.field_offset_size.unpack_usize(field_offsets, i)
+    }
+
     /// Get a field's name by index in `0..self.len()`
     ///
     /// # Panics
     /// If the variant object is corrupted (e.g., invalid offsets or field 
IDs).
     /// This should never happen since the constructor validates all data 
upfront.
     pub fn field_name(&self, i: usize) -> Option<&'m str> {
-        Some(
+        (i < self.len()).then(|| {

Review Comment:
   Another missing bounds check



##########
parquet-variant/src/variant/object.rs:
##########
@@ -29,111 +29,196 @@ const NUM_HEADER_BYTES: usize = 1;
 /// Header structure for [`VariantObject`]
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct VariantObjectHeader {
-    field_offset_size: OffsetSizeBytes,
+    num_elements_size: OffsetSizeBytes,
     field_id_size: OffsetSizeBytes,
-    is_large: bool,
+    field_offset_size: OffsetSizeBytes,
 }
 
 impl VariantObjectHeader {
+    // Hide the ugly casting
+    const fn num_elements_size(&self) -> usize {
+        self.num_elements_size as _
+    }
+    const fn field_id_size(&self) -> usize {
+        self.field_id_size as _
+    }
+    const fn field_offset_size(&self) -> usize {
+        self.field_offset_size as _
+    }
+
+    // Avoid materializing this offset, since it's cheaply and safely 
computable
+    const fn field_ids_start_byte(&self) -> usize {
+        NUM_HEADER_BYTES + self.num_elements_size()
+    }
+
     pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
         // Parse the header byte to get object parameters
         let value_header = header_byte >> 2;
         let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
         let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 
bits
         let is_large = (value_header & 0x10) != 0; // 5th bit
-
+        let num_elements_size = match is_large {
+            true => OffsetSizeBytes::Four,
+            false => OffsetSizeBytes::One,
+        };
         Ok(Self {
-            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
+            num_elements_size,
             field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
-            is_large,
+            field_offset_size: 
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
         })
     }
 }
 
 /// A [`Variant`] Object (struct with named fields).
+///
+/// See the [Variant spec] file for more information.
+///
+/// # Validation
+///
+/// Every instance of variant object is either _valid_ or _invalid_. depending 
on whether the
+/// underlying bytes are a valid encoding of a variant object subtype (see 
below).
+///
+/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully 
(and recursively)
+/// _validated_. They always contain _valid_ data, and infallible accesses 
such as iteration and
+/// indexing are panic-free. The validation cost is linear in the number of 
underlying bytes.
+///
+/// Instances produced by [`Self::new`] are _unvalidated_ and so they may 
contain either _valid_ or
+/// _invalid_ data. Infallible accesses such as iteration and indexing will 
panic if the underlying
+/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] 
and [`Self::get`] are
+/// provided as panic-free alternatives. [`Self::validate`] can also be used 
to _validate_ an
+/// _unvalidated_ instance, if desired.
+///
+/// _Unvalidated_ instances can be constructed in constant time. They can be 
useful if the caller
+/// knows the underlying bytes were already validated previously, or if the 
caller intends to
+/// perform a small number of (fallible) field accesses against a large object.
+///
+/// A _validated_ instance guarantees that:
+///
+/// - header byte is valid
+/// - num_elements is in bounds
+/// - field id array is in bounds
+/// - field offset array is in bounds
+/// - field value array is in bounds
+/// - all field ids are valid metadata dictionary entries (*)
+/// - field ids are lexically ordered according by their corresponding string 
values (*)
+/// - all field offsets are in bounds (*)
+/// - all field values are (recursively) _valid_ variant values (*)
+/// - the associated variant metadata is [valid] (*)
+///
+/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation 
checks (marked by `(*)`
+/// in the list above); it panics any of the other checks fails.
+///
+/// # Safety
+///
+/// Even an _invalid_ variant object instance is still _safe_ to use in the 
Rust sense. Accessing it
+/// with infallible methods may cause panics but will never lead to undefined 
behavior.
+///
+/// [valid]: VariantMetadata#Validation
+/// [Variant spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2
 #[derive(Clone, Debug, PartialEq)]
 pub struct VariantObject<'m, 'v> {
     pub metadata: VariantMetadata<'m>,
     pub value: &'v [u8],
     header: VariantObjectHeader,
     num_elements: usize,
-    field_ids_start_byte: usize,
-    field_offsets_start_byte: usize,
-    values_start_byte: usize,
+    first_field_offset_byte: usize,
+    first_value_byte: usize,
+    validated: bool,
 }
 
 impl<'m, 'v> VariantObject<'m, 'v> {
-    /// Attempts to interpret `value` as a variant object value.
+    pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
+        Self::try_new_impl(metadata, value).expect("Invalid variant object")
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant object.
     ///
     /// # Validation
     ///
     /// This constructor verifies that `value` points to a valid variant 
object value. In
     /// particular, that all field ids exist in `metadata`, and all offsets 
are in-bounds and point
     /// to valid objects.
-    // TODO: How to make the validation non-recursive while still making 
iterators safely infallible??
-    // See https://github.com/apache/arrow-rs/issues/7711
     pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> 
Result<Self, ArrowError> {
+        let mut new_self = Self::try_new_impl(metadata, value)?.validate()?;
+        new_self.validated = true;
+        Ok(new_self)
+    }
+
+    /// Attempts to interpet `metadata` and `value` as a variant object, 
performing only basic
+    /// (constant-cost) [validation].
+    ///
+    /// [validation]: Self#Validation
+    pub(crate) fn try_new_impl(
+        metadata: VariantMetadata<'m>,
+        value: &'v [u8],
+    ) -> Result<Self, ArrowError> {
         let header_byte = first_byte_from_slice(value)?;
         let header = VariantObjectHeader::try_new(header_byte)?;
 
         // Determine num_elements size based on is_large flag and fetch the 
value
-        let num_elements_size = if header.is_large {
-            OffsetSizeBytes::Four
-        } else {
-            OffsetSizeBytes::One
-        };
-        let num_elements = num_elements_size.unpack_usize(value, 
NUM_HEADER_BYTES, 0)?;
-
-        // Calculate byte offsets for different sections with overflow 
protection
-        let field_ids_start_byte = NUM_HEADER_BYTES
-            .checked_add(num_elements_size as usize)
-            .ok_or_else(|| overflow_error("offset of variant object field 
ids"))?;
-
-        let field_offsets_start_byte = num_elements
-            .checked_mul(header.field_id_size as usize)
-            .and_then(|n| n.checked_add(field_ids_start_byte))
+        let num_elements =
+            header
+                .num_elements_size
+                .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
+
+        // Calculate byte offsets for field offsets and values with overflow 
protection, and verify
+        // they're in bounds
+        let first_field_offset_byte = num_elements
+            .checked_mul(header.field_id_size())
+            .and_then(|n| n.checked_add(header.field_ids_start_byte()))
             .ok_or_else(|| overflow_error("offset of variant object field 
offsets"))?;
 
-        let values_start_byte = num_elements
+        let first_value_byte = num_elements

Review Comment:
   renamed to match similar patterns elsewhere



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -154,29 +246,47 @@ impl<'m> VariantMetadata<'m> {
     /// This offset is an index into the dictionary, at the boundary between 
string `i-1` and string
     /// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
     fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
-        // Skip the header byte and the dictionary_size entry (by offset_index 
+ 1)
-        let bytes = slice_from_slice(self.bytes, 
..self.dictionary_key_start_byte)?;
-        self.header
-            .offset_size
-            .unpack_usize(bytes, NUM_HEADER_BYTES, i + 1)
+        let offset_byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+        let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
+        self.header.offset_size.unpack_usize(bytes, i)
     }
 
-    /// Gets a dictionary entry by index
+    /// Attempts to retrieve a dictionary entry by index, failing if out of 
bounds or if the
+    /// underlying bytes are [invalid].
+    ///
+    /// [invalid]: Self#Validation
     pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
         let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
-        string_from_slice(self.bytes, self.dictionary_key_start_byte, 
byte_range)
+        string_from_slice(self.bytes, self.first_value_byte, byte_range)
+    }
+
+    /// Returns an iterator that attempts to visit all dictionary entries, 
producing `Err` if the
+    /// iterator encounters [invalid] data.
+    ///
+    /// [invalid]: Self#Validation
+    pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str, 
ArrowError>> + '_ {
+        (0..self.dictionary_size).map(move |i| self.get(i))
     }
 
-    /// Get all dictionary entries as an Iterator of strings
+    /// Iterates over all dictionary entries. When working with [unvalidated] 
input, consider
+    /// [`Self::iter_try`] to avoid panics due to invalid data.
+    ///
+    /// [unvalidated]: Self#Validation
     pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
-        // NOTE: It is safe to unwrap because the constructor already made a 
successful traversal.
-        self.iter_checked().map(Result::unwrap)
+        self.iter_try()

Review Comment:
   No need for an `iter_try_impl` here, because `VariantMetadata` is not a 
nested type.



##########
parquet-variant/src/variant/list.rs:
##########
@@ -119,54 +226,51 @@ impl<'m, 'v> VariantList<'m, 'v> {
         self.len() == 0
     }
 
-    /// Returns element by index in `0..self.len()`, if any
+    /// Returns element by index in `0..self.len()`, if any. May panic if this 
list is [invalid].
+    ///
+    /// [invalid]: Self#Validation
     pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
-        if index >= self.num_elements {
-            return None;
-        }
-
-        match self.try_get(index) {
-            Ok(variant) => Some(variant),
-            Err(err) => panic!("validation error: {err}"),
-        }
+        (index < self.num_elements).then(|| {
+            self.try_get_impl(index)
+                .and_then(Variant::validate)
+                .expect("Invalid variant array element")
+        })
     }
 
     /// Fallible version of `get`. Returns element by index, capturing 
validation errors
-    fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
-        if index >= self.num_elements {
-            return Err(ArrowError::InvalidArgumentError(format!(
-                "Index {} out of bounds for list of length {}",
-                index, self.num_elements,
-            )));
-        }
-
-        // Skip header and num_elements bytes to read the offsets
-        let unpack = |i| {
-            self.header
-                .offset_size
-                .unpack_usize(self.value, self.first_offset_byte, i)
-        };
+    pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> 
{
+        self.try_get_impl(index)?.validate()
+    }
 
-        // Read the value bytes from the offsets
-        let variant_value_bytes = slice_from_slice_at_offset(
-            self.value,
-            self.first_value_byte,
-            unpack(index)?..unpack(index + 1)?,
-        )?;
-        let variant = Variant::try_new_with_metadata(self.metadata, 
variant_value_bytes)?;
-        Ok(variant)
+    /// Fallible iteration over the elements of this list.
+    pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, 
ArrowError>> + '_ {
+        (0..self.len()).map(move |i| self.try_get_impl(i))
     }
 
-    /// Iterates over the values of this list
+    /// Iterates over the values of this list. When working with [unvalidated] 
input, consider
+    /// [`Self::iter_try`] to avoid panics due to invalid data.
+    ///
+    /// [unvalidated]: Self#Validation
     pub fn iter(&self) -> impl Iterator<Item = Variant<'m, 'v>> + '_ {
-        // NOTE: It is safe to unwrap because the constructor already made a 
successful traversal.
-        self.iter_checked().map(Result::unwrap)
+        self.iter_try()
+            .map(|result| result.expect("Invalid variant list entry"))

Review Comment:
   We don't try to `validate` because that would trigger the recursion the 
caller was trying to avoid.



##########
parquet-variant/src/variant/list.rs:
##########
@@ -119,54 +226,51 @@ impl<'m, 'v> VariantList<'m, 'v> {
         self.len() == 0
     }
 
-    /// Returns element by index in `0..self.len()`, if any
+    /// Returns element by index in `0..self.len()`, if any. May panic if this 
list is [invalid].
+    ///
+    /// [invalid]: Self#Validation
     pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
-        if index >= self.num_elements {
-            return None;
-        }
-
-        match self.try_get(index) {
-            Ok(variant) => Some(variant),
-            Err(err) => panic!("validation error: {err}"),
-        }
+        (index < self.num_elements).then(|| {
+            self.try_get_impl(index)
+                .and_then(Variant::validate)
+                .expect("Invalid variant array element")
+        })
     }
 
     /// Fallible version of `get`. Returns element by index, capturing 
validation errors
-    fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
-        if index >= self.num_elements {
-            return Err(ArrowError::InvalidArgumentError(format!(
-                "Index {} out of bounds for list of length {}",
-                index, self.num_elements,
-            )));
-        }
-
-        // Skip header and num_elements bytes to read the offsets
-        let unpack = |i| {
-            self.header
-                .offset_size
-                .unpack_usize(self.value, self.first_offset_byte, i)
-        };

Review Comment:
   The offset-wrangling logic was factored out as a new `get_offset` method 
below.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to