alamb commented on code in PR #7666: URL: https://github.com/apache/arrow-rs/pull/7666#discussion_r2150611864
########## parquet-variant/src/variant.rs: ########## @@ -377,30 +534,134 @@ impl<'m, 'v> VariantArray<'m, 'v> { .checked_add(value_bytes) .ok_or_else(overflow)?; - // Skip num_elements bytes to read the offsets - let start_field_offset_from_first_value_byte = - offset_size.unpack_usize(self.value_data, first_offset_byte, index)?; - let end_field_offset_from_first_value_byte = - offset_size.unpack_usize(self.value_data, first_offset_byte, index + 1)?; + // Verify that the last offset array entry is inside the value slice + let last_offset_byte = first_offset_byte + n_offsets * offset_size as usize; + if last_offset_byte > value.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Last offset array entry at offset {} with length {} is outside the value slice of length {}", + last_offset_byte, + offset_size as usize, + value.len() + ))); + } + + // Verify that the value of the last offset array entry fits inside the value slice + let last_offset = offset_size.unpack_usize(value, first_offset_byte, num_elements)?; + if first_value_byte + last_offset > value.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Last offset value {} at offset {} is outside the value slice of length {}", + last_offset, + first_value_byte, + value.len() + ))); + } + + Ok(Self { + offset_size, + is_large, + num_elements, + first_offset_byte, + first_value_byte, + }) + } + + /// Returns the number of elements in this list + pub fn num_elements(&self) -> usize { + self.num_elements + } + + /// Returns the offset size in bytes + pub fn offset_size(&self) -> usize { + self.offset_size as _ + } + + /// Returns whether this is a large list + pub fn is_large(&self) -> bool { + self.is_large + } + + /// Returns the byte offset where the offset array starts + pub fn first_offset_byte(&self) -> usize { + self.first_offset_byte + } + + /// Returns the byte offset where the values start + pub fn first_value_byte(&self) -> usize { + self.first_value_byte + } +} + +// NOTE: We differ from the variant spec and call it "list" instead of "array" in order to be +// consistent with parquet and arrow type naming. Otherwise, the name would conflict with the +// `VariantArray : Array` we must eventually define for variant-typed arrow arrays. +#[derive(Clone, Debug, PartialEq)] +pub struct VariantList<'m, 'v> { + pub metadata: VariantMetadata<'m>, + pub value: &'v [u8], + header: VariantListHeader, +} + +impl<'m, 'v> VariantList<'m, 'v> { + pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> { + Ok(Self { + metadata, + value, + header: VariantListHeader::try_new(value)?, + }) + } + + /// Return the length of this array + pub fn len(&self) -> usize { + self.header.num_elements() + } + + /// Is the array of zero length + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>, ArrowError> { Review Comment: Something I noticed here was that it would be really nice if: 1. This returned an Iterator rather than a `Result` 2. We could implement `iter()` and `IntoIterator` for VariantList That would make using it more ergonomic, though it would require either panic'ing or else validating the offsets on construction 🤔 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org