alamb commented on code in PR #7666:
URL: https://github.com/apache/arrow-rs/pull/7666#discussion_r2151997857
##########
parquet-variant/src/variant.rs:
##########
@@ -377,30 +534,134 @@ impl<'m, 'v> VariantArray<'m, 'v> {
.checked_add(value_bytes)
.ok_or_else(overflow)?;
- // Skip num_elements bytes to read the offsets
- let start_field_offset_from_first_value_byte =
- offset_size.unpack_usize(self.value_data, first_offset_byte,
index)?;
- let end_field_offset_from_first_value_byte =
- offset_size.unpack_usize(self.value_data, first_offset_byte, index
+ 1)?;
+ // Verify that the last offset array entry is inside the value slice
+ let last_offset_byte = first_offset_byte + n_offsets * offset_size as
usize;
+ if last_offset_byte > value.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Last offset array entry at offset {} with length {} is
outside the value slice of length {}",
+ last_offset_byte,
+ offset_size as usize,
+ value.len()
+ )));
+ }
+
+ // Verify that the value of the last offset array entry fits inside
the value slice
+ let last_offset = offset_size.unpack_usize(value, first_offset_byte,
num_elements)?;
+ if first_value_byte + last_offset > value.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Last offset value {} at offset {} is outside the value slice
of length {}",
+ last_offset,
+ first_value_byte,
+ value.len()
+ )));
+ }
+
+ Ok(Self {
+ offset_size,
+ is_large,
+ num_elements,
+ first_offset_byte,
+ first_value_byte,
+ })
+ }
+
+ /// Returns the number of elements in this list
+ pub fn num_elements(&self) -> usize {
+ self.num_elements
+ }
+
+ /// Returns the offset size in bytes
+ pub fn offset_size(&self) -> usize {
+ self.offset_size as _
+ }
+
+ /// Returns whether this is a large list
+ pub fn is_large(&self) -> bool {
+ self.is_large
+ }
+
+ /// Returns the byte offset where the offset array starts
+ pub fn first_offset_byte(&self) -> usize {
+ self.first_offset_byte
+ }
+
+ /// Returns the byte offset where the values start
+ pub fn first_value_byte(&self) -> usize {
+ self.first_value_byte
+ }
+}
+
+// NOTE: We differ from the variant spec and call it "list" instead of "array"
in order to be
+// consistent with parquet and arrow type naming. Otherwise, the name would
conflict with the
+// `VariantArray : Array` we must eventually define for variant-typed arrow
arrays.
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantList<'m, 'v> {
+ pub metadata: VariantMetadata<'m>,
+ pub value: &'v [u8],
+ header: VariantListHeader,
+}
+
+impl<'m, 'v> VariantList<'m, 'v> {
+ pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) ->
Result<Self, ArrowError> {
+ Ok(Self {
+ metadata,
+ value,
+ header: VariantListHeader::try_new(value)?,
+ })
+ }
+
+ /// Return the length of this array
+ pub fn len(&self) -> usize {
+ self.header.num_elements()
+ }
+
+ /// Is the array of zero length
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>,
ArrowError> {
Review Comment:
Yeah, I think in general this is hitting the tension you mentioned in the
earlier PRs of early vs late validation.
The more I think about it the more I think we should move the validation to
construction because:
1. The only reason to create a `Variant` in the first place is to access its
data, so I think we would end up validating it almost immediately on read
2. `Variants` are constructed once but read many times so validating up
front is probably faster
3. We could offer an `unchecked` variant for construction if performance
overhead is hight that skips the validation
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]