scovich commented on code in PR #7666:
URL: https://github.com/apache/arrow-rs/pull/7666#discussion_r2151042187
##########
parquet-variant/src/variant.rs:
##########
@@ -377,30 +534,134 @@ impl<'m, 'v> VariantArray<'m, 'v> {
.checked_add(value_bytes)
.ok_or_else(overflow)?;
- // Skip num_elements bytes to read the offsets
- let start_field_offset_from_first_value_byte =
- offset_size.unpack_usize(self.value_data, first_offset_byte,
index)?;
- let end_field_offset_from_first_value_byte =
- offset_size.unpack_usize(self.value_data, first_offset_byte, index
+ 1)?;
+ // Verify that the last offset array entry is inside the value slice
+ let last_offset_byte = first_offset_byte + n_offsets * offset_size as
usize;
+ if last_offset_byte > value.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Last offset array entry at offset {} with length {} is
outside the value slice of length {}",
+ last_offset_byte,
+ offset_size as usize,
+ value.len()
+ )));
+ }
+
+ // Verify that the value of the last offset array entry fits inside
the value slice
+ let last_offset = offset_size.unpack_usize(value, first_offset_byte,
num_elements)?;
+ if first_value_byte + last_offset > value.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Last offset value {} at offset {} is outside the value slice
of length {}",
+ last_offset,
+ first_value_byte,
+ value.len()
+ )));
+ }
+
+ Ok(Self {
+ offset_size,
+ is_large,
+ num_elements,
+ first_offset_byte,
+ first_value_byte,
+ })
+ }
+
+ /// Returns the number of elements in this list
+ pub fn num_elements(&self) -> usize {
+ self.num_elements
+ }
+
+ /// Returns the offset size in bytes
+ pub fn offset_size(&self) -> usize {
+ self.offset_size as _
+ }
+
+ /// Returns whether this is a large list
+ pub fn is_large(&self) -> bool {
+ self.is_large
+ }
+
+ /// Returns the byte offset where the offset array starts
+ pub fn first_offset_byte(&self) -> usize {
+ self.first_offset_byte
+ }
+
+ /// Returns the byte offset where the values start
+ pub fn first_value_byte(&self) -> usize {
+ self.first_value_byte
+ }
+}
+
+// NOTE: We differ from the variant spec and call it "list" instead of "array"
in order to be
+// consistent with parquet and arrow type naming. Otherwise, the name would
conflict with the
+// `VariantArray : Array` we must eventually define for variant-typed arrow
arrays.
+#[derive(Clone, Debug, PartialEq)]
+pub struct VariantList<'m, 'v> {
+ pub metadata: VariantMetadata<'m>,
+ pub value: &'v [u8],
+ header: VariantListHeader,
+}
+
+impl<'m, 'v> VariantList<'m, 'v> {
+ pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) ->
Result<Self, ArrowError> {
+ Ok(Self {
+ metadata,
+ value,
+ header: VariantListHeader::try_new(value)?,
+ })
+ }
+
+ /// Return the length of this array
+ pub fn len(&self) -> usize {
+ self.header.num_elements()
+ }
+
+ /// Is the array of zero length
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ pub fn values(&self) -> Result<impl Iterator<Item = Variant<'m, 'v>>,
ArrowError> {
Review Comment:
Hmm, this is tricky. Ideally we wouldn't even materialize the result just to
iterate over it, but that would require an iterator with `Item = Result` which
is yet more annoying.
Definitely not a fan of allowing untrusted input to cause a panic.
But if we want to make this method infallible, we'd need to pay the
validation cost in the constructor.
So it seems like we have two choices:
1. Pay O(n) to validate the offsets in the constructor, and `unwrap` here.
* PRO: Cleaner API, allows to iterate without materializing the result
first
2. Keep as-is or even return an iterator of result instead of result of
iterator
* PRO: Provably panic-free (no `unwrap` to reason about)
If we went for 1/, I'd favor an internal method that returns an iterator of
result. The constructor would instantiate the iterator and verify it's all-ok,
at which point we _know_ `values` can safely invoke `map(Result::unwrap)`
because we already consumed the iterator once. Otherwise, I worry the checks in
the constructor could diverge from the checks we trigger here, causing an
unexpected panic.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]