scovich commented on code in PR #7704:
URL: https://github.com/apache/arrow-rs/pull/7704#discussion_r2155543305
##########
parquet-variant/src/variant.rs:
##########
@@ -154,64 +152,35 @@ impl<'m> VariantMetadata<'m> {
}
pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
- let header = VariantMetadataHeader::try_new(bytes)?;
+ let header = first_byte_from_slice(bytes)?;
+ let header = VariantMetadataHeader::try_new(header)?;
+
// Offset 1, index 0 because first element after header is dictionary
size
let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?;
- // Check that we have the correct metadata length according to
dictionary_size, or return
- // error early.
- // Minimum number of bytes the metadata buffer must contain:
- // 1 byte header
- // + offset_size-byte `dictionary_size` field
- // + (dict_size + 1) offset entries, each `offset_size` bytes. (Table
size, essentially)
- // 1 + offset_size + (dict_size + 1) * offset_size
+ // Calculate the starting offset of the dictionary string bytes.
+ //
+ // Value header, dict_size (offset_size bytes), and dict_size+1 offsets
+ // = 1 + offset_size + (dict_size + 1) * offset_size
// = (dict_size + 2) * offset_size + 1
- let offset_size = header.offset_size as usize; // Cheap to copy
-
let dictionary_key_start_byte = dict_size
.checked_add(2)
- .and_then(|n| n.checked_mul(offset_size))
+ .and_then(|n| n.checked_mul(header.offset_size as usize))
.and_then(|n| n.checked_add(1))
.ok_or_else(|| ArrowError::InvalidArgumentError("metadata length
overflow".into()))?;
-
- if bytes.len() < dictionary_key_start_byte {
- return Err(ArrowError::InvalidArgumentError(
- "Metadata shorter than dictionary_size implies".to_string(),
- ));
- }
-
- // Check that all offsets are monotonically increasing
- let mut offsets = (0..=dict_size).map(|i|
header.offset_size.unpack_usize(bytes, 1, i + 1));
- let Some(Ok(mut end @ 0)) = offsets.next() else {
- return Err(ArrowError::InvalidArgumentError(
- "First offset is non-zero".to_string(),
- ));
- };
-
- for offset in offsets {
- let offset = offset?;
- if end >= offset {
- return Err(ArrowError::InvalidArgumentError(
- "Offsets are not monotonically increasing".to_string(),
- ));
- }
- end = offset;
- }
-
- // Verify the buffer covers the whole dictionary-string section
- if end > bytes.len() - dictionary_key_start_byte {
- // `prev` holds the last offset seen still
- return Err(ArrowError::InvalidArgumentError(
- "Last offset does not equal dictionary length".to_string(),
- ));
- }
-
- Ok(Self {
+ let s = Self {
bytes,
header,
dict_size,
dictionary_key_start_byte,
- })
+ };
+
+ // Verify that `iter` can safely `unwrap` the items produced by this
iterator.
+ //
+ // This has the side effect of validating the offset array and proving
that the string bytes
+ // are all in bounds.
Review Comment:
I took a stab at it, PTAL.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]