friendlymatthew commented on code in PR #7878:
URL: https://github.com/apache/arrow-rs/pull/7878#discussion_r2196412403
##########
parquet-variant/src/variant/object.rs:
##########
@@ -206,13 +205,118 @@ impl<'m, 'v> VariantObject<'m, 'v> {
/// [validation]: Self#Validation
pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
if !self.validated {
+ /*
+ (1) the associated variant metadata is [valid] (*)
+ (2) all field ids are valid metadata dictionary entries (*)
+ (3) field ids are lexically ordered according by their
corresponding string values (*)
+ (4) all field offsets are in bounds (*)
+ (5) all field values are (recursively) _valid_ variant values (*)
+
+ */
+
+ // (1)
// Validate the metadata dictionary first, if not already
validated, because we pass it
// by value to all the children (who would otherwise re-validate
it repeatedly).
self.metadata = self.metadata.with_full_validation()?;
- // Iterate over all string keys in this dictionary in order to
prove that the offset
- // array is valid, all offsets are in bounds, and all string bytes
are valid utf-8.
- validate_fallible_iterator(self.iter_try())?;
+ // (2)
+ // Field ids serve as indexes into the metadata buffer.
+ // As long as we guarantee the largest field id is < dictionary
size,
+ // we can guarantee all field ids are valid metadata dictionaries
+
+ // (2), (3)
+ let byte_range =
self.header.field_ids_start_byte()..self.first_field_offset_byte;
+ let field_id_bytes = slice_from_slice(self.value, byte_range)?;
+ // let field_id =
self.header.field_id_size.unpack_usize(field_id_bytes, i)?;
+
+ let field_id_chunks =
field_id_bytes.chunks_exact(self.header.field_id_size());
+ assert!(field_id_chunks.remainder().is_empty()); // guaranteed to
be none
+
+ let field_ids = field_id_chunks
+ .map(|chunk| match self.header.field_id_size {
+ OffsetSizeBytes::One => chunk[0] as usize,
+ OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0],
chunk[1]]) as usize,
+ OffsetSizeBytes::Three => {
+ u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0])
as usize
+ }
+ OffsetSizeBytes::Four => {
+ u32::from_le_bytes([chunk[0], chunk[1], chunk[2],
chunk[3]]) as usize
+ }
+ })
+ .collect::<Vec<_>>();
Review Comment:
Hm happy to change, but I think in this case the extra allocation isn’t
hurting perf very much and it makes the code a lot simpler to reason about
because we can split up the following checks better.
I could inline calls to `map_bytes_to_offsets` everywhere we loop over
`field_ids`, but that doesn’t seem much better than doing a single allocation
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]