scovich commented on code in PR #7878:
URL: https://github.com/apache/arrow-rs/pull/7878#discussion_r2190880881


##########
parquet-variant/src/variant/list.rs:
##########
@@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all other offsets are in-bounds (*)
+            (3) all offsets are monotonically increasing (*)
+            (4) all values are (recursively) valid variant objects (*)
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2), (4)
+            // if we check all values are valid by using offset lookups,
+            // we know all offsets are in bounds if correct
+
+            // note how we do this once!
+            let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.value, byte_range)?;
+
+            let offset_chunks = 
offset_bytes.chunks_exact(self.header.offset_size());
+            assert!(offset_chunks.remainder().is_empty()); // guaranteed by 
shallow validation

Review Comment:
   We're in a method that returns `Result`... why not use it instead of having 
to worry whether shallow validation covered the corner case?



##########
parquet-variant/src/variant/list.rs:
##########
@@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all other offsets are in-bounds (*)
+            (3) all offsets are monotonically increasing (*)
+            (4) all values are (recursively) valid variant objects (*)
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2), (4)
+            // if we check all values are valid by using offset lookups,
+            // we know all offsets are in bounds if correct
+
+            // note how we do this once!
+            let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.value, byte_range)?;
+
+            let offset_chunks = 
offset_bytes.chunks_exact(self.header.offset_size());
+            assert!(offset_chunks.remainder().is_empty()); // guaranteed by 
shallow validation
+
+            let offsets = offset_chunks
+                .map(|chunk| match self.header.offset_size {
+                    OffsetSizeBytes::One => chunk[0] as usize,
+                    OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]) as usize,
+                    OffsetSizeBytes::Three => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) 
as usize
+                    }
+                    OffsetSizeBytes::Four => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            // (3)
+            let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b);
+            if !monotonic_offsets {
+                return Err(ArrowError::InvalidArgumentError(
+                    "offsets are not monotonically increasing".to_string(),
+                ));
+            }
+
+            // (4)
+
+            let value_buffer = &self.value[self.first_value_byte..];
+
+            for i in 0..offsets.len() - 1 {
+                let start_offset = offsets[i];
+                let end_offset = offsets[i + 1];

Review Comment:
   consider doing:
   ```suggestion
               for i in 1..offsets.len() {
                   let start_offset = offsets[i - 1];
                   let end_offset = offsets[i];
   ```



##########
parquet-variant/src/variant/list.rs:
##########
@@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all other offsets are in-bounds (*)
+            (3) all offsets are monotonically increasing (*)
+            (4) all values are (recursively) valid variant objects (*)
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2), (4)
+            // if we check all values are valid by using offset lookups,
+            // we know all offsets are in bounds if correct
+
+            // note how we do this once!
+            let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.value, byte_range)?;
+
+            let offset_chunks = 
offset_bytes.chunks_exact(self.header.offset_size());
+            assert!(offset_chunks.remainder().is_empty()); // guaranteed by 
shallow validation
+
+            let offsets = offset_chunks
+                .map(|chunk| match self.header.offset_size {
+                    OffsetSizeBytes::One => chunk[0] as usize,
+                    OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]) as usize,
+                    OffsetSizeBytes::Three => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) 
as usize
+                    }
+                    OffsetSizeBytes::Four => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize

Review Comment:
   Is this faster and/or simpler than try_into from slice to array?



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> {
         if !self.validated {
             // Iterate over all string keys in this dictionary in order to 
prove that the offset
             // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+
+            /*
+            As scovich pointed out, here are the things full validation must 
do:
+
+            (1) all other offsets are in-bounds (*)
+            (2) all offsets are monotonically increasing (*)
+            (3) all values are valid utf-8 (*)
+
+            I propose we add another validation check
+            (4) if sorted dictionary, check if dictionary fields are sorted
+
+                Doing this check will help us in objects with sorted 
dictionaries,
+                since we can guarantee sortedness by checking if field ids are 
increasing.
+            */
+
+            // (1) (2)
+            // Since shallow validation already computes the first and last 
offset
+            // if we guarantee monotonicity for all offsets, then we know they 
are all in-bounds
+
+            // notice how we do this ceremony only once!
+            let offset_byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.bytes, 
offset_byte_range)?;
+
+            let offsets = offset_bytes
+                .chunks_exact(self.header.offset_size())
+                .map(|chunk| {
+                    // at this point, we know for a _fact_ that chunk will 
have `offset_size` bytes
+
+                    match self.header.offset_size {
+                        OffsetSizeBytes::One => chunk[0].into(),
+                        OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]).into(),
+                        OffsetSizeBytes::Three => {
+                            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
0]) as usize
+                        }
+                        OffsetSizeBytes::Four => {
+                            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                        }
+                    }
+                })
+                .collect::<Vec<usize>>();
+
+            let offsets_monotonically_increasing = offsets.is_sorted_by(|a, b| 
a < b);
+
+            if !offsets_monotonically_increasing {
+                return Err(ArrowError::InvalidArgumentError(
+                    "offsets not monotonically increasing".to_string(),
+                ));
+            }
+
+            // (3)
+            // We don't take advantage of the values being packed side by side.
+            // For every value, we rerequest the entire value buffer and then 
slice into _just_ that value
+            // and parse into a str
+
+            // This looks like a great place to do vectorized utf8 validation
+            // plus, all subsequent attempts at parsing out values as a &str 
can be done using
+            // String::from_utf8_unchecked
+
+            let value_bytes = slice_from_slice(self.bytes, 
self.first_value_byte..)?;
+            let value_str = simdutf8::basic::from_utf8(value_bytes)
+                .map_err(|e| 
ArrowError::InvalidArgumentError(format!("{e:?}")))?;
+
+            // (4)
+            // if the metadata header marked this variant as having a sorted 
dictionary,
+            // we must check whether the fields are actually sorted
+
+            if self.header.is_sorted {
+                let mut prev_field_name = None;
+                let mut is_sorted = true;
+
+                for i in 0..offsets.len() - 1 {
+                    if !is_sorted {
+                        return Err(ArrowError::InvalidArgumentError(
+                            "variant marked as having sorted dictionary but is 
unsorted"
+                                .to_string(),
+                        ));
+                    }
+
+                    let offset_range = offsets[i]..offsets[i + 1];
+
+                    let field_name = value_str
+                        .get(offset_range)
+                        .ok_or_else(|| overflow_error("overflowed"))?;
+
+                    if let Some(prev_field_name) = prev_field_name {
+                        is_sorted = is_sorted && prev_field_name < field_name;
+                    }

Review Comment:
   by deferring the check to the next iteration, we risk missing the case where 
the last pair breaks ordering? Combined with the above, a better control flow 
might be (again, in a helper function so we can use `?`):
   ```rust
       let Some(prev_offset) = offset_chunks.next().transpose()? else {
           return Ok(());
       };
       let field_names = offsets.scan(prev_offset, |prev_offset, offset| {
           let offset = offset?;
           if *prev_offset >= offset {
               return Err(... offset not monotonic ...);
           }
   
           let offset_range = prev_offset..offset;
           *prev_offset = offset;
   
           value_str
               .get(offset_range)
               .ok_or_else(|| overflow_error("overflowed"))
       };
   
       let Some(mut prev_field_name) = field_names.next().transpose()? else {
           return Ok(()); // empty dictionary, nothing to validate
       };
       
       for field_name in field_names {
           let field_name = field_name?;
           if self.header.is_sorted && prev_field_name >= field_name {
               return Err(... not sorted or has a duplicate key ...);
           }
           prev_field_name = field_name;
       } 
   ```



##########
parquet-variant/src/variant/list.rs:
##########
@@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all other offsets are in-bounds (*)
+            (3) all offsets are monotonically increasing (*)
+            (4) all values are (recursively) valid variant objects (*)
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2), (4)
+            // if we check all values are valid by using offset lookups,
+            // we know all offsets are in bounds if correct
+
+            // note how we do this once!
+            let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.value, byte_range)?;
+
+            let offset_chunks = 
offset_bytes.chunks_exact(self.header.offset_size());
+            assert!(offset_chunks.remainder().is_empty()); // guaranteed by 
shallow validation
+
+            let offsets = offset_chunks
+                .map(|chunk| match self.header.offset_size {
+                    OffsetSizeBytes::One => chunk[0] as usize,
+                    OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]) as usize,
+                    OffsetSizeBytes::Three => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) 
as usize
+                    }
+                    OffsetSizeBytes::Four => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            // (3)
+            let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b);

Review Comment:
   Update: The code below uses the materialized `offsets`.
   
   But for large objects, wouldn't it be cheaper (and simpler) to combine the 
checks instead of materializing the offset array? Most likely inside a helper 
method so the code can leverage `?`:
   ```rust
       let mut offsets = offset_chunks.map(|chunk| match chunk {
           ...
       });
       let Some(mut prev_offset) = offset_chunks.next().transpose()? else {
           return Ok(());
       };
       for offset in offsets {
           let offset = offset?;
           if prev_offset >= offset {
               return Err(... not monotonic ...);
           }
   
           let value_bytes = slice_from_slice(value_buffer, 
prev_offset..offset)?;
           prev_offset = offset;
           let _ = Variant::try_new_with_metadata(self.metadata, value_bytes)?;
       };
       Ok(())
   }
   ```
   



##########
parquet-variant/src/variant/list.rs:
##########
@@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all other offsets are in-bounds (*)
+            (3) all offsets are monotonically increasing (*)
+            (4) all values are (recursively) valid variant objects (*)
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2), (4)
+            // if we check all values are valid by using offset lookups,
+            // we know all offsets are in bounds if correct
+
+            // note how we do this once!
+            let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.value, byte_range)?;
+
+            let offset_chunks = 
offset_bytes.chunks_exact(self.header.offset_size());
+            assert!(offset_chunks.remainder().is_empty()); // guaranteed by 
shallow validation
+
+            let offsets = offset_chunks
+                .map(|chunk| match self.header.offset_size {
+                    OffsetSizeBytes::One => chunk[0] as usize,
+                    OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]) as usize,
+                    OffsetSizeBytes::Three => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) 
as usize
+                    }
+                    OffsetSizeBytes::Four => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            // (3)
+            let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b);

Review Comment:
   Why not 
[`Iterator::is_sorted_by`](https://doc.rust-lang.org/std/iter/trait.Iterator.html?search=_sorted#method.is_sorted_by)?
   ```rust
   let offsets_monotonically_increasing = offset_chunks
       .map(...)
       .is_sorted_by(...);
   ```
   (again below)



##########
parquet-variant/src/variant/list.rs:
##########
@@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all other offsets are in-bounds (*)
+            (3) all offsets are monotonically increasing (*)
+            (4) all values are (recursively) valid variant objects (*)
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2), (4)
+            // if we check all values are valid by using offset lookups,
+            // we know all offsets are in bounds if correct
+
+            // note how we do this once!
+            let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.value, byte_range)?;
+
+            let offset_chunks = 
offset_bytes.chunks_exact(self.header.offset_size());
+            assert!(offset_chunks.remainder().is_empty()); // guaranteed by 
shallow validation
+
+            let offsets = offset_chunks
+                .map(|chunk| match self.header.offset_size {
+                    OffsetSizeBytes::One => chunk[0] as usize,
+                    OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]) as usize,
+                    OffsetSizeBytes::Three => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) 
as usize
+                    }
+                    OffsetSizeBytes::Four => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            // (3)
+            let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b);
+            if !monotonic_offsets {
+                return Err(ArrowError::InvalidArgumentError(
+                    "offsets are not monotonically increasing".to_string(),
+                ));
+            }
+
+            // (4)
+
+            let value_buffer = &self.value[self.first_value_byte..];

Review Comment:
   again, why risk panic?
   ```suggestion
               let value_buffer = slice_from_slice(self.value, 
self.first_value_byte..)?;
   ```



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> {
         if !self.validated {
             // Iterate over all string keys in this dictionary in order to 
prove that the offset
             // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+
+            /*
+            As scovich pointed out, here are the things full validation must 
do:
+
+            (1) all other offsets are in-bounds (*)
+            (2) all offsets are monotonically increasing (*)
+            (3) all values are valid utf-8 (*)
+
+            I propose we add another validation check
+            (4) if sorted dictionary, check if dictionary fields are sorted

Review Comment:
   👍 



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> {
         if !self.validated {
             // Iterate over all string keys in this dictionary in order to 
prove that the offset
             // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+
+            /*
+            As scovich pointed out, here are the things full validation must 
do:

Review Comment:
   aside: flattering, but prob not super helpful to reference random github 
handles in the code?



##########
parquet-variant/src/variant/list.rs:
##########
@@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all other offsets are in-bounds (*)
+            (3) all offsets are monotonically increasing (*)
+            (4) all values are (recursively) valid variant objects (*)
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2), (4)
+            // if we check all values are valid by using offset lookups,
+            // we know all offsets are in bounds if correct
+
+            // note how we do this once!
+            let byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.value, byte_range)?;
+
+            let offset_chunks = 
offset_bytes.chunks_exact(self.header.offset_size());
+            assert!(offset_chunks.remainder().is_empty()); // guaranteed by 
shallow validation
+
+            let offsets = offset_chunks
+                .map(|chunk| match self.header.offset_size {
+                    OffsetSizeBytes::One => chunk[0] as usize,
+                    OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]) as usize,
+                    OffsetSizeBytes::Three => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) 
as usize
+                    }
+                    OffsetSizeBytes::Four => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            // (3)
+            let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b);
+            if !monotonic_offsets {
+                return Err(ArrowError::InvalidArgumentError(
+                    "offsets are not monotonically increasing".to_string(),
+                ));
+            }
+
+            // (4)
+
+            let value_buffer = &self.value[self.first_value_byte..];
+
+            for i in 0..offsets.len() - 1 {
+                let start_offset = offsets[i];
+                let end_offset = offsets[i + 1];
+
+                let value_bytes = slice_from_slice(value_buffer, 
start_offset..end_offset)?;
+                Variant::try_new_with_metadata(self.metadata, value_bytes)?;

Review Comment:
   At some point, copying that fat metadata over and over will be a meaningful 
contributor to the total cost. 
   
   Are we _sure_ it's ~free? 
   
   If not sure, should we somehow allow creating variant instances with 
metadata references, at least internally for validation purposes?



##########
parquet-variant/src/variant/object.rs:
##########
@@ -206,13 +205,118 @@ impl<'m, 'v> VariantObject<'m, 'v> {
     /// [validation]: Self#Validation
     pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
         if !self.validated {
+            /*
+            (1) the associated variant metadata is [valid] (*)
+            (2) all field ids are valid metadata dictionary entries (*)
+            (3) field ids are lexically ordered according by their 
corresponding string values (*)
+            (4) all field offsets are in bounds (*)
+            (5) all field values are (recursively) _valid_ variant values (*)
+
+            */
+
+            // (1)
             // Validate the metadata dictionary first, if not already 
validated, because we pass it
             // by value to all the children (who would otherwise re-validate 
it repeatedly).
             self.metadata = self.metadata.with_full_validation()?;
 
-            // Iterate over all string keys in this dictionary in order to 
prove that the offset
-            // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+            // (2)
+            // Field ids serve as indexes into the metadata buffer.
+            // As long as we guarantee the largest field id is < dictionary 
size,
+            // we can guarantee all field ids are valid metadata dictionaries
+
+            // (2), (3)
+            let byte_range = 
self.header.field_ids_start_byte()..self.first_field_offset_byte;
+            let field_id_bytes = slice_from_slice(self.value, byte_range)?;
+            // let field_id = 
self.header.field_id_size.unpack_usize(field_id_bytes, i)?;
+
+            let field_id_chunks = 
field_id_bytes.chunks_exact(self.header.field_id_size());
+            assert!(field_id_chunks.remainder().is_empty()); // guaranteed to 
be none
+
+            let field_ids = field_id_chunks
+                .map(|chunk| match self.header.field_id_size {
+                    OffsetSizeBytes::One => chunk[0] as usize,
+                    OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]) as usize,
+                    OffsetSizeBytes::Three => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) 
as usize
+                    }
+                    OffsetSizeBytes::Four => {
+                        u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                    }
+                })
+                .collect::<Vec<_>>();

Review Comment:
   As above, I expect it will be straightforward to avoid materializing these 
iterators
   (again below)



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> {
         if !self.validated {
             // Iterate over all string keys in this dictionary in order to 
prove that the offset
             // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+
+            /*
+            As scovich pointed out, here are the things full validation must 
do:
+
+            (1) all other offsets are in-bounds (*)
+            (2) all offsets are monotonically increasing (*)
+            (3) all values are valid utf-8 (*)
+
+            I propose we add another validation check
+            (4) if sorted dictionary, check if dictionary fields are sorted
+
+                Doing this check will help us in objects with sorted 
dictionaries,
+                since we can guarantee sortedness by checking if field ids are 
increasing.
+            */
+
+            // (1) (2)
+            // Since shallow validation already computes the first and last 
offset
+            // if we guarantee monotonicity for all offsets, then we know they 
are all in-bounds
+
+            // notice how we do this ceremony only once!
+            let offset_byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.bytes, 
offset_byte_range)?;
+
+            let offsets = offset_bytes
+                .chunks_exact(self.header.offset_size())
+                .map(|chunk| {
+                    // at this point, we know for a _fact_ that chunk will 
have `offset_size` bytes
+
+                    match self.header.offset_size {
+                        OffsetSizeBytes::One => chunk[0].into(),
+                        OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]).into(),
+                        OffsetSizeBytes::Three => {
+                            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
0]) as usize
+                        }
+                        OffsetSizeBytes::Four => {
+                            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                        }
+                    }
+                })
+                .collect::<Vec<usize>>();
+
+            let offsets_monotonically_increasing = offsets.is_sorted_by(|a, b| 
a < b);

Review Comment:
   As above, no need to materialize the iterator if we combine the checks into 
a single loop?
   (but also see comment below)
   



##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> {
         if !self.validated {
             // Iterate over all string keys in this dictionary in order to 
prove that the offset
             // array is valid, all offsets are in bounds, and all string bytes 
are valid utf-8.
-            validate_fallible_iterator(self.iter_try())?;
+
+            /*
+            As scovich pointed out, here are the things full validation must 
do:
+
+            (1) all other offsets are in-bounds (*)
+            (2) all offsets are monotonically increasing (*)
+            (3) all values are valid utf-8 (*)
+
+            I propose we add another validation check
+            (4) if sorted dictionary, check if dictionary fields are sorted
+
+                Doing this check will help us in objects with sorted 
dictionaries,
+                since we can guarantee sortedness by checking if field ids are 
increasing.
+            */
+
+            // (1) (2)
+            // Since shallow validation already computes the first and last 
offset
+            // if we guarantee monotonicity for all offsets, then we know they 
are all in-bounds
+
+            // notice how we do this ceremony only once!
+            let offset_byte_range = 
self.header.first_offset_byte()..self.first_value_byte;
+            let offset_bytes = slice_from_slice(self.bytes, 
offset_byte_range)?;
+
+            let offsets = offset_bytes
+                .chunks_exact(self.header.offset_size())
+                .map(|chunk| {
+                    // at this point, we know for a _fact_ that chunk will 
have `offset_size` bytes
+
+                    match self.header.offset_size {
+                        OffsetSizeBytes::One => chunk[0].into(),
+                        OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], 
chunk[1]]).into(),
+                        OffsetSizeBytes::Three => {
+                            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
0]) as usize
+                        }
+                        OffsetSizeBytes::Four => {
+                            u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 
chunk[3]]) as usize
+                        }
+                    }
+                })
+                .collect::<Vec<usize>>();
+
+            let offsets_monotonically_increasing = offsets.is_sorted_by(|a, b| 
a < b);
+
+            if !offsets_monotonically_increasing {
+                return Err(ArrowError::InvalidArgumentError(
+                    "offsets not monotonically increasing".to_string(),
+                ));
+            }
+
+            // (3)
+            // We don't take advantage of the values being packed side by side.
+            // For every value, we rerequest the entire value buffer and then 
slice into _just_ that value
+            // and parse into a str
+
+            // This looks like a great place to do vectorized utf8 validation
+            // plus, all subsequent attempts at parsing out values as a &str 
can be done using
+            // String::from_utf8_unchecked
+
+            let value_bytes = slice_from_slice(self.bytes, 
self.first_value_byte..)?;
+            let value_str = simdutf8::basic::from_utf8(value_bytes)
+                .map_err(|e| 
ArrowError::InvalidArgumentError(format!("{e:?}")))?;
+
+            // (4)
+            // if the metadata header marked this variant as having a sorted 
dictionary,
+            // we must check whether the fields are actually sorted
+
+            if self.header.is_sorted {
+                let mut prev_field_name = None;
+                let mut is_sorted = true;
+
+                for i in 0..offsets.len() - 1 {

Review Comment:
   as above, `1..offsets.len()` and adjust indexing below?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to