viirya commented on code in PR #7878:
URL: https://github.com/apache/arrow-rs/pull/7878#discussion_r2196635404
##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -228,9 +225,48 @@ impl<'m> VariantMetadata<'m> {
/// [validation]: Self#Validation
pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
if !self.validated {
- // Iterate over all string keys in this dictionary in order to
prove that the offset
- // array is valid, all offsets are in bounds, and all string bytes
are valid utf-8.
- validate_fallible_iterator(self.iter_try())?;
+ let offset_bytes = slice_from_slice(
+ self.bytes,
+ self.header.first_offset_byte()..self.first_value_byte,
+ )?;
+
+ let offsets =
+ map_bytes_to_offsets(offset_bytes,
self.header.offset_size).collect::<Vec<_>>();
+
+ // Validate offsets are in-bounds and monotonically increasing.
+ // Since shallow validation ensures the first and last offsets are
in bounds, we can also verify all offsets
+ // are in-bounds by checking if offsets are monotonically
increasing.
+ let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
+ if !are_offsets_monotonic {
+ return Err(ArrowError::InvalidArgumentError(
+ "offsets not monotonically increasing".to_string(),
+ ));
+ }
+
+ // Verify the string values in the dictionary are UTF-8 encoded
strings.
+ let value_buffer = slice_from_slice(self.bytes,
self.first_value_byte..)?;
+ let value_str = simdutf8::basic::from_utf8(value_buffer)
Review Comment:
What is the difference between the currently used `str::from_utf8` and this
`simdutf8::basic::from_utf8`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]