scovich commented on code in PR #7878: URL: https://github.com/apache/arrow-rs/pull/7878#discussion_r2190880881
########## parquet-variant/src/variant/list.rs: ########## @@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { + /* + (1) the associated variant metadata is [valid] (*) + (2) all other offsets are in-bounds (*) + (3) all offsets are monotonically increasing (*) + (4) all values are (recursively) valid variant objects (*) + */ + + // (1) // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + // (2), (4) + // if we check all values are valid by using offset lookups, + // we know all offsets are in bounds if correct + + // note how we do this once! + let byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.value, byte_range)?; + + let offset_chunks = offset_bytes.chunks_exact(self.header.offset_size()); + assert!(offset_chunks.remainder().is_empty()); // guaranteed by shallow validation Review Comment: We're in a method that returns `Result`... why not use it instead of having to worry whether shallow validation covered the corner case? ########## parquet-variant/src/variant/list.rs: ########## @@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { + /* + (1) the associated variant metadata is [valid] (*) + (2) all other offsets are in-bounds (*) + (3) all offsets are monotonically increasing (*) + (4) all values are (recursively) valid variant objects (*) + */ + + // (1) // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + // (2), (4) + // if we check all values are valid by using offset lookups, + // we know all offsets are in bounds if correct + + // note how we do this once! + let byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.value, byte_range)?; + + let offset_chunks = offset_bytes.chunks_exact(self.header.offset_size()); + assert!(offset_chunks.remainder().is_empty()); // guaranteed by shallow validation + + let offsets = offset_chunks + .map(|chunk| match self.header.offset_size { + OffsetSizeBytes::One => chunk[0] as usize, + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize, + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + }) + .collect::<Vec<_>>(); + + // (3) + let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b); + if !monotonic_offsets { + return Err(ArrowError::InvalidArgumentError( + "offsets are not monotonically increasing".to_string(), + )); + } + + // (4) + + let value_buffer = &self.value[self.first_value_byte..]; + + for i in 0..offsets.len() - 1 { + let start_offset = offsets[i]; + let end_offset = offsets[i + 1]; Review Comment: consider doing: ```suggestion for i in 1..offsets.len() { let start_offset = offsets[i - 1]; let end_offset = offsets[i]; ``` ########## parquet-variant/src/variant/list.rs: ########## @@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { + /* + (1) the associated variant metadata is [valid] (*) + (2) all other offsets are in-bounds (*) + (3) all offsets are monotonically increasing (*) + (4) all values are (recursively) valid variant objects (*) + */ + + // (1) // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + // (2), (4) + // if we check all values are valid by using offset lookups, + // we know all offsets are in bounds if correct + + // note how we do this once! + let byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.value, byte_range)?; + + let offset_chunks = offset_bytes.chunks_exact(self.header.offset_size()); + assert!(offset_chunks.remainder().is_empty()); // guaranteed by shallow validation + + let offsets = offset_chunks + .map(|chunk| match self.header.offset_size { + OffsetSizeBytes::One => chunk[0] as usize, + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize, + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize Review Comment: Is this faster and/or simpler than try_into from slice to array? ########## parquet-variant/src/variant/metadata.rs: ########## @@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> { if !self.validated { // Iterate over all string keys in this dictionary in order to prove that the offset // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + + /* + As scovich pointed out, here are the things full validation must do: + + (1) all other offsets are in-bounds (*) + (2) all offsets are monotonically increasing (*) + (3) all values are valid utf-8 (*) + + I propose we add another validation check + (4) if sorted dictionary, check if dictionary fields are sorted + + Doing this check will help us in objects with sorted dictionaries, + since we can guarantee sortedness by checking if field ids are increasing. + */ + + // (1) (2) + // Since shallow validation already computes the first and last offset + // if we guarantee monotonicity for all offsets, then we know they are all in-bounds + + // notice how we do this ceremony only once! + let offset_byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.bytes, offset_byte_range)?; + + let offsets = offset_bytes + .chunks_exact(self.header.offset_size()) + .map(|chunk| { + // at this point, we know for a _fact_ that chunk will have `offset_size` bytes + + match self.header.offset_size { + OffsetSizeBytes::One => chunk[0].into(), + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]).into(), + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + } + }) + .collect::<Vec<usize>>(); + + let offsets_monotonically_increasing = offsets.is_sorted_by(|a, b| a < b); + + if !offsets_monotonically_increasing { + return Err(ArrowError::InvalidArgumentError( + "offsets not monotonically increasing".to_string(), + )); + } + + // (3) + // We don't take advantage of the values being packed side by side. + // For every value, we rerequest the entire value buffer and then slice into _just_ that value + // and parse into a str + + // This looks like a great place to do vectorized utf8 validation + // plus, all subsequent attempts at parsing out values as a &str can be done using + // String::from_utf8_unchecked + + let value_bytes = slice_from_slice(self.bytes, self.first_value_byte..)?; + let value_str = simdutf8::basic::from_utf8(value_bytes) + .map_err(|e| ArrowError::InvalidArgumentError(format!("{e:?}")))?; + + // (4) + // if the metadata header marked this variant as having a sorted dictionary, + // we must check whether the fields are actually sorted + + if self.header.is_sorted { + let mut prev_field_name = None; + let mut is_sorted = true; + + for i in 0..offsets.len() - 1 { + if !is_sorted { + return Err(ArrowError::InvalidArgumentError( + "variant marked as having sorted dictionary but is unsorted" + .to_string(), + )); + } + + let offset_range = offsets[i]..offsets[i + 1]; + + let field_name = value_str + .get(offset_range) + .ok_or_else(|| overflow_error("overflowed"))?; + + if let Some(prev_field_name) = prev_field_name { + is_sorted = is_sorted && prev_field_name < field_name; + } Review Comment: by deferring the check to the next iteration, we risk missing the case where the last pair breaks ordering? Combined with the above, a better control flow might be (again, in a helper function so we can use `?`): ```rust let Some(prev_offset) = offset_chunks.next().transpose()? else { return Ok(()); }; let field_names = offsets.scan(prev_offset, |prev_offset, offset| { let offset = offset?; if *prev_offset >= offset { return Err(... offset not monotonic ...); } let offset_range = prev_offset..offset; *prev_offset = offset; value_str .get(offset_range) .ok_or_else(|| overflow_error("overflowed")) }; let Some(mut prev_field_name) = field_names.next().transpose()? else { return Ok(()); // empty dictionary, nothing to validate }; for field_name in field_names { let field_name = field_name?; if self.header.is_sorted && prev_field_name >= field_name { return Err(... not sorted or has a duplicate key ...); } prev_field_name = field_name; } ``` ########## parquet-variant/src/variant/list.rs: ########## @@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { + /* + (1) the associated variant metadata is [valid] (*) + (2) all other offsets are in-bounds (*) + (3) all offsets are monotonically increasing (*) + (4) all values are (recursively) valid variant objects (*) + */ + + // (1) // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + // (2), (4) + // if we check all values are valid by using offset lookups, + // we know all offsets are in bounds if correct + + // note how we do this once! + let byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.value, byte_range)?; + + let offset_chunks = offset_bytes.chunks_exact(self.header.offset_size()); + assert!(offset_chunks.remainder().is_empty()); // guaranteed by shallow validation + + let offsets = offset_chunks + .map(|chunk| match self.header.offset_size { + OffsetSizeBytes::One => chunk[0] as usize, + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize, + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + }) + .collect::<Vec<_>>(); + + // (3) + let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b); Review Comment: Update: The code below uses the materialized `offsets`. But for large objects, wouldn't it be cheaper (and simpler) to combine the checks instead of materializing the offset array? Most likely inside a helper method so the code can leverage `?`: ```rust let mut offsets = offset_chunks.map(|chunk| match chunk { ... }); let Some(mut prev_offset) = offset_chunks.next().transpose()? else { return Ok(()); }; for offset in offsets { let offset = offset?; if prev_offset >= offset { return Err(... not monotonic ...); } let value_bytes = slice_from_slice(value_buffer, prev_offset..offset)?; prev_offset = offset; let _ = Variant::try_new_with_metadata(self.metadata, value_bytes)?; }; Ok(()) } ``` ########## parquet-variant/src/variant/list.rs: ########## @@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { + /* + (1) the associated variant metadata is [valid] (*) + (2) all other offsets are in-bounds (*) + (3) all offsets are monotonically increasing (*) + (4) all values are (recursively) valid variant objects (*) + */ + + // (1) // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + // (2), (4) + // if we check all values are valid by using offset lookups, + // we know all offsets are in bounds if correct + + // note how we do this once! + let byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.value, byte_range)?; + + let offset_chunks = offset_bytes.chunks_exact(self.header.offset_size()); + assert!(offset_chunks.remainder().is_empty()); // guaranteed by shallow validation + + let offsets = offset_chunks + .map(|chunk| match self.header.offset_size { + OffsetSizeBytes::One => chunk[0] as usize, + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize, + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + }) + .collect::<Vec<_>>(); + + // (3) + let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b); Review Comment: Why not [`Iterator::is_sorted_by`](https://doc.rust-lang.org/std/iter/trait.Iterator.html?search=_sorted#method.is_sorted_by)? ```rust let offsets_monotonically_increasing = offset_chunks .map(...) .is_sorted_by(...); ``` (again below) ########## parquet-variant/src/variant/list.rs: ########## @@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { + /* + (1) the associated variant metadata is [valid] (*) + (2) all other offsets are in-bounds (*) + (3) all offsets are monotonically increasing (*) + (4) all values are (recursively) valid variant objects (*) + */ + + // (1) // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + // (2), (4) + // if we check all values are valid by using offset lookups, + // we know all offsets are in bounds if correct + + // note how we do this once! + let byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.value, byte_range)?; + + let offset_chunks = offset_bytes.chunks_exact(self.header.offset_size()); + assert!(offset_chunks.remainder().is_empty()); // guaranteed by shallow validation + + let offsets = offset_chunks + .map(|chunk| match self.header.offset_size { + OffsetSizeBytes::One => chunk[0] as usize, + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize, + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + }) + .collect::<Vec<_>>(); + + // (3) + let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b); + if !monotonic_offsets { + return Err(ArrowError::InvalidArgumentError( + "offsets are not monotonically increasing".to_string(), + )); + } + + // (4) + + let value_buffer = &self.value[self.first_value_byte..]; Review Comment: again, why risk panic? ```suggestion let value_buffer = slice_from_slice(self.value, self.first_value_byte..)?; ``` ########## parquet-variant/src/variant/metadata.rs: ########## @@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> { if !self.validated { // Iterate over all string keys in this dictionary in order to prove that the offset // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + + /* + As scovich pointed out, here are the things full validation must do: + + (1) all other offsets are in-bounds (*) + (2) all offsets are monotonically increasing (*) + (3) all values are valid utf-8 (*) + + I propose we add another validation check + (4) if sorted dictionary, check if dictionary fields are sorted Review Comment: 👍 ########## parquet-variant/src/variant/metadata.rs: ########## @@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> { if !self.validated { // Iterate over all string keys in this dictionary in order to prove that the offset // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + + /* + As scovich pointed out, here are the things full validation must do: Review Comment: aside: flattering, but prob not super helpful to reference random github handles in the code? ########## parquet-variant/src/variant/list.rs: ########## @@ -205,13 +204,62 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { + /* + (1) the associated variant metadata is [valid] (*) + (2) all other offsets are in-bounds (*) + (3) all offsets are monotonically increasing (*) + (4) all values are (recursively) valid variant objects (*) + */ + + // (1) // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + // (2), (4) + // if we check all values are valid by using offset lookups, + // we know all offsets are in bounds if correct + + // note how we do this once! + let byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.value, byte_range)?; + + let offset_chunks = offset_bytes.chunks_exact(self.header.offset_size()); + assert!(offset_chunks.remainder().is_empty()); // guaranteed by shallow validation + + let offsets = offset_chunks + .map(|chunk| match self.header.offset_size { + OffsetSizeBytes::One => chunk[0] as usize, + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize, + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + }) + .collect::<Vec<_>>(); + + // (3) + let monotonic_offsets = offsets.is_sorted_by(|a, b| a < b); + if !monotonic_offsets { + return Err(ArrowError::InvalidArgumentError( + "offsets are not monotonically increasing".to_string(), + )); + } + + // (4) + + let value_buffer = &self.value[self.first_value_byte..]; + + for i in 0..offsets.len() - 1 { + let start_offset = offsets[i]; + let end_offset = offsets[i + 1]; + + let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?; + Variant::try_new_with_metadata(self.metadata, value_bytes)?; Review Comment: At some point, copying that fat metadata over and over will be a meaningful contributor to the total cost. Are we _sure_ it's ~free? If not sure, should we somehow allow creating variant instances with metadata references, at least internally for validation purposes? ########## parquet-variant/src/variant/object.rs: ########## @@ -206,13 +205,118 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// [validation]: Self#Validation pub fn with_full_validation(mut self) -> Result<Self, ArrowError> { if !self.validated { + /* + (1) the associated variant metadata is [valid] (*) + (2) all field ids are valid metadata dictionary entries (*) + (3) field ids are lexically ordered according by their corresponding string values (*) + (4) all field offsets are in bounds (*) + (5) all field values are (recursively) _valid_ variant values (*) + + */ + + // (1) // Validate the metadata dictionary first, if not already validated, because we pass it // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.with_full_validation()?; - // Iterate over all string keys in this dictionary in order to prove that the offset - // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + // (2) + // Field ids serve as indexes into the metadata buffer. + // As long as we guarantee the largest field id is < dictionary size, + // we can guarantee all field ids are valid metadata dictionaries + + // (2), (3) + let byte_range = self.header.field_ids_start_byte()..self.first_field_offset_byte; + let field_id_bytes = slice_from_slice(self.value, byte_range)?; + // let field_id = self.header.field_id_size.unpack_usize(field_id_bytes, i)?; + + let field_id_chunks = field_id_bytes.chunks_exact(self.header.field_id_size()); + assert!(field_id_chunks.remainder().is_empty()); // guaranteed to be none + + let field_ids = field_id_chunks + .map(|chunk| match self.header.field_id_size { + OffsetSizeBytes::One => chunk[0] as usize, + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]) as usize, + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + }) + .collect::<Vec<_>>(); Review Comment: As above, I expect it will be straightforward to avoid materializing these iterators (again below) ########## parquet-variant/src/variant/metadata.rs: ########## @@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> { if !self.validated { // Iterate over all string keys in this dictionary in order to prove that the offset // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + + /* + As scovich pointed out, here are the things full validation must do: + + (1) all other offsets are in-bounds (*) + (2) all offsets are monotonically increasing (*) + (3) all values are valid utf-8 (*) + + I propose we add another validation check + (4) if sorted dictionary, check if dictionary fields are sorted + + Doing this check will help us in objects with sorted dictionaries, + since we can guarantee sortedness by checking if field ids are increasing. + */ + + // (1) (2) + // Since shallow validation already computes the first and last offset + // if we guarantee monotonicity for all offsets, then we know they are all in-bounds + + // notice how we do this ceremony only once! + let offset_byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.bytes, offset_byte_range)?; + + let offsets = offset_bytes + .chunks_exact(self.header.offset_size()) + .map(|chunk| { + // at this point, we know for a _fact_ that chunk will have `offset_size` bytes + + match self.header.offset_size { + OffsetSizeBytes::One => chunk[0].into(), + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]).into(), + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + } + }) + .collect::<Vec<usize>>(); + + let offsets_monotonically_increasing = offsets.is_sorted_by(|a, b| a < b); Review Comment: As above, no need to materialize the iterator if we combine the checks into a single loop? (but also see comment below) ########## parquet-variant/src/variant/metadata.rs: ########## @@ -230,7 +227,98 @@ impl<'m> VariantMetadata<'m> { if !self.validated { // Iterate over all string keys in this dictionary in order to prove that the offset // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. - validate_fallible_iterator(self.iter_try())?; + + /* + As scovich pointed out, here are the things full validation must do: + + (1) all other offsets are in-bounds (*) + (2) all offsets are monotonically increasing (*) + (3) all values are valid utf-8 (*) + + I propose we add another validation check + (4) if sorted dictionary, check if dictionary fields are sorted + + Doing this check will help us in objects with sorted dictionaries, + since we can guarantee sortedness by checking if field ids are increasing. + */ + + // (1) (2) + // Since shallow validation already computes the first and last offset + // if we guarantee monotonicity for all offsets, then we know they are all in-bounds + + // notice how we do this ceremony only once! + let offset_byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.bytes, offset_byte_range)?; + + let offsets = offset_bytes + .chunks_exact(self.header.offset_size()) + .map(|chunk| { + // at this point, we know for a _fact_ that chunk will have `offset_size` bytes + + match self.header.offset_size { + OffsetSizeBytes::One => chunk[0].into(), + OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]]).into(), + OffsetSizeBytes::Three => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize + } + OffsetSizeBytes::Four => { + u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as usize + } + } + }) + .collect::<Vec<usize>>(); + + let offsets_monotonically_increasing = offsets.is_sorted_by(|a, b| a < b); + + if !offsets_monotonically_increasing { + return Err(ArrowError::InvalidArgumentError( + "offsets not monotonically increasing".to_string(), + )); + } + + // (3) + // We don't take advantage of the values being packed side by side. + // For every value, we rerequest the entire value buffer and then slice into _just_ that value + // and parse into a str + + // This looks like a great place to do vectorized utf8 validation + // plus, all subsequent attempts at parsing out values as a &str can be done using + // String::from_utf8_unchecked + + let value_bytes = slice_from_slice(self.bytes, self.first_value_byte..)?; + let value_str = simdutf8::basic::from_utf8(value_bytes) + .map_err(|e| ArrowError::InvalidArgumentError(format!("{e:?}")))?; + + // (4) + // if the metadata header marked this variant as having a sorted dictionary, + // we must check whether the fields are actually sorted + + if self.header.is_sorted { + let mut prev_field_name = None; + let mut is_sorted = true; + + for i in 0..offsets.len() - 1 { Review Comment: as above, `1..offsets.len()` and adjust indexing below? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org