scovich commented on code in PR #7871: URL: https://github.com/apache/arrow-rs/pull/7871#discussion_r2186470644
########## parquet-variant/src/variant.rs: ########## @@ -407,16 +412,16 @@ impl<'m, 'v> Variant<'m, 'v> { /// Variant leaf values are always valid by construction, but [objects] and [arrays] can be /// constructed in unvalidated (and potentially invalid) state. /// - /// If [`Self::is_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)` + /// If [`Self::is_fully_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)` /// where `m` and `v` are the sizes of metadata and value buffers, respectively. /// /// [objects]: VariantObject#Validation /// [arrays]: VariantList#Validation - pub fn validate(self) -> Result<Self, ArrowError> { + pub fn with_deep_validation(self) -> Result<Self, ArrowError> { Review Comment: Right now we have "full" and "deep" as ~synonyms? Should we pick just one? ```suggestion pub fn with_full_validation(self) -> Result<Self, ArrowError> { ``` (IMO `is_deeply_validated` doesn't sound quite as nice as `is_fully_validated`) ########## parquet-variant/src/variant/object.rs: ########## @@ -281,23 +286,23 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// Returns an iterator of (name, value) pairs over the fields of this object. pub fn iter(&self) -> impl Iterator<Item = (&'m str, Variant<'m, 'v>)> + '_ { - self.iter_try_impl() + self.try_iter_with_shallow_validation() .map(|result| result.expect("Invalid variant object field value")) } /// Fallible iteration over the fields of this object. - pub fn iter_try( + pub fn try_iter( &self, ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> + '_ { - self.iter_try_impl().map(|result| { + self.try_iter_with_shallow_validation().map(|result| { let (name, value) = result?; - Ok((name, value.validate()?)) + Ok((name, value.with_deep_validation()?)) }) } // Fallible iteration over the fields of this object that performs only shallow (constant-cost) // validation of field values. - fn iter_try_impl( + fn try_iter_with_shallow_validation( &self, ) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>> + '_ { (0..self.num_elements).map(move |i| Ok((self.try_field_name(i)?, self.try_field(i)?))) Review Comment: A missed fix: ```suggestion (0..self.num_elements).map(move |i| { let field = self.try_field_with_shallow_validation(i)?; Ok((self.try_field_name(i)?, field)) }) ``` ########## parquet-variant/src/variant/list.rs: ########## @@ -232,52 +232,57 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [invalid]: Self#Validation pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> { (index < self.num_elements).then(|| { - self.try_get_impl(index) - .and_then(Variant::validate) + self.try_get_with_shallow_validation(index) + .and_then(Variant::with_deep_validation) .expect("Invalid variant array element") }) } /// Fallible version of `get`. Returns element by index, capturing validation errors pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { - self.try_get_impl(index)?.validate() + self.try_get_with_shallow_validation(index)? + .with_deep_validation() } - /// Fallible iteration over the elements of this list. - pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { - self.iter_try_impl().map(|result| result?.validate()) + // Fallible version of `get`, performing only basic (constant-time) validation. + fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { + // Fetch the value bytes between the two offsets for this index, from the value array region + // of the byte buffer + let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?; + let value_bytes = + slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?; + + Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes) } - // Fallible iteration that only performs basic (constant-time) validation. - fn iter_try_impl(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { - (0..self.len()).map(move |i| self.try_get_impl(i)) + /// Fallible iteration over the elements of this list. + pub fn try_iter(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { Review Comment: I actually had `try_iter` at first, but I changed because the name suggests something that returns a Result of Iterator, rather than an Iterator of Result. No strong opinions tho -- if people like `try_iter` better, let's go for it. ########## parquet-variant/src/variant.rs: ########## @@ -323,18 +325,19 @@ impl<'m, 'v> Variant<'m, 'v> { metadata: VariantMetadata<'m>, value: &'v [u8], ) -> Result<Self, ArrowError> { - Self::try_new_with_metadata_impl(metadata, value)?.validate() + Self::try_new_with_metadata_and_shallow_validation(metadata, value)?.with_deep_validation() } /// Similar to [`Self::try_new_with_metadata`], but [unvalidated]. /// /// [unvalidated]: Self#Validation pub fn new_with_metadata(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { - Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant") + Self::try_new_with_metadata_and_shallow_validation(metadata, value) + .expect("Invalid variant") } // The actual constructor, which only performs shallow (constant-time) validation. - fn try_new_with_metadata_impl( + fn try_new_with_metadata_and_shallow_validation( Review Comment: Good thing this isn't part of the public API... quite a mouthful! ########## parquet-variant/src/variant/list.rs: ########## @@ -232,52 +232,57 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [invalid]: Self#Validation pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> { (index < self.num_elements).then(|| { - self.try_get_impl(index) - .and_then(Variant::validate) + self.try_get_with_shallow_validation(index) + .and_then(Variant::with_deep_validation) .expect("Invalid variant array element") }) } /// Fallible version of `get`. Returns element by index, capturing validation errors pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { - self.try_get_impl(index)?.validate() + self.try_get_with_shallow_validation(index)? + .with_deep_validation() } - /// Fallible iteration over the elements of this list. - pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { - self.iter_try_impl().map(|result| result?.validate()) + // Fallible version of `get`, performing only basic (constant-time) validation. + fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { + // Fetch the value bytes between the two offsets for this index, from the value array region + // of the byte buffer + let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?; + let value_bytes = + slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?; + + Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes) } - // Fallible iteration that only performs basic (constant-time) validation. - fn iter_try_impl(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { - (0..self.len()).map(move |i| self.try_get_impl(i)) + /// Fallible iteration over the elements of this list. + pub fn try_iter(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { + self.try_iter_with_shallow_validation() + .map(|result| result?.with_deep_validation()) } /// Iterates over the values of this list. When working with [unvalidated] input, consider - /// [`Self::iter_try`] to avoid panics due to invalid data. + /// [`Self::try_iter`] to avoid panics due to invalid data. /// /// [unvalidated]: Self#Validation pub fn iter(&self) -> impl Iterator<Item = Variant<'m, 'v>> + '_ { - self.iter_try_impl() + self.try_iter_with_shallow_validation() .map(|result| result.expect("Invalid variant list entry")) } + // Fallible iteration that only performs basic (constant-time) validation. + fn try_iter_with_shallow_validation( + &self, + ) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { + (0..self.len()).map(move |i| self.try_get_with_shallow_validation(i)) + } + // Attempts to retrieve the ith offset from the offset array region of the byte buffer. fn get_offset(&self, index: usize) -> Result<usize, ArrowError> { let byte_range = self.header.first_offset_byte()..self.first_value_byte; let offset_bytes = slice_from_slice(self.value, byte_range)?; self.header.offset_size.unpack_usize(offset_bytes, index) } - - // Fallible version of `get`, performing only basic (constant-time) validation. - fn try_get_impl(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { Review Comment: note: moving and changing code in the same commit makes review a _lot_ more difficult, and refactor + bug fix in the same commit suffers a similar issue. I know this code inside out and it's still hard to be sure I'm catching everything -- imagine the burden on a reviewer who isn't as familiar with the code? In the future, you might consider curating a stack of three commits for changes like this: * code movement only (big, but trivial to review because it doesn't change anything) * function renames only (big, but purely mechanical and thus easy to review) * the actual fix (tiny, and therefore easy to review) For big changes, some of those commits might even be their own PR... but a commit stack is often good enough, if called out so reviewers know to look at the commits in isolation) ########## parquet-variant/src/variant/list.rs: ########## @@ -232,52 +232,57 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [invalid]: Self#Validation pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> { (index < self.num_elements).then(|| { - self.try_get_impl(index) - .and_then(Variant::validate) + self.try_get_with_shallow_validation(index) + .and_then(Variant::with_deep_validation) .expect("Invalid variant array element") }) } /// Fallible version of `get`. Returns element by index, capturing validation errors pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { - self.try_get_impl(index)?.validate() + self.try_get_with_shallow_validation(index)? + .with_deep_validation() } - /// Fallible iteration over the elements of this list. - pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { - self.iter_try_impl().map(|result| result?.validate()) + // Fallible version of `get`, performing only basic (constant-time) validation. + fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { + // Fetch the value bytes between the two offsets for this index, from the value array region + // of the byte buffer + let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?; + let value_bytes = + slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?; + + Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes) } - // Fallible iteration that only performs basic (constant-time) validation. - fn iter_try_impl(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { - (0..self.len()).map(move |i| self.try_get_impl(i)) + /// Fallible iteration over the elements of this list. + pub fn try_iter(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { + self.try_iter_with_shallow_validation() + .map(|result| result?.with_deep_validation()) } /// Iterates over the values of this list. When working with [unvalidated] input, consider - /// [`Self::iter_try`] to avoid panics due to invalid data. + /// [`Self::try_iter`] to avoid panics due to invalid data. /// /// [unvalidated]: Self#Validation pub fn iter(&self) -> impl Iterator<Item = Variant<'m, 'v>> + '_ { - self.iter_try_impl() + self.try_iter_with_shallow_validation() .map(|result| result.expect("Invalid variant list entry")) } + // Fallible iteration that only performs basic (constant-time) validation. + fn try_iter_with_shallow_validation( + &self, + ) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { + (0..self.len()).map(move |i| self.try_get_with_shallow_validation(i)) Review Comment: First fix -- good catch! ########## parquet-variant/src/variant.rs: ########## @@ -382,21 +385,23 @@ impl<'m, 'v> Variant<'m, 'v> { VariantBasicType::ShortString => { Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?) } - VariantBasicType::Object => { - Variant::Object(VariantObject::try_new_impl(metadata, value)?) - } - VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?), + VariantBasicType::Object => Variant::Object( + VariantObject::try_new_with_shallow_validation(metadata, value)?, + ), + VariantBasicType::Array => Variant::List(VariantList::try_new_with_shallow_validation( + metadata, value, Review Comment: aside: I don't think fmt usually likes multiple args on a single line? (I sure sometimes _wish_ it did) ########## parquet-variant/src/variant/list.rs: ########## @@ -232,52 +232,57 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [invalid]: Self#Validation pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> { (index < self.num_elements).then(|| { - self.try_get_impl(index) - .and_then(Variant::validate) + self.try_get_with_shallow_validation(index) + .and_then(Variant::with_deep_validation) .expect("Invalid variant array element") }) } /// Fallible version of `get`. Returns element by index, capturing validation errors pub fn try_get(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { - self.try_get_impl(index)?.validate() + self.try_get_with_shallow_validation(index)? + .with_deep_validation() } - /// Fallible iteration over the elements of this list. - pub fn iter_try(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { - self.iter_try_impl().map(|result| result?.validate()) + // Fallible version of `get`, performing only basic (constant-time) validation. + fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { + // Fetch the value bytes between the two offsets for this index, from the value array region + // of the byte buffer + let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?; + let value_bytes = + slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?; + + Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes) } - // Fallible iteration that only performs basic (constant-time) validation. - fn iter_try_impl(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { - (0..self.len()).map(move |i| self.try_get_impl(i)) + /// Fallible iteration over the elements of this list. + pub fn try_iter(&self) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { + self.try_iter_with_shallow_validation() + .map(|result| result?.with_deep_validation()) } /// Iterates over the values of this list. When working with [unvalidated] input, consider - /// [`Self::iter_try`] to avoid panics due to invalid data. + /// [`Self::try_iter`] to avoid panics due to invalid data. /// /// [unvalidated]: Self#Validation pub fn iter(&self) -> impl Iterator<Item = Variant<'m, 'v>> + '_ { - self.iter_try_impl() + self.try_iter_with_shallow_validation() .map(|result| result.expect("Invalid variant list entry")) } + // Fallible iteration that only performs basic (constant-time) validation. + fn try_iter_with_shallow_validation( + &self, + ) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ { + (0..self.len()).map(move |i| self.try_get_with_shallow_validation(i)) + } + // Attempts to retrieve the ith offset from the offset array region of the byte buffer. fn get_offset(&self, index: usize) -> Result<usize, ArrowError> { let byte_range = self.header.first_offset_byte()..self.first_value_byte; let offset_bytes = slice_from_slice(self.value, byte_range)?; self.header.offset_size.unpack_usize(offset_bytes, index) } - - // Fallible version of `get`, performing only basic (constant-time) validation. - fn try_get_impl(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> { Review Comment: Plus, in my experience, it benefits the author as well. I can't count the number of times I've discovered unnecessarily ugliness, incomplete refactorings, test gaps, and even bugs (by inspection) when I did the work to tease apart a big complicated change into a stack of commits. A bit more work up front, but higher quality code with review stamps landing faster (sometimes a _lot_ faster). ########## parquet-variant/src/variant/list.rs: ########## @@ -136,18 +136,18 @@ impl<'m, 'v> VariantList<'m, 'v> { /// This constructor verifies that `value` points to a valid variant array value. In particular, /// that all offsets are in-bounds and point to valid (recursively validated) objects. pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> { - Self::try_new_impl(metadata, value)?.validate() + Self::try_new_with_shallow_validation(metadata, value)?.with_deep_validation() } pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { - Self::try_new_impl(metadata, value).expect("Invalid variant list value") + Self::try_new_with_shallow_validation(metadata, value).expect("Invalid variant list value") Review Comment: Another long line fmt would split? ########## parquet-variant/src/variant.rs: ########## @@ -382,21 +385,23 @@ impl<'m, 'v> Variant<'m, 'v> { VariantBasicType::ShortString => { Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?) } - VariantBasicType::Object => { - Variant::Object(VariantObject::try_new_impl(metadata, value)?) - } - VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?), + VariantBasicType::Object => Variant::Object( + VariantObject::try_new_with_shallow_validation(metadata, value)?, + ), + VariantBasicType::Array => Variant::List(VariantList::try_new_with_shallow_validation( + metadata, value, + )?), }; Ok(new_self) } /// True if this variant instance has already been [validated]. /// /// [validated]: Self#Validation - pub fn is_validated(&self) -> bool { + pub fn is_fully_validated(&self) -> bool { match self { Variant::List(list) => list.is_validated(), Review Comment: missed a rename? ```suggestion Variant::List(list) => list.is_fully_validated(), ``` ########## parquet-variant/src/variant/object.rs: ########## @@ -236,20 +236,25 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// field IDs). The latter can only happen when working with an unvalidated object produced by /// [`Self::new`]. pub fn field(&self, i: usize) -> Option<Variant<'m, 'v>> { - (i < self.len()).then(|| self.try_field_impl(i).expect("Invalid object field value")) + (i < self.len()).then(|| { + self.try_field_with_shallow_validation(i) + .expect("Invalid object field value") + }) } /// Fallible version of `field`. Returns field value by index, capturing validation errors pub fn try_field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> { - self.try_field_impl(i)?.validate() + self.try_field_with_shallow_validation(i)? + .with_deep_validation() } // Attempts to retrieve the ith field value from the value region of the byte buffer; it // performs only basic (constant-cost) validation. - fn try_field_impl(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> { + fn try_field_with_shallow_validation(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> { let value_bytes = slice_from_slice(self.value, self.first_value_byte..)?; let value_bytes = slice_from_slice(value_bytes, self.get_offset(i)?..)?; - Variant::try_new_with_metadata(self.metadata, value_bytes) + + Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes) Review Comment: Good catch. This and the one in ListBuilder were definitely an oversight. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org