scovich commented on code in PR #7807:
URL: https://github.com/apache/arrow-rs/pull/7807#discussion_r2176146408
##########
parquet-variant/src/variant/object.rs:
##########
@@ -29,111 +29,196 @@ const NUM_HEADER_BYTES: usize = 1;
/// Header structure for [`VariantObject`]
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct VariantObjectHeader {
- field_offset_size: OffsetSizeBytes,
+ num_elements_size: OffsetSizeBytes,
field_id_size: OffsetSizeBytes,
- is_large: bool,
+ field_offset_size: OffsetSizeBytes,
}
impl VariantObjectHeader {
+ // Hide the ugly casting
+ const fn num_elements_size(&self) -> usize {
+ self.num_elements_size as _
+ }
+ const fn field_id_size(&self) -> usize {
+ self.field_id_size as _
+ }
+ const fn field_offset_size(&self) -> usize {
+ self.field_offset_size as _
+ }
+
+ // Avoid materializing this offset, since it's cheaply and safely
computable
+ const fn field_ids_start_byte(&self) -> usize {
+ NUM_HEADER_BYTES + self.num_elements_size()
+ }
+
pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
// Parse the header byte to get object parameters
let value_header = header_byte >> 2;
let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2
bits
let is_large = (value_header & 0x10) != 0; // 5th bit
-
+ let num_elements_size = match is_large {
+ true => OffsetSizeBytes::Four,
+ false => OffsetSizeBytes::One,
+ };
Ok(Self {
- field_offset_size:
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
+ num_elements_size,
field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
- is_large,
+ field_offset_size:
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
})
}
}
/// A [`Variant`] Object (struct with named fields).
+///
+/// See the [Variant spec] file for more information.
+///
+/// # Validation
+///
+/// Every instance of variant object is either _valid_ or _invalid_. depending
on whether the
+/// underlying bytes are a valid encoding of a variant object subtype (see
below).
+///
+/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully
(and recursively)
+/// _validated_. They always contain _valid_ data, and infallible accesses
such as iteration and
+/// indexing are panic-free. The validation cost is linear in the number of
underlying bytes.
+///
+/// Instances produced by [`Self::new`] are _unvalidated_ and so they may
contain either _valid_ or
+/// _invalid_ data. Infallible accesses such as iteration and indexing will
panic if the underlying
+/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`]
and [`Self::get`] are
+/// provided as panic-free alternatives. [`Self::validate`] can also be used
to _validate_ an
+/// _unvalidated_ instance, if desired.
+///
+/// _Unvalidated_ instances can be constructed in constant time. They can be
useful if the caller
+/// knows the underlying bytes were already validated previously, or if the
caller intends to
+/// perform a small number of (fallible) field accesses against a large object.
+///
+/// A _validated_ instance guarantees that:
+///
+/// - header byte is valid
+/// - num_elements is in bounds
+/// - field id array is in bounds
+/// - field offset array is in bounds
+/// - field value array is in bounds
+/// - all field ids are valid metadata dictionary entries (*)
+/// - field ids are lexically ordered according by their corresponding string
values (*)
+/// - all field offsets are in bounds (*)
+/// - all field values are (recursively) _valid_ variant values (*)
+/// - the associated variant metadata is [valid] (*)
+///
+/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation
checks (marked by `(*)`
+/// in the list above); it panics any of the other checks fails.
+///
+/// # Safety
+///
+/// Even an _invalid_ variant object instance is still _safe_ to use in the
Rust sense. Accessing it
+/// with infallible methods may cause panics but will never lead to undefined
behavior.
+///
+/// [valid]: VariantMetadata#Validation
+/// [Variant spec]:
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2
#[derive(Clone, Debug, PartialEq)]
pub struct VariantObject<'m, 'v> {
pub metadata: VariantMetadata<'m>,
pub value: &'v [u8],
header: VariantObjectHeader,
num_elements: usize,
- field_ids_start_byte: usize,
- field_offsets_start_byte: usize,
- values_start_byte: usize,
+ first_field_offset_byte: usize,
+ first_value_byte: usize,
+ validated: bool,
}
impl<'m, 'v> VariantObject<'m, 'v> {
- /// Attempts to interpret `value` as a variant object value.
+ pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
+ Self::try_new_impl(metadata, value).expect("Invalid variant object")
+ }
+
+ /// Attempts to interpet `metadata` and `value` as a variant object.
///
/// # Validation
///
/// This constructor verifies that `value` points to a valid variant
object value. In
/// particular, that all field ids exist in `metadata`, and all offsets
are in-bounds and point
/// to valid objects.
- // TODO: How to make the validation non-recursive while still making
iterators safely infallible??
- // See https://github.com/apache/arrow-rs/issues/7711
pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) ->
Result<Self, ArrowError> {
+ let mut new_self = Self::try_new_impl(metadata, value)?.validate()?;
+ new_self.validated = true;
Review Comment:
Fixed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]