alamb commented on code in PR #8021:
URL: https://github.com/apache/arrow-rs/pull/8021#discussion_r2257590658
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -135,36 +140,189 @@ impl VariantArray {
self.inner
}
+ /// Return the shredding state of this `VariantArray`
+ pub fn shredding_state(&self) -> &ShreddingState {
+ &self.shredding_state
+ }
+
/// Return the [`Variant`] instance stored at the given row
///
- /// Panics if the index is out of bounds.
+ /// Consistently with other Arrow arrays types, this API requires you to
+ /// check for nulls first using [`Self::is_valid`].
+ ///
+ /// # Panics
+ /// * if the index is out of bounds
+ /// * if the array value is null
+ ///
+ /// If this is a shredded variant but has no value at the shredded
location, it
+ /// will return [`Variant::Null`].
+ ///
+ ///
+ /// # Performance Note
+ ///
+ /// This is certainly not the most efficient way to access values in a
+ /// `VariantArray`, but it is useful for testing and debugging.
///
/// Note: Does not do deep validation of the [`Variant`], so it is up to
the
/// caller to ensure that the metadata and value were constructed
correctly.
pub fn value(&self, index: usize) -> Variant {
- let metadata = self.metadata_field().as_binary_view().value(index);
- let value = self.value_field().as_binary_view().value(index);
- Variant::new(metadata, value)
+ match &self.shredding_state {
+ ShreddingState::Unshredded { metadata, value } => {
+ Variant::new(metadata.value(index), value.value(index))
+ }
+ ShreddingState::FullyShredded {
+ metadata: _,
+ typed_value,
+ } => {
+ if typed_value.is_null(index) {
+ Variant::Null
+ } else {
+ typed_value_to_variant(typed_value, index)
+ }
+ }
+ ShreddingState::PartiallyShredded {
+ metadata,
+ value,
+ typed_value,
+ } => {
+ if typed_value.is_null(index) {
+ Variant::new(metadata.value(index), value.value(index))
+ } else {
+ typed_value_to_variant(typed_value, index)
+ }
+ }
+ }
}
- fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> {
- array.column_by_name("metadata").cloned()
+ /// Return a reference to the metadata field of the [`StructArray`]
+ pub fn metadata_field(&self) -> &BinaryViewArray {
+ self.shredding_state.metadata_field()
}
- fn find_value_field(array: &StructArray) -> Option<ArrayRef> {
- array.column_by_name("value").cloned()
+ /// Return a reference to the value field of the `StructArray`
+ pub fn value_field(&self) -> Option<&BinaryViewArray> {
+ self.shredding_state.value_field()
}
- /// Return a reference to the metadata field of the [`StructArray`]
- pub fn metadata_field(&self) -> &ArrayRef {
- // spec says fields order is not guaranteed, so we search by name
- &self.metadata_ref
+ /// Return a reference to the typed_value field of the `StructArray`, if
present
+ pub fn typed_value_field(&self) -> Option<&ArrayRef> {
+ self.shredding_state.typed_value_field()
}
+}
- /// Return a reference to the value field of the `StructArray`
- pub fn value_field(&self) -> &ArrayRef {
- // spec says fields order is not guaranteed, so we search by name
- &self.value_ref
+/// Variant arrays can be shredded in one of three states, encoded here
+#[derive(Debug)]
+pub enum ShreddingState {
+ /// This variant has no typed_value field
+ Unshredded {
+ metadata: BinaryViewArray,
+ value: BinaryViewArray,
+ },
+ /// This variant has a typed_value field and no value field
+ /// meaning it is fully shredded (aka the value is stored in typed_value)
+ FullyShredded {
+ metadata: BinaryViewArray,
+ typed_value: ArrayRef,
+ },
+ /// This variant has both a value field and a typed_value field
+ /// meaning it is partially shredded: first the typed_value is used, and
+ /// if that is null, the value field is used.
+ PartiallyShredded {
+ metadata: BinaryViewArray,
+ value: BinaryViewArray,
+ typed_value: ArrayRef,
+ },
+}
+
+impl ShreddingState {
+ /// try to create a new `ShreddingState` from the given fields
+ pub fn try_new(
+ metadata: BinaryViewArray,
+ value: Option<BinaryViewArray>,
+ typed_value: Option<ArrayRef>,
+ ) -> Result<Self, ArrowError> {
+ match (metadata, value, typed_value) {
+ (metadata, Some(value), Some(typed_value)) =>
Ok(Self::PartiallyShredded {
+ metadata,
+ value,
+ typed_value,
+ }),
+ (metadata, Some(value), None) => Ok(Self::Unshredded { metadata,
value }),
+ (metadata, None, Some(typed_value)) => Ok(Self::FullyShredded {
+ metadata,
+ typed_value,
+ }),
+ (_metadata_field, None, None) =>
Err(ArrowError::InvalidArgumentError(String::from(
+ "VariantArray has neither value nor typed_value field",
+ ))),
+ }
+ }
+
+ /// Return a reference to the metadata field
+ pub fn metadata_field(&self) -> &BinaryViewArray {
+ match self {
+ ShreddingState::Unshredded { metadata, .. } => metadata,
+ ShreddingState::FullyShredded { metadata, .. } => metadata,
+ ShreddingState::PartiallyShredded { metadata, .. } => metadata,
+ }
+ }
+
+ /// Return a reference to the value field, if present
+ pub fn value_field(&self) -> Option<&BinaryViewArray> {
+ match self {
+ ShreddingState::Unshredded { value, .. } => Some(value),
+ ShreddingState::FullyShredded { .. } => None,
+ ShreddingState::PartiallyShredded { value, .. } => Some(value),
+ }
+ }
+
+ /// Return a reference to the typed_value field, if present
+ pub fn typed_value_field(&self) -> Option<&ArrayRef> {
+ match self {
+ ShreddingState::Unshredded { .. } => None,
+ ShreddingState::FullyShredded { typed_value, .. } =>
Some(typed_value),
+ ShreddingState::PartiallyShredded { typed_value, .. } =>
Some(typed_value),
+ }
+ }
+
+ /// Slice all the underlying arrays
+ pub fn slice(&self, offset: usize, length: usize) -> Self {
+ match self {
+ ShreddingState::Unshredded { metadata, value } =>
ShreddingState::Unshredded {
+ metadata: metadata.slice(offset, length),
+ value: value.slice(offset, length),
+ },
+ ShreddingState::FullyShredded {
+ metadata,
+ typed_value,
+ } => ShreddingState::FullyShredded {
+ metadata: metadata.slice(offset, length),
+ typed_value: typed_value.slice(offset, length),
+ },
+ ShreddingState::PartiallyShredded {
+ metadata,
+ value,
+ typed_value,
+ } => ShreddingState::PartiallyShredded {
+ metadata: metadata.slice(offset, length),
+ value: value.slice(offset, length),
+ typed_value: typed_value.slice(offset, length),
+ },
+ }
+ }
+}
+
+/// returns the non-null element at index as a Variant
+fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant {
Review Comment:
> For public API -- 100% agree. This is a private internal API tho, so it
seems like we have room to do what we think makes the code simple/maintainable.
Pulling important work inside the method instead of requiring all callers to
remember it seems like a good example of that.
Sorry -- I wasn't clear -- the reason I was talking about
`VariantArray::value` is that it is the only caller of `typed_value_to_variant`
so if we return `Option` from this value, we would just be stuck at the same
place 🤔
> Here, we're accessing is_null and value both inside the loop -- and the
latter conditionally -- so I'd be very surprised if LLVM was willing to inject
vectorization code that requires touching values the code said not to touch.
Yeah I agree I don't think it will make any difference for performance with
Cariants. The primary reason in my mind is to be consistent with other APIs.
I think in an earlier version of this PR I actually had changed `value()`
return `Option`. Maybe changing the signature is a good idea 🤔
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]