sdf-jkl commented on code in PR #8354:
URL: https://github.com/apache/arrow-rs/pull/8354#discussion_r2461286479
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -870,6 +870,40 @@ impl From<BorrowedShreddingState<'_>> for ShreddingState {
}
}
+pub enum ShreddingStateCow<'a> {
+ Owned(ShreddingState),
+ Borrowed(BorrowedShreddingState<'a>),
+}
+
+impl<'a> From<ShreddingState> for ShreddingStateCow<'a> {
+ fn from(s: ShreddingState) -> Self {
+ Self::Owned(s)
+ }
+}
+impl<'a> From<BorrowedShreddingState<'a>> for ShreddingStateCow<'a> {
+ fn from(s: BorrowedShreddingState<'a>) -> Self {
+ Self::Borrowed(s)
+ }
+}
+
+impl<'a> ShreddingStateCow<'a> {
+ /// Always gives the caller a borrowed view, even if we own internally.
+ pub fn as_view(&self) -> BorrowedShreddingState<'_> {
+ match self {
+ ShreddingStateCow::Borrowed(b) => b.clone(),
+ ShreddingStateCow::Owned(o) => o.borrow(),
+ }
+ }
+
+ /// Materialize ownership when the caller needs to keep it.
+ pub fn into_owned(self) -> ShreddingState {
+ match self {
+ ShreddingStateCow::Borrowed(b) => b.into(),
+ ShreddingStateCow::Owned(o) => o,
+ }
+ }
+}
Review Comment:
Here is the new `ShreddingStateCow` enum implementation
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -151,20 +205,21 @@ fn shredded_get_path(
// Peel away the prefix of path elements that traverses the shredded parts
of this variant
// column. Shredding will traverse the rest of the path on a per-row basis.
- let mut shredding_state = input.shredding_state().borrow();
+ let mut shredding_state =
ShreddingStateCow::Borrowed(input.shredding_state().borrow());
let mut accumulated_nulls = input.inner().nulls().cloned();
let mut path_index = 0;
for path_element in path {
- match follow_shredded_path_element(&shredding_state, path_element,
cast_options)? {
+ match follow_shredded_path_element(&shredding_state.as_view(),
path_element, cast_options)?
+ {
ShreddedPathStep::Success(state) => {
// Union nulls from the typed_value we just accessed
- if let Some(typed_value) = shredding_state.typed_value_field()
{
+ if let Some(typed_value) =
shredding_state.as_view().typed_value_field() {
accumulated_nulls = arrow::buffer::NullBuffer::union(
accumulated_nulls.as_ref(),
typed_value.nulls(),
);
}
- shredding_state = state;
+ shredding_state = ShreddingStateCow::Owned(state.into_owned());
Review Comment:
Here I could not come up with a way to make the `shredding_state` for the
next `path_element` be ither borrowed or owned depending on the
`follow_shredded_path_element` output.
Made it `into_owned()` just to pass the borrow checker.
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -97,14 +98,67 @@ pub(crate) fn follow_shredded_path_element<'a>(
})?;
let state = BorrowedShreddingState::try_from(struct_array)?;
- Ok(ShreddedPathStep::Success(state))
+ Ok(ShreddedPathStep::Success(state.into()))
}
- VariantPathElement::Index { .. } => {
+ VariantPathElement::Index { index } => {
// TODO: Support array indexing. Among other things, it will
require slicing not
// only the array we have here, but also the corresponding
metadata and null masks.
- Err(ArrowError::NotYetImplemented(
- "Pathing into shredded variant array index".into(),
- ))
+ let Some(list_array) =
typed_value.as_any().downcast_ref::<GenericListArray<i32>>()
+ else {
+ // Downcast failure - if strict cast options are enabled, this
should be an error
+ if !cast_options.safe {
+ return Err(ArrowError::CastError(format!(
+ "Cannot access index '{}' on non-list type: {}",
+ index,
+ typed_value.data_type()
+ )));
+ }
+ // With safe cast options, return NULL (missing_path_step)
+ return Ok(missing_path_step());
+ };
+
+ let offsets = list_array.offsets();
+ let values = list_array.values(); // This is a StructArray
+
+ let Some(struct_array) =
values.as_any().downcast_ref::<StructArray>() else {
+ return Ok(missing_path_step());
+ };
+
+ let Some(typed_array) = struct_array.column_by_name("typed_value")
else {
+ return Ok(missing_path_step());
+ };
+
+ // Build the list of indices to take
+ let mut take_indices = Vec::with_capacity(list_array.len());
+ for i in 0..list_array.len() {
+ let start = offsets[i] as usize;
+ let end = offsets[i + 1] as usize;
+ let len = end - start;
+
+ if *index < len {
+ take_indices.push(Some((start + index) as u32));
+ } else {
+ take_indices.push(None);
+ }
+ }
+
+ let index_array = UInt32Array::from(take_indices);
+
+ // Use Arrow compute kernel to gather elements
+ let taken = take(typed_array, &index_array, None)?;
+
+ let metadata_array =
BinaryViewArray::from_iter_values(std::iter::repeat_n(
+ EMPTY_VARIANT_METADATA_BYTES,
+ taken.len(),
+ ));
+
+ let struct_array = &StructArrayBuilder::new()
+ .with_field("metadata", Arc::new(metadata_array), false)
+ .with_field("typed_value", taken, true)
+ .build();
+
+ let state = ShreddingState::try_from(struct_array)?;
+ Ok(ShreddedPathStep::Success(state.into()))
}
}
}
Review Comment:
When we use `variant_get` on `Struct` `Variant Array`'s it's relatively easy
to extract the `typed_value`. For example, if we extract a.b because on the
inside it's just:
```
VariantArray{
StructArray{
"typed_value":
StructArray{
"typed_value": PrimiteArray, <- We can directly borrow the
value into
ShreddingState::Success()
because the needed values in the array are contiguous
"value": VariantArray
```
But if we try to extract "typed_value" from a `List` `VariantArray` it gets
more complicated. For example, extracting 0.0:
```
VariantArray{
StructArray{
"typed_value":
ListArray{
Offsets
StructArray{
"typed_value": PrimiteArray, <- but the values are now
not contiguous, and the
output array can only be
extracted using offsets, no borrow available
"value": VariantArray
```
Because of this issue the output of `follow_shredded_path_element` ->
ShreddedPathStep::Success can end up receiving `BorrowedShreddingState` or
owned `ShreddingState`.
To make this work I added a `ShreddingStateCow` enum and made it the
`ShreddedPathStep::Success` input.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]