scovich commented on code in PR #9791:
URL: https://github.com/apache/arrow-rs/pull/9791#discussion_r3251117749


##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -809,125 +827,97 @@ impl From<ShreddedVariantFieldArray> for StructArray {
 ///
 /// [Parquet Variant Shredding Spec]: 
https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
 #[derive(Debug, Clone)]
-pub struct ShreddingState {
-    value: Option<ArrayRef>,
-    typed_value: Option<ArrayRef>,
+pub struct ShreddingState<'a> {
+    value: Option<Cow<'a, ArrayRef>>,
+    typed_value: Option<Cow<'a, ArrayRef>>,
 }
 
-impl ShreddingState {
-    /// Create a new `ShreddingState` from the given `value` and `typed_value` 
fields
-    ///
-    /// Note you can create a `ShreddingState` from a &[`StructArray`] using
-    /// `ShreddingState::try_from(&struct_array)`, for example:
+impl ShreddingState<'static> {
+    /// Create a new owned `ShreddingState` from the given `value` and 
`typed_value` fields
     ///
-    /// ```no_run
-    /// # use arrow::array::StructArray;
-    /// # use parquet_variant_compute::ShreddingState;
-    /// # fn get_struct_array() -> StructArray {
-    /// #   unimplemented!()
-    /// # }
-    /// let struct_array: StructArray = get_struct_array();
-    /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
-    /// ```
+    /// Use this when constructing a `ShreddingState` from freshly produced 
arrays (e.g., the
+    /// result of a `take` kernel call). For borrowing an existing 
`StructArray`'s columns, use
+    /// `ShreddingState::try_from(&struct_array)`.
     pub fn new(value: Option<ArrayRef>, typed_value: Option<ArrayRef>) -> Self 
{
-        Self { value, typed_value }
+        Self {
+            value: value.map(Cow::Owned),
+            typed_value: typed_value.map(Cow::Owned),
+        }
     }
+}
 
+impl<'a> ShreddingState<'a> {
     /// Return a reference to the value field, if present
     pub fn value_field(&self) -> Option<&ArrayRef> {
-        self.value.as_ref()
+        self.value.as_deref()
     }
 
     /// Return a reference to the typed_value field, if present
     pub fn typed_value_field(&self) -> Option<&ArrayRef> {
-        self.typed_value.as_ref()
+        self.typed_value.as_deref()
     }
 
-    /// Returns a borrowed version of this shredding state
-    pub fn borrow(&self) -> BorrowedShreddingState<'_> {
-        BorrowedShreddingState {
-            value: self.value_field(),
-            typed_value: self.typed_value_field(),
-        }
+    /// Consume self and return the `(value, typed_value)` Cows so callers can 
extract long-lived
+    /// borrows from `Cow::Borrowed` variants.
+    pub(crate) fn into_fields(self) -> (Option<Cow<'a, ArrayRef>>, 
Option<Cow<'a, ArrayRef>>) {
+        (self.value, self.typed_value)
     }
 
-    /// Slice all the underlying arrays
-    pub fn slice(&self, offset: usize, length: usize) -> Self {
-        Self {
-            value: self.value.as_ref().map(|v| v.slice(offset, length)),
-            typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, 
length)),
+    /// Convert into a `'static`-lifetime state by cloning any borrowed Arcs. 
Cheap (just Arc
+    /// refcount bumps for each `Cow::Borrowed`; no-op for `Cow::Owned`).
+    pub fn into_static(self) -> ShreddingState<'static> {
+        ShreddingState {
+            value: self.value.map(|cow| Cow::Owned(cow.into_owned())),
+            typed_value: self.typed_value.map(|cow| 
Cow::Owned(cow.into_owned())),
         }
     }
-}
-
-/// Similar to [`ShreddingState`] except it holds borrowed references of the 
target arrays. Useful
-/// for avoiding clone operations when the caller does not need a 
self-standing shredding state.
-#[derive(Clone, Debug)]
-pub struct BorrowedShreddingState<'a> {
-    value: Option<&'a ArrayRef>,
-    typed_value: Option<&'a ArrayRef>,
-}
 
-impl<'a> BorrowedShreddingState<'a> {
-    /// Create a new `BorrowedShreddingState` from the given `value` and 
`typed_value` fields
-    ///
-    /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] 
using
-    /// `BorrowedShreddingState::try_from(&struct_array)`, for example:
+    /// Consume self and return the `(value, typed_value)` borrowed 
references, asserting that
+    /// the state was constructed from borrowed data. Panics on `Cow::Owned`.
     ///
-    /// ```no_run
-    /// # use arrow::array::StructArray;
-    /// # use parquet_variant_compute::BorrowedShreddingState;
-    /// # fn get_struct_array() -> StructArray {
-    /// #   unimplemented!()
-    /// # }
-    /// let struct_array: StructArray = get_struct_array();
-    /// let shredding_state = 
BorrowedShreddingState::try_from(&struct_array).unwrap();
-    /// ```
-    pub fn new(value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>) 
-> Self {
-        Self { value, typed_value }
-    }
-
-    /// Return a reference to the value field, if present
-    pub fn value_field(&self) -> Option<&'a ArrayRef> {
-        self.value
+    /// Use this in code paths (like unshred) that only ever receive borrowed 
state and need
+    /// `&'a ArrayRef` references to store in lifetime-parameterized builders.
+    pub(crate) fn into_borrowed_fields(self) -> (Option<&'a ArrayRef>, 
Option<&'a ArrayRef>) {
+        fn expect_borrowed<'a>(opt: Option<Cow<'a, ArrayRef>>) -> Option<&'a 
ArrayRef> {
+            opt.map(|cow| match cow {
+                Cow::Borrowed(r) => r,
+                Cow::Owned(_) => unreachable!("expected Cow::Borrowed 
ShreddingState"),
+            })
+        }
+        (
+            expect_borrowed(self.value),
+            expect_borrowed(self.typed_value),
+        )
     }
 
-    /// Return a reference to the typed_value field, if present
-    pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
-        self.typed_value
+    /// Slice all the underlying arrays. Produces owned data.
+    pub fn slice(&self, offset: usize, length: usize) -> 
ShreddingState<'static> {
+        ShreddingState {
+            value: self
+                .value_field()
+                .map(|v| Cow::Owned(v.slice(offset, length))),
+            typed_value: self
+                .typed_value_field()
+                .map(|tv| Cow::Owned(tv.slice(offset, length))),
+        }
     }
 }
 
-impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
+impl<'a> TryFrom<&'a StructArray> for ShreddingState<'a> {
     type Error = ArrowError;
 
     fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
         // The `value` column need not exist, but if it does it must be a 
binary type.
         let value = if let Some(value_col) = 
inner_struct.column_by_name("value") {
             validate_binary_array(value_col.as_ref(), "value")?;
-            Some(value_col)
+            Some(Cow::Borrowed(value_col))
         } else {
             None
         };

Review Comment:
   aside: isn't this just a `transpose`?
   ```rust
   let value = inner_struct
       .column_by_name("value")
       .map(|value| {
           validate_binary_array(...)?;
           Ok(Cow::Borrowed(value))
       }
       .transpose()?; 
   ```
   (tho it's not shorter, so meh)



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -25,74 +25,115 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use parquet_variant::{VariantPath, VariantPathElement};
 
 use crate::VariantArray;
-use crate::variant_array::BorrowedShreddingState;
+use crate::variant_array::{ShreddingState, validate_binary_array};
 use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
 
 use arrow::array::AsArray;
+use std::borrow::Cow;
 use std::sync::Arc;
 
 pub(crate) enum ShreddedPathStep<'a> {
-    /// Path step succeeded, return the new shredding state
-    Success(BorrowedShreddingState<'a>),
+    /// Path step succeeded.
+    Success {
+        /// The shredding state for the next step.
+        next_state: ShreddingState<'a>,
+        /// The `typed_value` column the step descended through. Returned so 
the caller can union
+        /// its nulls into the accumulated null buffer without needing to 
re-borrow from the
+        /// (now-consumed) input state.
+        consumed_typed_value: Cow<'a, ArrayRef>,

Review Comment:
   We can't just return a reference to the null buffer because of lifetime 
issues, I guess?



##########
parquet-variant-compute/src/unshred_variant.rs:
##########
@@ -175,12 +174,17 @@ impl<'a> UnshredVariantRowBuilder<'a> {
         }
     }
 
-    /// Creates a new UnshredVariantRowBuilder from shredding state
-    /// Returns None for None/None case - caller decides how to handle based 
on context
-    fn try_new_opt(shredding_state: BorrowedShreddingState<'a>) -> 
Result<Option<Self>> {
-        let value = shredding_state.value_field();
-        let typed_value = shredding_state.typed_value_field();
-        let Some(typed_value) = typed_value else {
+    /// Creates a new UnshredVariantRowBuilder from the `(value, typed_value)` 
pair of a shredded
+    /// variant struct. Returns None for the None/None case - caller decides 
how to handle based on
+    /// context.
+    fn try_new_opt(inner_struct: &'a StructArray) -> Result<Option<Self>> {

Review Comment:
   Sorry I was too slow -- I agree with your assessment (from five hours ago) 
that 1-2 arc bumps per path step should be acceptable. But it looks like you 
already implemented the Cow version (two hours ago)?



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -676,16 +687,23 @@ impl ShreddedVariantFieldArray {
             ));
         };
 
+        // Cache the (value, typed_value) lookup. try_from validates the value 
column type.
+        let shredding_state = 
ShreddingState::try_from(inner_struct)?.into_static();
+
         // Note this clone is cheap, it just bumps the ref count

Review Comment:
   aside: `inner_struct` is a `StructArray`, not an `ArrayRef`, so `clone` has 
to round trip through `ArrayData`, no?



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -25,74 +25,115 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use parquet_variant::{VariantPath, VariantPathElement};
 
 use crate::VariantArray;
-use crate::variant_array::BorrowedShreddingState;
+use crate::variant_array::{ShreddingState, validate_binary_array};
 use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
 
 use arrow::array::AsArray;
+use std::borrow::Cow;
 use std::sync::Arc;
 
 pub(crate) enum ShreddedPathStep<'a> {
-    /// Path step succeeded, return the new shredding state
-    Success(BorrowedShreddingState<'a>),
+    /// Path step succeeded.
+    Success {
+        /// The shredding state for the next step.
+        next_state: ShreddingState<'a>,
+        /// The `typed_value` column the step descended through. Returned so 
the caller can union
+        /// its nulls into the accumulated null buffer without needing to 
re-borrow from the
+        /// (now-consumed) input state.
+        consumed_typed_value: Cow<'a, ArrayRef>,
+    },
     /// The path element is not present in the `typed_value` column and there 
is no `value` column,
     /// so we know it does not exist. It, and all paths under it, are all-NULL.
     Missing,
-    /// The path element is not present in the `typed_value` column and must 
be retrieved from the `value`
-    /// column instead. The caller should be prepared to handle any value, 
including the requested
-    /// type, an arbitrary "wrong" type, or `Variant::Null`.
-    NotShredded,
+    /// The path element is not present in the `typed_value` column and must 
be retrieved from the
+    /// `value` column instead. The caller should be prepared to handle any 
value, including the
+    /// requested type, an arbitrary "wrong" type, or `Variant::Null`. The 
consumed `value` column
+    /// is returned so the caller can use it without re-borrowing from the 
(now-consumed) input.
+    NotShredded {
+        consumed_value: Option<Cow<'a, ArrayRef>>,
+    },
+}
+
+/// Walk into `typed_value`'s nested `StructArray` and look up `name`. Returns
+/// `Ok(Some(nested_struct))` on success, `Ok(None)` for a missing path 
(non-struct or missing
+/// field), and `Err` on an invalid nested struct.
+fn walk_field_step<'b>(typed_value: &'b ArrayRef, name: &str) -> 
Result<Option<&'b StructArray>> {
+    let Some(struct_array) = typed_value.as_struct_opt() else {
+        return Ok(None);
+    };
+    let Some(field) = struct_array.column_by_name(name) else {
+        return Ok(None);
+    };
+    let nested_struct = field.as_struct_opt().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(format!(
+            "Expected Struct array while following path, got {}",
+            field.data_type(),
+        ))
+    })?;
+    Ok(Some(nested_struct))
 }
 
 /// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try 
to take one path step
 /// deeper. For a `VariantPathElement::Field`, if there is no `typed_value` at 
this level, if
 /// `typed_value` is not a struct, or if the requested field name does not 
exist, traversal returns
 /// a missing-path step (`Missing` or `NotShredded` depending on whether 
`value` exists).
 ///
+/// The state is consumed by value so its lifetime `'a` (rooted at the 
original input) is preserved
+/// onto the returned step. For `Cow::Borrowed` inputs this lets the chain 
walk all the way down
+/// without per-step Arc bumps; for `Cow::Owned` inputs (e.g., `take`-kernel 
results) the function
+/// clones the new state's columns since the owned typed_value goes out of 
scope on return.
+///
 /// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe 
not even possible.
 pub(crate) fn follow_shredded_path_element<'a>(
-    shredding_state: &BorrowedShreddingState<'a>,
+    shredding_state: ShreddingState<'a>,
     path_element: &VariantPathElement<'_>,
     _cast_options: &CastOptions,
 ) -> Result<ShreddedPathStep<'a>> {
-    // If the requested path element is not present in `typed_value`, and 
`value` is missing, then
-    // we know it does not exist; it, and all paths under it, are all-NULL.
-    let missing_path_step = || match shredding_state.value_field() {
-        Some(_) => ShreddedPathStep::NotShredded,
+    let (value, typed_value) = shredding_state.into_fields();
+    let missing_path_step = |value: Option<Cow<'a, ArrayRef>>| match value {
+        Some(_) => ShreddedPathStep::NotShredded {
+            consumed_value: value,
+        },
         None => ShreddedPathStep::Missing,
     };
 
-    let Some(typed_value) = shredding_state.typed_value_field() else {
-        return Ok(missing_path_step());
+    let Some(typed_value) = typed_value else {
+        return Ok(missing_path_step(value));
     };
 
     match path_element {
-        VariantPathElement::Field { name } => {
-            // Try to step into the requested field name of a struct.
-            // First, try to downcast to StructArray
-            let Some(struct_array) = 
typed_value.as_any().downcast_ref::<StructArray>() else {
-                // Object field path step follows JSONPath semantics and 
returns missing path step (NotShredded/Missing) on non-struct path
-                return Ok(missing_path_step());
-            };
-
-            // Now try to find the column - missing column in a present struct 
is just missing data
-            let Some(field) = struct_array.column_by_name(name) else {
-                // Missing column in a present struct is just missing, not 
wrong - return Ok
-                return Ok(missing_path_step());
-            };
-
-            let struct_array = field.as_struct_opt().ok_or_else(|| {
-                // TODO: Should we blow up? Or just end the traversal and let 
the normal
-                // variant pathing code sort out the mess that it must anyway 
be
-                // prepared to handle?
-                ArrowError::InvalidArgumentError(format!(
-                    "Expected Struct array while following path, got {}",
-                    field.data_type(),
-                ))
-            })?;
-
-            let state = BorrowedShreddingState::try_from(struct_array)?;
-            Ok(ShreddedPathStep::Success(state))
-        }
+        VariantPathElement::Field { name } => match typed_value {
+            Cow::Borrowed(typed_value_ref) => {
+                let Some(nested) = walk_field_step(typed_value_ref, name)? 
else {
+                    return Ok(missing_path_step(value));
+                };
+                // nested: &'a StructArray, so try_from produces 
ShreddingState<'a> with
+                // Cow::Borrowed views into `nested`. No Arc bumps on this 
path.
+                let next_state = ShreddingState::try_from(nested)?;
+                Ok(ShreddedPathStep::Success {
+                    next_state,
+                    consumed_typed_value: Cow::Borrowed(typed_value_ref),
+                })
+            }
+            Cow::Owned(typed_value_arc) => {
+                let Some(nested) = walk_field_step(&typed_value_arc, name)? 
else {
+                    return Ok(missing_path_step(value));
+                };
+                // `typed_value_arc` is the owned input and is about to move 
into the returned
+                // `consumed_typed_value`. We cannot also borrow into it 
(self-referential), so we
+                // clone `nested`'s columns to produce a 
`ShreddingState<'static>` (coerces to 'a).

Review Comment:
   I don't understand how this could work? `'a` is the lifetime of the root, 
and some cloned Arc will have a "local" lifetime instead. How can that owned 
value coerce to `'a` lifetime? Or is `'a` actually something shorter-lived than 
the root?



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -25,74 +25,115 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use parquet_variant::{VariantPath, VariantPathElement};
 
 use crate::VariantArray;
-use crate::variant_array::BorrowedShreddingState;
+use crate::variant_array::{ShreddingState, validate_binary_array};
 use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
 
 use arrow::array::AsArray;
+use std::borrow::Cow;
 use std::sync::Arc;
 
 pub(crate) enum ShreddedPathStep<'a> {
-    /// Path step succeeded, return the new shredding state
-    Success(BorrowedShreddingState<'a>),
+    /// Path step succeeded.
+    Success {
+        /// The shredding state for the next step.
+        next_state: ShreddingState<'a>,
+        /// The `typed_value` column the step descended through. Returned so 
the caller can union
+        /// its nulls into the accumulated null buffer without needing to 
re-borrow from the
+        /// (now-consumed) input state.
+        consumed_typed_value: Cow<'a, ArrayRef>,
+    },
     /// The path element is not present in the `typed_value` column and there 
is no `value` column,
     /// so we know it does not exist. It, and all paths under it, are all-NULL.
     Missing,
-    /// The path element is not present in the `typed_value` column and must 
be retrieved from the `value`
-    /// column instead. The caller should be prepared to handle any value, 
including the requested
-    /// type, an arbitrary "wrong" type, or `Variant::Null`.
-    NotShredded,
+    /// The path element is not present in the `typed_value` column and must 
be retrieved from the
+    /// `value` column instead. The caller should be prepared to handle any 
value, including the
+    /// requested type, an arbitrary "wrong" type, or `Variant::Null`. The 
consumed `value` column
+    /// is returned so the caller can use it without re-borrowing from the 
(now-consumed) input.
+    NotShredded {
+        consumed_value: Option<Cow<'a, ArrayRef>>,
+    },
+}
+
+/// Walk into `typed_value`'s nested `StructArray` and look up `name`. Returns
+/// `Ok(Some(nested_struct))` on success, `Ok(None)` for a missing path 
(non-struct or missing
+/// field), and `Err` on an invalid nested struct.
+fn walk_field_step<'b>(typed_value: &'b ArrayRef, name: &str) -> 
Result<Option<&'b StructArray>> {
+    let Some(struct_array) = typed_value.as_struct_opt() else {
+        return Ok(None);
+    };
+    let Some(field) = struct_array.column_by_name(name) else {
+        return Ok(None);
+    };
+    let nested_struct = field.as_struct_opt().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(format!(

Review Comment:
   This would mean the Array doesn't follow the shredding spec? Because the 
field should be a struct with value/typed_value children?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to