sdf-jkl commented on code in PR #9791:
URL: https://github.com/apache/arrow-rs/pull/9791#discussion_r3251649202


##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -25,74 +25,115 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use parquet_variant::{VariantPath, VariantPathElement};
 
 use crate::VariantArray;
-use crate::variant_array::BorrowedShreddingState;
+use crate::variant_array::{ShreddingState, validate_binary_array};
 use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
 
 use arrow::array::AsArray;
+use std::borrow::Cow;
 use std::sync::Arc;
 
 pub(crate) enum ShreddedPathStep<'a> {
-    /// Path step succeeded, return the new shredding state
-    Success(BorrowedShreddingState<'a>),
+    /// Path step succeeded.
+    Success {
+        /// The shredding state for the next step.
+        next_state: ShreddingState<'a>,
+        /// The `typed_value` column the step descended through. Returned so 
the caller can union
+        /// its nulls into the accumulated null buffer without needing to 
re-borrow from the
+        /// (now-consumed) input state.
+        consumed_typed_value: Cow<'a, ArrayRef>,

Review Comment:
   We'd clone otherwise, which is just an Arc Bump



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -25,74 +25,115 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use parquet_variant::{VariantPath, VariantPathElement};
 
 use crate::VariantArray;
-use crate::variant_array::BorrowedShreddingState;
+use crate::variant_array::{ShreddingState, validate_binary_array};
 use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
 
 use arrow::array::AsArray;
+use std::borrow::Cow;
 use std::sync::Arc;
 
 pub(crate) enum ShreddedPathStep<'a> {
-    /// Path step succeeded, return the new shredding state
-    Success(BorrowedShreddingState<'a>),
+    /// Path step succeeded.
+    Success {
+        /// The shredding state for the next step.
+        next_state: ShreddingState<'a>,
+        /// The `typed_value` column the step descended through. Returned so 
the caller can union
+        /// its nulls into the accumulated null buffer without needing to 
re-borrow from the
+        /// (now-consumed) input state.
+        consumed_typed_value: Cow<'a, ArrayRef>,
+    },
     /// The path element is not present in the `typed_value` column and there 
is no `value` column,
     /// so we know it does not exist. It, and all paths under it, are all-NULL.
     Missing,
-    /// The path element is not present in the `typed_value` column and must 
be retrieved from the `value`
-    /// column instead. The caller should be prepared to handle any value, 
including the requested
-    /// type, an arbitrary "wrong" type, or `Variant::Null`.
-    NotShredded,
+    /// The path element is not present in the `typed_value` column and must 
be retrieved from the
+    /// `value` column instead. The caller should be prepared to handle any 
value, including the
+    /// requested type, an arbitrary "wrong" type, or `Variant::Null`. The 
consumed `value` column
+    /// is returned so the caller can use it without re-borrowing from the 
(now-consumed) input.
+    NotShredded {
+        consumed_value: Option<Cow<'a, ArrayRef>>,
+    },
+}
+
+/// Walk into `typed_value`'s nested `StructArray` and look up `name`. Returns
+/// `Ok(Some(nested_struct))` on success, `Ok(None)` for a missing path 
(non-struct or missing
+/// field), and `Err` on an invalid nested struct.
+fn walk_field_step<'b>(typed_value: &'b ArrayRef, name: &str) -> 
Result<Option<&'b StructArray>> {
+    let Some(struct_array) = typed_value.as_struct_opt() else {
+        return Ok(None);
+    };
+    let Some(field) = struct_array.column_by_name(name) else {
+        return Ok(None);
+    };
+    let nested_struct = field.as_struct_opt().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(format!(

Review Comment:
   Yes



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -25,74 +25,115 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
 use parquet_variant::{VariantPath, VariantPathElement};
 
 use crate::VariantArray;
-use crate::variant_array::BorrowedShreddingState;
+use crate::variant_array::{ShreddingState, validate_binary_array};
 use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
 
 use arrow::array::AsArray;
+use std::borrow::Cow;
 use std::sync::Arc;
 
 pub(crate) enum ShreddedPathStep<'a> {
-    /// Path step succeeded, return the new shredding state
-    Success(BorrowedShreddingState<'a>),
+    /// Path step succeeded.
+    Success {
+        /// The shredding state for the next step.
+        next_state: ShreddingState<'a>,
+        /// The `typed_value` column the step descended through. Returned so 
the caller can union
+        /// its nulls into the accumulated null buffer without needing to 
re-borrow from the
+        /// (now-consumed) input state.
+        consumed_typed_value: Cow<'a, ArrayRef>,
+    },
     /// The path element is not present in the `typed_value` column and there 
is no `value` column,
     /// so we know it does not exist. It, and all paths under it, are all-NULL.
     Missing,
-    /// The path element is not present in the `typed_value` column and must 
be retrieved from the `value`
-    /// column instead. The caller should be prepared to handle any value, 
including the requested
-    /// type, an arbitrary "wrong" type, or `Variant::Null`.
-    NotShredded,
+    /// The path element is not present in the `typed_value` column and must 
be retrieved from the
+    /// `value` column instead. The caller should be prepared to handle any 
value, including the
+    /// requested type, an arbitrary "wrong" type, or `Variant::Null`. The 
consumed `value` column
+    /// is returned so the caller can use it without re-borrowing from the 
(now-consumed) input.
+    NotShredded {
+        consumed_value: Option<Cow<'a, ArrayRef>>,
+    },
+}
+
+/// Walk into `typed_value`'s nested `StructArray` and look up `name`. Returns
+/// `Ok(Some(nested_struct))` on success, `Ok(None)` for a missing path 
(non-struct or missing
+/// field), and `Err` on an invalid nested struct.
+fn walk_field_step<'b>(typed_value: &'b ArrayRef, name: &str) -> 
Result<Option<&'b StructArray>> {
+    let Some(struct_array) = typed_value.as_struct_opt() else {
+        return Ok(None);
+    };
+    let Some(field) = struct_array.column_by_name(name) else {
+        return Ok(None);
+    };
+    let nested_struct = field.as_struct_opt().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(format!(
+            "Expected Struct array while following path, got {}",
+            field.data_type(),
+        ))
+    })?;
+    Ok(Some(nested_struct))
 }
 
 /// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try 
to take one path step
 /// deeper. For a `VariantPathElement::Field`, if there is no `typed_value` at 
this level, if
 /// `typed_value` is not a struct, or if the requested field name does not 
exist, traversal returns
 /// a missing-path step (`Missing` or `NotShredded` depending on whether 
`value` exists).
 ///
+/// The state is consumed by value so its lifetime `'a` (rooted at the 
original input) is preserved
+/// onto the returned step. For `Cow::Borrowed` inputs this lets the chain 
walk all the way down
+/// without per-step Arc bumps; for `Cow::Owned` inputs (e.g., `take`-kernel 
results) the function
+/// clones the new state's columns since the owned typed_value goes out of 
scope on return.
+///
 /// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe 
not even possible.
 pub(crate) fn follow_shredded_path_element<'a>(
-    shredding_state: &BorrowedShreddingState<'a>,
+    shredding_state: ShreddingState<'a>,
     path_element: &VariantPathElement<'_>,
     _cast_options: &CastOptions,
 ) -> Result<ShreddedPathStep<'a>> {
-    // If the requested path element is not present in `typed_value`, and 
`value` is missing, then
-    // we know it does not exist; it, and all paths under it, are all-NULL.
-    let missing_path_step = || match shredding_state.value_field() {
-        Some(_) => ShreddedPathStep::NotShredded,
+    let (value, typed_value) = shredding_state.into_fields();
+    let missing_path_step = |value: Option<Cow<'a, ArrayRef>>| match value {
+        Some(_) => ShreddedPathStep::NotShredded {
+            consumed_value: value,
+        },
         None => ShreddedPathStep::Missing,
     };
 
-    let Some(typed_value) = shredding_state.typed_value_field() else {
-        return Ok(missing_path_step());
+    let Some(typed_value) = typed_value else {
+        return Ok(missing_path_step(value));
     };
 
     match path_element {
-        VariantPathElement::Field { name } => {
-            // Try to step into the requested field name of a struct.
-            // First, try to downcast to StructArray
-            let Some(struct_array) = 
typed_value.as_any().downcast_ref::<StructArray>() else {
-                // Object field path step follows JSONPath semantics and 
returns missing path step (NotShredded/Missing) on non-struct path
-                return Ok(missing_path_step());
-            };
-
-            // Now try to find the column - missing column in a present struct 
is just missing data
-            let Some(field) = struct_array.column_by_name(name) else {
-                // Missing column in a present struct is just missing, not 
wrong - return Ok
-                return Ok(missing_path_step());
-            };
-
-            let struct_array = field.as_struct_opt().ok_or_else(|| {
-                // TODO: Should we blow up? Or just end the traversal and let 
the normal
-                // variant pathing code sort out the mess that it must anyway 
be
-                // prepared to handle?
-                ArrowError::InvalidArgumentError(format!(
-                    "Expected Struct array while following path, got {}",
-                    field.data_type(),
-                ))
-            })?;
-
-            let state = BorrowedShreddingState::try_from(struct_array)?;
-            Ok(ShreddedPathStep::Success(state))
-        }
+        VariantPathElement::Field { name } => match typed_value {
+            Cow::Borrowed(typed_value_ref) => {
+                let Some(nested) = walk_field_step(typed_value_ref, name)? 
else {
+                    return Ok(missing_path_step(value));
+                };
+                // nested: &'a StructArray, so try_from produces 
ShreddingState<'a> with
+                // Cow::Borrowed views into `nested`. No Arc bumps on this 
path.
+                let next_state = ShreddingState::try_from(nested)?;
+                Ok(ShreddedPathStep::Success {
+                    next_state,
+                    consumed_typed_value: Cow::Borrowed(typed_value_ref),
+                })
+            }
+            Cow::Owned(typed_value_arc) => {
+                let Some(nested) = walk_field_step(&typed_value_arc, name)? 
else {
+                    return Ok(missing_path_step(value));
+                };
+                // `typed_value_arc` is the owned input and is about to move 
into the returned
+                // `consumed_typed_value`. We cannot also borrow into it 
(self-referential), so we
+                // clone `nested`'s columns to produce a 
`ShreddingState<'static>` (coerces to 'a).

Review Comment:
   The `ShreddingState::new` returns a ShreddingState<'static> and it auto 
coerces into a <'a>. The values we pass in it are owned so it satisfies the 
<'a>.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to