scovich commented on code in PR #8021:
URL: https://github.com/apache/arrow-rs/pull/8021#discussion_r2257329656


##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -135,36 +142,195 @@ impl VariantArray {
         self.inner
     }
 
+    /// Return the shredding state of this `VariantArray`
+    pub fn shredding_state(&self) -> &ShreddingState {
+        &self.shredding_state
+    }
+
     /// Return the [`Variant`] instance stored at the given row
     ///
-    /// Panics if the index is out of bounds.
+    /// Consistently with other Arrow arrays types, this API requires you to
+    /// check for nulls first using [`Self::is_valid`].
+    ///
+    /// # Panics
+    /// * if the index is out of bounds
+    /// * if the array value is null
+    ///
+    /// If this is a shredded variant but has no value at the shredded 
location, it
+    /// will return [`Variant::Null`].
+    ///
+    ///
+    /// # Performance Note
+    ///
+    /// This is certainly not the most efficient way to access values in a
+    /// `VariantArray`, but it is useful for testing and debugging.
     ///
     /// Note: Does not do deep validation of the [`Variant`], so it is up to 
the
     /// caller to ensure that the metadata and value were constructed 
correctly.
     pub fn value(&self, index: usize) -> Variant {
-        let metadata = self.metadata_field().as_binary_view().value(index);
-        let value = self.value_field().as_binary_view().value(index);
-        Variant::new(metadata, value)
+        match &self.shredding_state {
+            ShreddingState::Unshredded { metadata, value } => {
+                Variant::new(metadata.value(index), value.value(index))
+            }
+            ShreddingState::FullyShredded { typed_value, .. } => {
+                if typed_value.is_null(index) {
+                    Variant::Null
+                } else {
+                    typed_value_to_variant(typed_value, index)
+                }
+            }
+            ShreddingState::PartiallyShredded {
+                metadata,
+                value,
+                typed_value,
+            } => {
+                if typed_value.is_null(index) {
+                    Variant::new(metadata.value(index), value.value(index))
+                } else {
+                    typed_value_to_variant(typed_value, index)
+                }
+            }
+        }
     }
 
-    fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> {
-        array.column_by_name("metadata").cloned()
+    /// Return a reference to the metadata field of the [`StructArray`]
+    pub fn metadata_field(&self) -> &BinaryViewArray {
+        self.shredding_state.metadata_field()
     }
 
-    fn find_value_field(array: &StructArray) -> Option<ArrayRef> {
-        array.column_by_name("value").cloned()
+    /// Return a reference to the value field of the `StructArray`
+    pub fn value_field(&self) -> Option<&BinaryViewArray> {
+        self.shredding_state.value_field()
     }
 
-    /// Return a reference to the metadata field of the [`StructArray`]
-    pub fn metadata_field(&self) -> &ArrayRef {
-        // spec says fields order is not guaranteed, so we search by name
-        &self.metadata_ref
+    /// Return a reference to the typed_value field of the `StructArray`, if 
present
+    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
+        self.shredding_state.typed_value_field()
     }
+}
 
-    /// Return a reference to the value field of the `StructArray`
-    pub fn value_field(&self) -> &ArrayRef {
-        // spec says fields order is not guaranteed, so we search by name
-        &self.value_ref
+/// Variant arrays can be shredded in one of three states, encoded here
+#[derive(Debug)]
+pub enum ShreddingState {

Review Comment:
   For a future `FullyShreddedAllNull` variant (neither `value` nor 
`typed_value` present), would we still need to store the `metadata` even tho 
it's never actually used? 🤔 



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -93,35 +88,47 @@ impl VariantArray {
                 "Invalid VariantArray: requires StructArray as 
input".to_string(),
             ));
         };
-        // Ensure the StructArray has a metadata field of BinaryView
 
-        let Some(metadata_field) = VariantArray::find_metadata_field(inner) 
else {
+        // Note the specification allows for any order so we must search by 
name
+
+        // Ensure the StructArray has a metadata field of BinaryView
+        let Some(metadata_field) = inner.column_by_name("metadata") else {
             return Err(ArrowError::InvalidArgumentError(
                 "Invalid VariantArray: StructArray must contain a 'metadata' 
field".to_string(),
             ));
         };
-        if metadata_field.data_type() != &DataType::BinaryView {
+        let Some(metadata) = metadata_field.as_binary_view_opt() else {
             return Err(ArrowError::NotYetImplemented(format!(
                 "VariantArray 'metadata' field must be BinaryView, got {}",
                 metadata_field.data_type()
             )));
-        }
-        let Some(value_field) = VariantArray::find_value_field(inner) else {
-            return Err(ArrowError::InvalidArgumentError(
-                "Invalid VariantArray: StructArray must contain a 'value' 
field".to_string(),
-            ));
         };
-        if value_field.data_type() != &DataType::BinaryView {
-            return Err(ArrowError::NotYetImplemented(format!(
-                "VariantArray 'value' field must be BinaryView, got {}",
-                value_field.data_type()
-            )));
-        }
+
+        // Find the value field, if present
+        let value = inner
+            .column_by_name("value")
+            .map(|v| {
+                let Some(binary_view) = v.as_binary_view_opt() else {
+                    return Err(ArrowError::NotYetImplemented(format!(
+                        "VariantArray 'value' field must be BinaryView, got 
{}",
+                        v.data_type()
+                    )));
+                };
+                Ok(binary_view)

Review Comment:
   Isn't this just 
   ```suggestion
                   v.as_binary_view_opt().ok_or_else(|| {
                       ArrowError::NotYetImplemented(format!(
                           "VariantArray 'value' field must be BinaryView, got 
{}",
                           v.data_type()
                       ));
                   })
   ```



##########
parquet-variant-compute/src/variant_get/output/primitive.rs:
##########
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::variant_get::output::OutputBuilder;
+use crate::VariantArray;
+use arrow::error::Result;
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, 
NullBufferBuilder,
+    PrimitiveArray,
+};
+use arrow::compute::{cast_with_options, CastOptions};
+use arrow::datatypes::Int32Type;
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::{Variant, VariantPath};
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+/// Trait for Arrow primitive types that can be used in the output builder
+///
+/// This just exists to add a generic way to convert from Variant to the 
primitive type
+pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType {
+    /// Try to extract the primitive value from a Variant, returning None if it
+    /// cannot be converted
+    ///
+    /// TODO: figure out how to handle coercion/casting
+    fn from_variant(variant: &Variant) -> Option<Self::Native>;
+}
+
+/// Outputs Primitive arrays
+pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> {
+    /// What path to extract
+    path: VariantPath<'a>,
+    /// Returned output type
+    as_type: FieldRef,
+    /// Controls the casting behavior (e.g. error vs substituting null on cast 
error).
+    cast_options: CastOptions<'a>,
+    /// Phantom data for the primitive type
+    _phantom: PhantomData<T>,
+}
+
+impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> {
+    pub(super) fn new(
+        path: VariantPath<'a>,
+        as_type: FieldRef,
+        cast_options: CastOptions<'a>,
+    ) -> Self {
+        Self {
+            path,
+            as_type,
+            cast_options,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for 
PrimitiveOutputBuilder<'a, T> {
+    fn partially_shredded(
+        &self,
+        variant_array: &VariantArray,
+        _metadata: &BinaryViewArray,
+        _value_field: &BinaryViewArray,
+        typed_value: &ArrayRef,
+    ) -> arrow::error::Result<ArrayRef> {
+        // build up the output array element by element
+        let mut nulls = NullBufferBuilder::new(variant_array.len());
+        let mut values = Vec::with_capacity(variant_array.len());
+        let typed_value =
+            cast_with_options(typed_value, self.as_type.data_type(), 
&self.cast_options)?;
+        // downcast to the primitive array (e.g. Int32Array, Float64Array, etc)
+        let typed_value = typed_value.as_primitive::<T>();
+
+        for i in 0..variant_array.len() {
+            if variant_array.is_null(i) {
+                nulls.append_null();
+                values.push(T::default_value()); // not used, placeholder
+                continue;
+            }
+
+            // if the typed value is null, decode the variant and extract the 
value
+            if typed_value.is_null(i) {
+                // todo follow path
+                let variant = variant_array.value(i);
+                let Some(value) = T::from_variant(&variant) else {
+                    if self.cast_options.safe {
+                        // safe mode: append null if we can't convert
+                        nulls.append_null();
+                        values.push(T::default_value()); // not used, 
placeholder
+                        continue;
+                    } else {
+                        return Err(ArrowError::CastError(format!(
+                            "Failed to extract primitive of type {} from 
variant {:?} at path {:?}",
+                            self.as_type.data_type(),
+                            variant,
+                            self.path
+                        )));
+                    }
+                };
+
+                nulls.append_non_null();
+                values.push(value)
+            } else {
+                // otherwise we have a typed value, so we can use it directly
+                nulls.append_non_null();
+                values.push(typed_value.value(i));
+            }
+        }
+
+        let nulls = nulls.finish();
+        let array = PrimitiveArray::<T>::new(values.into(), nulls)
+            .with_data_type(self.as_type.data_type().clone());
+        Ok(Arc::new(array))
+    }
+
+    fn fully_shredded(
+        &self,
+        _variant_array: &VariantArray,
+        _metadata: &BinaryViewArray,
+        typed_value: &ArrayRef,
+    ) -> arrow::error::Result<ArrayRef> {
+        // if the types match exactly, we can just return the typed_value
+        if typed_value.data_type() == self.as_type.data_type() {
+            Ok(typed_value.clone())
+        } else {
+            // TODO: try to cast the typed_value to the desired type?
+            Err(ArrowError::NotYetImplemented(format!(

Review Comment:
   I guess we could pursue three potential levels of casting: 
   1. Widening casts (based on type). E.g. if the type is int32, we can also 
handle int8, int16 and even decimal4 (if scale=0). The conversion is guaranteed 
to succeed losslessly.
   2. Lossless narrowing casts (based on value). E.g. if the type is int32, we 
can still handle a small int64 value like 42. TBD what should happen if any one 
value failed to cast? 
   3. Converting casts. These are the lossy ones, e.g. converting 3.14f64 to 
3i32, or 3i32 to string. I'm guessing we don't want to mess with these?



##########
parquet-variant-compute/src/variant_get/output/primitive.rs:
##########
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::variant_get::output::OutputBuilder;
+use crate::VariantArray;
+use arrow::error::Result;
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, 
NullBufferBuilder,
+    PrimitiveArray,
+};
+use arrow::compute::{cast_with_options, CastOptions};
+use arrow::datatypes::Int32Type;
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::{Variant, VariantPath};
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+/// Trait for Arrow primitive types that can be used in the output builder
+///
+/// This just exists to add a generic way to convert from Variant to the 
primitive type
+pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType {
+    /// Try to extract the primitive value from a Variant, returning None if it
+    /// cannot be converted
+    ///
+    /// TODO: figure out how to handle coercion/casting
+    fn from_variant(variant: &Variant) -> Option<Self::Native>;
+}
+
+/// Outputs Primitive arrays
+pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> {
+    /// What path to extract
+    path: VariantPath<'a>,
+    /// Returned output type
+    as_type: FieldRef,
+    /// Controls the casting behavior (e.g. error vs substituting null on cast 
error).
+    cast_options: CastOptions<'a>,
+    /// Phantom data for the primitive type
+    _phantom: PhantomData<T>,
+}
+
+impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> {
+    pub(super) fn new(
+        path: VariantPath<'a>,
+        as_type: FieldRef,
+        cast_options: CastOptions<'a>,
+    ) -> Self {
+        Self {
+            path,
+            as_type,
+            cast_options,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for 
PrimitiveOutputBuilder<'a, T> {
+    fn partially_shredded(
+        &self,
+        variant_array: &VariantArray,
+        _metadata: &BinaryViewArray,
+        _value_field: &BinaryViewArray,
+        typed_value: &ArrayRef,
+    ) -> arrow::error::Result<ArrayRef> {
+        // build up the output array element by element

Review Comment:
   It took a minute to realize that "partially shredded" here does _not_ refer 
to the [variant shredding 
spec](https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding)
 definition:
   > An object is partially shredded when the `value` is an object and the 
`typed_value` is a shredded object. Writers must not produce data where both 
`value` and `typed_value` are non-null, unless the Variant value is an object.
   
   and
   > The value column of a partially shredded object must never contain fields 
represented by the Parquet columns in `typed_value` (shredded fields). Readers 
may always assume that data is written correctly and that shredded fields in 
`typed_value` are not present in `value`.
   
   This function seems to actually refer to "imperfectly shredded" columns, 
where we need a `value` column as a fallback for input values that fail to 
shred? As opposed to "perfectly shredded" columns that only need a 
`typed_value` column (or where the `value` column is all-null).



##########
parquet-variant-compute/src/variant_get/mod.rs:
##########
@@ -0,0 +1,430 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use arrow::{
+    array::{Array, ArrayRef},
+    compute::CastOptions,
+    error::Result,
+};
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::VariantPath;
+
+use crate::variant_array::ShreddingState;
+use crate::variant_get::output::instantiate_output_builder;
+use crate::VariantArray;
+
+mod output;
+
+/// Returns an array with the specified path extracted from the variant values.
+///
+/// The return array type depends on the `as_type` field of the options 
parameter
+/// 1. `as_type: None`: a VariantArray is returned. The values in this new 
VariantArray will point
+///    to the specified path.
+/// 2. `as_type: Some(<specific field>)`: an array of the specified type is 
returned.
+pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
+    let variant_array: &VariantArray = 
input.as_any().downcast_ref().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(
+            "expected a VariantArray as the input for variant_get".to_owned(),
+        )
+    })?;
+
+    // Create the output writer based on the specified output options
+    let output_builder = instantiate_output_builder(options.clone())?;
+
+    // Dispatch based on the shredding state of the input variant array
+    match variant_array.shredding_state() {
+        ShreddingState::PartiallyShredded {
+            metadata,
+            value,
+            typed_value,
+        } => output_builder.partially_shredded(variant_array, metadata, value, 
typed_value),
+        ShreddingState::FullyShredded {
+            metadata,
+            typed_value,
+        } => output_builder.fully_shredded(variant_array, metadata, 
typed_value),
+        ShreddingState::Unshredded { metadata, value } => {
+            output_builder.unshredded(variant_array, metadata, value)
+        }
+    }
+}
+
+/// Controls the action of the variant_get kernel.
+#[derive(Debug, Clone, Default)]
+pub struct GetOptions<'a> {
+    /// What path to extract
+    pub path: VariantPath<'a>,
+    /// if `as_type` is None, the returned array will itself be a VariantArray.
+    ///
+    /// if `as_type` is `Some(type)` the field is returned as the specified 
type.
+    pub as_type: Option<FieldRef>,

Review Comment:
   Presumably `None` and `Some(VariantType)` are equivalent?
   
   If so, should we just default-initialize `as_type: FieldRef` to 
`VariantType`?



##########
parquet-variant-compute/src/variant_get/output/mod.rs:
##########


Review Comment:
   Currently, the output builder seems to be fully column-oriented -- it 
assumes that all values for each leaf column are extracted in a tight loop. 
This can work for primitive builders, but nested builders will quickly run into 
pathing and efficiency problems.
   
   I think we'll need to do something similar to the JSON builder, with a 
row-oriented approach where each level of a nested builder receives an 
already-constructed `Variant` for the current row and does a field extract for 
each child builder; child builders can then cast the result directly or recurse 
further as needed (based on their own type). And then the top-level builder 
call would construct a `Variant` for each row to kick-start the process.
   
   But see the other comment -- to the extend that the shredding aligns nicely, 
we can hoist a subset of this per-row pathing of the append method up to 
columnar pathing of the builder's constructor and finalizer.



##########
parquet-variant-compute/src/variant_get/output/primitive.rs:
##########
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::variant_get::output::OutputBuilder;
+use crate::VariantArray;
+use arrow::error::Result;
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, 
NullBufferBuilder,
+    PrimitiveArray,
+};
+use arrow::compute::{cast_with_options, CastOptions};
+use arrow::datatypes::Int32Type;
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::{Variant, VariantPath};
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+/// Trait for Arrow primitive types that can be used in the output builder
+///
+/// This just exists to add a generic way to convert from Variant to the 
primitive type
+pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType {
+    /// Try to extract the primitive value from a Variant, returning None if it
+    /// cannot be converted
+    ///
+    /// TODO: figure out how to handle coercion/casting
+    fn from_variant(variant: &Variant) -> Option<Self::Native>;
+}
+
+/// Outputs Primitive arrays
+pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> {
+    /// What path to extract
+    path: VariantPath<'a>,
+    /// Returned output type
+    as_type: FieldRef,
+    /// Controls the casting behavior (e.g. error vs substituting null on cast 
error).
+    cast_options: CastOptions<'a>,
+    /// Phantom data for the primitive type
+    _phantom: PhantomData<T>,
+}
+
+impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> {
+    pub(super) fn new(
+        path: VariantPath<'a>,
+        as_type: FieldRef,
+        cast_options: CastOptions<'a>,
+    ) -> Self {
+        Self {
+            path,
+            as_type,
+            cast_options,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for 
PrimitiveOutputBuilder<'a, T> {
+    fn partially_shredded(
+        &self,
+        variant_array: &VariantArray,
+        _metadata: &BinaryViewArray,
+        _value_field: &BinaryViewArray,
+        typed_value: &ArrayRef,
+    ) -> arrow::error::Result<ArrayRef> {
+        // build up the output array element by element
+        let mut nulls = NullBufferBuilder::new(variant_array.len());
+        let mut values = Vec::with_capacity(variant_array.len());
+        let typed_value =
+            cast_with_options(typed_value, self.as_type.data_type(), 
&self.cast_options)?;
+        // downcast to the primitive array (e.g. Int32Array, Float64Array, etc)
+        let typed_value = typed_value.as_primitive::<T>();
+
+        for i in 0..variant_array.len() {
+            if variant_array.is_null(i) {
+                nulls.append_null();
+                values.push(T::default_value()); // not used, placeholder
+                continue;
+            }
+
+            // if the typed value is null, decode the variant and extract the 
value
+            if typed_value.is_null(i) {
+                // todo follow path

Review Comment:
   Ah, I think I see -- for a strongly-typed builder, the caller could do all 
the pathing and just pass in the correct leaf column here. But this is variant, 
so we'd actually need to extract the variant leaf value (using a path) on a 
per-row basis. We'd either have to do _all_ the pathing here (recursively), or 
caller would have to to extract the struct/array once and recurse on its 
fields. And the latter would require a row-oriented builder where this builder 
seems to be column-oriented.



##########
parquet-variant-compute/src/variant_get/mod.rs:
##########
@@ -0,0 +1,430 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use arrow::{
+    array::{Array, ArrayRef},
+    compute::CastOptions,
+    error::Result,
+};
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::VariantPath;
+
+use crate::variant_array::ShreddingState;
+use crate::variant_get::output::instantiate_output_builder;
+use crate::VariantArray;
+
+mod output;
+
+/// Returns an array with the specified path extracted from the variant values.
+///
+/// The return array type depends on the `as_type` field of the options 
parameter
+/// 1. `as_type: None`: a VariantArray is returned. The values in this new 
VariantArray will point
+///    to the specified path.
+/// 2. `as_type: Some(<specific field>)`: an array of the specified type is 
returned.
+pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
+    let variant_array: &VariantArray = 
input.as_any().downcast_ref().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(
+            "expected a VariantArray as the input for variant_get".to_owned(),
+        )
+    })?;
+
+    // Create the output writer based on the specified output options
+    let output_builder = instantiate_output_builder(options.clone())?;
+
+    // Dispatch based on the shredding state of the input variant array
+    match variant_array.shredding_state() {

Review Comment:
   One (big) complexity here -- the requested type could differ wildly from how 
the input data are currently shredded.
   
   Pathing-wise, we'd need to drill as deeply as possible into the variant 
array's physical shredding schema, following the requested path. In the happy 
path, the requested path was shredded and we just have to deal with the 
corresponding `(value, typed_value)` pair of columns. 
   
   <details>
   <summary>Example</summary>
   
   Suppose we're trying to extract `v:a.b::INT` from a `VariantArray` with the 
following physical layout:
   ```
   v: VARIANT {
       metadata: BINARY,
       value: BINARY,
       typed_value: STRUCT {
           a: SHREDDED_VARIANT {
               value: BINARY,
               typed_value: STRUCT {
                   b: SHREDDED_VARIANT {
                       value: BINARY,
                       typed_value: INT,
                   },
               },
           },
       },
   }
   ```
   
   Then we're actually doing `v.typed_value.a.typed_value.b::INT` -- the 
builder only has to worry about reconciling 
`v.typed_value.a.typed_value.b.value` with 
`v.typed_value.a.typed_value.b.typed_value` (which can be passed in directly at 
construction time)
   
   This case has an even easier example, if the caller requested `v:a` (an 
untyped variant extract) -- in that case we simply combine `v.typed_value.a` 
with `v.metadata` into a new VariantArray -- no casting needed because 
`VariantArray` can capture the full structure `v:a` no matter how it was 
physically shredded.
   
   </details>
   
   In a less-happy path, we will hit a partially shredded object where the 
requested path element is _not_ available in `typed_value`, and we must go 
row-oriented from there (with the remaining path extraction performed on a 
per-row basis):
   
   <details>
   <summary>Example</summary>
   
   Suppose we're trying to extract `v:a.b::INT` from a `VariantArray` with the 
following physical layout:
   ```
   v: VARIANT {
       metadata: BINARY,
       value: BINARY,
       typed_value: STRUCT {
           a: SHREDDED_VARIANT {
               value: BINARY,
               typed_value: STRUCT {
                   x: SHREDDED_VARIANT {
                       value: BINARY,
                       typed_value: INT,
                   },
               },
           },
       },
   }
   ```
   
   Then we're actually doing `v.typed_value.a:b::INT` -- we have to pass 
`v.typed_value.a.value` into the builder, which has to extract field `b` from 
every row.
   
   </details>
   
   In an unhappy path, the partial shredding of the input disagrees with the 
output's requested shredding, and we have to build up a struct from both 
shredded and unshredded paths.
   
   <details>
   <summary>Example</summary>
   
   Suppose the requested schema asks for both `v:a.b::INT` _and_ `v:a.x::INT`, 
when the VariantArray's layout is:
   ```
   v: VARIANT {
       metadata: BINARY,
       value: BINARY,
       typed_value: STRUCT {
           a: SHREDDED_VARIANT {
               value: BINARY,
               typed_value: STRUCT {
                   b: SHREDDED_VARIANT {
                       value: BINARY,
                       typed_value: INT,
                   },
               },
           },
       },
   }
   ```
   Now we have to actually build the struct `v.a` with two fields by unioning 
shredded (`v.typed_value.a.typed_value.b::INT` and unshredded bits 
(`v.typed_value.a.value:x::INT`).
   
   </details>
   
   In a _really_ unhappy case, the caller requested a shredded variant type 
that disagrees with the input data's shredding. So some fields have to be 
shredded and others unshredded.
   
   <details>
   <summary>Example</summary>
   
   Suppose we're trying to extract both `v:a` as a partially shredded variant 
with `x` shredded and `b` _not_ shredded... from a `VariantArray` where `b` is 
shredded and `x` is _not_ shredded:
   ```
   v: VARIANT {
       metadata: BINARY,
       value: BINARY,
       typed_value: STRUCT {
           a: SHREDDED_VARIANT {
               value: BINARY,
               typed_value: STRUCT {
                   b: SHREDDED_VARIANT {
                       value: BINARY,
                       typed_value: INT,
                   },
               },
           },
       },
   }
   ```
   
   Then not only would we need to path into 
`v.typed_value.a.typed_value.b::VARIANT` (which requires unshredding it and 
which produces an `o.typed_value.a.value_b` column), we'd also have to path 
into `v.typed_value.a.value::INT` (which requires shredding it and which 
contributes to _both_ `o.typed_value.a.value_x` _and_ 
`o.typed_value.a.typed_value` columns). And then we have to variant-union 
`o.typed_value.a.value_b` and `o.typed_value.a.value_x`columns into a single 
`o.typed_value.a.value` column, and pack everything into a new `VariantArray`.
   
   </details>



##########
parquet-variant-compute/src/variant_get/mod.rs:
##########
@@ -0,0 +1,430 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use arrow::{
+    array::{Array, ArrayRef},
+    compute::CastOptions,
+    error::Result,
+};
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::VariantPath;
+
+use crate::variant_array::ShreddingState;
+use crate::variant_get::output::instantiate_output_builder;
+use crate::VariantArray;
+
+mod output;
+
+/// Returns an array with the specified path extracted from the variant values.
+///
+/// The return array type depends on the `as_type` field of the options 
parameter
+/// 1. `as_type: None`: a VariantArray is returned. The values in this new 
VariantArray will point
+///    to the specified path.
+/// 2. `as_type: Some(<specific field>)`: an array of the specified type is 
returned.
+pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
+    let variant_array: &VariantArray = 
input.as_any().downcast_ref().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(
+            "expected a VariantArray as the input for variant_get".to_owned(),
+        )
+    })?;
+
+    // Create the output writer based on the specified output options
+    let output_builder = instantiate_output_builder(options.clone())?;
+
+    // Dispatch based on the shredding state of the input variant array
+    match variant_array.shredding_state() {
+        ShreddingState::PartiallyShredded {
+            metadata,
+            value,
+            typed_value,
+        } => output_builder.partially_shredded(variant_array, metadata, value, 
typed_value),
+        ShreddingState::FullyShredded {
+            metadata,
+            typed_value,
+        } => output_builder.fully_shredded(variant_array, metadata, 
typed_value),
+        ShreddingState::Unshredded { metadata, value } => {
+            output_builder.unshredded(variant_array, metadata, value)
+        }
+    }
+}
+
+/// Controls the action of the variant_get kernel.
+#[derive(Debug, Clone, Default)]
+pub struct GetOptions<'a> {
+    /// What path to extract
+    pub path: VariantPath<'a>,
+    /// if `as_type` is None, the returned array will itself be a VariantArray.
+    ///
+    /// if `as_type` is `Some(type)` the field is returned as the specified 
type.
+    pub as_type: Option<FieldRef>,
+    /// Controls the casting behavior (e.g. error vs substituting null on cast 
error).
+    pub cast_options: CastOptions<'a>,
+}
+
+impl<'a> GetOptions<'a> {
+    /// Construct default options to get the specified path as a variant.
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Construct options to get the specified path as a variant.
+    pub fn new_with_path(path: VariantPath<'a>) -> Self {

Review Comment:
   I guess path is optional (defaults to empty) because a leaf extract of a 
perfectly shredded variant doesn't need any internal pathing -- whoever 
constructs the builder just passes the leaf columns and the builder only needs 
to cast them appropriately.



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -186,13 +340,11 @@ impl Array for VariantArray {
     }
 
     fn slice(&self, offset: usize, length: usize) -> ArrayRef {
-        let slice = self.inner.slice(offset, length);
-        let met = self.metadata_ref.slice(offset, length);
-        let val = self.value_ref.slice(offset, length);
+        let inner = self.inner.slice(offset, length);
+        let shredding_state = self.shredding_state.slice(offset, length);
         Arc::new(Self {

Review Comment:
   Out of curiosity, would we actually expect a redundant `slice` call to be 
notably more expensive than `clone`? 
   Seems like they'd do the same thing?
   
   <details>
   
   For example, `BooleanArray::slice` is:
   ```rust
       pub fn slice(&self, offset: usize, length: usize) -> Self {
           Self {
               values: self.values.slice(offset, length),
               nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
           }
       }
   ```
   with `BooleanBuffer::slice`:
   ```rust
       pub fn slice(&self, offset: usize, len: usize) -> Self {
           assert!(
               offset.saturating_add(len) <= self.len,
               "the length + offset of the sliced BooleanBuffer cannot exceed 
the existing length"
           );
           Self {
               buffer: self.buffer.clone(),
               offset: self.offset + offset,
               len,
           }
       }
   ```
   
   Assuming the compiler inlines things as aggressively as it usually does, it 
seems like the net difference would be the bounds check and other offset 
arithmetic.
   
   </details>
   



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -93,35 +88,45 @@ impl VariantArray {
                 "Invalid VariantArray: requires StructArray as 
input".to_string(),
             ));
         };
-        // Ensure the StructArray has a metadata field of BinaryView
 
-        let Some(metadata_field) = VariantArray::find_metadata_field(inner) 
else {
+        // Note the specification allows for any order so we must search by 
name
+
+        // Ensure the StructArray has a metadata field of BinaryView
+        let Some(metadata_field) = inner.column_by_name("metadata") else {
             return Err(ArrowError::InvalidArgumentError(
                 "Invalid VariantArray: StructArray must contain a 'metadata' 
field".to_string(),
             ));
         };
-        if metadata_field.data_type() != &DataType::BinaryView {
+        let Some(metadata) = metadata_field.as_binary_view_opt() else {
             return Err(ArrowError::NotYetImplemented(format!(
                 "VariantArray 'metadata' field must be BinaryView, got {}",
                 metadata_field.data_type()
             )));
-        }
-        let Some(value_field) = VariantArray::find_value_field(inner) else {
-            return Err(ArrowError::InvalidArgumentError(
-                "Invalid VariantArray: StructArray must contain a 'value' 
field".to_string(),
-            ));
         };
-        if value_field.data_type() != &DataType::BinaryView {
-            return Err(ArrowError::NotYetImplemented(format!(
-                "VariantArray 'value' field must be BinaryView, got {}",
-                value_field.data_type()
-            )));
-        }
+
+        // Find the value field, if present
+        let value_field = inner.column_by_name("value");
+        let value = value_field
+            .map(|v| match v.as_binary_view_opt() {
+                Some(bv) => Ok(bv),
+                None => Err(ArrowError::NotYetImplemented(format!(
+                    "VariantArray 'value' field must be BinaryView, got {}",
+                    v.data_type()
+                ))),
+            })
+            .transpose()?;

Review Comment:
   Oops, I meant `ok_or_else`...



##########
parquet-variant-compute/src/variant_get/mod.rs:
##########
@@ -0,0 +1,430 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use arrow::{
+    array::{Array, ArrayRef},
+    compute::CastOptions,
+    error::Result,
+};
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::VariantPath;
+
+use crate::variant_array::ShreddingState;
+use crate::variant_get::output::instantiate_output_builder;
+use crate::VariantArray;
+
+mod output;
+
+/// Returns an array with the specified path extracted from the variant values.
+///
+/// The return array type depends on the `as_type` field of the options 
parameter
+/// 1. `as_type: None`: a VariantArray is returned. The values in this new 
VariantArray will point
+///    to the specified path.
+/// 2. `as_type: Some(<specific field>)`: an array of the specified type is 
returned.
+pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
+    let variant_array: &VariantArray = 
input.as_any().downcast_ref().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(
+            "expected a VariantArray as the input for variant_get".to_owned(),
+        )
+    })?;
+
+    // Create the output writer based on the specified output options
+    let output_builder = instantiate_output_builder(options.clone())?;
+
+    // Dispatch based on the shredding state of the input variant array
+    match variant_array.shredding_state() {
+        ShreddingState::PartiallyShredded {
+            metadata,
+            value,
+            typed_value,
+        } => output_builder.partially_shredded(variant_array, metadata, value, 
typed_value),
+        ShreddingState::FullyShredded {
+            metadata,
+            typed_value,
+        } => output_builder.fully_shredded(variant_array, metadata, 
typed_value),
+        ShreddingState::Unshredded { metadata, value } => {
+            output_builder.unshredded(variant_array, metadata, value)
+        }
+    }
+}
+
+/// Controls the action of the variant_get kernel.
+#[derive(Debug, Clone, Default)]
+pub struct GetOptions<'a> {
+    /// What path to extract
+    pub path: VariantPath<'a>,
+    /// if `as_type` is None, the returned array will itself be a VariantArray.
+    ///
+    /// if `as_type` is `Some(type)` the field is returned as the specified 
type.
+    pub as_type: Option<FieldRef>,

Review Comment:
   Oh... but variant is not a single type... every possible shredding (or lack 
thereof) is a different physical `DataType`. So by passing `None`, we request 
to keep whatever structure the underlying path already has; by passing 
`Some(... variant type...)` we're requesting to [un|re]shred as necessary to 
make it fit the new shredding schema.



##########
parquet-variant-compute/src/variant_get/output/primitive.rs:
##########
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::variant_get::output::OutputBuilder;
+use crate::VariantArray;
+use arrow::error::Result;
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, 
NullBufferBuilder,
+    PrimitiveArray,
+};
+use arrow::compute::{cast_with_options, CastOptions};
+use arrow::datatypes::Int32Type;
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::{Variant, VariantPath};
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+/// Trait for Arrow primitive types that can be used in the output builder
+///
+/// This just exists to add a generic way to convert from Variant to the 
primitive type
+pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType {
+    /// Try to extract the primitive value from a Variant, returning None if it
+    /// cannot be converted
+    ///
+    /// TODO: figure out how to handle coercion/casting
+    fn from_variant(variant: &Variant) -> Option<Self::Native>;
+}
+
+/// Outputs Primitive arrays
+pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> {
+    /// What path to extract
+    path: VariantPath<'a>,
+    /// Returned output type
+    as_type: FieldRef,
+    /// Controls the casting behavior (e.g. error vs substituting null on cast 
error).
+    cast_options: CastOptions<'a>,
+    /// Phantom data for the primitive type
+    _phantom: PhantomData<T>,
+}
+
+impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> {
+    pub(super) fn new(
+        path: VariantPath<'a>,
+        as_type: FieldRef,
+        cast_options: CastOptions<'a>,
+    ) -> Self {
+        Self {
+            path,
+            as_type,
+            cast_options,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for 
PrimitiveOutputBuilder<'a, T> {
+    fn partially_shredded(
+        &self,
+        variant_array: &VariantArray,
+        _metadata: &BinaryViewArray,
+        _value_field: &BinaryViewArray,
+        typed_value: &ArrayRef,
+    ) -> arrow::error::Result<ArrayRef> {
+        // build up the output array element by element
+        let mut nulls = NullBufferBuilder::new(variant_array.len());
+        let mut values = Vec::with_capacity(variant_array.len());
+        let typed_value =
+            cast_with_options(typed_value, self.as_type.data_type(), 
&self.cast_options)?;
+        // downcast to the primitive array (e.g. Int32Array, Float64Array, etc)
+        let typed_value = typed_value.as_primitive::<T>();
+
+        for i in 0..variant_array.len() {
+            if variant_array.is_null(i) {
+                nulls.append_null();
+                values.push(T::default_value()); // not used, placeholder
+                continue;
+            }
+
+            // if the typed value is null, decode the variant and extract the 
value
+            if typed_value.is_null(i) {
+                // todo follow path

Review Comment:
   Why would there be a path at all for this primitive output builder, sorry? 
   Seems like that only matters for array and object builders?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to