Samyak2 commented on code in PR #8021:
URL: https://github.com/apache/arrow-rs/pull/8021#discussion_r2249924930


##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -79,12 +143,22 @@ impl VariantArray {
     /// # Errors:
     /// - If the `StructArray` does not contain the required fields
     ///
-    /// # Current support
-    /// This structure does not (yet) support the full Arrow Variant Array 
specification.
+    /// # Requirements of the `StructArray`
+    ///
+    /// 1. A required field named `metadata` which is binary, large_binary, or
+    ///    binary_view
     ///
-    /// Only `StructArrays` with `metadata` and `value` fields that are
-    /// [`BinaryViewArray`] are supported. Shredded values are not currently 
supported
-    /// nor are using types other than `BinaryViewArray`
+    /// 2. An optional field named `value` that is binary, large_binary, or
+    ///    binary_view
+    ///
+    /// 3. An optional field named `typed_value` which can be any primitive 
type
+    ///    or be a list, large_list, list_view or struct
+    ///
+    /// NOTE: It is also permissible for the metadata field to be
+    /// Dictionary-Encoded, preferably (but not required) with an index type of
+    /// int8.
+    ///
+    /// Currently, only  [`BinaryViewArray`] are supported.

Review Comment:
   Double space is intentional?
   ```suggestion
       /// Currently, only [`BinaryViewArray`] are supported.
   ```



##########
parquet-variant-compute/src/variant_get/output/primitive.rs:
##########
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::variant_get::output::OutputBuilder;
+use crate::VariantArray;
+use arrow::error::Result;
+
+use arrow::array::{
+    Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, 
NullBufferBuilder,
+    PrimitiveArray,
+};
+use arrow::compute::{cast_with_options, CastOptions};
+use arrow::datatypes::Int32Type;
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::{Variant, VariantPath};
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+/// Trait for Arrow primitive types that can be used in the output builder
+///
+/// This just exists to add a generic way to convert from Variant to the 
primitive type
+pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType {
+    /// Try to extract the primitive value from a Variant, returning None if it
+    /// cannot be converted
+    ///
+    /// TODO: figure out how to handle coercion/casting
+    fn from_variant(variant: &Variant) -> Option<Self::Native>;
+}
+
+/// Outputs Primitive arrays
+pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> {
+    /// What path to extract
+    path: VariantPath<'a>,
+    /// Returned output type
+    as_type: FieldRef,
+    /// Controls the casting behavior (e.g. error vs substituting null on cast 
error).
+    cast_options: CastOptions<'a>,
+    /// Phantom data for the primitive type
+    _phantom: PhantomData<T>,
+}
+
+impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> {
+    pub(super) fn new(
+        path: VariantPath<'a>,
+        as_type: FieldRef,
+        cast_options: CastOptions<'a>,
+    ) -> Self {
+        Self {
+            path,
+            as_type,
+            cast_options,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for 
PrimitiveOutputBuilder<'a, T> {
+    fn partially_shredded(
+        &self,
+        variant_array: &VariantArray,
+        _metadata: &BinaryViewArray,
+        _value_field: &BinaryViewArray,
+        typed_value: &ArrayRef,
+    ) -> arrow::error::Result<ArrayRef> {
+        // build up the output array element by element
+        let mut nulls = NullBufferBuilder::new(variant_array.len());
+        let mut values = Vec::with_capacity(variant_array.len());
+        let typed_value =
+            cast_with_options(typed_value, self.as_type.data_type(), 
&self.cast_options)?;
+        // downcast to the primitive array (e.g. Int32Array, Float64Array, etc)
+        let typed_value = typed_value.as_primitive::<T>();
+
+        for i in 0..variant_array.len() {
+            if variant_array.is_null(i) {
+                nulls.append_null();
+                values.push(T::default_value()); // not used, placeholder
+                continue;
+            }
+
+            // if the typed value is null, decode the variant and extract the 
value
+            if typed_value.is_null(i) {
+                // todo follow path

Review Comment:
   I have one concern with this. I think the path-ing should be done 
independent of the output builder. Paths are only valid on objects/lists inside 
the variant. I don't see why every type's output builder needs to be concerned 
with the path.
   
   Even for shredded objects, we can bring the nested value/typed_value fields 
up and pass it along like a top-level shredded variant array.
   
   What do you think? Let me know if I missed something (I wasn't actively 
following the discussions around this).



##########
parquet-variant-compute/src/variant_get/mod.rs:
##########
@@ -0,0 +1,431 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use arrow::{
+    array::{Array, ArrayRef},
+    compute::CastOptions,
+    error::Result,
+};
+use arrow_schema::{ArrowError, FieldRef};
+use parquet_variant::VariantPath;
+
+use crate::variant_array::ShreddingState;
+use crate::variant_get::output::instantiate_output_builder;
+use crate::VariantArray;
+
+mod output;
+
+/// Returns an array with the specified path extracted from the variant values.
+///
+/// The return array type depends on the `as_type` field of the options 
parameter
+/// 1. `as_type: None`: a VariantArray is returned. The values in this new 
VariantArray will point
+///    to the specified path.
+/// 2. `as_type: Some(<specific field>)`: an array of the specified type is 
returned.
+pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
+    let variant_array: &VariantArray = 
input.as_any().downcast_ref().ok_or_else(|| {
+        ArrowError::InvalidArgumentError(
+            "expected a VariantArray as the input for variant_get".to_owned(),
+        )
+    })?;
+
+    // Create the output writer based on the specified output options
+    let output_builder = instantiate_output_builder(options.clone())?;
+
+    // Dispatch based on the shredding state of the input variant array
+    // TODO make this an enum on VariantArray (e.g ShreddingState)

Review Comment:
   This TODO seems to be done?



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -135,36 +238,87 @@ impl VariantArray {
         self.inner
     }
 
+    /// Return the shredding state of this `VariantArray`
+    pub fn shredding_state(&self) -> &ShreddingState {
+        &self.shredding_state
+    }
+
     /// Return the [`Variant`] instance stored at the given row
     ///
-    /// Panics if the index is out of bounds.
+    /// Consistently with other Arrow arrays types, this API requires you to
+    /// check for nulls first using [`Self::is_valid`].
+    ///
+    /// # Panics
+    /// * if the index is out of bounds
+    /// * if the array value is null
+    ///
+    /// If this is a shredded variant but has no value at the shredded 
location, it
+    /// will return [`Variant::Null`].
+    ///
+    ///
+    /// # Performance Note
+    ///
+    /// This is certainly not the most efficient way to access values in a
+    /// `VariantArray`, but it is useful for testing and debugging.
     ///
     /// Note: Does not do deep validation of the [`Variant`], so it is up to 
the
     /// caller to ensure that the metadata and value were constructed 
correctly.
     pub fn value(&self, index: usize) -> Variant {
-        let metadata = self.metadata_field().as_binary_view().value(index);
-        let value = self.value_field().as_binary_view().value(index);
-        Variant::new(metadata, value)
+        match &self.shredding_state {
+            ShreddingState::Unshredded { metadata, value } => {
+                Variant::new(metadata.value(index), value.value(index))
+            }
+            ShreddingState::FullyShredded {
+                metadata: _,
+                typed_value,
+            } => {
+                if typed_value.is_null(index) {
+                    Variant::Null
+                } else {
+                    typed_value_to_variant(typed_value, index)
+                }
+            }
+            ShreddingState::PartiallyShredded {
+                metadata,
+                value,
+                typed_value,
+            } => {
+                if typed_value.is_null(index) {
+                    Variant::new(metadata.value(index), value.value(index))
+                } else {
+                    typed_value_to_variant(typed_value, index)
+                }
+            }
+        }
     }
 
-    fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> {
-        array.column_by_name("metadata").cloned()
+    /// Return a reference to the metadata field of the [`StructArray`]
+    pub fn metadata_field(&self) -> &BinaryViewArray {
+        self.shredding_state.metadata_field()
     }
 
-    fn find_value_field(array: &StructArray) -> Option<ArrayRef> {
-        array.column_by_name("value").cloned()
+    /// Return a reference to the value field of the `StructArray`
+    pub fn value_field(&self) -> Option<&BinaryViewArray> {
+        self.shredding_state.value_field()
     }
 
-    /// Return a reference to the metadata field of the [`StructArray`]
-    pub fn metadata_field(&self) -> &ArrayRef {
-        // spec says fields order is not guaranteed, so we search by name
-        &self.metadata_ref
+    /// Return a reference to the typed_value field of the `StructArray`, if 
present
+    pub fn typed_value_field(&self) -> Option<&ArrayRef> {
+        self.shredding_state.typed_value_field()
     }
+}
 
-    /// Return a reference to the value field of the `StructArray`
-    pub fn value_field(&self) -> &ArrayRef {
-        // spec says fields order is not guaranteed, so we search by name
-        &self.value_ref
+/// returns the non-null element at index as a Variant
+fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant {
+    match typed_value.data_type() {
+        DataType::Int32 => {
+            let typed_value = typed_value.as_primitive::<Int32Type>();
+            Variant::from(typed_value.value(index))
+        }
+        // todo other types here
+        _ => {
+            todo!(); // Unsupported typed_value type
+        }
     }
 }

Review Comment:
   This could be useful as a (public) function on its own. For example, this 
could be used to cast columns to variant like in `some_column::variant`



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -93,35 +167,64 @@ impl VariantArray {
                 "Invalid VariantArray: requires StructArray as 
input".to_string(),
             ));
         };
-        // Ensure the StructArray has a metadata field of BinaryView
 
-        let Some(metadata_field) = VariantArray::find_metadata_field(inner) 
else {
+        // Note the specification allows for any order so we must search by 
name
+
+        // Ensure the StructArray has a metadata field of BinaryView
+        let Some(metadata_field) = inner.column_by_name("metadata") else {
             return Err(ArrowError::InvalidArgumentError(
                 "Invalid VariantArray: StructArray must contain a 'metadata' 
field".to_string(),
             ));
         };
-        if metadata_field.data_type() != &DataType::BinaryView {
+        let Some(metadata) = metadata_field.as_binary_view_opt() else {
             return Err(ArrowError::NotYetImplemented(format!(
                 "VariantArray 'metadata' field must be BinaryView, got {}",
                 metadata_field.data_type()
             )));
-        }
-        let Some(value_field) = VariantArray::find_value_field(inner) else {
-            return Err(ArrowError::InvalidArgumentError(
-                "Invalid VariantArray: StructArray must contain a 'value' 
field".to_string(),
-            ));
         };
-        if value_field.data_type() != &DataType::BinaryView {
-            return Err(ArrowError::NotYetImplemented(format!(
-                "VariantArray 'value' field must be BinaryView, got {}",
-                value_field.data_type()
-            )));
-        }
+
+        // Find the value field, if present
+        let value_field = inner.column_by_name("value");
+        let value = value_field
+            .map(|v| match v.as_binary_view_opt() {
+                Some(bv) => Ok(bv),
+                None => Err(ArrowError::NotYetImplemented(format!(
+                    "VariantArray 'value' field must be BinaryView, got {}",
+                    v.data_type()
+                ))),
+            })
+            .transpose()?;
+
+        // Find the typed_value field, if present
+        let typed_value = inner.column_by_name("typed_value");
+
+        // Note these clones are cheap, they just bump the ref count
+        let inner = inner.clone();
+        let metadata = metadata.clone();
+        let value = value.cloned();
+        let typed_value = typed_value.cloned();
+
+        let shredding_state = match (metadata, value, typed_value) {
+            (metadata, Some(value), Some(typed_value)) => 
ShreddingState::PartiallyShredded {
+                metadata,
+                value,
+                typed_value,
+            },
+            (metadata, Some(value), None) => ShreddingState::Unshredded { 
metadata, value },
+            (metadata, None, Some(typed_value)) => 
ShreddingState::FullyShredded {
+                metadata,
+                typed_value,
+            },
+            (_metadata_field, None, None) => {
+                return Err(ArrowError::InvalidArgumentError(String::from(
+                    "VariantArray has neither value nor typed_value field",
+                )));
+            }
+        };

Review Comment:
   Could this be a `try_new` constructor for `ShreddingState`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to