This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 1b18582bda [Variant] Add try_value/value for VariantArray (#8719)
1b18582bda is described below

commit 1b18582bda44deaf584720047e3446fbcca4a718
Author: Congxian Qiu <[email protected]>
AuthorDate: Thu Oct 30 19:01:26 2025 +0800

    [Variant] Add try_value/value for VariantArray (#8719)
    
    # Which issue does this PR close?
    
    - Closes #8672 .
    
    # What changes are included in this PR?
    
    - Add `try_value/value` function for `VariantArray`
    - Add test for `VariantArray::try_value`
    
    # Are these changes tested?
    
    Covered by existing tests and added new tests
    
    # Are there any user-facing changes?
    
    Yes, add a new function for `VariantArray::try_value`, and the
    `VariantArray::value` changed to panic from returning `Variant::Null` if
    there is some cast error.
---
 parquet-variant-compute/src/type_conversion.rs |  21 +++-
 parquet-variant-compute/src/variant_array.rs   | 157 +++++++++++++++++++------
 parquet-variant-compute/src/variant_get.rs     |  60 +++++++++-
 3 files changed, 200 insertions(+), 38 deletions(-)

diff --git a/parquet-variant-compute/src/type_conversion.rs 
b/parquet-variant-compute/src/type_conversion.rs
index d15664f5af..0716469f76 100644
--- a/parquet-variant-compute/src/type_conversion.rs
+++ b/parquet-variant-compute/src/type_conversion.rs
@@ -194,10 +194,10 @@ macro_rules! non_generic_conversion_single_value {
     ($array:expr, $cast_fn:expr, $index:expr) => {{
         let array = $array;
         if array.is_null($index) {
-            Variant::Null
+            Ok(Variant::Null)
         } else {
             let cast_value = $cast_fn(array.value($index));
-            Variant::from(cast_value)
+            Ok(Variant::from(cast_value))
         }
     }};
 }
@@ -217,6 +217,23 @@ macro_rules! generic_conversion_single_value {
 }
 pub(crate) use generic_conversion_single_value;
 
+macro_rules! generic_conversion_single_value_with_result {
+    ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $index:expr) => {{
+        let arr = $input.$method::<$t>();
+        let v = arr.value($index);
+        match ($cast_fn)(v) {
+            Ok(var) => Ok(Variant::from(var)),
+            Err(e) => Err(ArrowError::CastError(format!(
+                "Cast failed at index {idx} (array type: {ty}): {e}",
+                idx = $index,
+                ty = <$t as ::arrow::datatypes::ArrowPrimitiveType>::DATA_TYPE
+            ))),
+        }
+    }};
+}
+
+pub(crate) use generic_conversion_single_value_with_result;
+
 /// Convert the value at a specific index in the given array into a `Variant`.
 macro_rules! primitive_conversion_single_value {
     ($t:ty, $input:expr, $index:expr) => {{
diff --git a/parquet-variant-compute/src/variant_array.rs 
b/parquet-variant-compute/src/variant_array.rs
index 40fd76b170..0a42b3ab2e 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -18,7 +18,10 @@
 //! [`VariantArray`] implementation
 
 use crate::VariantArrayBuilder;
-use crate::type_conversion::{generic_conversion_single_value, 
primitive_conversion_single_value};
+use crate::type_conversion::{
+    generic_conversion_single_value, 
generic_conversion_single_value_with_result,
+    primitive_conversion_single_value,
+};
 use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
 use arrow::buffer::NullBuffer;
 use arrow::compute::cast;
@@ -27,6 +30,7 @@ use arrow::datatypes::{
     Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, 
Time64MicrosecondType,
     TimestampMicrosecondType, TimestampNanosecondType,
 };
+use arrow::error::Result;
 use arrow_schema::extension::ExtensionType;
 use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
 use chrono::{DateTime, NaiveTime};
@@ -58,11 +62,11 @@ impl ExtensionType for VariantType {
         Some(String::new())
     }
 
-    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata, 
ArrowError> {
+    fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> 
{
         Ok("")
     }
 
-    fn supports_data_type(&self, data_type: &DataType) -> Result<(), 
ArrowError> {
+    fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
         if matches!(data_type, DataType::Struct(_)) {
             Ok(())
         } else {
@@ -72,7 +76,7 @@ impl ExtensionType for VariantType {
         }
     }
 
-    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> 
Result<Self, ArrowError> {
+    fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> 
Result<Self> {
         Self.supports_data_type(data_type)?;
         Ok(Self)
     }
@@ -249,7 +253,7 @@ impl VariantArray {
     /// int8.
     ///
     /// Currently, only [`BinaryViewArray`] are supported.
-    pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
+    pub fn try_new(inner: &dyn Array) -> Result<Self> {
         // Workaround lack of support for Binary
         // https://github.com/apache/arrow-rs/issues/8387
         let inner = cast_to_binary_view_arrays(inner)?;
@@ -325,12 +329,32 @@ impl VariantArray {
 
     /// Return the [`Variant`] instance stored at the given row
     ///
-    /// Note: This method does not check for nulls and the value is arbitrary
-    /// (but still well-defined) if [`is_null`](Self::is_null) returns true 
for the index.
+    /// This is a convenience wrapper that calls [`VariantArray::try_value`] 
and unwraps the `Result`.
+    /// Use `try_value` if you need to handle conversion errors gracefully.
     ///
     /// # Panics
     /// * if the index is out of bounds
     /// * if the array value is null
+    /// * if `try_value` returns an error.
+    pub fn value(&self, index: usize) -> Variant<'_, '_> {
+        self.try_value(index).unwrap()
+    }
+
+    /// Return the [`Variant`] instance stored at the given row
+    ///
+    /// Note: This method does not check for nulls and the value is arbitrary
+    /// (but still well-defined) if [`is_null`](Self::is_null) returns true 
for the index.
+    ///
+    /// # Panics
+    ///
+    /// Panics if
+    /// * the index is out of bounds
+    /// * the array value is null
+    ///
+    /// # Errors
+    ///
+    /// Errors if
+    /// - the data in `typed_value` cannot be interpreted as a valid `Variant`
     ///
     /// If this is a shredded variant but has no value at the shredded 
location, it
     /// will return [`Variant::Null`].
@@ -343,7 +367,7 @@ impl VariantArray {
     ///
     /// Note: Does not do deep validation of the [`Variant`], so it is up to 
the
     /// caller to ensure that the metadata and value were constructed 
correctly.
-    pub fn value(&self, index: usize) -> Variant<'_, '_> {
+    pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
         match (self.typed_value_field(), self.value_field()) {
             // Always prefer typed_value, if available
             (Some(typed_value), value) if typed_value.is_valid(index) => {
@@ -351,11 +375,11 @@ impl VariantArray {
             }
             // Otherwise fall back to value, if available
             (_, Some(value)) if value.is_valid(index) => {
-                Variant::new(self.metadata.value(index), value.value(index))
+                Ok(Variant::new(self.metadata.value(index), 
value.value(index)))
             }
             // It is technically invalid for neither value nor typed_value 
fields to be available,
             // but the spec specifically requires readers to return 
Variant::Null in this case.
-            _ => Variant::Null,
+            _ => Ok(Variant::Null),
         }
     }
 
@@ -603,7 +627,7 @@ impl ShreddedVariantFieldArray {
     ///    or be a list, large_list, list_view or struct
     ///
     /// Currently, only `value` columns of type [`BinaryViewArray`] are 
supported.
-    pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
+    pub fn try_new(inner: &dyn Array) -> Result<Self> {
         let Some(inner_struct) = inner.as_struct_opt() else {
             return Err(ArrowError::InvalidArgumentError(
                 "Invalid ShreddedVariantFieldArray: requires StructArray as 
input".to_string(),
@@ -835,7 +859,7 @@ impl<'a> BorrowedShreddingState<'a> {
 impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
     type Error = ArrowError;
 
-    fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> {
+    fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
         // The `value` column need not exist, but if it does it must be a 
binary view.
         let value = if let Some(value_col) = 
inner_struct.column_by_name("value") {
             let Some(binary_view) = value_col.as_binary_view_opt() else {
@@ -856,7 +880,7 @@ impl<'a> TryFrom<&'a StructArray> for 
BorrowedShreddingState<'a> {
 impl TryFrom<&StructArray> for ShreddingState {
     type Error = ArrowError;
 
-    fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> {
+    fn try_from(inner_struct: &StructArray) -> Result<Self> {
         Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
     }
 }
@@ -914,34 +938,34 @@ fn typed_value_to_variant<'a>(
     typed_value: &'a ArrayRef,
     value: Option<&BinaryViewArray>,
     index: usize,
-) -> Variant<'a, 'a> {
+) -> Result<Variant<'a, 'a>> {
     let data_type = typed_value.data_type();
     if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && 
v.is_valid(index)) {
         // Only a partially shredded struct is allowed to have values for both 
columns
         panic!("Invalid variant, conflicting value and typed_value");
     }
     match data_type {
-        DataType::Null => Variant::Null,
+        DataType::Null => Ok(Variant::Null),
         DataType::Boolean => {
             let boolean_array = typed_value.as_boolean();
             let value = boolean_array.value(index);
-            Variant::from(value)
+            Ok(Variant::from(value))
         }
         // 16-byte FixedSizeBinary alway corresponds to a UUID; all other 
sizes are illegal.
         DataType::FixedSizeBinary(16) => {
             let array = typed_value.as_fixed_size_binary();
             let value = array.value(index);
-            Uuid::from_slice(value).unwrap().into() // unwrap is safe: slice 
is always 16 bytes
+            Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: 
slice is always 16 bytes
         }
         DataType::BinaryView => {
             let array = typed_value.as_binary_view();
             let value = array.value(index);
-            Variant::from(value)
+            Ok(Variant::from(value))
         }
         DataType::Utf8 => {
             let array = typed_value.as_string::<i32>();
             let value = array.value(index);
-            Variant::from(value)
+            Ok(Variant::from(value))
         }
         DataType::Int8 => {
             primitive_conversion_single_value!(Int8Type, typed_value, index)
@@ -965,28 +989,28 @@ fn typed_value_to_variant<'a>(
             primitive_conversion_single_value!(Float64Type, typed_value, index)
         }
         DataType::Decimal32(_, s) => {
-            generic_conversion_single_value!(
+            generic_conversion_single_value_with_result!(
                 Decimal32Type,
                 as_primitive,
-                |v| VariantDecimal4::try_new(v, *s as 
u8).map_or(Variant::Null, Variant::from),
+                |v| VariantDecimal4::try_new(v, *s as u8),
                 typed_value,
                 index
             )
         }
         DataType::Decimal64(_, s) => {
-            generic_conversion_single_value!(
+            generic_conversion_single_value_with_result!(
                 Decimal64Type,
                 as_primitive,
-                |v| VariantDecimal8::try_new(v, *s as 
u8).map_or(Variant::Null, Variant::from),
+                |v| VariantDecimal8::try_new(v, *s as u8),
                 typed_value,
                 index
             )
         }
         DataType::Decimal128(_, s) => {
-            generic_conversion_single_value!(
+            generic_conversion_single_value_with_result!(
                 Decimal128Type,
                 as_primitive,
-                |v| VariantDecimal16::try_new(v, *s as 
u8).map_or(Variant::Null, Variant::from),
+                |v| VariantDecimal16::try_new(v, *s as u8),
                 typed_value,
                 index
             )
@@ -1001,14 +1025,14 @@ fn typed_value_to_variant<'a>(
             )
         }
         DataType::Time64(TimeUnit::Microsecond) => {
-            generic_conversion_single_value!(
+            generic_conversion_single_value_with_result!(
                 Time64MicrosecondType,
                 as_primitive,
                 |v| NaiveTime::from_num_seconds_from_midnight_opt(
                     (v / 1_000_000) as u32,
                     (v % 1_000_000) as u32 * 1000
                 )
-                .map_or(Variant::Null, Variant::from),
+                .ok_or_else(|| format!("Invalid microsecond from midnight: 
{}", v)),
                 typed_value,
                 index
             )
@@ -1060,7 +1084,7 @@ fn typed_value_to_variant<'a>(
                 "Unsupported typed_value type: {}",
                 typed_value.data_type()
             );
-            Variant::Null
+            Ok(Variant::Null)
         }
     }
 }
@@ -1075,7 +1099,7 @@ fn typed_value_to_variant<'a>(
 /// * `StructArray<metadata: BinaryView, value: BinaryView>`
 ///
 /// So cast them to get the right type.
-fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, 
ArrowError> {
+fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef> {
     let new_type = canonicalize_and_verify_data_type(array.data_type())?;
     if let Cow::Borrowed(_) = new_type {
         if let Some(array) = array.as_struct_opt() {
@@ -1088,9 +1112,7 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> 
Result<ArrayRef, ArrowError>
 /// Recursively visits a data type, ensuring that it only contains data types 
that can legally
 /// appear in a (possibly shredded) variant array. It also replaces Binary 
fields with BinaryView,
 /// since that's what comes back from the parquet reader and what the variant 
code expects to find.
-fn canonicalize_and_verify_data_type(
-    data_type: &DataType,
-) -> Result<Cow<'_, DataType>, ArrowError> {
+fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, 
DataType>> {
     use DataType::*;
 
     // helper macros
@@ -1188,7 +1210,7 @@ fn canonicalize_and_verify_data_type(
     Ok(new_data_type)
 }
 
-fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, 
Arc<Field>>, ArrowError> {
+fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, 
Arc<Field>>> {
     let Cow::Owned(new_data_type) = 
canonicalize_and_verify_data_type(field.data_type())? else {
         return Ok(Cow::Borrowed(field));
     };
@@ -1199,11 +1221,15 @@ fn canonicalize_and_verify_field(field: &Arc<Field>) -> 
Result<Cow<'_, Arc<Field
 #[cfg(test)]
 mod test {
     use crate::VariantArrayBuilder;
+    use std::str::FromStr;
 
     use super::*;
-    use arrow::array::{BinaryViewArray, Int32Array};
+    use arrow::array::{
+        BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, 
Int32Array,
+        Time64MicrosecondArray,
+    };
     use arrow_schema::{Field, Fields};
-    use parquet_variant::ShortString;
+    use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
 
     #[test]
     fn invalid_not_a_struct_array() {
@@ -1535,4 +1561,65 @@ mod test {
             assert_ne!(v, v_sliced);
         }
     }
+
+    macro_rules! invalid_variant_array_test {
+        ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
+            #[test]
+            fn $fn_name() {
+                let metadata = 
BinaryViewArray::from_iter_values(std::iter::repeat_n(
+                    EMPTY_VARIANT_METADATA_BYTES,
+                    1,
+                ));
+                let invalid_typed_value = $invalid_typed_value;
+
+                let struct_array = StructArrayBuilder::new()
+                    .with_field("metadata", Arc::new(metadata), false)
+                    .with_field("typed_value", Arc::new(invalid_typed_value), 
true)
+                    .build();
+
+                let array: VariantArray = VariantArray::try_new(&struct_array)
+                    .expect("should create variant array")
+                    .into();
+
+                let result = array.try_value(0);
+                assert!(result.is_err());
+                let error = result.unwrap_err();
+                assert!(matches!(error, ArrowError::CastError(_)));
+
+                let expected: &str = $error_msg;
+                assert!(
+                    error.to_string().contains($error_msg),
+                    "error `{}` did not contain `{}`",
+                    error,
+                    expected
+                )
+            }
+        };
+    }
+
+    invalid_variant_array_test!(
+        test_variant_array_invalide_time,
+        Time64MicrosecondArray::from(vec![Some(86401000000)]),
+        "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid 
microsecond from midnight: 86401000000"
+    );
+
+    invalid_variant_array_test!(
+        test_variant_array_invalid_decimal32,
+        Decimal32Array::from(vec![Some(1234567890)]),
+        "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): 
Invalid argument error: 1234567890 is wider than max precision 9"
+    );
+
+    invalid_variant_array_test!(
+        test_variant_array_invalid_decimal64,
+        Decimal64Array::from(vec![Some(1234567890123456789)]),
+        "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): 
Invalid argument error: 1234567890123456789 is wider than max precision 18"
+    );
+
+    invalid_variant_array_test!(
+        test_variant_array_invalid_decimal128,
+        Decimal128Array::from(vec![Some(
+            i128::from_str("123456789012345678901234567890123456789").unwrap()
+        ),]),
+        "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): 
Invalid argument error: 123456789012345678901234567890123456789 is wider than 
max precision 38"
+    );
 }
diff --git a/parquet-variant-compute/src/variant_get.rs 
b/parquet-variant-compute/src/variant_get.rs
index 1061548933..38c6513961 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -142,8 +142,17 @@ fn shredded_get_path(
             for i in 0..target.len() {
                 if target.is_null(i) {
                     builder.append_null()?;
+                } else if !cast_options.safe {
+                    let value = target.try_value(i)?;
+                    builder.append_value(value)?;
                 } else {
-                    builder.append_value(target.value(i))?;
+                    let _ = match target.try_value(i) {
+                        Ok(v) => builder.append_value(v)?,
+                        Err(_) => {
+                            builder.append_null()?;
+                            false // add this to make match arms have the same 
return type
+                        }
+                    };
                 }
             }
             builder.finish()
@@ -3584,4 +3593,53 @@ mod test {
             "Failed to cast to Decimal256(precision=76, scale=39) from variant 
Decimal16"
         ));
     }
+
+    
perfectly_shredded_variant_array_fn!(perfectly_shredded_invalid_time_variant_array,
 || {
+        // 86401000000 is invalid for Time64Microsecond (max is 86400000000)
+        Time64MicrosecondArray::from(vec![
+            Some(86401000000),
+            Some(86401000000),
+            Some(86401000000),
+        ])
+    });
+
+    #[test]
+    fn test_variant_get_error_when_cast_failure_and_safe_false() {
+        let variant_array = perfectly_shredded_invalid_time_variant_array();
+
+        let field = Field::new("result", 
DataType::Time64(TimeUnit::Microsecond), true);
+        let cast_options = CastOptions {
+            safe: false, // Will error on cast failure
+            ..Default::default()
+        };
+        let options = GetOptions::new()
+            .with_as_type(Some(FieldRef::from(field)))
+            .with_cast_options(cast_options);
+        let err = variant_get(&variant_array, options).unwrap_err();
+        assert!(
+            err.to_string().contains(
+                "Cast error: Cast failed at index 0 (array type: Time64(µs)): 
Invalid microsecond from midnight: 86401000000"
+            )
+        );
+    }
+
+    #[test]
+    fn test_variant_get_return_null_when_cast_failure_and_safe_true() {
+        let variant_array = perfectly_shredded_invalid_time_variant_array();
+
+        let field = Field::new("result", 
DataType::Time64(TimeUnit::Microsecond), true);
+        let cast_options = CastOptions {
+            safe: true, // Will return null on cast failure
+            ..Default::default()
+        };
+        let options = GetOptions::new()
+            .with_as_type(Some(FieldRef::from(field)))
+            .with_cast_options(cast_options);
+        let result = variant_get(&variant_array, options).unwrap();
+        assert_eq!(3, result.len());
+
+        for i in 0..3 {
+            assert!(result.is_null(i));
+        }
+    }
 }

Reply via email to