This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1b18582bda [Variant] Add try_value/value for VariantArray (#8719)
1b18582bda is described below
commit 1b18582bda44deaf584720047e3446fbcca4a718
Author: Congxian Qiu <[email protected]>
AuthorDate: Thu Oct 30 19:01:26 2025 +0800
[Variant] Add try_value/value for VariantArray (#8719)
# Which issue does this PR close?
- Closes #8672 .
# What changes are included in this PR?
- Add `try_value/value` function for `VariantArray`
- Add test for `VariantArray::try_value`
# Are these changes tested?
Covered by existing tests and added new tests
# Are there any user-facing changes?
Yes, add a new function for `VariantArray::try_value`, and the
`VariantArray::value` changed to panic from returning `Variant::Null` if
there is some cast error.
---
parquet-variant-compute/src/type_conversion.rs | 21 +++-
parquet-variant-compute/src/variant_array.rs | 157 +++++++++++++++++++------
parquet-variant-compute/src/variant_get.rs | 60 +++++++++-
3 files changed, 200 insertions(+), 38 deletions(-)
diff --git a/parquet-variant-compute/src/type_conversion.rs
b/parquet-variant-compute/src/type_conversion.rs
index d15664f5af..0716469f76 100644
--- a/parquet-variant-compute/src/type_conversion.rs
+++ b/parquet-variant-compute/src/type_conversion.rs
@@ -194,10 +194,10 @@ macro_rules! non_generic_conversion_single_value {
($array:expr, $cast_fn:expr, $index:expr) => {{
let array = $array;
if array.is_null($index) {
- Variant::Null
+ Ok(Variant::Null)
} else {
let cast_value = $cast_fn(array.value($index));
- Variant::from(cast_value)
+ Ok(Variant::from(cast_value))
}
}};
}
@@ -217,6 +217,23 @@ macro_rules! generic_conversion_single_value {
}
pub(crate) use generic_conversion_single_value;
+macro_rules! generic_conversion_single_value_with_result {
+ ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $index:expr) => {{
+ let arr = $input.$method::<$t>();
+ let v = arr.value($index);
+ match ($cast_fn)(v) {
+ Ok(var) => Ok(Variant::from(var)),
+ Err(e) => Err(ArrowError::CastError(format!(
+ "Cast failed at index {idx} (array type: {ty}): {e}",
+ idx = $index,
+ ty = <$t as ::arrow::datatypes::ArrowPrimitiveType>::DATA_TYPE
+ ))),
+ }
+ }};
+}
+
+pub(crate) use generic_conversion_single_value_with_result;
+
/// Convert the value at a specific index in the given array into a `Variant`.
macro_rules! primitive_conversion_single_value {
($t:ty, $input:expr, $index:expr) => {{
diff --git a/parquet-variant-compute/src/variant_array.rs
b/parquet-variant-compute/src/variant_array.rs
index 40fd76b170..0a42b3ab2e 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -18,7 +18,10 @@
//! [`VariantArray`] implementation
use crate::VariantArrayBuilder;
-use crate::type_conversion::{generic_conversion_single_value,
primitive_conversion_single_value};
+use crate::type_conversion::{
+ generic_conversion_single_value,
generic_conversion_single_value_with_result,
+ primitive_conversion_single_value,
+};
use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
use arrow::buffer::NullBuffer;
use arrow::compute::cast;
@@ -27,6 +30,7 @@ use arrow::datatypes::{
Float64Type, Int8Type, Int16Type, Int32Type, Int64Type,
Time64MicrosecondType,
TimestampMicrosecondType, TimestampNanosecondType,
};
+use arrow::error::Result;
use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
use chrono::{DateTime, NaiveTime};
@@ -58,11 +62,11 @@ impl ExtensionType for VariantType {
Some(String::new())
}
- fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata>
{
Ok("")
}
- fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
if matches!(data_type, DataType::Struct(_)) {
Ok(())
} else {
@@ -72,7 +76,7 @@ impl ExtensionType for VariantType {
}
}
- fn try_new(data_type: &DataType, _metadata: Self::Metadata) ->
Result<Self, ArrowError> {
+ fn try_new(data_type: &DataType, _metadata: Self::Metadata) ->
Result<Self> {
Self.supports_data_type(data_type)?;
Ok(Self)
}
@@ -249,7 +253,7 @@ impl VariantArray {
/// int8.
///
/// Currently, only [`BinaryViewArray`] are supported.
- pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
+ pub fn try_new(inner: &dyn Array) -> Result<Self> {
// Workaround lack of support for Binary
// https://github.com/apache/arrow-rs/issues/8387
let inner = cast_to_binary_view_arrays(inner)?;
@@ -325,12 +329,32 @@ impl VariantArray {
/// Return the [`Variant`] instance stored at the given row
///
- /// Note: This method does not check for nulls and the value is arbitrary
- /// (but still well-defined) if [`is_null`](Self::is_null) returns true
for the index.
+ /// This is a convenience wrapper that calls [`VariantArray::try_value`]
and unwraps the `Result`.
+ /// Use `try_value` if you need to handle conversion errors gracefully.
///
/// # Panics
/// * if the index is out of bounds
/// * if the array value is null
+ /// * if `try_value` returns an error.
+ pub fn value(&self, index: usize) -> Variant<'_, '_> {
+ self.try_value(index).unwrap()
+ }
+
+ /// Return the [`Variant`] instance stored at the given row
+ ///
+ /// Note: This method does not check for nulls and the value is arbitrary
+ /// (but still well-defined) if [`is_null`](Self::is_null) returns true
for the index.
+ ///
+ /// # Panics
+ ///
+ /// Panics if
+ /// * the index is out of bounds
+ /// * the array value is null
+ ///
+ /// # Errors
+ ///
+ /// Errors if
+ /// - the data in `typed_value` cannot be interpreted as a valid `Variant`
///
/// If this is a shredded variant but has no value at the shredded
location, it
/// will return [`Variant::Null`].
@@ -343,7 +367,7 @@ impl VariantArray {
///
/// Note: Does not do deep validation of the [`Variant`], so it is up to
the
/// caller to ensure that the metadata and value were constructed
correctly.
- pub fn value(&self, index: usize) -> Variant<'_, '_> {
+ pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
match (self.typed_value_field(), self.value_field()) {
// Always prefer typed_value, if available
(Some(typed_value), value) if typed_value.is_valid(index) => {
@@ -351,11 +375,11 @@ impl VariantArray {
}
// Otherwise fall back to value, if available
(_, Some(value)) if value.is_valid(index) => {
- Variant::new(self.metadata.value(index), value.value(index))
+ Ok(Variant::new(self.metadata.value(index),
value.value(index)))
}
// It is technically invalid for neither value nor typed_value
fields to be available,
// but the spec specifically requires readers to return
Variant::Null in this case.
- _ => Variant::Null,
+ _ => Ok(Variant::Null),
}
}
@@ -603,7 +627,7 @@ impl ShreddedVariantFieldArray {
/// or be a list, large_list, list_view or struct
///
/// Currently, only `value` columns of type [`BinaryViewArray`] are
supported.
- pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
+ pub fn try_new(inner: &dyn Array) -> Result<Self> {
let Some(inner_struct) = inner.as_struct_opt() else {
return Err(ArrowError::InvalidArgumentError(
"Invalid ShreddedVariantFieldArray: requires StructArray as
input".to_string(),
@@ -835,7 +859,7 @@ impl<'a> BorrowedShreddingState<'a> {
impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
type Error = ArrowError;
- fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> {
+ fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
// The `value` column need not exist, but if it does it must be a
binary view.
let value = if let Some(value_col) =
inner_struct.column_by_name("value") {
let Some(binary_view) = value_col.as_binary_view_opt() else {
@@ -856,7 +880,7 @@ impl<'a> TryFrom<&'a StructArray> for
BorrowedShreddingState<'a> {
impl TryFrom<&StructArray> for ShreddingState {
type Error = ArrowError;
- fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> {
+ fn try_from(inner_struct: &StructArray) -> Result<Self> {
Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
}
}
@@ -914,34 +938,34 @@ fn typed_value_to_variant<'a>(
typed_value: &'a ArrayRef,
value: Option<&BinaryViewArray>,
index: usize,
-) -> Variant<'a, 'a> {
+) -> Result<Variant<'a, 'a>> {
let data_type = typed_value.data_type();
if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) &&
v.is_valid(index)) {
// Only a partially shredded struct is allowed to have values for both
columns
panic!("Invalid variant, conflicting value and typed_value");
}
match data_type {
- DataType::Null => Variant::Null,
+ DataType::Null => Ok(Variant::Null),
DataType::Boolean => {
let boolean_array = typed_value.as_boolean();
let value = boolean_array.value(index);
- Variant::from(value)
+ Ok(Variant::from(value))
}
// 16-byte FixedSizeBinary alway corresponds to a UUID; all other
sizes are illegal.
DataType::FixedSizeBinary(16) => {
let array = typed_value.as_fixed_size_binary();
let value = array.value(index);
- Uuid::from_slice(value).unwrap().into() // unwrap is safe: slice
is always 16 bytes
+ Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe:
slice is always 16 bytes
}
DataType::BinaryView => {
let array = typed_value.as_binary_view();
let value = array.value(index);
- Variant::from(value)
+ Ok(Variant::from(value))
}
DataType::Utf8 => {
let array = typed_value.as_string::<i32>();
let value = array.value(index);
- Variant::from(value)
+ Ok(Variant::from(value))
}
DataType::Int8 => {
primitive_conversion_single_value!(Int8Type, typed_value, index)
@@ -965,28 +989,28 @@ fn typed_value_to_variant<'a>(
primitive_conversion_single_value!(Float64Type, typed_value, index)
}
DataType::Decimal32(_, s) => {
- generic_conversion_single_value!(
+ generic_conversion_single_value_with_result!(
Decimal32Type,
as_primitive,
- |v| VariantDecimal4::try_new(v, *s as
u8).map_or(Variant::Null, Variant::from),
+ |v| VariantDecimal4::try_new(v, *s as u8),
typed_value,
index
)
}
DataType::Decimal64(_, s) => {
- generic_conversion_single_value!(
+ generic_conversion_single_value_with_result!(
Decimal64Type,
as_primitive,
- |v| VariantDecimal8::try_new(v, *s as
u8).map_or(Variant::Null, Variant::from),
+ |v| VariantDecimal8::try_new(v, *s as u8),
typed_value,
index
)
}
DataType::Decimal128(_, s) => {
- generic_conversion_single_value!(
+ generic_conversion_single_value_with_result!(
Decimal128Type,
as_primitive,
- |v| VariantDecimal16::try_new(v, *s as
u8).map_or(Variant::Null, Variant::from),
+ |v| VariantDecimal16::try_new(v, *s as u8),
typed_value,
index
)
@@ -1001,14 +1025,14 @@ fn typed_value_to_variant<'a>(
)
}
DataType::Time64(TimeUnit::Microsecond) => {
- generic_conversion_single_value!(
+ generic_conversion_single_value_with_result!(
Time64MicrosecondType,
as_primitive,
|v| NaiveTime::from_num_seconds_from_midnight_opt(
(v / 1_000_000) as u32,
(v % 1_000_000) as u32 * 1000
)
- .map_or(Variant::Null, Variant::from),
+ .ok_or_else(|| format!("Invalid microsecond from midnight:
{}", v)),
typed_value,
index
)
@@ -1060,7 +1084,7 @@ fn typed_value_to_variant<'a>(
"Unsupported typed_value type: {}",
typed_value.data_type()
);
- Variant::Null
+ Ok(Variant::Null)
}
}
}
@@ -1075,7 +1099,7 @@ fn typed_value_to_variant<'a>(
/// * `StructArray<metadata: BinaryView, value: BinaryView>`
///
/// So cast them to get the right type.
-fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
+fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef> {
let new_type = canonicalize_and_verify_data_type(array.data_type())?;
if let Cow::Borrowed(_) = new_type {
if let Some(array) = array.as_struct_opt() {
@@ -1088,9 +1112,7 @@ fn cast_to_binary_view_arrays(array: &dyn Array) ->
Result<ArrayRef, ArrowError>
/// Recursively visits a data type, ensuring that it only contains data types
that can legally
/// appear in a (possibly shredded) variant array. It also replaces Binary
fields with BinaryView,
/// since that's what comes back from the parquet reader and what the variant
code expects to find.
-fn canonicalize_and_verify_data_type(
- data_type: &DataType,
-) -> Result<Cow<'_, DataType>, ArrowError> {
+fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_,
DataType>> {
use DataType::*;
// helper macros
@@ -1188,7 +1210,7 @@ fn canonicalize_and_verify_data_type(
Ok(new_data_type)
}
-fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_,
Arc<Field>>, ArrowError> {
+fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_,
Arc<Field>>> {
let Cow::Owned(new_data_type) =
canonicalize_and_verify_data_type(field.data_type())? else {
return Ok(Cow::Borrowed(field));
};
@@ -1199,11 +1221,15 @@ fn canonicalize_and_verify_field(field: &Arc<Field>) ->
Result<Cow<'_, Arc<Field
#[cfg(test)]
mod test {
use crate::VariantArrayBuilder;
+ use std::str::FromStr;
use super::*;
- use arrow::array::{BinaryViewArray, Int32Array};
+ use arrow::array::{
+ BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array,
Int32Array,
+ Time64MicrosecondArray,
+ };
use arrow_schema::{Field, Fields};
- use parquet_variant::ShortString;
+ use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
#[test]
fn invalid_not_a_struct_array() {
@@ -1535,4 +1561,65 @@ mod test {
assert_ne!(v, v_sliced);
}
}
+
+ macro_rules! invalid_variant_array_test {
+ ($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
+ #[test]
+ fn $fn_name() {
+ let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(
+ EMPTY_VARIANT_METADATA_BYTES,
+ 1,
+ ));
+ let invalid_typed_value = $invalid_typed_value;
+
+ let struct_array = StructArrayBuilder::new()
+ .with_field("metadata", Arc::new(metadata), false)
+ .with_field("typed_value", Arc::new(invalid_typed_value),
true)
+ .build();
+
+ let array: VariantArray = VariantArray::try_new(&struct_array)
+ .expect("should create variant array")
+ .into();
+
+ let result = array.try_value(0);
+ assert!(result.is_err());
+ let error = result.unwrap_err();
+ assert!(matches!(error, ArrowError::CastError(_)));
+
+ let expected: &str = $error_msg;
+ assert!(
+ error.to_string().contains($error_msg),
+ "error `{}` did not contain `{}`",
+ error,
+ expected
+ )
+ }
+ };
+ }
+
+ invalid_variant_array_test!(
+ test_variant_array_invalide_time,
+ Time64MicrosecondArray::from(vec![Some(86401000000)]),
+ "Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid
microsecond from midnight: 86401000000"
+ );
+
+ invalid_variant_array_test!(
+ test_variant_array_invalid_decimal32,
+ Decimal32Array::from(vec![Some(1234567890)]),
+ "Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)):
Invalid argument error: 1234567890 is wider than max precision 9"
+ );
+
+ invalid_variant_array_test!(
+ test_variant_array_invalid_decimal64,
+ Decimal64Array::from(vec![Some(1234567890123456789)]),
+ "Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)):
Invalid argument error: 1234567890123456789 is wider than max precision 18"
+ );
+
+ invalid_variant_array_test!(
+ test_variant_array_invalid_decimal128,
+ Decimal128Array::from(vec![Some(
+ i128::from_str("123456789012345678901234567890123456789").unwrap()
+ ),]),
+ "Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)):
Invalid argument error: 123456789012345678901234567890123456789 is wider than
max precision 38"
+ );
}
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index 1061548933..38c6513961 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -142,8 +142,17 @@ fn shredded_get_path(
for i in 0..target.len() {
if target.is_null(i) {
builder.append_null()?;
+ } else if !cast_options.safe {
+ let value = target.try_value(i)?;
+ builder.append_value(value)?;
} else {
- builder.append_value(target.value(i))?;
+ let _ = match target.try_value(i) {
+ Ok(v) => builder.append_value(v)?,
+ Err(_) => {
+ builder.append_null()?;
+ false // add this to make match arms have the same
return type
+ }
+ };
}
}
builder.finish()
@@ -3584,4 +3593,53 @@ mod test {
"Failed to cast to Decimal256(precision=76, scale=39) from variant
Decimal16"
));
}
+
+
perfectly_shredded_variant_array_fn!(perfectly_shredded_invalid_time_variant_array,
|| {
+ // 86401000000 is invalid for Time64Microsecond (max is 86400000000)
+ Time64MicrosecondArray::from(vec![
+ Some(86401000000),
+ Some(86401000000),
+ Some(86401000000),
+ ])
+ });
+
+ #[test]
+ fn test_variant_get_error_when_cast_failure_and_safe_false() {
+ let variant_array = perfectly_shredded_invalid_time_variant_array();
+
+ let field = Field::new("result",
DataType::Time64(TimeUnit::Microsecond), true);
+ let cast_options = CastOptions {
+ safe: false, // Will error on cast failure
+ ..Default::default()
+ };
+ let options = GetOptions::new()
+ .with_as_type(Some(FieldRef::from(field)))
+ .with_cast_options(cast_options);
+ let err = variant_get(&variant_array, options).unwrap_err();
+ assert!(
+ err.to_string().contains(
+ "Cast error: Cast failed at index 0 (array type: Time64(µs)):
Invalid microsecond from midnight: 86401000000"
+ )
+ );
+ }
+
+ #[test]
+ fn test_variant_get_return_null_when_cast_failure_and_safe_true() {
+ let variant_array = perfectly_shredded_invalid_time_variant_array();
+
+ let field = Field::new("result",
DataType::Time64(TimeUnit::Microsecond), true);
+ let cast_options = CastOptions {
+ safe: true, // Will return null on cast failure
+ ..Default::default()
+ };
+ let options = GetOptions::new()
+ .with_as_type(Some(FieldRef::from(field)))
+ .with_cast_options(cast_options);
+ let result = variant_get(&variant_array, options).unwrap();
+ assert_eq!(3, result.len());
+
+ for i in 0..3 {
+ assert!(result.is_null(i));
+ }
+ }
}