Re: [PR] Variant integration fixes [arrow-rs]

via GitHub Sat, 18 Oct 2025 07:53:51 -0700


scovich commented on code in PR #8438:
URL: https://github.com/apache/arrow-rs/pull/8438#discussion_r2377285348



##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -891,28 +864,116 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: 
usize) -> Variant<'_, '
 ///
 /// So cast them to get the right type.
 fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, 
ArrowError> {
-    let new_type = rewrite_to_view_types(array.data_type());
-    cast(array, &new_type)
+    let new_type = canonicalize_and_verify_data_type(array.data_type())?;
+    cast(array, new_type.as_ref())
+}
+
+/// Validates whether a given arrow decimal is a valid variant decimal
+///
+/// NOTE: By a strict reading of the "decimal table" in the [shredding spec], 
each decimal type
+/// should have a lower bound on precision as well as an upper bound (i.e. 
Decimal16 with precision
+/// 5 is invalid because Decimal4 can cover it). But the variant shredding 
integration tests
+/// specifically expect such cases to succeed, so we only enforce the upper 
bound here.
+///
+/// [shredding spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
+fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool {
+    *p <= max_precision && (0..=*p as i8).contains(s)
 }
 
 /// replaces all instances of Binary with BinaryView in a DataType
-fn rewrite_to_view_types(data_type: &DataType) -> DataType {
-    match data_type {
-        DataType::Binary => DataType::BinaryView,
-        DataType::List(field) => DataType::List(rewrite_field_type(field)),
-        DataType::Struct(fields) => {
-            DataType::Struct(fields.iter().map(rewrite_field_type).collect())
-        }
-        _ => data_type.clone(),
+fn canonicalize_and_verify_data_type(
+    data_type: &DataType,
+) -> Result<Cow<'_, DataType>, ArrowError> {
+    use DataType::*;
+
+    // helper macros
+    macro_rules! fail {
+        () => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Illegal shredded value type: {data_type}"
+            )))
+        };
     }
+    macro_rules! borrow {
+        () => {
+            Cow::Borrowed(data_type)
+        };
+    }
+
+    let new_data_type = match data_type {
+        // Primitive arrow types that have a direct variant counterpart are 
allowed
+        Null | Boolean => borrow!(),
+        Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
+
+        // Unsigned integers are not allowed at all
+        UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
+
+        // Most decimal types are allowed, with restrictions on precision and 
scale
+        Decimal32(p, s) if is_valid_variant_decimal(p, s, 9) => borrow!(),
+        Decimal64(p, s) if is_valid_variant_decimal(p, s, 18) => borrow!(),
+        Decimal128(p, s) if is_valid_variant_decimal(p, s, 38) => borrow!(),
+        Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) => 
fail!(),
+
+        // Only micro and nano timestamps are supported
+        Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) => 
borrow!(),
+        Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
+
+        // Only 32-bit dates and 64-bit microsecond time are supported.
+        Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
+        Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
+
+        // Binary and string are allowed.
+        // NOTE: We force Binary to BinaryView because that's what the parquet 
reader returns.
+        Binary => Cow::Owned(DataType::BinaryView),
+        BinaryView | Utf8 => borrow!(),
+
+        // UUID maps to 16-byte fixed-size binary; no other width is allowed
+        FixedSizeBinary(16) => borrow!(),
+        FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
+
+        // We can _possibly_ support (some of) these some day?
+        LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | 
LargeListView(_) => {
+            fail!()
+        }
+
+        // Lists and struct are supported, maps and unions are not
+        List(field) => match canonicalize_and_verify_field(field)? {
+            Cow::Borrowed(_) => borrow!(),
+            Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
+        },
+        Struct(fields) => {
+            // Avoid allocation unless at least one field was rewritten

Review Comment:
   Observation: The original code unconditionally built a new type from the 
ground up, which defeated the purpose of having `Fields` store `Arc<Field>` 
(which can break long chains). So this code collects the set of fields changed 
by deeper layers, and only constructs a new struct if at least one field 
actually changed. In the common case where no fields changed, the hashmap is 
empty (no allocations) and we just return a borrowed version of the input data 
type.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Variant integration fixes [arrow-rs]

Reply via email to