This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new a30890e6c4 [Variant] Support `shred_variant` for Uuids (#8666)
a30890e6c4 is described below

commit a30890e6c4d7379c1de2d376488b1e724deefb3a
Author: Matthew Kim <[email protected]>
AuthorDate: Mon Nov 3 16:20:35 2025 -0500

    [Variant] Support `shred_variant` for Uuids (#8666)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/8665
    
    # Rationale for this change
    `shred_variant` currently panics when attempting to shred Variants
    containing values of the FixedSizeBinary(16) data type
---
 parquet-variant-compute/src/shred_variant.rs    | 251 ++++++++++++++++++++++-
 parquet-variant-compute/src/variant_to_arrow.rs | 252 +++++++++++++++---------
 2 files changed, 409 insertions(+), 94 deletions(-)

diff --git a/parquet-variant-compute/src/shred_variant.rs 
b/parquet-variant-compute/src/shred_variant.rs
index f8158b2211..4e6a72e87a 100644
--- a/parquet-variant-compute/src/shred_variant.rs
+++ b/parquet-variant-compute/src/shred_variant.rs
@@ -326,10 +326,11 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
 mod tests {
     use super::*;
     use crate::VariantArrayBuilder;
-    use arrow::array::{Array, Float64Array, Int64Array};
+    use arrow::array::{Array, FixedSizeBinaryArray, Float64Array, Int64Array};
     use arrow::datatypes::{DataType, Field, Fields};
     use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant, 
VariantBuilder};
     use std::sync::Arc;
+    use uuid::Uuid;
 
     #[test]
     fn test_already_shredded_input_error() {
@@ -369,6 +370,73 @@ mod tests {
         shred_variant(&input, &list_schema).expect_err("unsupported");
     }
 
+    #[test]
+    fn test_invalid_fixed_size_binary_shredding() {
+        let mock_uuid_1 = Uuid::new_v4();
+
+        let input = VariantArray::from_iter([Some(Variant::from(mock_uuid_1)), 
None]);
+
+        // shred_variant only supports FixedSizeBinary(16). Any other length 
will err.
+        let err = shred_variant(&input, 
&DataType::FixedSizeBinary(17)).unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: FixedSizeBinary(17) is not a valid 
variant shredding type. Only FixedSizeBinary(16) for UUID is supported."
+        );
+    }
+
+    #[test]
+    fn test_uuid_shredding() {
+        let mock_uuid_1 = Uuid::new_v4();
+        let mock_uuid_2 = Uuid::new_v4();
+
+        let input = VariantArray::from_iter([
+            Some(Variant::from(mock_uuid_1)),
+            None,
+            Some(Variant::from(false)),
+            Some(Variant::from(mock_uuid_2)),
+        ]);
+
+        let variant_array = shred_variant(&input, 
&DataType::FixedSizeBinary(16)).unwrap();
+
+        // // inspect the typed_value Field and make sure it contains the 
canonical Uuid extension type
+        // let typed_value_field = variant_array
+        //     .inner()
+        //     .fields()
+        //     .into_iter()
+        //     .find(|f| f.name() == "typed_value")
+        //     .unwrap();
+
+        // assert!(
+        //     typed_value_field
+        //         .try_extension_type::<extension::Uuid>()
+        //         .is_ok()
+        // );
+
+        // probe the downcasted typed_value array to make sure uuids are 
shredded correctly
+        let uuids = variant_array
+            .typed_value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<FixedSizeBinaryArray>()
+            .unwrap();
+
+        assert_eq!(uuids.len(), 4);
+
+        assert!(!uuids.is_null(0));
+
+        let got_uuid_1: &[u8] = uuids.value(0);
+        assert_eq!(got_uuid_1, mock_uuid_1.as_bytes());
+
+        assert!(uuids.is_null(1));
+        assert!(uuids.is_null(2));
+
+        assert!(!uuids.is_null(3));
+
+        let got_uuid_2: &[u8] = uuids.value(3);
+        assert_eq!(got_uuid_2, mock_uuid_2.as_bytes());
+    }
+
     #[test]
     fn test_primitive_shredding_comprehensive() {
         // Test mixed scenarios in a single array
@@ -869,6 +937,187 @@ mod tests {
         assert!(value_field3.is_null(0)); // fully shredded, no remaining 
fields
     }
 
+    #[test]
+    fn test_uuid_shredding_in_objects() {
+        let mock_uuid_1 = Uuid::new_v4();
+        let mock_uuid_2 = Uuid::new_v4();
+        let mock_uuid_3 = Uuid::new_v4();
+
+        let mut builder = VariantArrayBuilder::new(6);
+
+        // Row 0: Fully shredded object with both UUID fields
+        builder
+            .new_object()
+            .with_field("id", mock_uuid_1)
+            .with_field("session_id", mock_uuid_2)
+            .finish();
+
+        // Row 1: Partially shredded object - UUID fields plus extra field
+        builder
+            .new_object()
+            .with_field("id", mock_uuid_2)
+            .with_field("session_id", mock_uuid_3)
+            .with_field("name", "test_user")
+            .finish();
+
+        // Row 2: Missing UUID field (no session_id)
+        builder.new_object().with_field("id", mock_uuid_1).finish();
+
+        // Row 3: Type mismatch - id is UUID but session_id is a string
+        builder
+            .new_object()
+            .with_field("id", mock_uuid_3)
+            .with_field("session_id", "not-a-uuid")
+            .finish();
+
+        // Row 4: Object with non-UUID value in id field
+        builder
+            .new_object()
+            .with_field("id", 12345i64)
+            .with_field("session_id", mock_uuid_1)
+            .finish();
+
+        // Row 5: Null
+        builder.append_null();
+
+        let input = builder.build();
+
+        let fields = Fields::from(vec![
+            Field::new("id", DataType::FixedSizeBinary(16), true),
+            Field::new("session_id", DataType::FixedSizeBinary(16), true),
+        ]);
+        let target_schema = DataType::Struct(fields);
+
+        let result = shred_variant(&input, &target_schema).unwrap();
+
+        assert!(result.value_field().is_some());
+        assert!(result.typed_value_field().is_some());
+        assert_eq!(result.len(), 6);
+
+        let metadata = result.metadata_field();
+        let value = result.value_field().unwrap();
+        let typed_value = result
+            .typed_value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<arrow::array::StructArray>()
+            .unwrap();
+
+        // Extract id and session_id fields from typed_value struct
+        let id_field =
+            
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("id").unwrap()).unwrap();
+        let session_id_field =
+            
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap())
+                .unwrap();
+
+        let id_value = id_field
+            .value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<BinaryViewArray>()
+            .unwrap();
+        let id_typed_value = id_field
+            .typed_value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<FixedSizeBinaryArray>()
+            .unwrap();
+        let session_id_value = session_id_field
+            .value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<BinaryViewArray>()
+            .unwrap();
+        let session_id_typed_value = session_id_field
+            .typed_value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<FixedSizeBinaryArray>()
+            .unwrap();
+
+        // Row 0: Fully shredded - both UUID fields shred successfully
+        assert!(result.is_valid(0));
+
+        assert!(value.is_null(0)); // fully shredded, no remaining fields
+        assert!(id_value.is_null(0));
+        assert!(session_id_value.is_null(0));
+
+        assert!(typed_value.is_valid(0));
+        assert!(id_typed_value.is_valid(0));
+        assert!(session_id_typed_value.is_valid(0));
+
+        assert_eq!(id_typed_value.value(0), mock_uuid_1.as_bytes());
+        assert_eq!(session_id_typed_value.value(0), mock_uuid_2.as_bytes());
+
+        // Row 1: Partially shredded - value contains extra name field
+        assert!(result.is_valid(1));
+
+        assert!(value.is_valid(1)); // contains unshredded "name" field
+        assert!(typed_value.is_valid(1));
+
+        assert!(id_value.is_null(1));
+        assert!(id_typed_value.is_valid(1));
+        assert_eq!(id_typed_value.value(1), mock_uuid_2.as_bytes());
+
+        assert!(session_id_value.is_null(1));
+        assert!(session_id_typed_value.is_valid(1));
+        assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes());
+
+        // Verify the value field contains the name field
+        let row_1_variant = Variant::new(metadata.value(1), value.value(1));
+        let Variant::Object(obj) = row_1_variant else {
+            panic!("Expected object");
+        };
+
+        assert_eq!(obj.get("name"), Some(Variant::from("test_user")));
+
+        // Row 2: Missing session_id field
+        assert!(result.is_valid(2));
+
+        assert!(value.is_null(2)); // fully shredded, no extra fields
+        assert!(typed_value.is_valid(2));
+
+        assert!(id_value.is_null(2));
+        assert!(id_typed_value.is_valid(2));
+        assert_eq!(id_typed_value.value(2), mock_uuid_1.as_bytes());
+
+        assert!(session_id_value.is_null(2));
+        assert!(session_id_typed_value.is_null(2)); // missing field
+
+        // Row 3: Type mismatch - session_id is a string, not UUID
+        assert!(result.is_valid(3));
+
+        assert!(value.is_null(3)); // no extra fields
+        assert!(typed_value.is_valid(3));
+
+        assert!(id_value.is_null(3));
+        assert!(id_typed_value.is_valid(3));
+        assert_eq!(id_typed_value.value(3), mock_uuid_3.as_bytes());
+
+        assert!(session_id_value.is_valid(3)); // type mismatch, stored in 
value
+        assert!(session_id_typed_value.is_null(3));
+        let session_id_variant = Variant::new(metadata.value(3), 
session_id_value.value(3));
+        assert_eq!(session_id_variant, Variant::from("not-a-uuid"));
+
+        // Row 4: Type mismatch - id is int64, not UUID
+        assert!(result.is_valid(4));
+
+        assert!(value.is_null(4)); // no extra fields
+        assert!(typed_value.is_valid(4));
+
+        assert!(id_value.is_valid(4)); // type mismatch, stored in value
+        assert!(id_typed_value.is_null(4));
+        let id_variant = Variant::new(metadata.value(4), id_value.value(4));
+        assert_eq!(id_variant, Variant::from(12345i64));
+
+        assert!(session_id_value.is_null(4));
+        assert!(session_id_typed_value.is_valid(4));
+        assert_eq!(session_id_typed_value.value(4), mock_uuid_1.as_bytes());
+
+        // Row 5: Null
+        assert!(result.is_null(5));
+    }
+
     #[test]
     fn test_spec_compliance() {
         let input = VariantArray::from_iter(vec![Variant::from(42i64), 
Variant::from("hello")]);
diff --git a/parquet-variant-compute/src/variant_to_arrow.rs 
b/parquet-variant-compute/src/variant_to_arrow.rs
index faf64c20d0..998de36d18 100644
--- a/parquet-variant-compute/src/variant_to_arrow.rs
+++ b/parquet-variant-compute/src/variant_to_arrow.rs
@@ -16,7 +16,8 @@
 // under the License.
 
 use arrow::array::{
-    ArrayRef, BinaryViewArray, BooleanBuilder, NullArray, NullBufferBuilder, 
PrimitiveBuilder,
+    ArrayRef, BinaryViewArray, BooleanBuilder, FixedSizeBinaryBuilder, 
NullArray,
+    NullBufferBuilder, PrimitiveBuilder,
 };
 use arrow::compute::{CastOptions, DecimalCast};
 use arrow::datatypes::{self, DataType, DecimalType};
@@ -60,6 +61,7 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> {
     TimestampNanoNtz(VariantToTimestampNtzArrowRowBuilder<'a, 
datatypes::TimestampNanosecondType>),
     Time(VariantToPrimitiveArrowRowBuilder<'a, 
datatypes::Time64MicrosecondType>),
     Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>),
+    Uuid(VariantToUuidArrowRowBuilder<'a>),
 }
 
 /// Builder for converting variant values into strongly typed Arrow arrays.
@@ -101,6 +103,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
             TimestampNanoNtz(b) => b.append_null(),
             Time(b) => b.append_null(),
             Date(b) => b.append_null(),
+            Uuid(b) => b.append_null(),
         }
     }
 
@@ -130,6 +133,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
             TimestampNanoNtz(b) => b.append_value(value),
             Time(b) => b.append_value(value),
             Date(b) => b.append_value(value),
+            Uuid(b) => b.append_value(value),
         }
     }
 
@@ -159,6 +163,7 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
             TimestampNanoNtz(b) => b.finish(),
             Time(b) => b.finish(),
             Date(b) => b.finish(),
+            Uuid(b) => b.finish(),
         }
     }
 }
@@ -200,98 +205,116 @@ pub(crate) fn 
make_primitive_variant_to_arrow_row_builder<'a>(
 ) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
     use PrimitiveVariantToArrowRowBuilder::*;
 
-    let builder =
-        match data_type {
-            DataType::Null => 
Null(VariantToNullArrowRowBuilder::new(cast_options, capacity)),
-            DataType::Boolean => {
-                Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, 
capacity))
-            }
-            DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::Float16 => 
Float16(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::Float32 => 
Float32(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::Float64 => 
Float64(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::Decimal32(precision, scale) => Decimal32(
-                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, 
*precision, *scale)?,
-            ),
-            DataType::Decimal64(precision, scale) => Decimal64(
-                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, 
*precision, *scale)?,
-            ),
-            DataType::Decimal128(precision, scale) => Decimal128(
-                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, 
*precision, *scale)?,
-            ),
-            DataType::Decimal256(precision, scale) => Decimal256(
-                VariantToDecimalArrowRowBuilder::new(cast_options, capacity, 
*precision, *scale)?,
-            ),
-            DataType::Timestamp(TimeUnit::Microsecond, None) => 
TimestampMicroNtz(
-                VariantToTimestampNtzArrowRowBuilder::new(cast_options, 
capacity),
-            ),
-            DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
-                VariantToTimestampArrowRowBuilder::new(cast_options, capacity, 
tz.clone()),
-            ),
-            DataType::Timestamp(TimeUnit::Nanosecond, None) => 
TimestampNanoNtz(
-                VariantToTimestampNtzArrowRowBuilder::new(cast_options, 
capacity),
-            ),
-            DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
-                VariantToTimestampArrowRowBuilder::new(cast_options, capacity, 
tz.clone()),
-            ),
-            DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
-                cast_options,
-                capacity,
-            )),
-            DataType::Time64(TimeUnit::Microsecond) => Time(
-                VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
-            ),
-            _ if data_type.is_primitive() => {
-                return Err(ArrowError::NotYetImplemented(format!(
-                    "Primitive data_type {data_type:?} not yet implemented"
-                )));
-            }
-            _ => {
-                return Err(ArrowError::InvalidArgumentError(format!(
-                    "Not a primitive type: {data_type:?}"
-                )));
-            }
-        };
+    let builder = match data_type {
+        DataType::Null => Null(VariantToNullArrowRowBuilder::new(cast_options, 
capacity)),
+        DataType::Boolean => 
Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)),
+        DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::Decimal32(precision, scale) => 
Decimal32(VariantToDecimalArrowRowBuilder::new(
+            cast_options,
+            capacity,
+            *precision,
+            *scale,
+        )?),
+        DataType::Decimal64(precision, scale) => 
Decimal64(VariantToDecimalArrowRowBuilder::new(
+            cast_options,
+            capacity,
+            *precision,
+            *scale,
+        )?),
+        DataType::Decimal128(precision, scale) => 
Decimal128(VariantToDecimalArrowRowBuilder::new(
+            cast_options,
+            capacity,
+            *precision,
+            *scale,
+        )?),
+        DataType::Decimal256(precision, scale) => 
Decimal256(VariantToDecimalArrowRowBuilder::new(
+            cast_options,
+            capacity,
+            *precision,
+            *scale,
+        )?),
+        DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz(
+            VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
+        ),
+        DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
+            VariantToTimestampArrowRowBuilder::new(cast_options, capacity, 
tz.clone()),
+        ),
+        DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz(
+            VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
+        ),
+        DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
+            VariantToTimestampArrowRowBuilder::new(cast_options, capacity, 
tz.clone()),
+        ),
+        DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::Time64(TimeUnit::Microsecond) => 
Time(VariantToPrimitiveArrowRowBuilder::new(
+            cast_options,
+            capacity,
+        )),
+        DataType::FixedSizeBinary(16) => {
+            Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
+        }
+        DataType::FixedSizeBinary(size) => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "FixedSizeBinary({size}) is not a valid variant shredding 
type. Only FixedSizeBinary(16) for UUID is supported."
+            )));
+        }
+        _ if data_type.is_primitive() => {
+            return Err(ArrowError::NotYetImplemented(format!(
+                "Primitive data_type {data_type:?} not yet implemented"
+            )));
+        }
+        _ => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Not a primitive type: {data_type:?}"
+            )));
+        }
+    };
     Ok(builder)
 }
 
@@ -519,6 +542,49 @@ where
     }
 }
 
+/// Builder for converting variant values to FixedSizeBinary(16) for UUIDs
+pub(crate) struct VariantToUuidArrowRowBuilder<'a> {
+    builder: FixedSizeBinaryBuilder,
+    cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToUuidArrowRowBuilder<'a> {
+    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self {
+            builder: FixedSizeBinaryBuilder::with_capacity(capacity, 16),
+            cast_options,
+        }
+    }
+
+    fn append_null(&mut self) -> Result<()> {
+        self.builder.append_null();
+        Ok(())
+    }
+
+    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
+        match value.as_uuid() {
+            Some(uuid) => {
+                self.builder
+                    .append_value(uuid.as_bytes())
+                    .map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
+
+                Ok(true)
+            }
+            None if self.cast_options.safe => {
+                self.builder.append_null();
+                Ok(false)
+            }
+            None => Err(ArrowError::CastError(format!(
+                "Failed to extract UUID from variant {value:?}",
+            ))),
+        }
+    }
+
+    fn finish(mut self) -> Result<ArrayRef> {
+        Ok(Arc::new(self.builder.finish()))
+    }
+}
+
 /// Builder for creating VariantArray output (for path extraction without type 
conversion)
 pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
     metadata: BinaryViewArray,

Reply via email to