This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new f248da3cc3 [VARIANT] Add support for DataType::Struct for 
cast_to_variant (#8090)
f248da3cc3 is described below

commit f248da3cc39161af436b6337b2ae836168d13abe
Author: Aditya Bhatnagar <[email protected]>
AuthorDate: Thu Aug 14 13:41:30 2025 -0400

    [VARIANT] Add support for DataType::Struct for cast_to_variant (#8090)
    
    # Which issue does this PR close?
    
    - Closes #8061
    
    # Rationale for this change
    
    Add support for DataType::Struct for cast_to_variant
    
    # What changes are included in this PR?
    Adds support for casting and adds tests as well
    
    # Are there any user-facing changes?
    yes casting to variant is a user facing issue
    
    Props to @mprammer!!
---
 parquet-variant-compute/src/cast_to_variant.rs | 339 ++++++++++++++++++++++++-
 1 file changed, 334 insertions(+), 5 deletions(-)

diff --git a/parquet-variant-compute/src/cast_to_variant.rs 
b/parquet-variant-compute/src/cast_to_variant.rs
index 343d387b24..2df53a501e 100644
--- a/parquet-variant-compute/src/cast_to_variant.rs
+++ b/parquet-variant-compute/src/cast_to_variant.rs
@@ -34,7 +34,9 @@ use arrow::temporal_conversions::{
 use arrow_schema::{ArrowError, DataType, TimeUnit};
 use chrono::{DateTime, NaiveDateTime, TimeZone, Utc};
 use half::f16;
-use parquet_variant::{Variant, VariantDecimal16, VariantDecimal4, 
VariantDecimal8};
+use parquet_variant::{
+    Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, 
VariantDecimal8,
+};
 
 /// Convert the input array of a specific primitive type to a `VariantArray`
 /// row by row
@@ -367,6 +369,51 @@ pub fn cast_to_variant(input: &dyn Array) -> 
Result<VariantArray, ArrowError> {
         DataType::Utf8View => {
             cast_conversion_nongeneric!(as_string_view, |v| v, input, builder);
         }
+        DataType::Struct(_) => {
+            let struct_array = input.as_struct();
+
+            // Pre-convert all field arrays once for better performance
+            // This avoids converting the same field array multiple times
+            // Alternative approach: Use slicing per row: field_array.slice(i, 
1)
+            // However, pre-conversion is more efficient for typical use cases
+            let field_variant_arrays: Result<Vec<_>, _> = struct_array
+                .columns()
+                .iter()
+                .map(|field_array| cast_to_variant(field_array.as_ref()))
+                .collect();
+            let field_variant_arrays = field_variant_arrays?;
+
+            // Cache column names to avoid repeated calls
+            let column_names = struct_array.column_names();
+
+            for i in 0..struct_array.len() {
+                if struct_array.is_null(i) {
+                    builder.append_null();
+                    continue;
+                }
+
+                // Create a VariantBuilder for this struct instance
+                let mut variant_builder = VariantBuilder::new();
+                let mut object_builder = variant_builder.new_object();
+
+                // Iterate through all fields in the struct
+                for (field_idx, field_name) in column_names.iter().enumerate() 
{
+                    // Use pre-converted field variant arrays for better 
performance
+                    // Check nulls directly from the pre-converted arrays 
instead of accessing column again
+                    if !field_variant_arrays[field_idx].is_null(i) {
+                        let field_variant = 
field_variant_arrays[field_idx].value(i);
+                        object_builder.insert(field_name, field_variant);
+                    }
+                    // Note: we skip null fields rather than inserting 
Variant::Null
+                    // to match Arrow struct semantics where null fields are 
omitted
+                }
+
+                object_builder.finish()?;
+                let (metadata, value) = variant_builder.finish();
+                let variant = Variant::try_new(&metadata, &value)?;
+                builder.append_variant(variant);
+            }
+        }
         dt => {
             return Err(ArrowError::CastError(format!(
                 "Unsupported data type for casting to Variant: {dt:?}",
@@ -384,12 +431,14 @@ pub fn cast_to_variant(input: &dyn Array) -> 
Result<VariantArray, ArrowError> {
 mod tests {
     use super::*;
     use arrow::array::{
-        ArrayRef, BooleanArray, Decimal128Array, Decimal256Array, 
Decimal32Array, Decimal64Array,
-        FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, 
GenericByteBuilder,
-        GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array,
+        ArrayRef, BinaryArray, BooleanArray, Decimal128Array, Decimal256Array, 
Decimal32Array,
+        Decimal64Array, FixedSizeBinaryBuilder, Float16Array, Float32Array, 
Float64Array,
+        GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, 
Int64Array, Int8Array,
         IntervalYearMonthArray, LargeStringArray, NullArray, StringArray, 
StringViewArray,
-        UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+        StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
     };
+    use arrow::buffer::NullBuffer;
+    use arrow_schema::{Field, Fields};
     use arrow_schema::{
         DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, 
DECIMAL64_MAX_PRECISION,
     };
@@ -1286,6 +1335,286 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_cast_to_variant_struct() {
+        // Test a simple struct with two fields: id (int64) and age (int32)
+        let id_array = Int64Array::from(vec![Some(1001), Some(1002), None, 
Some(1003)]);
+        let age_array = Int32Array::from(vec![Some(25), Some(30), Some(35), 
None]);
+
+        let fields = Fields::from(vec![
+            Field::new("id", DataType::Int64, true),
+            Field::new("age", DataType::Int32, true),
+        ]);
+
+        let struct_array = StructArray::new(
+            fields,
+            vec![Arc::new(id_array), Arc::new(age_array)],
+            None, // no nulls at the struct level
+        );
+
+        let result = cast_to_variant(&struct_array).unwrap();
+        assert_eq!(result.len(), 4);
+
+        // Check first row: {"id": 1001, "age": 25}
+        let variant1 = result.value(0);
+        let obj1 = variant1.as_object().unwrap();
+        assert_eq!(obj1.get("id"), Some(Variant::from(1001i64)));
+        assert_eq!(obj1.get("age"), Some(Variant::from(25i32)));
+
+        // Check second row: {"id": 1002, "age": 30}
+        let variant2 = result.value(1);
+        let obj2 = variant2.as_object().unwrap();
+        assert_eq!(obj2.get("id"), Some(Variant::from(1002i64)));
+        assert_eq!(obj2.get("age"), Some(Variant::from(30i32)));
+
+        // Check third row: {"age": 35} (id is null, so omitted)
+        let variant3 = result.value(2);
+        let obj3 = variant3.as_object().unwrap();
+        assert_eq!(obj3.get("id"), None);
+        assert_eq!(obj3.get("age"), Some(Variant::from(35i32)));
+
+        // Check fourth row: {"id": 1003} (age is null, so omitted)
+        let variant4 = result.value(3);
+        let obj4 = variant4.as_object().unwrap();
+        assert_eq!(obj4.get("id"), Some(Variant::from(1003i64)));
+        assert_eq!(obj4.get("age"), None);
+    }
+
+    #[test]
+    fn test_cast_to_variant_struct_with_nulls() {
+        // Test struct with null values at the struct level
+        let id_array = Int64Array::from(vec![Some(1001), Some(1002)]);
+        let age_array = Int32Array::from(vec![Some(25), Some(30)]);
+
+        let fields = Fields::from(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("age", DataType::Int32, false),
+        ]);
+
+        // Create null buffer to make second row null
+        let null_buffer = NullBuffer::from(vec![true, false]);
+
+        let struct_array = StructArray::new(
+            fields,
+            vec![Arc::new(id_array), Arc::new(age_array)],
+            Some(null_buffer),
+        );
+
+        let result = cast_to_variant(&struct_array).unwrap();
+        assert_eq!(result.len(), 2);
+
+        // Check first row: {"id": 1001, "age": 25}
+        assert!(!result.is_null(0));
+        let variant1 = result.value(0);
+        let obj1 = variant1.as_object().unwrap();
+        assert_eq!(obj1.get("id"), Some(Variant::from(1001i64)));
+        assert_eq!(obj1.get("age"), Some(Variant::from(25i32)));
+
+        // Check second row: null struct
+        assert!(result.is_null(1));
+    }
+
+    #[test]
+    fn test_cast_to_variant_struct_performance() {
+        // Test with a larger struct to demonstrate performance optimization
+        // This test ensures that field arrays are only converted once, not 
per row
+        let size = 1000;
+
+        let id_array = Int64Array::from((0..size).map(|i| Some(i as 
i64)).collect::<Vec<_>>());
+        let age_array = Int32Array::from(
+            (0..size)
+                .map(|i| Some((i % 100) as i32))
+                .collect::<Vec<_>>(),
+        );
+        let score_array =
+            Float64Array::from((0..size).map(|i| Some(i as f64 * 
0.1)).collect::<Vec<_>>());
+
+        let fields = Fields::from(vec![
+            Field::new("id", DataType::Int64, false),
+            Field::new("age", DataType::Int32, false),
+            Field::new("score", DataType::Float64, false),
+        ]);
+
+        let struct_array = StructArray::new(
+            fields,
+            vec![
+                Arc::new(id_array),
+                Arc::new(age_array),
+                Arc::new(score_array),
+            ],
+            None,
+        );
+
+        let result = cast_to_variant(&struct_array).unwrap();
+        assert_eq!(result.len(), size);
+
+        // Verify a few sample rows
+        let variant0 = result.value(0);
+        let obj0 = variant0.as_object().unwrap();
+        assert_eq!(obj0.get("id"), Some(Variant::from(0i64)));
+        assert_eq!(obj0.get("age"), Some(Variant::from(0i32)));
+        assert_eq!(obj0.get("score"), Some(Variant::from(0.0f64)));
+
+        let variant999 = result.value(999);
+        let obj999 = variant999.as_object().unwrap();
+        assert_eq!(obj999.get("id"), Some(Variant::from(999i64)));
+        assert_eq!(obj999.get("age"), Some(Variant::from(99i32))); // 999 % 
100 = 99
+        assert_eq!(obj999.get("score"), Some(Variant::from(99.9f64)));
+    }
+
+    #[test]
+    fn test_cast_to_variant_struct_performance_large() {
+        // Test with even larger struct and more fields to demonstrate 
optimization benefits
+        let size = 10000;
+        let num_fields = 10;
+
+        // Create arrays for many fields
+        let mut field_arrays: Vec<ArrayRef> = Vec::new();
+        let mut fields = Vec::new();
+
+        for field_idx in 0..num_fields {
+            match field_idx % 4 {
+                0 => {
+                    // Int64 fields
+                    let array = Int64Array::from(
+                        (0..size)
+                            .map(|i| Some(i as i64 + field_idx as i64))
+                            .collect::<Vec<_>>(),
+                    );
+                    field_arrays.push(Arc::new(array));
+                    fields.push(Field::new(
+                        format!("int_field_{}", field_idx),
+                        DataType::Int64,
+                        false,
+                    ));
+                }
+                1 => {
+                    // Int32 fields
+                    let array = Int32Array::from(
+                        (0..size)
+                            .map(|i| Some((i % 1000) as i32 + field_idx as 
i32))
+                            .collect::<Vec<_>>(),
+                    );
+                    field_arrays.push(Arc::new(array));
+                    fields.push(Field::new(
+                        format!("int32_field_{}", field_idx),
+                        DataType::Int32,
+                        false,
+                    ));
+                }
+                2 => {
+                    // Float64 fields
+                    let array = Float64Array::from(
+                        (0..size)
+                            .map(|i| Some(i as f64 * 0.1 + field_idx as f64))
+                            .collect::<Vec<_>>(),
+                    );
+                    field_arrays.push(Arc::new(array));
+                    fields.push(Field::new(
+                        format!("float_field_{}", field_idx),
+                        DataType::Float64,
+                        false,
+                    ));
+                }
+                _ => {
+                    // Binary fields
+                    let binary_data: Vec<Option<&[u8]>> = (0..size)
+                        .map(|i| {
+                            // Use static data to avoid lifetime issues in 
tests
+                            match i % 3 {
+                                0 => Some(b"test_data_0" as &[u8]),
+                                1 => Some(b"test_data_1" as &[u8]),
+                                _ => Some(b"test_data_2" as &[u8]),
+                            }
+                        })
+                        .collect();
+                    let array = BinaryArray::from(binary_data);
+                    field_arrays.push(Arc::new(array));
+                    fields.push(Field::new(
+                        format!("binary_field_{}", field_idx),
+                        DataType::Binary,
+                        false,
+                    ));
+                }
+            }
+        }
+
+        let struct_array = StructArray::new(Fields::from(fields), 
field_arrays, None);
+
+        let result = cast_to_variant(&struct_array).unwrap();
+        assert_eq!(result.len(), size);
+
+        // Verify a sample of rows
+        for sample_idx in [0, size / 4, size / 2, size - 1] {
+            let variant = result.value(sample_idx);
+            let obj = variant.as_object().unwrap();
+
+            // Should have all fields
+            assert_eq!(obj.len(), num_fields);
+
+            // Verify a few field values
+            if let Some(int_field_0) = obj.get("int_field_0") {
+                assert_eq!(int_field_0, Variant::from(sample_idx as i64));
+            }
+            if let Some(float_field_2) = obj.get("float_field_2") {
+                assert_eq!(float_field_2, Variant::from(sample_idx as f64 * 
0.1 + 2.0));
+            }
+        }
+    }
+
+    #[test]
+    fn test_cast_to_variant_nested_struct() {
+        // Test nested struct: person with location struct
+        let id_array = Int64Array::from(vec![Some(1001), Some(1002)]);
+        let x_array = Float64Array::from(vec![Some(40.7), Some(37.8)]);
+        let y_array = Float64Array::from(vec![Some(-74.0), Some(-122.4)]);
+
+        // Create location struct
+        let location_fields = Fields::from(vec![
+            Field::new("x", DataType::Float64, true),
+            Field::new("y", DataType::Float64, true),
+        ]);
+        let location_struct = StructArray::new(
+            location_fields.clone(),
+            vec![Arc::new(x_array), Arc::new(y_array)],
+            None,
+        );
+
+        // Create person struct containing location
+        let person_fields = Fields::from(vec![
+            Field::new("id", DataType::Int64, true),
+            Field::new("location", DataType::Struct(location_fields), true),
+        ]);
+        let person_struct = StructArray::new(
+            person_fields,
+            vec![Arc::new(id_array), Arc::new(location_struct)],
+            None,
+        );
+
+        let result = cast_to_variant(&person_struct).unwrap();
+        assert_eq!(result.len(), 2);
+
+        // Check first row
+        let variant1 = result.value(0);
+        let obj1 = variant1.as_object().unwrap();
+        assert_eq!(obj1.get("id"), Some(Variant::from(1001i64)));
+
+        let location_variant1 = obj1.get("location").unwrap();
+        let location_obj1 = location_variant1.as_object().unwrap();
+        assert_eq!(location_obj1.get("x"), Some(Variant::from(40.7f64)));
+        assert_eq!(location_obj1.get("y"), Some(Variant::from(-74.0f64)));
+
+        // Check second row
+        let variant2 = result.value(1);
+        let obj2 = variant2.as_object().unwrap();
+        assert_eq!(obj2.get("id"), Some(Variant::from(1002i64)));
+
+        let location_variant2 = obj2.get("location").unwrap();
+        let location_obj2 = location_variant2.as_object().unwrap();
+        assert_eq!(location_obj2.get("x"), Some(Variant::from(37.8f64)));
+        assert_eq!(location_obj2.get("y"), Some(Variant::from(-122.4f64)));
+    }
+
     /// Converts the given `Array` to a `VariantArray` and tests the conversion
     /// against the expected values. It also tests the handling of nulls by
     /// setting one element to null and verifying the output.

Reply via email to