alamb commented on code in PR #6218:
URL: https://github.com/apache/arrow-rs/pull/6218#discussion_r1715930797


##########
arrow-array/src/array/union_array.rs:
##########
@@ -296,7 +296,7 @@ impl UnionArray {
     }
 
     /// Returns whether the `UnionArray` is dense (or sparse if `false`).
-    fn is_dense(&self) -> bool {
+    pub fn is_dense(&self) -> bool {

Review Comment:
   👍 



##########
arrow-cast/src/cast/mod.rs:
##########
@@ -2442,9 +2559,37 @@ where
     Ok(Arc::new(byte_array_builder.finish()))
 }
 
+/// Get a field of the union as an array, this is equivalent to 
[`UnionArray::child`] for sparse unions,
+/// but builds an array of the right length for dense unions.
+///
+/// # Panics
+/// Panics if `type_id` is not present in the union.
+fn union_field_array(union_array: &UnionArray, type_id: i8) -> 
Result<Cow<ArrayRef>, ArrowError> {

Review Comment:
   Since an ArrayRef is already an `Arc` I suspect the extra `Cow` here doesn't 
matter much



##########
arrow-cast/src/cast/mod.rs:
##########
@@ -2417,9 +2518,37 @@ where
     Ok(Arc::new(byte_array_builder.finish()))
 }
 
+/// Get a field of the union as an array, this is equivalent to 
[`UnionArray::child`] for sparse unions,
+/// but builds an array of the right length for dense unions.
+///
+/// # Panics
+/// Panics if `type_id` is not present in the union.
+fn union_field_array(union_array: &UnionArray, type_id: i8) -> 
Result<Cow<ArrayRef>, ArrowError> {
+    let child = union_array.child(type_id);
+    match union_array.offsets() {
+        Some(offsets) => {
+            let data_type = child.data_type();
+            let sub_arrays: Vec<ArrayRef> = offsets
+                .iter()
+                .zip(union_array.type_ids().iter())
+                .map(|(offset, id)| {
+                    if id == &type_id {
+                        child.slice(*offset as usize, 1)
+                    } else {
+                        new_null_array(data_type, 1)

Review Comment:
   This will definitely be slow.
   
   I think the better API is this:  
https://docs.rs/arrow/latest/arrow/array/struct.MutableArrayData.html (which 
lets you copy chunks of data from different arrays)
   
   
   Though if what you are trying to do is to get one child array and use the 
nulls of the parent, you could potentially just replace the null buffer directly
   
   I can probably have some more specific suggestions once we sort out what the 
semantics of casting a union array are supposed to be (see my other comment)
   
   



##########
arrow-cast/src/cast/mod.rs:
##########
@@ -812,6 +836,115 @@ pub fn cast_with_options(
         (Map(_, ordered1), Map(_, ordered2)) if ordered1 == ordered2 => {
             cast_map_values(array.as_map(), to_type, cast_options, 
ordered1.to_owned())
         }
+
+        // structs
+        (Struct(_), Struct(to_fields)) => {
+            let array = array.as_struct();
+            let fields = array
+                .columns()
+                .iter()
+                .zip(to_fields.iter())
+                .map(|(l, field)| cast_with_options(l, field.data_type(), 
cast_options))
+                .collect::<Result<Vec<ArrayRef>, ArrowError>>()?;
+            let array = StructArray::try_new(to_fields.clone(), fields, 
array.nulls().cloned())?;
+            Ok(Arc::new(array) as ArrayRef)
+        }
+        (Struct(_), _) => Err(ArrowError::CastError(
+            "Cannot cast from struct to other types except struct".to_string(),
+        )),
+        (_, Struct(_)) => Err(ArrowError::CastError(
+            "Cannot cast to struct from other types except struct".to_string(),
+        )),
+
+        // unions
+        // we might be able to support this, but it's complex

Review Comment:
   👍  -- it is also not well specified if there are multiple field types that 
could be the target 
   
   For example, what would `Union(Int32, Int64) --> Union(Float64, Decimal128)` 
do?  Would it cast all the integers to Float64 Or to Decimal128? Or something 
else 🤔 



##########
arrow-cast/src/cast/mod.rs:
##########
@@ -167,6 +174,26 @@ pub fn can_cast_types(from_type: &DataType, to_type: 
&DataType) -> bool {
                     can_cast_types(from_key.data_type(), to_key.data_type()) 
&& can_cast_types(from_value.data_type(), to_value.data_type()),
                 _ => false
             },
+
+        (Struct(from_fields), Struct(to_fields)) => {
+            from_fields.len() == to_fields.len() &&
+                from_fields.iter().zip(to_fields.iter()).all(|(f1, f2)| {
+                    // Assume that nullability between two structs are 
compatible, if not,
+                    // cast kernel will return error.
+                    can_cast_types(f1.data_type(), f2.data_type())
+                })
+        }
+        (Struct(_), _) => false,
+        (_, Struct(_)) => false,
+
+        (Union(_, _), Union(_, _)) => false,
+        (Union(from_fields, _), _) => {
+            from_fields.iter().any(|(_, f)| can_cast_types(f.data_type(), 
to_type))

Review Comment:
   Shouldn't this verify we can cast *ALL* the possible union field types to 
the target type , not just one of the fields?



##########
arrow-cast/src/cast/mod.rs:
##########
@@ -9524,4 +9669,192 @@ mod tests {
             "Cast non-nullable to non-nullable struct field returning null 
should fail",
         );
     }
+
+    fn union_fields() -> UnionFields {
+        [
+            (0, Arc::new(Field::new("A", DataType::Int32, true))),
+            (1, Arc::new(Field::new("B", DataType::Float64, true))),
+            (2, Arc::new(Field::new("C", DataType::Utf8, true))),
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    fn as_int_vec<T: ArrowPrimitiveType>(array: &ArrayRef) -> 
Vec<Option<T::Native>> {
+        array
+            .as_any()
+            .downcast_ref::<PrimitiveArray<T>>()
+            .unwrap()
+            .iter()
+            .collect()
+    }
+
+    fn as_str_vec(array: &ArrayRef) -> Vec<Option<&str>> {
+        array
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap()
+            .iter()
+            .collect()
+    }
+
+    #[test]
+    fn sparse_union_cast() {
+        // union of [{A=1}, {A=}, {B=3.2}, {C="a"}]
+        let int_array = Int32Array::from(vec![Some(1), None, None, None]);
+        let float_array = Float64Array::from(vec![None, None, Some(3.2), 
None]);
+        let str_array = StringArray::from(vec![None, None, None, Some("a")]);
+        let type_ids = [0, 0, 1, 2].into_iter().collect::<ScalarBuffer<i8>>();
+
+        let children = vec![
+            Arc::new(int_array) as Arc<dyn Array>,
+            Arc::new(float_array),
+            Arc::new(str_array),
+        ];
+
+        let union_array = UnionArray::try_new(union_fields(), type_ids, None, 
children).unwrap();
+
+        assert!(can_cast_types(union_array.data_type(), &DataType::Int64));
+        assert!(can_cast_types(&DataType::Int64, union_array.data_type()));
+
+        assert!(can_cast_types(union_array.data_type(), &DataType::Float64));
+        assert!(can_cast_types(&DataType::Float64, union_array.data_type()));
+
+        assert!(can_cast_types(union_array.data_type(), &DataType::Utf8));
+        assert!(can_cast_types(&DataType::Utf8, union_array.data_type()));
+
+        assert!(!can_cast_types(
+            union_array.data_type(),
+            &DataType::Duration(TimeUnit::Second)
+        ));
+        // Duration to Utf8 is allowed
+        assert!(can_cast_types(
+            &DataType::Duration(TimeUnit::Second),
+            union_array.data_type()
+        ));
+
+        let cast_array = cast(&union_array, &DataType::Int64).unwrap();
+        let as_int64 = as_int_vec::<Int64Type>(&cast_array);
+        assert_eq!(as_int64, vec![Some(1), None, None, None]);

Review Comment:
   What are the expected semantics of casting a UnionArray to another type?
   
   What this test seems to verify is some sort of TAKE semantics, where casting 
from `UnionArray` to `Int64` basically picks the Union variant that is the 
closest and takes
   
   So it does
   ```
   cast(union of [{A=1}, {A=}, {B=3.2}], Int64) --> [1, NULL, NULL]
   ```
   
   I sort of expected that the result of casting a UnionArray would be to cast 
all the elements individually to the target type. So in this example
   
   ```
   cast (union of [{A=1}, {A=}, {B=3.2}]) --> [1, NULL, 3]
   ```
   
   Where the `3` results from casting `3.2` to `3`
   
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to