alamb commented on code in PR #6218:
URL: https://github.com/apache/arrow-rs/pull/6218#discussion_r1715930797
##########
arrow-array/src/array/union_array.rs:
##########
@@ -296,7 +296,7 @@ impl UnionArray {
}
/// Returns whether the `UnionArray` is dense (or sparse if `false`).
- fn is_dense(&self) -> bool {
+ pub fn is_dense(&self) -> bool {
Review Comment:
👍
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -2442,9 +2559,37 @@ where
Ok(Arc::new(byte_array_builder.finish()))
}
+/// Get a field of the union as an array, this is equivalent to
[`UnionArray::child`] for sparse unions,
+/// but builds an array of the right length for dense unions.
+///
+/// # Panics
+/// Panics if `type_id` is not present in the union.
+fn union_field_array(union_array: &UnionArray, type_id: i8) ->
Result<Cow<ArrayRef>, ArrowError> {
Review Comment:
Since an ArrayRef is already an `Arc` I suspect the extra `Cow` here doesn't
matter much
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -2417,9 +2518,37 @@ where
Ok(Arc::new(byte_array_builder.finish()))
}
+/// Get a field of the union as an array, this is equivalent to
[`UnionArray::child`] for sparse unions,
+/// but builds an array of the right length for dense unions.
+///
+/// # Panics
+/// Panics if `type_id` is not present in the union.
+fn union_field_array(union_array: &UnionArray, type_id: i8) ->
Result<Cow<ArrayRef>, ArrowError> {
+ let child = union_array.child(type_id);
+ match union_array.offsets() {
+ Some(offsets) => {
+ let data_type = child.data_type();
+ let sub_arrays: Vec<ArrayRef> = offsets
+ .iter()
+ .zip(union_array.type_ids().iter())
+ .map(|(offset, id)| {
+ if id == &type_id {
+ child.slice(*offset as usize, 1)
+ } else {
+ new_null_array(data_type, 1)
Review Comment:
This will definitely be slow.
I think the better API is this:
https://docs.rs/arrow/latest/arrow/array/struct.MutableArrayData.html (which
lets you copy chunks of data from different arrays)
Though if what you are trying to do is to get one child array and use the
nulls of the parent, you could potentially just replace the null buffer directly
I can probably have some more specific suggestions once we sort out what the
semantics of casting a union array are supposed to be (see my other comment)
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -812,6 +836,115 @@ pub fn cast_with_options(
(Map(_, ordered1), Map(_, ordered2)) if ordered1 == ordered2 => {
cast_map_values(array.as_map(), to_type, cast_options,
ordered1.to_owned())
}
+
+ // structs
+ (Struct(_), Struct(to_fields)) => {
+ let array = array.as_struct();
+ let fields = array
+ .columns()
+ .iter()
+ .zip(to_fields.iter())
+ .map(|(l, field)| cast_with_options(l, field.data_type(),
cast_options))
+ .collect::<Result<Vec<ArrayRef>, ArrowError>>()?;
+ let array = StructArray::try_new(to_fields.clone(), fields,
array.nulls().cloned())?;
+ Ok(Arc::new(array) as ArrayRef)
+ }
+ (Struct(_), _) => Err(ArrowError::CastError(
+ "Cannot cast from struct to other types except struct".to_string(),
+ )),
+ (_, Struct(_)) => Err(ArrowError::CastError(
+ "Cannot cast to struct from other types except struct".to_string(),
+ )),
+
+ // unions
+ // we might be able to support this, but it's complex
Review Comment:
👍 -- it is also not well specified if there are multiple field types that
could be the target
For example, what would `Union(Int32, Int64) --> Union(Float64, Decimal128)`
do? Would it cast all the integers to Float64 Or to Decimal128? Or something
else 🤔
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -167,6 +174,26 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
can_cast_types(from_key.data_type(), to_key.data_type())
&& can_cast_types(from_value.data_type(), to_value.data_type()),
_ => false
},
+
+ (Struct(from_fields), Struct(to_fields)) => {
+ from_fields.len() == to_fields.len() &&
+ from_fields.iter().zip(to_fields.iter()).all(|(f1, f2)| {
+ // Assume that nullability between two structs are
compatible, if not,
+ // cast kernel will return error.
+ can_cast_types(f1.data_type(), f2.data_type())
+ })
+ }
+ (Struct(_), _) => false,
+ (_, Struct(_)) => false,
+
+ (Union(_, _), Union(_, _)) => false,
+ (Union(from_fields, _), _) => {
+ from_fields.iter().any(|(_, f)| can_cast_types(f.data_type(),
to_type))
Review Comment:
Shouldn't this verify we can cast *ALL* the possible union field types to
the target type , not just one of the fields?
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -9524,4 +9669,192 @@ mod tests {
"Cast non-nullable to non-nullable struct field returning null
should fail",
);
}
+
+ fn union_fields() -> UnionFields {
+ [
+ (0, Arc::new(Field::new("A", DataType::Int32, true))),
+ (1, Arc::new(Field::new("B", DataType::Float64, true))),
+ (2, Arc::new(Field::new("C", DataType::Utf8, true))),
+ ]
+ .into_iter()
+ .collect()
+ }
+
+ fn as_int_vec<T: ArrowPrimitiveType>(array: &ArrayRef) ->
Vec<Option<T::Native>> {
+ array
+ .as_any()
+ .downcast_ref::<PrimitiveArray<T>>()
+ .unwrap()
+ .iter()
+ .collect()
+ }
+
+ fn as_str_vec(array: &ArrayRef) -> Vec<Option<&str>> {
+ array
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap()
+ .iter()
+ .collect()
+ }
+
+ #[test]
+ fn sparse_union_cast() {
+ // union of [{A=1}, {A=}, {B=3.2}, {C="a"}]
+ let int_array = Int32Array::from(vec![Some(1), None, None, None]);
+ let float_array = Float64Array::from(vec![None, None, Some(3.2),
None]);
+ let str_array = StringArray::from(vec![None, None, None, Some("a")]);
+ let type_ids = [0, 0, 1, 2].into_iter().collect::<ScalarBuffer<i8>>();
+
+ let children = vec![
+ Arc::new(int_array) as Arc<dyn Array>,
+ Arc::new(float_array),
+ Arc::new(str_array),
+ ];
+
+ let union_array = UnionArray::try_new(union_fields(), type_ids, None,
children).unwrap();
+
+ assert!(can_cast_types(union_array.data_type(), &DataType::Int64));
+ assert!(can_cast_types(&DataType::Int64, union_array.data_type()));
+
+ assert!(can_cast_types(union_array.data_type(), &DataType::Float64));
+ assert!(can_cast_types(&DataType::Float64, union_array.data_type()));
+
+ assert!(can_cast_types(union_array.data_type(), &DataType::Utf8));
+ assert!(can_cast_types(&DataType::Utf8, union_array.data_type()));
+
+ assert!(!can_cast_types(
+ union_array.data_type(),
+ &DataType::Duration(TimeUnit::Second)
+ ));
+ // Duration to Utf8 is allowed
+ assert!(can_cast_types(
+ &DataType::Duration(TimeUnit::Second),
+ union_array.data_type()
+ ));
+
+ let cast_array = cast(&union_array, &DataType::Int64).unwrap();
+ let as_int64 = as_int_vec::<Int64Type>(&cast_array);
+ assert_eq!(as_int64, vec![Some(1), None, None, None]);
Review Comment:
What are the expected semantics of casting a UnionArray to another type?
What this test seems to verify is some sort of TAKE semantics, where casting
from `UnionArray` to `Int64` basically picks the Union variant that is the
closest and takes
So it does
```
cast(union of [{A=1}, {A=}, {B=3.2}], Int64) --> [1, NULL, NULL]
```
I sort of expected that the result of casting a UnionArray would be to cast
all the elements individually to the target type. So in this example
```
cast (union of [{A=1}, {A=}, {B=3.2}]) --> [1, NULL, 3]
```
Where the `3` results from casting `3.2` to `3`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]