This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9413cd3ffdc Add the ability for Maps to cast to another case where the 
field names are different (#5703)
9413cd3ffdc is described below

commit 9413cd3ffdccdc529e44b7aa9d77c9565f7ecaca
Author: Michael Maletich <[email protected]>
AuthorDate: Sat Jun 22 06:19:52 2024 -0500

    Add the ability for Maps to cast to another case where the field names are 
different (#5703)
    
    * Add the ability for Maps to cast to another case where the field names 
are different.
    
    Arrow Maps have field names for the elements of the fields, the field names 
are allowed to be any value and do not affect the type of the data.
    
    This allows a Map where the field names are key_value, key, value to be 
mapped to a entries, keys, values.
    
    This can be helpful in merging record batches that may have come from 
different sources.  This also makes maps behave similar to lists which also 
have a field to distinguish their elements.
    
    * Apply suggestions from code review
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * Feedback from code review
    
    - simplify map casting logic to reuse the entries
    - Added unit tests for negative cases
    - Use MapBuilder to make the intended type clearer.
    
    * fix formatting
    
    * Lint and format
    
    * correctly set the null fields
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-cast/src/cast/map.rs |  74 ++++++++++++++
 arrow-cast/src/cast/mod.rs | 245 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 319 insertions(+)

diff --git a/arrow-cast/src/cast/map.rs b/arrow-cast/src/cast/map.rs
new file mode 100644
index 00000000000..d62a9519b7b
--- /dev/null
+++ b/arrow-cast/src/cast/map.rs
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::cast::*;
+
+/// Helper function that takes a map container and casts the inner datatype.
+pub(crate) fn cast_map_values(
+    from: &MapArray,
+    to_data_type: &DataType,
+    cast_options: &CastOptions,
+    to_ordered: bool,
+) -> Result<ArrayRef, ArrowError> {
+    let entries_field = if let DataType::Map(entries_field, _) = to_data_type {
+        entries_field
+    } else {
+        return Err(ArrowError::CastError(
+            "Internal Error: to_data_type is not a map type.".to_string(),
+        ));
+    };
+
+    let key_field = key_field(entries_field).ok_or(ArrowError::CastError(
+        "map is missing key field".to_string(),
+    ))?;
+    let value_field = value_field(entries_field).ok_or(ArrowError::CastError(
+        "map is missing value field".to_string(),
+    ))?;
+
+    let key_array = cast_with_options(from.keys(), key_field.data_type(), 
cast_options)?;
+    let value_array = cast_with_options(from.values(), 
value_field.data_type(), cast_options)?;
+
+    Ok(Arc::new(MapArray::new(
+        entries_field.clone(),
+        from.offsets().clone(),
+        StructArray::new(
+            Fields::from(vec![key_field, value_field]),
+            vec![key_array, value_array],
+            from.entries().nulls().cloned(),
+        ),
+        from.nulls().cloned(),
+        to_ordered,
+    )))
+}
+
+/// Gets the key field from the entries of a map.  For all other types returns 
None.
+pub(crate) fn key_field(entries_field: &FieldRef) -> Option<FieldRef> {
+    if let DataType::Struct(fields) = entries_field.data_type() {
+        fields.first().cloned()
+    } else {
+        None
+    }
+}
+
+/// Gets the value field from the entries of a map.  For all other types 
returns None.
+pub(crate) fn value_field(entries_field: &FieldRef) -> Option<FieldRef> {
+    if let DataType::Struct(fields) = entries_field.data_type() {
+        fields.get(1).cloned()
+    } else {
+        None
+    }
+}
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 55f2ed72836..7a6e1a31bb4 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -40,10 +40,12 @@
 mod decimal;
 mod dictionary;
 mod list;
+mod map;
 mod string;
 use crate::cast::decimal::*;
 use crate::cast::dictionary::*;
 use crate::cast::list::*;
+use crate::cast::map::*;
 use crate::cast::string::*;
 
 use arrow_buffer::IntervalMonthDayNano;
@@ -159,6 +161,12 @@ pub fn can_cast_types(from_type: &DataType, to_type: 
&DataType) -> bool {
             can_cast_types(from_type, list_to.data_type())},
         (FixedSizeList(list_from,size), _) if *size == 1 => {
             can_cast_types(list_from.data_type(), to_type)},
+        (Map(from_entries,ordered_from), Map(to_entries, ordered_to)) if 
ordered_from == ordered_to =>
+            match (key_field(from_entries), key_field(to_entries), 
value_field(from_entries), value_field(to_entries)) {
+                (Some(from_key), Some(to_key), Some(from_value), 
Some(to_value)) =>
+                    can_cast_types(from_key.data_type(), to_key.data_type()) 
&& can_cast_types(from_value.data_type(), to_value.data_type()),
+                _ => false
+            },
         // cast one decimal type to another decimal type
         (Decimal128(_, _), Decimal128(_, _)) => true,
         (Decimal256(_, _), Decimal256(_, _)) => true,
@@ -802,6 +810,9 @@ pub fn cast_with_options(
         (FixedSizeList(_, size), _) if *size == 1 => {
             cast_single_element_fixed_size_list_to_values(array, to_type, 
cast_options)
         }
+        (Map(_, ordered1), Map(_, ordered2)) if ordered1 == ordered2 => {
+            cast_map_values(array.as_map(), to_type, cast_options, 
ordered1.to_owned())
+        }
         (Decimal128(_, s1), Decimal128(p2, s2)) => {
             cast_decimal_to_decimal_same_type::<Decimal128Type>(
                 array.as_primitive(),
@@ -7361,6 +7372,240 @@ mod tests {
         FixedSizeListArray::from(list_data)
     }
 
+    #[test]
+    fn test_cast_map_dont_allow_change_of_order() {
+        let string_builder = StringBuilder::new();
+        let value_builder = StringBuilder::new();
+        let mut builder = MapBuilder::new(
+            Some(MapFieldNames {
+                entry: "entries".to_string(),
+                key: "key".to_string(),
+                value: "value".to_string(),
+            }),
+            string_builder,
+            value_builder,
+        );
+
+        builder.keys().append_value("0");
+        builder.values().append_value("test_val_1");
+        builder.append(true).unwrap();
+        builder.keys().append_value("1");
+        builder.values().append_value("test_val_2");
+        builder.append(true).unwrap();
+
+        // map builder returns unsorted map by default
+        let array = builder.finish();
+
+        let new_ordered = true;
+        let new_type = DataType::Map(
+            Arc::new(Field::new(
+                "entries",
+                DataType::Struct(
+                    vec![
+                        Field::new("key", DataType::Utf8, false),
+                        Field::new("value", DataType::Utf8, false),
+                    ]
+                    .into(),
+                ),
+                false,
+            )),
+            new_ordered,
+        );
+
+        let new_array_result = cast(&array, &new_type.clone());
+        assert!(!can_cast_types(array.data_type(), &new_type));
+        assert!(
+            matches!(new_array_result, Err(ArrowError::CastError(t)) if t == 
r#"Casting from Map(Field { name: "entries", data_type: Struct([Field { name: 
"key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "value", data_type: Utf8, nullable: true, 
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 
0, dict_is_ordered: false, metadata: {} }, false) to Map(Field { name: 
"entries", data_type: Struct([Field { [...]
+        );
+    }
+
+    #[test]
+    fn test_cast_map_dont_allow_when_container_cant_cast() {
+        let string_builder = StringBuilder::new();
+        let value_builder = IntervalDayTimeArray::builder(2);
+        let mut builder = MapBuilder::new(
+            Some(MapFieldNames {
+                entry: "entries".to_string(),
+                key: "key".to_string(),
+                value: "value".to_string(),
+            }),
+            string_builder,
+            value_builder,
+        );
+
+        builder.keys().append_value("0");
+        builder.values().append_value(IntervalDayTime::new(1, 1));
+        builder.append(true).unwrap();
+        builder.keys().append_value("1");
+        builder.values().append_value(IntervalDayTime::new(2, 2));
+        builder.append(true).unwrap();
+
+        // map builder returns unsorted map by default
+        let array = builder.finish();
+
+        let new_ordered = true;
+        let new_type = DataType::Map(
+            Arc::new(Field::new(
+                "entries",
+                DataType::Struct(
+                    vec![
+                        Field::new("key", DataType::Utf8, false),
+                        Field::new("value", 
DataType::Duration(TimeUnit::Second), false),
+                    ]
+                    .into(),
+                ),
+                false,
+            )),
+            new_ordered,
+        );
+
+        let new_array_result = cast(&array, &new_type.clone());
+        assert!(!can_cast_types(array.data_type(), &new_type));
+        assert!(
+            matches!(new_array_result, Err(ArrowError::CastError(t)) if t == 
r#"Casting from Map(Field { name: "entries", data_type: Struct([Field { name: 
"key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "value", data_type: Interval(DayTime), nullable: 
true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, false) to Map(Field { name: 
"entries", data_type: St [...]
+        );
+    }
+
+    #[test]
+    fn test_cast_map_field_names() {
+        let string_builder = StringBuilder::new();
+        let value_builder = StringBuilder::new();
+        let mut builder = MapBuilder::new(
+            Some(MapFieldNames {
+                entry: "entries".to_string(),
+                key: "key".to_string(),
+                value: "value".to_string(),
+            }),
+            string_builder,
+            value_builder,
+        );
+
+        builder.keys().append_value("0");
+        builder.values().append_value("test_val_1");
+        builder.append(true).unwrap();
+        builder.keys().append_value("1");
+        builder.values().append_value("test_val_2");
+        builder.append(true).unwrap();
+        builder.append(false).unwrap();
+
+        let array = builder.finish();
+
+        let new_type = DataType::Map(
+            Arc::new(Field::new(
+                "entries_new",
+                DataType::Struct(
+                    vec![
+                        Field::new("key_new", DataType::Utf8, false),
+                        Field::new("value_values", DataType::Utf8, false),
+                    ]
+                    .into(),
+                ),
+                false,
+            )),
+            false,
+        );
+
+        assert_ne!(new_type, array.data_type().clone());
+
+        let new_array = cast(&array, &new_type.clone()).unwrap();
+        assert_eq!(new_type, new_array.data_type().clone());
+        let map_array = new_array.as_map();
+
+        assert_ne!(new_type, array.data_type().clone());
+        assert_eq!(new_type, map_array.data_type().clone());
+
+        let key_string = map_array
+            .keys()
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap()
+            .into_iter()
+            .flatten()
+            .collect::<Vec<_>>();
+        assert_eq!(&key_string, &vec!["0", "1"]);
+
+        let values_string_array = cast(map_array.values(), 
&DataType::Utf8).unwrap();
+        let values_string = values_string_array
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap()
+            .into_iter()
+            .flatten()
+            .collect::<Vec<_>>();
+        assert_eq!(&values_string, &vec!["test_val_1", "test_val_2"]);
+
+        assert_eq!(
+            map_array.nulls(),
+            Some(&NullBuffer::from(vec![true, true, false]))
+        );
+    }
+
+    #[test]
+    fn test_cast_map_contained_values() {
+        let string_builder = StringBuilder::new();
+        let value_builder = Int8Builder::new();
+        let mut builder = MapBuilder::new(
+            Some(MapFieldNames {
+                entry: "entries".to_string(),
+                key: "key".to_string(),
+                value: "value".to_string(),
+            }),
+            string_builder,
+            value_builder,
+        );
+
+        builder.keys().append_value("0");
+        builder.values().append_value(44);
+        builder.append(true).unwrap();
+        builder.keys().append_value("1");
+        builder.values().append_value(22);
+        builder.append(true).unwrap();
+
+        let array = builder.finish();
+
+        let new_type = DataType::Map(
+            Arc::new(Field::new(
+                "entries",
+                DataType::Struct(
+                    vec![
+                        Field::new("key", DataType::Utf8, false),
+                        Field::new("value", DataType::Utf8, false),
+                    ]
+                    .into(),
+                ),
+                false,
+            )),
+            false,
+        );
+
+        let new_array = cast(&array, &new_type.clone()).unwrap();
+        assert_eq!(new_type, new_array.data_type().clone());
+        let map_array = new_array.as_map();
+
+        assert_ne!(new_type, array.data_type().clone());
+        assert_eq!(new_type, map_array.data_type().clone());
+
+        let key_string = map_array
+            .keys()
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap()
+            .into_iter()
+            .flatten()
+            .collect::<Vec<_>>();
+        assert_eq!(&key_string, &vec!["0", "1"]);
+
+        let values_string_array = cast(map_array.values(), 
&DataType::Utf8).unwrap();
+        let values_string = values_string_array
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap()
+            .into_iter()
+            .flatten()
+            .collect::<Vec<_>>();
+        assert_eq!(&values_string, &vec!["44", "22"]);
+    }
+
     #[test]
     fn test_utf8_cast_offsets() {
         // test if offset of the array is taken into account during cast

Reply via email to