This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 57eeb266af arrow-cast: Add ability to cast plain struct to dictionary 
(#10039)
57eeb266af is described below

commit 57eeb266af09f7fee47b6d4265ed2bdff6746929
Author: Frederic Branczyk <[email protected]>
AuthorDate: Tue Jun 2 14:55:53 2026 +0000

    arrow-cast: Add ability to cast plain struct to dictionary (#10039)
    
    # Which issue does this PR close?
    
    - Closes #10038
    
    # What changes are included in this PR?
    
    A naive implementation of casting plain structs to dictionaries, that
    doesn't perform any deduplication.
    
    # Are these changes tested?
    
    Unit tests added.
    
    # Are there any user-facing changes?
    
    No, just a new feature.
    
    @alamb @Jefffrey
---
 arrow-cast/src/cast/dictionary.rs |  39 ++++++++++++
 arrow-cast/src/cast/mod.rs        | 126 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 165 insertions(+)

diff --git a/arrow-cast/src/cast/dictionary.rs 
b/arrow-cast/src/cast/dictionary.rs
index 601f50a4d0..83aa691482 100644
--- a/arrow-cast/src/cast/dictionary.rs
+++ b/arrow-cast/src/cast/dictionary.rs
@@ -315,12 +315,51 @@ pub(crate) fn cast_to_dictionary<K: 
ArrowDictionaryKeyType>(
         FixedSizeBinary(byte_size) => {
             pack_byte_to_fixed_size_dictionary::<K>(array, cast_options, 
byte_size)
         }
+        Struct(_) => pack_struct_to_dictionary::<K>(array, dict_value_type, 
cast_options),
         _ => Err(ArrowError::CastError(format!(
             "Unsupported output type for dictionary packing: {dict_value_type}"
         ))),
     }
 }
 
+/// Wrap a struct-valued array as a `DictionaryArray<K, Struct>` with identity
+/// keys `[0, 1, ..., len-1]`. Unlike the primitive / byte packers above, no
+/// deduplication is performed, since struct values have no general 
hash/equality
+/// builder in arrow-rs.
+///
+/// Each child field of the source is recursively cast to the matching field of
+/// `dict_value_type` via `cast_with_options` before keys are emitted. If any
+/// child cast fails, the whole pack fails, the same contract as the primitive
+/// packers above.
+fn pack_struct_to_dictionary<K: ArrowDictionaryKeyType>(
+    array: &dyn Array,
+    dict_value_type: &DataType,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
+    let len = cast_values.len();
+
+    // Identity keys `[0, 1, ..., len-1]`, with null entries wherever the
+    // source row is null so the dictionary's logical null mask matches.
+    let mut builder = PrimitiveBuilder::<K>::with_capacity(len);
+    for i in 0..len {
+        if cast_values.is_null(i) {
+            builder.append_null();
+        } else {
+            let key = K::Native::from_usize(i).ok_or_else(|| {
+                ArrowError::CastError(format!(
+                    "Cannot fit {len} dictionary keys in {:?}",
+                    K::DATA_TYPE,
+                ))
+            })?;
+            builder.append_value(key);
+        }
+    }
+    let keys = builder.finish();
+
+    Ok(Arc::new(DictionaryArray::<K>::try_new(keys, cast_values)?))
+}
+
 // Packs the data from the primitive array of type <V> to a
 // DictionaryArray with keys of type K and values of value_type V
 pub(crate) fn pack_numeric_to_dictionary<K, V>(
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 0367a54121..4d67703ea6 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -6278,6 +6278,132 @@ mod tests {
         assert_ne!(keys.value(0), keys.value(1));
     }
 
+    #[test]
+    fn test_cast_struct_array_to_dict_struct() {
+        // Cast a StructArray into Dictionary<UInt32, Struct{…}>. The 
dictionary
+        // value type's child fields may differ from the source's (here:
+        // Utf8 source → Utf8View child for `name`), so the per-field cast
+        // must run before identity keys are emitted. This is the "as long as
+        // the struct can be cast to the dict value" contract.
+        let names = StringArray::from(vec![Some("alpha"), None, 
Some("gamma")]);
+        let ids = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+        let source = StructArray::from(vec![
+            (
+                Arc::new(Field::new("name", DataType::Utf8, true)),
+                Arc::new(names) as ArrayRef,
+            ),
+            (
+                Arc::new(Field::new("id", DataType::Int32, false)),
+                Arc::new(ids) as ArrayRef,
+            ),
+        ]);
+
+        let target_value_type = DataType::Struct(
+            vec![
+                Field::new("name", DataType::Utf8View, true),
+                Field::new("id", DataType::Int64, false),
+            ]
+            .into(),
+        );
+        let cast_type = DataType::Dictionary(
+            Box::new(DataType::UInt32),
+            Box::new(target_value_type.clone()),
+        );
+        assert!(can_cast_types(source.data_type(), &cast_type));
+
+        let cast_array = cast(&source, &cast_type).unwrap();
+        assert_eq!(cast_array.data_type(), &cast_type);
+        assert_eq!(cast_array.len(), 3);
+
+        let dict = cast_array.as_dictionary::<UInt32Type>();
+        assert_eq!(dict.values().data_type(), &target_value_type);
+        // No dedup is performed for struct values — one row, one key.
+        assert_eq!(dict.values().len(), 3);
+
+        // Source row 1 was a `Utf8`-null in the `name` field but the whole
+        // struct row was valid (StructArray::from above takes per-field
+        // nulls only). The dictionary's logical null mask therefore mirrors
+        // the source struct's row-level null mask — all rows valid here.
+        let keys = dict.keys();
+        assert_eq!(keys.values(), &[0u32, 1, 2]);
+        assert_eq!(keys.null_count(), 0);
+
+        let struct_values = dict.values().as_struct();
+        let names_out = struct_values
+            .column_by_name("name")
+            .unwrap()
+            .as_string_view();
+        assert_eq!(names_out.value(0), "alpha");
+        assert!(names_out.is_null(1));
+        assert_eq!(names_out.value(2), "gamma");
+        let ids_out = struct_values
+            .column_by_name("id")
+            .unwrap()
+            .as_primitive::<Int64Type>();
+        assert_eq!(ids_out.values(), &[1i64, 2, 3]);
+    }
+
+    #[test]
+    fn test_cast_struct_array_to_dict_struct_row_nulls() {
+        // Row-level nulls on the source struct must surface as null keys on
+        // the dictionary, since the dictionary's logical null mask is
+        // determined by the keys.
+        let names = StringArray::from(vec![Some("alpha"), Some("beta"), 
Some("gamma")]);
+        let ids = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+        let source = StructArray::try_new(
+            vec![
+                Field::new("name", DataType::Utf8, true),
+                Field::new("id", DataType::Int32, false),
+            ]
+            .into(),
+            vec![Arc::new(names) as ArrayRef, Arc::new(ids) as ArrayRef],
+            Some(NullBuffer::from(vec![true, false, true])),
+        )
+        .unwrap();
+
+        let target_value_type = DataType::Struct(
+            vec![
+                Field::new("name", DataType::Utf8, true),
+                Field::new("id", DataType::Int32, false),
+            ]
+            .into(),
+        );
+        let cast_type =
+            DataType::Dictionary(Box::new(DataType::UInt32), 
Box::new(target_value_type));
+
+        let cast_array = cast(&source, &cast_type).unwrap();
+        let dict = cast_array.as_dictionary::<UInt32Type>();
+        assert_eq!(dict.len(), 3);
+        let keys = dict.keys();
+        assert!(!keys.is_null(0));
+        assert!(keys.is_null(1));
+        assert!(!keys.is_null(2));
+    }
+
+    #[test]
+    fn test_cast_struct_array_to_dict_struct_key_overflow() {
+        // Source has 300 rows but the dictionary key type is UInt8 (max 255).
+        // We must return a CastError instead of silently truncating.
+        let n = 300;
+        let names = StringArray::from((0..n).map(|i| 
Some(format!("v{i}"))).collect::<Vec<_>>());
+        let source = StructArray::from(vec![(
+            Arc::new(Field::new("name", DataType::Utf8, true)),
+            Arc::new(names) as ArrayRef,
+        )]);
+
+        let cast_type = DataType::Dictionary(
+            Box::new(DataType::UInt8),
+            Box::new(DataType::Struct(
+                vec![Field::new("name", DataType::Utf8, true)].into(),
+            )),
+        );
+        let err = cast(&source, &cast_type).unwrap_err().to_string();
+        assert!(
+            err.contains("Cannot fit") && err.contains("dictionary keys"),
+            "expected key-overflow error, got: {err}"
+        );
+    }
+
     #[test]
     fn test_cast_empty_string_array_to_dict_utf8_view() {
         let array = StringArray::from(Vec::<Option<&str>>::new());

Reply via email to