This is an automated email from the ASF dual-hosted git repository.

jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 08dcc0b8c5 fix: cast Binary/String dictionary to view (#8912)
08dcc0b8c5 is described below

commit 08dcc0b8c552e81d612b53bc0ab2a0007bfe20a1
Author: Jeffrey Vo <[email protected]>
AuthorDate: Fri Dec 12 19:49:53 2025 +0900

    fix: cast Binary/String dictionary to view (#8912)
    
    # Which issue does this PR close?
    
    - Closes #8841
    
    # Rationale for this change
    
    Be able to successfully cast from Dictionary type to View types.
    
    # What changes are included in this PR?
    
    Add checks on which array types can use the fast path that was
    previously erroring.
    
    Also do a little refactoring in surrounding code.
    
    # Are these changes tested?
    
    Added new tests.
    
    # Are there any user-facing changes?
    
    No.
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-cast/src/cast/dictionary.rs | 187 +++++++++++++++++---------------------
 arrow-cast/src/cast/mod.rs        |  55 +++++++++++
 2 files changed, 137 insertions(+), 105 deletions(-)

diff --git a/arrow-cast/src/cast/dictionary.rs 
b/arrow-cast/src/cast/dictionary.rs
index c213ac2662..64d77236cc 100644
--- a/arrow-cast/src/cast/dictionary.rs
+++ b/arrow-cast/src/cast/dictionary.rs
@@ -28,111 +28,92 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
 ) -> Result<ArrayRef, ArrowError> {
     use DataType::*;
 
-    match to_type {
-        Dictionary(to_index_type, to_value_type) => {
-            let dict_array = array
-                .as_any()
-                .downcast_ref::<DictionaryArray<K>>()
-                .ok_or_else(|| {
-                    ArrowError::ComputeError(
-                        "Internal Error: Cannot cast dictionary to 
DictionaryArray of expected type".to_string(),
-                    )
-                })?;
+    let array = array.as_dictionary::<K>();
+    let from_child_type = array.values().data_type();
+    match (from_child_type, to_type) {
+        (_, Dictionary(to_index_type, to_value_type)) => {
+            dictionary_to_dictionary_cast(array, to_index_type, to_value_type, 
cast_options)
+        }
+        // `unpack_dictionary` can handle Utf8View/BinaryView types, but 
incurs unnecessary data
+        // copy of the value buffer. Fast path which avoids copying underlying 
values buffer.
+        // TODO: handle LargeUtf8/LargeBinary -> View (need to check offsets 
can fit)
+        // TODO: handle cross types (String -> BinaryView, Binary -> 
StringView)
+        //       (need to validate utf8?)
+        (Utf8, Utf8View) => view_from_dict_values::<K, Utf8Type, 
StringViewType>(
+            array.keys(),
+            array.values().as_string::<i32>(),
+        ),
+        (Binary, BinaryView) => view_from_dict_values::<K, BinaryType, 
BinaryViewType>(
+            array.keys(),
+            array.values().as_binary::<i32>(),
+        ),
+        _ => unpack_dictionary(array, to_type, cast_options),
+    }
+}
 
-            let keys_array: ArrayRef =
-                
Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data()));
-            let values_array = dict_array.values();
-            let cast_keys = cast_with_options(&keys_array, to_index_type, 
cast_options)?;
-            let cast_values = cast_with_options(values_array, to_value_type, 
cast_options)?;
+fn dictionary_to_dictionary_cast<K: ArrowDictionaryKeyType>(
+    array: &DictionaryArray<K>,
+    to_index_type: &DataType,
+    to_value_type: &DataType,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    use DataType::*;
 
-            // Failure to cast keys (because they don't fit in the
-            // target type) results in NULL values;
-            if cast_keys.null_count() > keys_array.null_count() {
-                return Err(ArrowError::ComputeError(format!(
-                    "Could not convert {} dictionary indexes from {:?} to 
{:?}",
-                    cast_keys.null_count() - keys_array.null_count(),
-                    keys_array.data_type(),
-                    to_index_type
-                )));
-            }
+    let keys_array: ArrayRef = 
Arc::new(PrimitiveArray::<K>::from(array.keys().to_data()));
+    let values_array = array.values();
+    let cast_keys = cast_with_options(&keys_array, to_index_type, 
cast_options)?;
+    let cast_values = cast_with_options(values_array, to_value_type, 
cast_options)?;
 
-            let data = cast_keys.into_data();
-            let builder = data
-                .into_builder()
-                .data_type(to_type.clone())
-                .child_data(vec![cast_values.into_data()]);
+    // Failure to cast keys (because they don't fit in the
+    // target type) results in NULL values;
+    if cast_keys.null_count() > keys_array.null_count() {
+        return Err(ArrowError::ComputeError(format!(
+            "Could not convert {} dictionary indexes from {:?} to {:?}",
+            cast_keys.null_count() - keys_array.null_count(),
+            keys_array.data_type(),
+            to_index_type
+        )));
+    }
 
-            // Safety
-            // Cast keys are still valid
-            let data = unsafe { builder.build_unchecked() };
+    let data = cast_keys.into_data();
+    let builder = data
+        .into_builder()
+        .data_type(Dictionary(
+            Box::new(to_index_type.clone()),
+            Box::new(to_value_type.clone()),
+        ))
+        .child_data(vec![cast_values.into_data()]);
 
-            // create the appropriate array type
-            let new_array: ArrayRef = match **to_index_type {
-                Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
-                Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
-                Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
-                Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
-                UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
-                UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
-                UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
-                UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
-                _ => {
-                    return Err(ArrowError::CastError(format!(
-                        "Unsupported type {to_index_type} for dictionary index"
-                    )));
-                }
-            };
+    // Safety
+    // Cast keys are still valid
+    let data = unsafe { builder.build_unchecked() };
 
-            Ok(new_array)
+    // create the appropriate array type
+    let new_array: ArrayRef = match to_index_type {
+        Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
+        Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
+        Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
+        Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
+        UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
+        UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
+        UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
+        UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
+        _ => {
+            return Err(ArrowError::CastError(format!(
+                "Unsupported type {to_index_type} for dictionary index"
+            )));
         }
-        Utf8View => {
-            // `unpack_dictionary` can handle Utf8View/BinaryView types, but 
incurs unnecessary data copy of the value buffer.
-            // we handle it here to avoid the copy.
-            let dict_array = array
-                .as_dictionary::<K>()
-                .downcast_dict::<StringArray>()
-                .ok_or_else(|| {
-                    ArrowError::ComputeError(
-                        "Internal Error: Cannot cast Utf8View to StringArray 
of expected type"
-                            .to_string(),
-                    )
-                })?;
+    };
 
-            let string_view = view_from_dict_values::<K, StringViewType, 
GenericStringType<i32>>(
-                dict_array.values(),
-                dict_array.keys(),
-            )?;
-            Ok(Arc::new(string_view))
-        }
-        BinaryView => {
-            // `unpack_dictionary` can handle Utf8View/BinaryView types, but 
incurs unnecessary data copy of the value buffer.
-            // we handle it here to avoid the copy.
-            let dict_array = array
-                .as_dictionary::<K>()
-                .downcast_dict::<BinaryArray>()
-                .ok_or_else(|| {
-                    ArrowError::ComputeError(
-                        "Internal Error: Cannot cast BinaryView to BinaryArray 
of expected type"
-                            .to_string(),
-                    )
-                })?;
-
-            let binary_view = view_from_dict_values::<K, BinaryViewType, 
BinaryType>(
-                dict_array.values(),
-                dict_array.keys(),
-            )?;
-            Ok(Arc::new(binary_view))
-        }
-        _ => unpack_dictionary::<K>(array, to_type, cast_options),
-    }
+    Ok(new_array)
 }
 
-fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: 
ByteArrayType>(
-    array: &GenericByteArray<V>,
+fn view_from_dict_values<K: ArrowDictionaryKeyType, V: ByteArrayType, T: 
ByteViewType>(
     keys: &PrimitiveArray<K>,
-) -> Result<GenericByteViewArray<T>, ArrowError> {
-    let value_buffer = array.values();
-    let value_offsets = array.value_offsets();
+    values: &GenericByteArray<V>,
+) -> Result<ArrayRef, ArrowError> {
+    let value_buffer = values.values();
+    let value_offsets = values.value_offsets();
     let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
     builder.append_block(value_buffer.clone());
     for i in keys.iter() {
@@ -157,21 +138,17 @@ fn view_from_dict_values<K: ArrowDictionaryKeyType, T: 
ByteViewType, V: ByteArra
             }
         }
     }
-    Ok(builder.finish())
+    Ok(Arc::new(builder.finish()))
 }
 
-// Unpack a dictionary where the keys are of type <K> into a flattened array 
of type to_type
-pub(crate) fn unpack_dictionary<K>(
-    array: &dyn Array,
+// Unpack a dictionary into a flattened array of type to_type
+pub(crate) fn unpack_dictionary<K: ArrowDictionaryKeyType>(
+    array: &DictionaryArray<K>,
     to_type: &DataType,
     cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError>
-where
-    K: ArrowDictionaryKeyType,
-{
-    let dict_array = array.as_dictionary::<K>();
-    let cast_dict_values = cast_with_options(dict_array.values(), to_type, 
cast_options)?;
-    take(cast_dict_values.as_ref(), dict_array.keys(), None)
+) -> Result<ArrayRef, ArrowError> {
+    let cast_dict_values = cast_with_options(array.values(), to_type, 
cast_options)?;
+    take(cast_dict_values.as_ref(), array.keys(), None)
 }
 
 /// Pack a data type into a dictionary array passing the values through a 
primitive array
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index f61c335938..b0a25488e8 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -12168,4 +12168,59 @@ mod tests {
         // 03:12:44 = 3*3600 + 12*60 + 44 = 10800 + 720 + 44 = 11564
         assert_eq!(cast_array.value(0), 11564);
     }
+    #[test]
+    fn test_string_dicts_to_binary_view() {
+        let expected = BinaryViewArray::from_iter(vec![
+            VIEW_TEST_DATA[1],
+            VIEW_TEST_DATA[0],
+            None,
+            VIEW_TEST_DATA[3],
+            None,
+            VIEW_TEST_DATA[1],
+            VIEW_TEST_DATA[4],
+        ]);
+
+        let values_arrays: [ArrayRef; _] = [
+            Arc::new(StringArray::from_iter(VIEW_TEST_DATA)),
+            Arc::new(StringViewArray::from_iter(VIEW_TEST_DATA)),
+            Arc::new(LargeStringArray::from_iter(VIEW_TEST_DATA)),
+        ];
+        for values in values_arrays {
+            let keys =
+                Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, 
Some(1), Some(4)]);
+            let string_dict_array =
+                DictionaryArray::<Int8Type>::try_new(keys, 
Arc::new(values)).unwrap();
+
+            let casted = cast(&string_dict_array, 
&DataType::BinaryView).unwrap();
+            assert_eq!(casted.as_ref(), &expected);
+        }
+    }
+
+    #[test]
+    fn test_binary_dicts_to_string_view() {
+        let expected = StringViewArray::from_iter(vec![
+            VIEW_TEST_DATA[1],
+            VIEW_TEST_DATA[0],
+            None,
+            VIEW_TEST_DATA[3],
+            None,
+            VIEW_TEST_DATA[1],
+            VIEW_TEST_DATA[4],
+        ]);
+
+        let values_arrays: [ArrayRef; _] = [
+            Arc::new(BinaryArray::from_iter(VIEW_TEST_DATA)),
+            Arc::new(BinaryViewArray::from_iter(VIEW_TEST_DATA)),
+            Arc::new(LargeBinaryArray::from_iter(VIEW_TEST_DATA)),
+        ];
+        for values in values_arrays {
+            let keys =
+                Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, 
Some(1), Some(4)]);
+            let string_dict_array =
+                DictionaryArray::<Int8Type>::try_new(keys, 
Arc::new(values)).unwrap();
+
+            let casted = cast(&string_dict_array, 
&DataType::Utf8View).unwrap();
+            assert_eq!(casted.as_ref(), &expected);
+        }
+    }
 }

Reply via email to