This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 8752e01be64 Improve performance of casting `StringView`/`BinaryView` 
to `DictionaryArray` (#5872)
8752e01be64 is described below

commit 8752e01be642bce205984e16b44e06078413dc68
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Thu Jun 13 10:03:03 2024 -0400

    Improve performance of casting `StringView`/`BinaryView` to 
`DictionaryArray` (#5872)
    
    * zero-copy dict to view
    
    * view to dict
    
    * refactor to use try_append_view
    
    * unchecked view
    
    * make fmt happy
    
    * update test
    
    * add comments
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-cast/src/cast/dictionary.rs | 84 +++++++++++++++++++++++++++++++++++++--
 arrow-cast/src/cast/mod.rs        | 36 +++++++++++------
 2 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/arrow-cast/src/cast/dictionary.rs 
b/arrow-cast/src/cast/dictionary.rs
index d929277a4da..ee2021d15b6 100644
--- a/arrow-cast/src/cast/dictionary.rs
+++ b/arrow-cast/src/cast/dictionary.rs
@@ -188,10 +188,34 @@ pub(crate) fn cast_to_dictionary<K: 
ArrowDictionaryKeyType>(
         Decimal256(_, _) => {
             pack_numeric_to_dictionary::<K, Decimal256Type>(array, 
dict_value_type, cast_options)
         }
-        Utf8 => pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, 
cast_options),
-        LargeUtf8 => pack_byte_to_dictionary::<K, 
GenericStringType<i64>>(array, cast_options),
-        Binary => pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, 
cast_options),
-        LargeBinary => pack_byte_to_dictionary::<K, 
GenericBinaryType<i64>>(array, cast_options),
+        Utf8 => {
+            // If the input is a view type, we can avoid casting (thus 
copying) the data
+            if array.data_type() == &DataType::Utf8View {
+                return string_view_to_dictionary::<K, i32>(array);
+            }
+            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, 
cast_options)
+        }
+        LargeUtf8 => {
+            // If the input is a view type, we can avoid casting (thus 
copying) the data
+            if array.data_type() == &DataType::Utf8View {
+                return string_view_to_dictionary::<K, i64>(array);
+            }
+            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, 
cast_options)
+        }
+        Binary => {
+            // If the input is a view type, we can avoid casting (thus 
copying) the data
+            if array.data_type() == &DataType::BinaryView {
+                return binary_view_to_dictionary::<K, i32>(array);
+            }
+            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, 
cast_options)
+        }
+        LargeBinary => {
+            // If the input is a view type, we can avoid casting (thus 
copying) the data
+            if array.data_type() == &DataType::BinaryView {
+                return binary_view_to_dictionary::<K, i64>(array);
+            }
+            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, 
cast_options)
+        }
         _ => Err(ArrowError::CastError(format!(
             "Unsupported output type for dictionary packing: 
{dict_value_type:?}"
         ))),
@@ -226,6 +250,58 @@ where
     Ok(Arc::new(b.finish()))
 }
 
+pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
+    array: &dyn Array,
+) -> Result<ArrayRef, ArrowError>
+where
+    K: ArrowDictionaryKeyType,
+{
+    let mut b = GenericByteDictionaryBuilder::<K, 
GenericStringType<O>>::with_capacity(
+        array.len(),
+        1024,
+        1024,
+    );
+    let string_view = 
array.as_any().downcast_ref::<StringViewArray>().unwrap();
+    for v in string_view.iter() {
+        match v {
+            Some(v) => {
+                b.append(v)?;
+            }
+            None => {
+                b.append_null();
+            }
+        }
+    }
+
+    Ok(Arc::new(b.finish()))
+}
+
+pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
+    array: &dyn Array,
+) -> Result<ArrayRef, ArrowError>
+where
+    K: ArrowDictionaryKeyType,
+{
+    let mut b = GenericByteDictionaryBuilder::<K, 
GenericBinaryType<O>>::with_capacity(
+        array.len(),
+        1024,
+        1024,
+    );
+    let binary_view = 
array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
+    for v in binary_view.iter() {
+        match v {
+            Some(v) => {
+                b.append(v)?;
+            }
+            None => {
+                b.append_null();
+            }
+        }
+    }
+
+    Ok(Arc::new(b.finish()))
+}
+
 // Packs the data as a GenericByteDictionaryBuilder, if possible, with the
 // key types of K
 pub(crate) fn pack_byte_to_dictionary<K, T>(
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index e073e34cb6e..354c31af695 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -5205,10 +5205,10 @@ mod tests {
 
     const VIEW_TEST_DATA: [Option<&str>; 5] = [
         Some("hello"),
-        Some("world"),
+        Some("repeated"),
         None,
         Some("large payload over 12 bytes"),
-        Some("lulu"),
+        Some("repeated"),
     ];
 
     fn _test_string_to_view<O>()
@@ -5291,6 +5291,26 @@ mod tests {
         assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
     }
 
+    #[test]
+    fn test_view_to_dict() {
+        let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
+        let string_dict_array: DictionaryArray<Int8Type> = 
VIEW_TEST_DATA.into_iter().collect();
+        let casted_type = string_dict_array.data_type();
+        let casted_dict_array = cast(&string_view_array, casted_type).unwrap();
+        assert_eq!(casted_dict_array.data_type(), casted_type);
+        assert_eq!(casted_dict_array.as_ref(), &string_dict_array);
+
+        let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
+        let binary_dict_array = 
string_dict_array.downcast_dict::<StringArray>().unwrap();
+        let binary_buffer = cast(&binary_dict_array.values(), 
&DataType::Binary).unwrap();
+        let binary_dict_array =
+            DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(), 
binary_buffer);
+        let casted_type = binary_dict_array.data_type();
+        let casted_binary_array = cast(&binary_view_array, 
casted_type).unwrap();
+        assert_eq!(casted_binary_array.data_type(), casted_type);
+        assert_eq!(casted_binary_array.as_ref(), &binary_dict_array);
+    }
+
     #[test]
     fn test_view_to_string() {
         _test_view_to_string::<i32>();
@@ -5330,23 +5350,15 @@ mod tests {
     where
         O: OffsetSizeTrait,
     {
-        let data: Vec<Option<&[u8]>> = vec![
-            Some(b"hello"),
-            Some(b"world"),
-            None,
-            Some(b"large payload over 12 bytes"),
-            Some(b"lulu"),
-        ];
-
         let view_array = {
             let mut builder = BinaryViewBuilder::new().with_block_size(8); // 
multiple buffers.
-            for s in data.iter() {
+            for s in VIEW_TEST_DATA.iter() {
                 builder.append_option(*s);
             }
             builder.finish()
         };
 
-        let expected_binary_array = GenericBinaryArray::<O>::from(data);
+        let expected_binary_array = 
GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);
         let expected_type = expected_binary_array.data_type();
 
         assert!(can_cast_types(view_array.data_type(), expected_type));

Reply via email to