This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 77271c4984b Improve performance of casting `DictionaryArray` to 
`StringViewArray` (#5871)
77271c4984b is described below

commit 77271c4984b3917e0d3d3bcf26215d1ab70f29f5
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Thu Jun 13 07:46:26 2024 -0400

    Improve performance of casting `DictionaryArray` to `StringViewArray` 
(#5871)
    
    * zero-copy dict to view
    
    * refactor to use try_append_view
    
    * unchecked view
    
    * make fmt happy
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../src/builder/generic_bytes_view_builder.rs      | 47 ++++++++----
 arrow-cast/src/cast/dictionary.rs                  | 59 +++++++++++++++
 arrow-cast/src/cast/mod.rs                         | 84 +++++++++++++---------
 3 files changed, 143 insertions(+), 47 deletions(-)

diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs 
b/arrow-array/src/builder/generic_bytes_view_builder.rs
index e7f13a68288..6ec34bf5a91 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -116,6 +116,36 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
         offset as u32
     }
 
+    /// Append a view of the given `block`, `offset` and `length`
+    ///
+    /// # Safety
+    /// (1) The block must have been added using [`Self::append_block`]
+    /// (2) The range `offset..offset+length` must be within the bounds of the 
block
+    /// (3) The data in the block must be valid of type `T`
+    pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, 
len: u32) {
+        let b = self.completed.get_unchecked(block as usize);
+        let start = offset as usize;
+        let end = start.saturating_add(len as usize);
+        let b = b.get_unchecked(start..end);
+
+        if len <= 12 {
+            let mut view_buffer = [0; 16];
+            view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
+            view_buffer[4..4 + b.len()].copy_from_slice(b);
+            self.views_builder.append(u128::from_le_bytes(view_buffer));
+        } else {
+            let view = ByteView {
+                length: len,
+                prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
+                buffer_index: block,
+                offset,
+            };
+            self.views_builder.append(view.into());
+        }
+
+        self.null_buffer_builder.append_non_null();
+    }
+
     /// Try to append a view of the given `block`, `offset` and `length`
     ///
     /// See [`Self::append_block`]
@@ -139,22 +169,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
             ));
         }
 
-        if len <= 12 {
-            let mut view_buffer = [0; 16];
-            view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
-            view_buffer[4..4 + b.len()].copy_from_slice(b);
-            self.views_builder.append(u128::from_le_bytes(view_buffer));
-        } else {
-            let view = ByteView {
-                length: len,
-                prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
-                buffer_index: block,
-                offset,
-            };
-            self.views_builder.append(view.into());
+        unsafe {
+            self.append_view_unchecked(block, offset, len);
         }
-
-        self.null_buffer_builder.append_non_null();
         Ok(())
     }
 
diff --git a/arrow-cast/src/cast/dictionary.rs 
b/arrow-cast/src/cast/dictionary.rs
index 244e101f1d8..d929277a4da 100644
--- a/arrow-cast/src/cast/dictionary.rs
+++ b/arrow-cast/src/cast/dictionary.rs
@@ -85,10 +85,69 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
 
             Ok(new_array)
         }
+        Utf8View => {
+            // `unpack_dictionary` can handle Utf8View/BinaryView types, but 
incurs unnecessary data copy of the value buffer.
+            // we handle it here to avoid the copy.
+            let dict_array = array
+                .as_dictionary::<K>()
+                .downcast_dict::<StringArray>()
+                .unwrap();
+
+            let string_view = view_from_dict_values::<K, StringViewType, 
GenericStringType<i32>>(
+                dict_array.values(),
+                dict_array.keys(),
+            );
+            Ok(Arc::new(string_view))
+        }
+        BinaryView => {
+            // `unpack_dictionary` can handle Utf8View/BinaryView types, but 
incurs unnecessary data copy of the value buffer.
+            // we handle it here to avoid the copy.
+            let dict_array = array
+                .as_dictionary::<K>()
+                .downcast_dict::<BinaryArray>()
+                .unwrap();
+
+            let binary_view = view_from_dict_values::<K, BinaryViewType, 
BinaryType>(
+                dict_array.values(),
+                dict_array.keys(),
+            );
+            Ok(Arc::new(binary_view))
+        }
         _ => unpack_dictionary::<K>(array, to_type, cast_options),
     }
 }
 
+fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: 
ByteArrayType>(
+    array: &GenericByteArray<V>,
+    keys: &PrimitiveArray<K>,
+) -> GenericByteViewArray<T> {
+    let value_buffer = array.values();
+    let value_offsets = array.value_offsets();
+    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
+    builder.append_block(value_buffer.clone());
+    for i in keys.iter() {
+        match i {
+            Some(v) => {
+                let idx = v.to_usize().unwrap();
+
+                // Safety
+                // (1) The index is within bounds as they are offsets
+                // (2) The append_view is safe
+                unsafe {
+                    let offset = value_offsets.get_unchecked(idx).as_usize();
+                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
+                    let length = end - offset;
+                    builder.append_view_unchecked(0, offset as u32, length as 
u32)
+                }
+            }
+            None => {
+                builder.append_null();
+            }
+        }
+    }
+    builder.finish()
+}
+
 // Unpack a dictionary where the keys are of type <K> into a flattened array 
of type to_type
 pub(crate) fn unpack_dictionary<K>(
     array: &dyn Array,
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index ae2240bd940..e073e34cb6e 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -5203,19 +5203,19 @@ mod tests {
         _test_string_to_view::<i64>();
     }
 
+    const VIEW_TEST_DATA: [Option<&str>; 5] = [
+        Some("hello"),
+        Some("world"),
+        None,
+        Some("large payload over 12 bytes"),
+        Some("lulu"),
+    ];
+
     fn _test_string_to_view<O>()
     where
         O: OffsetSizeTrait,
     {
-        let data = vec![
-            Some("hello"),
-            Some("world"),
-            None,
-            Some("large payload over 12 bytes"),
-            Some("lulu"),
-        ];
-
-        let string_array = GenericStringArray::<O>::from(data.clone());
+        let string_array = GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
 
         assert!(can_cast_types(
             string_array.data_type(),
@@ -5225,7 +5225,7 @@ mod tests {
         let string_view_array = cast(&string_array, 
&DataType::Utf8View).unwrap();
         assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
 
-        let expect_string_view_array = StringViewArray::from(data);
+        let expect_string_view_array = 
StringViewArray::from_iter(VIEW_TEST_DATA);
         assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
     }
 
@@ -5239,15 +5239,7 @@ mod tests {
     where
         O: OffsetSizeTrait,
     {
-        let data: Vec<Option<&[u8]>> = vec![
-            Some(b"hello"),
-            Some(b"world"),
-            None,
-            Some(b"large payload over 12 bytes"),
-            Some(b"lulu"),
-        ];
-
-        let binary_array = GenericBinaryArray::<O>::from(data.clone());
+        let binary_array = GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);
 
         assert!(can_cast_types(
             binary_array.data_type(),
@@ -5257,10 +5249,48 @@ mod tests {
         let binary_view_array = cast(&binary_array, 
&DataType::BinaryView).unwrap();
         assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
 
-        let expect_binary_view_array = BinaryViewArray::from(data);
+        let expect_binary_view_array = 
BinaryViewArray::from_iter(VIEW_TEST_DATA);
         assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
     }
 
+    #[test]
+    fn test_dict_to_view() {
+        let values = StringArray::from_iter(VIEW_TEST_DATA);
+        let keys = Int8Array::from_iter([Some(1), Some(0), None, Some(3), 
None, Some(1), Some(4)]);
+        let string_dict_array =
+            DictionaryArray::<Int8Type>::try_new(keys, 
Arc::new(values)).unwrap();
+        let typed_dict = 
string_dict_array.downcast_dict::<StringArray>().unwrap();
+
+        let string_view_array = {
+            let mut builder = StringViewBuilder::new().with_block_size(8); // 
multiple buffers.
+            for v in typed_dict.into_iter() {
+                builder.append_option(v);
+            }
+            builder.finish()
+        };
+        let expected_string_array_type = string_view_array.data_type();
+        let casted_string_array = cast(&string_dict_array, 
expected_string_array_type).unwrap();
+        assert_eq!(casted_string_array.data_type(), 
expected_string_array_type);
+        assert_eq!(casted_string_array.as_ref(), &string_view_array);
+
+        let binary_buffer = cast(&typed_dict.values(), 
&DataType::Binary).unwrap();
+        let binary_dict_array =
+            DictionaryArray::<Int8Type>::new(typed_dict.keys().clone(), 
binary_buffer);
+        let typed_binary_dict = 
binary_dict_array.downcast_dict::<BinaryArray>().unwrap();
+
+        let binary_view_array = {
+            let mut builder = BinaryViewBuilder::new().with_block_size(8); // 
multiple buffers.
+            for v in typed_binary_dict.into_iter() {
+                builder.append_option(v);
+            }
+            builder.finish()
+        };
+        let expected_binary_array_type = binary_view_array.data_type();
+        let casted_binary_array = cast(&binary_dict_array, 
expected_binary_array_type).unwrap();
+        assert_eq!(casted_binary_array.data_type(), 
expected_binary_array_type);
+        assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
+    }
+
     #[test]
     fn test_view_to_string() {
         _test_view_to_string::<i32>();
@@ -5271,24 +5301,15 @@ mod tests {
     where
         O: OffsetSizeTrait,
     {
-        let data: Vec<Option<&str>> = vec![
-            Some("hello"),
-            Some("world"),
-            None,
-            Some("large payload over 12 bytes"),
-            Some("lulu"),
-        ];
-
         let view_array = {
-            // ["hello", "world", null, "large payload over 12 bytes", "lulu"]
             let mut builder = StringViewBuilder::new().with_block_size(8); // 
multiple buffers.
-            for s in data.iter() {
+            for s in VIEW_TEST_DATA.iter() {
                 builder.append_option(*s);
             }
             builder.finish()
         };
 
-        let expected_string_array = GenericStringArray::<O>::from(data);
+        let expected_string_array = 
GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
         let expected_type = expected_string_array.data_type();
 
         assert!(can_cast_types(view_array.data_type(), expected_type));
@@ -5318,7 +5339,6 @@ mod tests {
         ];
 
         let view_array = {
-            // ["hello", "world", null, "large payload over 12 bytes", "lulu"]
             let mut builder = BinaryViewBuilder::new().with_block_size(8); // 
multiple buffers.
             for s in data.iter() {
                 builder.append_option(*s);

Reply via email to