This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 77271c4984b Improve performance of casting `DictionaryArray` to
`StringViewArray` (#5871)
77271c4984b is described below
commit 77271c4984b3917e0d3d3bcf26215d1ab70f29f5
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Thu Jun 13 07:46:26 2024 -0400
Improve performance of casting `DictionaryArray` to `StringViewArray`
(#5871)
* zero-copy dict to view
* refactor to use try_append_view
* unchecked view
* make fmt happy
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
.../src/builder/generic_bytes_view_builder.rs | 47 ++++++++----
arrow-cast/src/cast/dictionary.rs | 59 +++++++++++++++
arrow-cast/src/cast/mod.rs | 84 +++++++++++++---------
3 files changed, 143 insertions(+), 47 deletions(-)
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs
b/arrow-array/src/builder/generic_bytes_view_builder.rs
index e7f13a68288..6ec34bf5a91 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -116,6 +116,36 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
offset as u32
}
+ /// Append a view of the given `block`, `offset` and `length`
+ ///
+ /// # Safety
+ /// (1) The block must have been added using [`Self::append_block`]
+ /// (2) The range `offset..offset+length` must be within the bounds of the
block
+ /// (3) The data in the block must be valid of type `T`
+ pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32,
len: u32) {
+ let b = self.completed.get_unchecked(block as usize);
+ let start = offset as usize;
+ let end = start.saturating_add(len as usize);
+ let b = b.get_unchecked(start..end);
+
+ if len <= 12 {
+ let mut view_buffer = [0; 16];
+ view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
+ view_buffer[4..4 + b.len()].copy_from_slice(b);
+ self.views_builder.append(u128::from_le_bytes(view_buffer));
+ } else {
+ let view = ByteView {
+ length: len,
+ prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
+ buffer_index: block,
+ offset,
+ };
+ self.views_builder.append(view.into());
+ }
+
+ self.null_buffer_builder.append_non_null();
+ }
+
/// Try to append a view of the given `block`, `offset` and `length`
///
/// See [`Self::append_block`]
@@ -139,22 +169,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
));
}
- if len <= 12 {
- let mut view_buffer = [0; 16];
- view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
- view_buffer[4..4 + b.len()].copy_from_slice(b);
- self.views_builder.append(u128::from_le_bytes(view_buffer));
- } else {
- let view = ByteView {
- length: len,
- prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
- buffer_index: block,
- offset,
- };
- self.views_builder.append(view.into());
+ unsafe {
+ self.append_view_unchecked(block, offset, len);
}
-
- self.null_buffer_builder.append_non_null();
Ok(())
}
diff --git a/arrow-cast/src/cast/dictionary.rs
b/arrow-cast/src/cast/dictionary.rs
index 244e101f1d8..d929277a4da 100644
--- a/arrow-cast/src/cast/dictionary.rs
+++ b/arrow-cast/src/cast/dictionary.rs
@@ -85,10 +85,69 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
Ok(new_array)
}
+ Utf8View => {
+ // `unpack_dictionary` can handle Utf8View/BinaryView types, but
incurs unnecessary data copy of the value buffer.
+ // we handle it here to avoid the copy.
+ let dict_array = array
+ .as_dictionary::<K>()
+ .downcast_dict::<StringArray>()
+ .unwrap();
+
+ let string_view = view_from_dict_values::<K, StringViewType,
GenericStringType<i32>>(
+ dict_array.values(),
+ dict_array.keys(),
+ );
+ Ok(Arc::new(string_view))
+ }
+ BinaryView => {
+ // `unpack_dictionary` can handle Utf8View/BinaryView types, but
incurs unnecessary data copy of the value buffer.
+ // we handle it here to avoid the copy.
+ let dict_array = array
+ .as_dictionary::<K>()
+ .downcast_dict::<BinaryArray>()
+ .unwrap();
+
+ let binary_view = view_from_dict_values::<K, BinaryViewType,
BinaryType>(
+ dict_array.values(),
+ dict_array.keys(),
+ );
+ Ok(Arc::new(binary_view))
+ }
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
}
+fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V:
ByteArrayType>(
+ array: &GenericByteArray<V>,
+ keys: &PrimitiveArray<K>,
+) -> GenericByteViewArray<T> {
+ let value_buffer = array.values();
+ let value_offsets = array.value_offsets();
+ let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
+ builder.append_block(value_buffer.clone());
+ for i in keys.iter() {
+ match i {
+ Some(v) => {
+ let idx = v.to_usize().unwrap();
+
+ // Safety
+ // (1) The index is within bounds as they are offsets
+ // (2) The append_view is safe
+ unsafe {
+ let offset = value_offsets.get_unchecked(idx).as_usize();
+ let end = value_offsets.get_unchecked(idx + 1).as_usize();
+ let length = end - offset;
+ builder.append_view_unchecked(0, offset as u32, length as
u32)
+ }
+ }
+ None => {
+ builder.append_null();
+ }
+ }
+ }
+ builder.finish()
+}
+
// Unpack a dictionary where the keys are of type <K> into a flattened array
of type to_type
pub(crate) fn unpack_dictionary<K>(
array: &dyn Array,
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index ae2240bd940..e073e34cb6e 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -5203,19 +5203,19 @@ mod tests {
_test_string_to_view::<i64>();
}
+ const VIEW_TEST_DATA: [Option<&str>; 5] = [
+ Some("hello"),
+ Some("world"),
+ None,
+ Some("large payload over 12 bytes"),
+ Some("lulu"),
+ ];
+
fn _test_string_to_view<O>()
where
O: OffsetSizeTrait,
{
- let data = vec![
- Some("hello"),
- Some("world"),
- None,
- Some("large payload over 12 bytes"),
- Some("lulu"),
- ];
-
- let string_array = GenericStringArray::<O>::from(data.clone());
+ let string_array = GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
assert!(can_cast_types(
string_array.data_type(),
@@ -5225,7 +5225,7 @@ mod tests {
let string_view_array = cast(&string_array,
&DataType::Utf8View).unwrap();
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
- let expect_string_view_array = StringViewArray::from(data);
+ let expect_string_view_array =
StringViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
}
@@ -5239,15 +5239,7 @@ mod tests {
where
O: OffsetSizeTrait,
{
- let data: Vec<Option<&[u8]>> = vec![
- Some(b"hello"),
- Some(b"world"),
- None,
- Some(b"large payload over 12 bytes"),
- Some(b"lulu"),
- ];
-
- let binary_array = GenericBinaryArray::<O>::from(data.clone());
+ let binary_array = GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);
assert!(can_cast_types(
binary_array.data_type(),
@@ -5257,10 +5249,48 @@ mod tests {
let binary_view_array = cast(&binary_array,
&DataType::BinaryView).unwrap();
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
- let expect_binary_view_array = BinaryViewArray::from(data);
+ let expect_binary_view_array =
BinaryViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}
+ #[test]
+ fn test_dict_to_view() {
+ let values = StringArray::from_iter(VIEW_TEST_DATA);
+ let keys = Int8Array::from_iter([Some(1), Some(0), None, Some(3),
None, Some(1), Some(4)]);
+ let string_dict_array =
+ DictionaryArray::<Int8Type>::try_new(keys,
Arc::new(values)).unwrap();
+ let typed_dict =
string_dict_array.downcast_dict::<StringArray>().unwrap();
+
+ let string_view_array = {
+ let mut builder = StringViewBuilder::new().with_block_size(8); //
multiple buffers.
+ for v in typed_dict.into_iter() {
+ builder.append_option(v);
+ }
+ builder.finish()
+ };
+ let expected_string_array_type = string_view_array.data_type();
+ let casted_string_array = cast(&string_dict_array,
expected_string_array_type).unwrap();
+ assert_eq!(casted_string_array.data_type(),
expected_string_array_type);
+ assert_eq!(casted_string_array.as_ref(), &string_view_array);
+
+ let binary_buffer = cast(&typed_dict.values(),
&DataType::Binary).unwrap();
+ let binary_dict_array =
+ DictionaryArray::<Int8Type>::new(typed_dict.keys().clone(),
binary_buffer);
+ let typed_binary_dict =
binary_dict_array.downcast_dict::<BinaryArray>().unwrap();
+
+ let binary_view_array = {
+ let mut builder = BinaryViewBuilder::new().with_block_size(8); //
multiple buffers.
+ for v in typed_binary_dict.into_iter() {
+ builder.append_option(v);
+ }
+ builder.finish()
+ };
+ let expected_binary_array_type = binary_view_array.data_type();
+ let casted_binary_array = cast(&binary_dict_array,
expected_binary_array_type).unwrap();
+ assert_eq!(casted_binary_array.data_type(),
expected_binary_array_type);
+ assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
+ }
+
#[test]
fn test_view_to_string() {
_test_view_to_string::<i32>();
@@ -5271,24 +5301,15 @@ mod tests {
where
O: OffsetSizeTrait,
{
- let data: Vec<Option<&str>> = vec![
- Some("hello"),
- Some("world"),
- None,
- Some("large payload over 12 bytes"),
- Some("lulu"),
- ];
-
let view_array = {
- // ["hello", "world", null, "large payload over 12 bytes", "lulu"]
let mut builder = StringViewBuilder::new().with_block_size(8); //
multiple buffers.
- for s in data.iter() {
+ for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}
builder.finish()
};
- let expected_string_array = GenericStringArray::<O>::from(data);
+ let expected_string_array =
GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
let expected_type = expected_string_array.data_type();
assert!(can_cast_types(view_array.data_type(), expected_type));
@@ -5318,7 +5339,6 @@ mod tests {
];
let view_array = {
- // ["hello", "world", null, "large payload over 12 bytes", "lulu"]
let mut builder = BinaryViewBuilder::new().with_block_size(8); //
multiple buffers.
for s in data.iter() {
builder.append_option(*s);