This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 8752e01be64 Improve performance of casting `StringView`/`BinaryView`
to `DictionaryArray` (#5872)
8752e01be64 is described below
commit 8752e01be642bce205984e16b44e06078413dc68
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Thu Jun 13 10:03:03 2024 -0400
Improve performance of casting `StringView`/`BinaryView` to
`DictionaryArray` (#5872)
* zero-copy dict to view
* view to dict
* refactor to use try_append_view
* unchecked view
* make fmt happy
* update test
* add comments
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-cast/src/cast/dictionary.rs | 84 +++++++++++++++++++++++++++++++++++++--
arrow-cast/src/cast/mod.rs | 36 +++++++++++------
2 files changed, 104 insertions(+), 16 deletions(-)
diff --git a/arrow-cast/src/cast/dictionary.rs
b/arrow-cast/src/cast/dictionary.rs
index d929277a4da..ee2021d15b6 100644
--- a/arrow-cast/src/cast/dictionary.rs
+++ b/arrow-cast/src/cast/dictionary.rs
@@ -188,10 +188,34 @@ pub(crate) fn cast_to_dictionary<K:
ArrowDictionaryKeyType>(
Decimal256(_, _) => {
pack_numeric_to_dictionary::<K, Decimal256Type>(array,
dict_value_type, cast_options)
}
- Utf8 => pack_byte_to_dictionary::<K, GenericStringType<i32>>(array,
cast_options),
- LargeUtf8 => pack_byte_to_dictionary::<K,
GenericStringType<i64>>(array, cast_options),
- Binary => pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array,
cast_options),
- LargeBinary => pack_byte_to_dictionary::<K,
GenericBinaryType<i64>>(array, cast_options),
+ Utf8 => {
+ // If the input is a view type, we can avoid casting (thus
copying) the data
+ if array.data_type() == &DataType::Utf8View {
+ return string_view_to_dictionary::<K, i32>(array);
+ }
+ pack_byte_to_dictionary::<K, GenericStringType<i32>>(array,
cast_options)
+ }
+ LargeUtf8 => {
+ // If the input is a view type, we can avoid casting (thus
copying) the data
+ if array.data_type() == &DataType::Utf8View {
+ return string_view_to_dictionary::<K, i64>(array);
+ }
+ pack_byte_to_dictionary::<K, GenericStringType<i64>>(array,
cast_options)
+ }
+ Binary => {
+ // If the input is a view type, we can avoid casting (thus
copying) the data
+ if array.data_type() == &DataType::BinaryView {
+ return binary_view_to_dictionary::<K, i32>(array);
+ }
+ pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array,
cast_options)
+ }
+ LargeBinary => {
+ // If the input is a view type, we can avoid casting (thus
copying) the data
+ if array.data_type() == &DataType::BinaryView {
+ return binary_view_to_dictionary::<K, i64>(array);
+ }
+ pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array,
cast_options)
+ }
_ => Err(ArrowError::CastError(format!(
"Unsupported output type for dictionary packing:
{dict_value_type:?}"
))),
@@ -226,6 +250,58 @@ where
Ok(Arc::new(b.finish()))
}
+pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
+ array: &dyn Array,
+) -> Result<ArrayRef, ArrowError>
+where
+ K: ArrowDictionaryKeyType,
+{
+ let mut b = GenericByteDictionaryBuilder::<K,
GenericStringType<O>>::with_capacity(
+ array.len(),
+ 1024,
+ 1024,
+ );
+ let string_view =
array.as_any().downcast_ref::<StringViewArray>().unwrap();
+ for v in string_view.iter() {
+ match v {
+ Some(v) => {
+ b.append(v)?;
+ }
+ None => {
+ b.append_null();
+ }
+ }
+ }
+
+ Ok(Arc::new(b.finish()))
+}
+
+pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
+ array: &dyn Array,
+) -> Result<ArrayRef, ArrowError>
+where
+ K: ArrowDictionaryKeyType,
+{
+ let mut b = GenericByteDictionaryBuilder::<K,
GenericBinaryType<O>>::with_capacity(
+ array.len(),
+ 1024,
+ 1024,
+ );
+ let binary_view =
array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
+ for v in binary_view.iter() {
+ match v {
+ Some(v) => {
+ b.append(v)?;
+ }
+ None => {
+ b.append_null();
+ }
+ }
+ }
+
+ Ok(Arc::new(b.finish()))
+}
+
// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
// key types of K
pub(crate) fn pack_byte_to_dictionary<K, T>(
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index e073e34cb6e..354c31af695 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -5205,10 +5205,10 @@ mod tests {
const VIEW_TEST_DATA: [Option<&str>; 5] = [
Some("hello"),
- Some("world"),
+ Some("repeated"),
None,
Some("large payload over 12 bytes"),
- Some("lulu"),
+ Some("repeated"),
];
fn _test_string_to_view<O>()
@@ -5291,6 +5291,26 @@ mod tests {
assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
}
+ #[test]
+ fn test_view_to_dict() {
+ let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
+ let string_dict_array: DictionaryArray<Int8Type> =
VIEW_TEST_DATA.into_iter().collect();
+ let casted_type = string_dict_array.data_type();
+ let casted_dict_array = cast(&string_view_array, casted_type).unwrap();
+ assert_eq!(casted_dict_array.data_type(), casted_type);
+ assert_eq!(casted_dict_array.as_ref(), &string_dict_array);
+
+ let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
+ let binary_dict_array =
string_dict_array.downcast_dict::<StringArray>().unwrap();
+ let binary_buffer = cast(&binary_dict_array.values(),
&DataType::Binary).unwrap();
+ let binary_dict_array =
+ DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(),
binary_buffer);
+ let casted_type = binary_dict_array.data_type();
+ let casted_binary_array = cast(&binary_view_array,
casted_type).unwrap();
+ assert_eq!(casted_binary_array.data_type(), casted_type);
+ assert_eq!(casted_binary_array.as_ref(), &binary_dict_array);
+ }
+
#[test]
fn test_view_to_string() {
_test_view_to_string::<i32>();
@@ -5330,23 +5350,15 @@ mod tests {
where
O: OffsetSizeTrait,
{
- let data: Vec<Option<&[u8]>> = vec![
- Some(b"hello"),
- Some(b"world"),
- None,
- Some(b"large payload over 12 bytes"),
- Some(b"lulu"),
- ];
-
let view_array = {
let mut builder = BinaryViewBuilder::new().with_block_size(8); //
multiple buffers.
- for s in data.iter() {
+ for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}
builder.finish()
};
- let expected_binary_array = GenericBinaryArray::<O>::from(data);
+ let expected_binary_array =
GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);
let expected_type = expected_binary_array.data_type();
assert!(can_cast_types(view_array.data_type(), expected_type));