alamb commented on code in PR #5871:
URL: https://github.com/apache/arrow-rs/pull/5871#discussion_r1635424761
##########
arrow-cast/src/cast/dictionary.rs:
##########
@@ -85,10 +85,65 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
Ok(new_array)
}
+ Utf8View => {
+ // `unpack_dictionary` can handle Utf8View/BinaryView types, but
incurs unnecessary data copy of the value buffer.
+ // we handle it here to avoid the copy.
+ let dict_array = array
+ .as_dictionary::<K>()
+ .downcast_dict::<StringArray>()
+ .unwrap();
+
+ let string_view = view_from_dict_values::<K, StringViewType,
GenericStringType<i32>>(
+ dict_array.values(),
+ dict_array.keys(),
+ );
+ Ok(Arc::new(string_view))
+ }
+ BinaryView => {
+ // `unpack_dictionary` can handle Utf8View/BinaryView types, but
incurs unnecessary data copy of the value buffer.
+ // we handle it here to avoid the copy.
+ let dict_array = array
+ .as_dictionary::<K>()
+ .downcast_dict::<BinaryArray>()
+ .unwrap();
+
+ let binary_view = view_from_dict_values::<K, BinaryViewType,
BinaryType>(
+ dict_array.values(),
+ dict_array.keys(),
+ );
+ Ok(Arc::new(binary_view))
+ }
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
}
+fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V:
ByteArrayType>(
+ array: &GenericByteArray<V>,
+ keys: &PrimitiveArray<K>,
+) -> GenericByteViewArray<T> {
+ let value_buffer = array.values();
+ let value_offsets = array.value_offsets();
+ let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
+ builder.append_block(value_buffer.clone());
+ for i in keys.iter() {
Review Comment:
Another potential optimization is a separate loop if there are no nulls in
keys (so we can avoid the branch)
Another potental idea is to use `value_offsets.windows(2)` as an iterator to
avoid the bounds checks in `value_offsets`
However, I think we should merge this basic PR in as is, and then add a
bencmark and optimize this kenrnel as a follow on PR (if we care). I can file a
ticket if @tustvold agrees
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -5257,10 +5249,48 @@ mod tests {
let binary_view_array = cast(&binary_array,
&DataType::BinaryView).unwrap();
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
- let expect_binary_view_array = BinaryViewArray::from(data);
+ let expect_binary_view_array =
BinaryViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}
+ #[test]
+ fn test_dict_to_view() {
+ let values = StringArray::from_iter(VIEW_TEST_DATA);
+ let keys = Int8Array::from_iter([Some(1), Some(0), None, Some(3),
None, Some(1), Some(4)]);
+ let string_dict_array =
+ DictionaryArray::<Int8Type>::try_new(keys,
Arc::new(values)).unwrap();
+ let typed_dict =
string_dict_array.downcast_dict::<StringArray>().unwrap();
+
+ let string_view_array = {
+ let mut builder = StringViewBuilder::new().with_block_size(8); //
multiple buffers.
Review Comment:
👍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]