This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new a61f1dc8ea1 Support casting `StringArray`/`BinaryArray` -->
`StringView` / `BinaryView` (#5686)
a61f1dc8ea1 is described below
commit a61f1dc8ea132add731e4426e11d341a1de5ca92
Author: RinChanNOW <[email protected]>
AuthorDate: Sat Apr 27 02:06:10 2024 +0800
Support casting `StringArray`/`BinaryArray` --> `StringView` / `BinaryView`
(#5686)
* Support casting from byte array to byte view array.
* Use new_unchecked.
* Add safety justification comment
* Fix comments :facepalm
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/src/array/byte_view_array.rs | 12 +++
arrow-cast/src/cast/mod.rs | 129 ++++++++++++++++++++++++++++++-
2 files changed, 139 insertions(+), 2 deletions(-)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index e2839b19e5f..79f2d47587a 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -428,6 +428,18 @@ impl BinaryViewArray {
}
}
+impl From<Vec<&[u8]>> for BinaryViewArray {
+ fn from(v: Vec<&[u8]>) -> Self {
+ Self::from_iter_values(v)
+ }
+}
+
+impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
+ fn from(v: Vec<Option<&[u8]>>) -> Self {
+ v.into_iter().collect()
+ }
+}
+
/// A [`GenericByteViewArray`] that stores utf8 data
///
/// # Example
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 8b7579c4cfc..36072760ed0 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -46,6 +46,8 @@ use crate::cast::dictionary::*;
use crate::cast::list::*;
use crate::cast::string::*;
+use arrow_buffer::ScalarBuffer;
+use arrow_data::ByteView;
use chrono::{NaiveTime, Offset, TimeZone, Utc};
use std::cmp::Ordering;
use std::sync::Arc;
@@ -119,6 +121,8 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
| Utf8
| LargeBinary
| LargeUtf8
+ | BinaryView
+ | Utf8View
| List(_)
| LargeList(_)
| FixedSizeList(_, _)
@@ -192,8 +196,8 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
DataType::is_integer(to_type) || DataType::is_floating(to_type) ||
to_type == &Utf8 || to_type == &LargeUtf8
}
- (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true,
- (LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true,
+ (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_) |
BinaryView) => true,
+ (LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) |
BinaryView) => true,
(FixedSizeBinary(_), Binary | LargeBinary) => true,
(
Utf8 | LargeUtf8,
@@ -213,6 +217,7 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
| Timestamp(Nanosecond, _)
| Interval(_),
) => true,
+ (Utf8 | LargeUtf8, Utf8View) => true,
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),
@@ -611,6 +616,8 @@ pub fn cast_with_options(
| Utf8
| LargeBinary
| LargeUtf8
+ | BinaryView
+ | Utf8View
| List(_)
| LargeList(_)
| FixedSizeList(_, _)
@@ -1120,6 +1127,7 @@ pub fn cast_with_options(
let binary =
BinaryArray::from(array.as_string::<i32>().clone());
cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
}
+ Utf8View => cast_byte_to_view::<Utf8Type, StringViewType>(array),
LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType,
i32>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
@@ -1179,6 +1187,7 @@ pub fn cast_with_options(
LargeBinary => Ok(Arc::new(LargeBinaryArray::from(
array.as_string::<i64>().clone(),
))),
+ Utf8View => cast_byte_to_view::<LargeUtf8Type,
StringViewType>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType,
i64>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
parse_string::<Time32MillisecondType, i64>(array, cast_options)
@@ -1226,6 +1235,7 @@ pub fn cast_with_options(
FixedSizeBinary(size) => {
cast_binary_to_fixed_size_binary::<i32>(array, *size,
cast_options)
}
+ BinaryView => cast_byte_to_view::<BinaryType,
BinaryViewType>(array),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
@@ -1240,6 +1250,7 @@ pub fn cast_with_options(
FixedSizeBinary(size) => {
cast_binary_to_fixed_size_binary::<i64>(array, *size,
cast_options)
}
+ BinaryView => cast_byte_to_view::<LargeBinaryType,
BinaryViewType>(array),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
@@ -2238,6 +2249,56 @@ where
Ok(Arc::new(GenericByteArray::<TO>::from(array_data)))
}
+/// Helper function to cast from one `ByteArrayType` array to `ByteViewType`
array.
+fn cast_byte_to_view<FROM, V>(array: &dyn Array) -> Result<ArrayRef,
ArrowError>
+where
+ FROM: ByteArrayType,
+ FROM::Offset: OffsetSizeTrait + ToPrimitive,
+ V: ByteViewType,
+{
+ let data = array.to_data();
+ assert_eq!(data.data_type(), &FROM::DATA_TYPE);
+
+ let len = array.len();
+ let str_values_buf = data.buffers()[1].clone();
+ let offsets = data.buffers()[0].typed_data::<FROM::Offset>();
+
+ let mut views_builder = BufferBuilder::<u128>::new(len);
+ for w in offsets.windows(2) {
+ let offset = w[0].to_u32().unwrap();
+ let end = w[1].to_u32().unwrap();
+ let value_buf = &str_values_buf[offset as usize..end as usize];
+ let length = end - offset;
+
+ if length <= 12 {
+ let mut view_buffer = [0; 16];
+ view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
+ view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf);
+ views_builder.append(u128::from_le_bytes(view_buffer));
+ } else {
+ let view = ByteView {
+ length,
+ prefix:
u32::from_le_bytes(value_buf[0..4].try_into().unwrap()),
+ buffer_index: 0,
+ offset,
+ };
+ views_builder.append(view.into());
+ }
+ }
+
+ assert_eq!(views_builder.len(), len);
+
+ // Safety: the input was a valid array so it valid UTF8 (if string). And
+ // all offsets were valid and we created the views correctly
+ Ok(Arc::new(unsafe {
+ GenericByteViewArray::<V>::new_unchecked(
+ ScalarBuffer::new(views_builder.finish(), 0, len),
+ vec![str_values_buf],
+ data.nulls().cloned(),
+ )
+ }))
+}
+
#[cfg(test)]
mod tests {
use arrow_buffer::{Buffer, NullBuffer};
@@ -5044,6 +5105,70 @@ mod tests {
}
}
+ #[test]
+ fn test_string_to_view() {
+ _test_string_to_view::<i32>();
+ _test_string_to_view::<i64>();
+ }
+
+ fn _test_string_to_view<O>()
+ where
+ O: OffsetSizeTrait,
+ {
+ let data = vec![
+ Some("hello"),
+ Some("world"),
+ None,
+ Some("large payload over 12 bytes"),
+ Some("lulu"),
+ ];
+
+ let string_array = GenericStringArray::<O>::from(data.clone());
+
+ assert!(can_cast_types(
+ string_array.data_type(),
+ &DataType::Utf8View
+ ));
+
+ let string_view_array = cast(&string_array,
&DataType::Utf8View).unwrap();
+ assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
+
+ let expect_string_view_array = StringViewArray::from(data);
+ assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
+ }
+
+ #[test]
+ fn test_bianry_to_view() {
+ _test_binary_to_view::<i32>();
+ _test_binary_to_view::<i64>();
+ }
+
+ fn _test_binary_to_view<O>()
+ where
+ O: OffsetSizeTrait,
+ {
+ let data: Vec<Option<&[u8]>> = vec![
+ Some(b"hello"),
+ Some(b"world"),
+ None,
+ Some(b"large payload over 12 bytes"),
+ Some(b"lulu"),
+ ];
+
+ let binary_array = GenericBinaryArray::<O>::from(data.clone());
+
+ assert!(can_cast_types(
+ binary_array.data_type(),
+ &DataType::BinaryView
+ ));
+
+ let binary_view_array = cast(&binary_array,
&DataType::BinaryView).unwrap();
+ assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
+
+ let expect_binary_view_array = BinaryViewArray::from(data);
+ assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
+ }
+
#[test]
fn test_cast_from_f64() {
let f64_values: Vec<f64> = vec![