This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 0c3a24d2a42 Implement arrow-row encoding/decoding for view types
(#5922)
0c3a24d2a42 is described below
commit 0c3a24d2a42dfac7bf56bf2c87374463efee722e
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Sun Jun 23 19:30:59 2024 -0700
Implement arrow-row encoding/decoding for view types (#5922)
* implement arrow-row encoding/decoding for view types
* add doc comments, better error msg, more test coverage
* ensure no performance regression
* update perf
* fix bug
* make fmt happy
* Update arrow-array/src/array/byte_view_array.rs
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
* update
* update comments
* move cmp around
* move things around and remove inline hint
* Update arrow-array/src/array/byte_view_array.rs
Co-authored-by: Andrew Lamb <[email protected]>
* Update arrow-ord/src/cmp.rs
Co-authored-by: Andrew Lamb <[email protected]>
* return error instead of panic
* remove unnecessary func
---------
Co-authored-by: Andrew Lamb <[email protected]>
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---
arrow-array/src/array/byte_view_array.rs | 54 ++++++++++-
arrow-cast/src/cast/mod.rs | 40 +-------
arrow-ord/src/cmp.rs | 158 ++++++++++++++++---------------
arrow-ord/src/ord.rs | 17 ++++
arrow-row/src/lib.rs | 54 ++++++++++-
arrow-row/src/variable.rs | 27 ++++++
arrow/benches/comparison_kernels.rs | 20 ++++
7 files changed, 250 insertions(+), 120 deletions(-)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index 187f5b8e6f9..f31bc1c785b 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -16,19 +16,22 @@
// under the License.
use crate::array::print_long_array;
-use crate::builder::GenericByteViewBuilder;
+use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
use crate::iterator::ArrayIter;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
-use crate::{Array, ArrayAccessor, ArrayRef, Scalar};
-use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
+use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait,
Scalar};
+use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
use arrow_schema::{ArrowError, DataType};
+use num::ToPrimitive;
use std::any::Any;
use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::Arc;
+use super::ByteArrayType;
+
/// [Variable-size Binary View Layout]: An array of variable length bytes view
arrays.
///
/// Different than [`crate::GenericByteArray`] as it stores both an offset and
length
@@ -429,6 +432,51 @@ impl<T: ByteViewType + ?Sized> From<ArrayData> for
GenericByteViewArray<T> {
}
}
+/// Convert a [`GenericByteArray`] to a [`GenericByteViewArray`] but in a
smart way:
+/// If the offsets are all less than u32::MAX, then we directly build the view
array on top of existing buffer.
+impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
+where
+ FROM: ByteArrayType,
+ FROM::Offset: OffsetSizeTrait + ToPrimitive,
+ V: ByteViewType<Native = FROM::Native>,
+{
+ fn from(byte_array: &GenericByteArray<FROM>) -> Self {
+ let offsets = byte_array.offsets();
+
+ let can_reuse_buffer = match offsets.last() {
+ Some(offset) => offset.as_usize() < u32::MAX as usize,
+ None => true,
+ };
+
+ if can_reuse_buffer {
+ let len = byte_array.len();
+ let mut views_builder =
GenericByteViewBuilder::<V>::with_capacity(len);
+ let str_values_buf = byte_array.values().clone();
+ let block = views_builder.append_block(str_values_buf);
+ for (i, w) in offsets.windows(2).enumerate() {
+ let offset = w[0].as_usize();
+ let end = w[1].as_usize();
+ let length = end - offset;
+
+ if byte_array.is_null(i) {
+ views_builder.append_null();
+ } else {
+ // Safety: the input was a valid array so it valid UTF8
(if string). And
+ // all offsets were valid
+ unsafe {
+ views_builder.append_view_unchecked(block, offset as
u32, length as u32)
+ }
+ }
+ }
+ assert_eq!(views_builder.len(), len);
+ views_builder.finish()
+ } else {
+ // TODO: the first u32::MAX can still be reused
+ GenericByteViewArray::<V>::from_iter(byte_array.iter())
+ }
+ }
+}
+
impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
fn from(mut array: GenericByteViewArray<T>) -> Self {
let len = array.len();
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 7a6e1a31bb4..e5ab304bb6f 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -1230,7 +1230,7 @@ pub fn cast_with_options(
let binary =
BinaryArray::from(array.as_string::<i32>().clone());
cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
}
- Utf8View => cast_byte_to_view::<Utf8Type, StringViewType>(array),
+ Utf8View =>
Ok(Arc::new(StringViewArray::from(array.as_string::<i32>()))),
LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType,
i32>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
@@ -1290,7 +1290,7 @@ pub fn cast_with_options(
LargeBinary => Ok(Arc::new(LargeBinaryArray::from(
array.as_string::<i64>().clone(),
))),
- Utf8View => cast_byte_to_view::<LargeUtf8Type,
StringViewType>(array),
+ Utf8View =>
Ok(Arc::new(StringViewArray::from(array.as_string::<i64>()))),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType,
i64>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
parse_string::<Time32MillisecondType, i64>(array, cast_options)
@@ -1338,7 +1338,7 @@ pub fn cast_with_options(
FixedSizeBinary(size) => {
cast_binary_to_fixed_size_binary::<i32>(array, *size,
cast_options)
}
- BinaryView => cast_byte_to_view::<BinaryType,
BinaryViewType>(array),
+ BinaryView =>
Ok(Arc::new(BinaryViewArray::from(array.as_binary::<i32>()))),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
@@ -1353,7 +1353,7 @@ pub fn cast_with_options(
FixedSizeBinary(size) => {
cast_binary_to_fixed_size_binary::<i64>(array, *size,
cast_options)
}
- BinaryView => cast_byte_to_view::<LargeBinaryType,
BinaryViewType>(array),
+ BinaryView =>
Ok(Arc::new(BinaryViewArray::from(array.as_binary::<i64>()))),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
@@ -2345,38 +2345,6 @@ where
Ok(Arc::new(GenericByteArray::<TO>::from(array_data)))
}
-/// Helper function to cast from one `ByteArrayType` array to `ByteViewType`
array.
-fn cast_byte_to_view<FROM, V>(array: &dyn Array) -> Result<ArrayRef,
ArrowError>
-where
- FROM: ByteArrayType,
- FROM::Offset: OffsetSizeTrait + ToPrimitive,
- V: ByteViewType,
-{
- let byte_array: &GenericByteArray<FROM> = array.as_bytes();
- let len = array.len();
- let str_values_buf = byte_array.values().clone();
- let offsets = byte_array.offsets();
-
- let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
- let block = views_builder.append_block(str_values_buf);
- for (i, w) in offsets.windows(2).enumerate() {
- let offset = w[0].to_u32().unwrap();
- let end = w[1].to_u32().unwrap();
- let length = end - offset;
-
- if byte_array.is_null(i) {
- views_builder.append_null();
- } else {
- // Safety: the input was a valid array so it valid UTF8 (if
string). And
- // all offsets were valid and we created the views correctly
- unsafe { views_builder.append_view_unchecked(block, offset,
length) }
- }
- }
-
- assert_eq!(views_builder.len(), len);
- Ok(Arc::new(views_builder.finish()))
-}
-
/// Helper function to cast from one `ByteViewType` array to `ByteArrayType`
array.
fn cast_view_to_byte<FROM, TO>(array: &dyn Array) -> Result<ArrayRef,
ArrowError>
where
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index c300f995283..18f77a9275c 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -540,98 +540,32 @@ impl<'a, T: ByteArrayType> ArrayOrd for &'a
GenericByteArray<T> {
}
}
-/// Comparing two ByteView types are non-trivial.
-/// It takes a bit of patience to understand why we don't just compare two
&[u8] directly.
-///
-/// ByteView types give us the following two advantages, and we need to be
careful not to lose them:
-/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in
the view.
-/// Meaning that reading one array element requires only one memory access
-/// (two memory access required for StringArray, one for offset buffer,
the other for value buffer).
-///
-/// (2) For string/byte larger than 12 bytes, we can still be faster than (for
certain operations) StringArray/ByteArray,
-/// thanks to the inlined 4 bytes.
-/// Consider equality check:
-/// If the first four bytes of the two strings are different, we can
return false immediately (with just one memory access).
-/// If we are unlucky and the first four bytes are the same, we need to
fallback to compare two full strings.
impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
- /// Item.0 is the array, Item.1 is the index into the array.
- /// Why don't we just store Item.0[Item.1] as the item?
- /// - Because if we do so, we materialize the entire string (i.e., make
multiple memory accesses), which might be unnecessary.
- /// - Most of the time (eq, ord), we only need to look at the first 4
bytes to know the answer,
- /// e.g., if the inlined 4 bytes are different, we can directly return
unequal without looking at the full string.
+ /// This is the item type for the GenericByteViewArray::compare
+ /// Item.0 is the array, Item.1 is the index
type Item = (&'a GenericByteViewArray<T>, usize);
- /// # Equality check flow
- /// (1) if both string are smaller than 12 bytes, we can directly compare
the data inlined to the view.
- /// (2) if any of the string is larger than 12 bytes, we need to compare
the full string.
- /// (2.1) if the inlined 4 bytes are different, we can return false
immediately.
- /// (2.2) o.w., we need to compare the full string.
- ///
- /// # Safety
- /// (1) Indexing. The Self::Item.1 encodes the index value, which is
already checked in `value` function,
- /// so it is safe to index into the views.
- /// (2) Slice data from view. We know the bytes 4-8 are inlined data (per
spec), so it is safe to slice from the view.
fn is_eq(l: Self::Item, r: Self::Item) -> bool {
+ // # Safety
+ // The index is within bounds as it is checked in value()
let l_view = unsafe { l.0.views().get_unchecked(l.1) };
let l_len = *l_view as u32;
let r_view = unsafe { r.0.views().get_unchecked(r.1) };
let r_len = *r_view as u32;
-
+ // This is a fast path for equality check.
+ // We don't need to look at the actual bytes to determine if they are
equal.
if l_len != r_len {
return false;
}
- if l_len <= 12 {
- let l_data = unsafe {
GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
- let r_data = unsafe {
GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
- l_data == r_data
- } else {
- let l_inlined_data = unsafe {
GenericByteViewArray::<T>::inline_value(l_view, 4) };
- let r_inlined_data = unsafe {
GenericByteViewArray::<T>::inline_value(r_view, 4) };
- if l_inlined_data != r_inlined_data {
- return false;
- }
-
- let l_full_data: &[u8] = unsafe {
l.0.value_unchecked(l.1).as_ref() };
- let r_full_data: &[u8] = unsafe {
r.0.value_unchecked(r.1).as_ref() };
- l_full_data == r_full_data
- }
+ unsafe { compare_byte_view_unchecked(l.0, l.1, r.0, r.1).is_eq() }
}
- /// # Ordering check flow
- /// (1) if both string are smaller than 12 bytes, we can directly compare
the data inlined to the view.
- /// (2) if any of the string is larger than 12 bytes, we need to compare
the full string.
- /// (2.1) if the inlined 4 bytes are different, we can return the
result immediately.
- /// (2.2) o.w., we need to compare the full string.
- ///
- /// # Safety
- /// (1) Indexing. The Self::Item.1 encodes the index value, which is
already checked in `value` function,
- /// so it is safe to index into the views.
- /// (2) Slice data from view. We know the bytes 4-8 are inlined data (per
spec), so it is safe to slice from the view.
fn is_lt(l: Self::Item, r: Self::Item) -> bool {
- let l_view = l.0.views().get(l.1).unwrap();
- let l_len = *l_view as u32;
-
- let r_view = r.0.views().get(r.1).unwrap();
- let r_len = *r_view as u32;
-
- if l_len <= 12 && r_len <= 12 {
- let l_data = unsafe {
GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
- let r_data = unsafe {
GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
- return l_data < r_data;
- }
- // one of the string is larger than 12 bytes,
- // we then try to compare the inlined data first
- let l_inlined_data = unsafe {
GenericByteViewArray::<T>::inline_value(l_view, 4) };
- let r_inlined_data = unsafe {
GenericByteViewArray::<T>::inline_value(r_view, 4) };
- if r_inlined_data != l_inlined_data {
- return l_inlined_data < r_inlined_data;
- }
- // unfortunately, we need to compare the full data
- let l_full_data: &[u8] = unsafe { l.0.value_unchecked(l.1).as_ref() };
- let r_full_data: &[u8] = unsafe { r.0.value_unchecked(r.1).as_ref() };
- l_full_data < r_full_data
+ // # Safety
+ // The index is within bounds as it is checked in value()
+ unsafe { compare_byte_view_unchecked(l.0, l.1, r.0, r.1).is_lt() }
}
fn len(&self) -> usize {
@@ -663,6 +597,78 @@ impl<'a> ArrayOrd for &'a FixedSizeBinaryArray {
}
}
+/// Compares two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
+pub fn compare_byte_view<T: ByteViewType>(
+ left: &GenericByteViewArray<T>,
+ left_idx: usize,
+ right: &GenericByteViewArray<T>,
+ right_idx: usize,
+) -> std::cmp::Ordering {
+ assert!(left_idx < left.len());
+ assert!(right_idx < right.len());
+ unsafe { compare_byte_view_unchecked(left, left_idx, right, right_idx) }
+}
+
+/// Comparing two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
+///
+/// Comparing two ByteView types are non-trivial.
+/// It takes a bit of patience to understand why we don't just compare two
&[u8] directly.
+///
+/// ByteView types give us the following two advantages, and we need to be
careful not to lose them:
+/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in
the view.
+/// Meaning that reading one array element requires only one memory access
+/// (two memory access required for StringArray, one for offset buffer,
the other for value buffer).
+///
+/// (2) For string/byte larger than 12 bytes, we can still be faster than (for
certain operations) StringArray/ByteArray,
+/// thanks to the inlined 4 bytes.
+/// Consider equality check:
+/// If the first four bytes of the two strings are different, we can
return false immediately (with just one memory access).
+///
+/// If we directly compare two &[u8], we materialize the entire string (i.e.,
make multiple memory accesses), which might be unnecessary.
+/// - Most of the time (eq, ord), we only need to look at the first 4 bytes to
know the answer,
+/// e.g., if the inlined 4 bytes are different, we can directly return
unequal without looking at the full string.
+///
+/// # Order check flow
+/// (1) if both string are smaller than 12 bytes, we can directly compare the
data inlined to the view.
+/// (2) if any of the string is larger than 12 bytes, we need to compare the
full string.
+/// (2.1) if the inlined 4 bytes are different, we can return the result
immediately.
+/// (2.2) o.w., we need to compare the full string.
+///
+/// # Safety
+/// The left/right_idx must within range of each array
+pub unsafe fn compare_byte_view_unchecked<T: ByteViewType>(
+ left: &GenericByteViewArray<T>,
+ left_idx: usize,
+ right: &GenericByteViewArray<T>,
+ right_idx: usize,
+) -> std::cmp::Ordering {
+ let l_view = left.views().get_unchecked(left_idx);
+ let l_len = *l_view as u32;
+
+ let r_view = right.views().get_unchecked(right_idx);
+ let r_len = *r_view as u32;
+
+ if l_len <= 12 && r_len <= 12 {
+ let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view,
l_len as usize) };
+ let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view,
r_len as usize) };
+ return l_data.cmp(r_data);
+ }
+
+ // one of the string is larger than 12 bytes,
+ // we then try to compare the inlined data first
+ let l_inlined_data = unsafe {
GenericByteViewArray::<T>::inline_value(l_view, 4) };
+ let r_inlined_data = unsafe {
GenericByteViewArray::<T>::inline_value(r_view, 4) };
+ if r_inlined_data != l_inlined_data {
+ return l_inlined_data.cmp(r_inlined_data);
+ }
+
+ // unfortunately, we need to compare the full data
+ let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref()
};
+ let r_full_data: &[u8] = unsafe {
right.value_unchecked(right_idx).as_ref() };
+
+ l_full_data.cmp(r_full_data)
+}
+
#[cfg(test)]
mod tests {
use std::sync::Arc;
diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs
index 3825e5ec66f..6430c8f0e40 100644
--- a/arrow-ord/src/ord.rs
+++ b/arrow-ord/src/ord.rs
@@ -135,6 +135,21 @@ fn compare_bytes<T: ByteArrayType>(
})
}
+fn compare_byte_view<T: ByteViewType>(
+ left: &dyn Array,
+ right: &dyn Array,
+ opts: SortOptions,
+) -> DynComparator {
+ let left = left.as_byte_view::<T>();
+ let right = right.as_byte_view::<T>();
+
+ let l = left.clone();
+ let r = right.clone();
+ compare(left, right, opts, move |i, j| {
+ crate::cmp::compare_byte_view(&l, i, &r, j)
+ })
+}
+
fn compare_dict<K: ArrowDictionaryKeyType>(
left: &dyn Array,
right: &dyn Array,
@@ -342,8 +357,10 @@ pub fn make_comparator(
(Boolean, Boolean) => Ok(compare_boolean(left, right, opts)),
(Utf8, Utf8) => Ok(compare_bytes::<Utf8Type>(left, right, opts)),
(LargeUtf8, LargeUtf8) => Ok(compare_bytes::<LargeUtf8Type>(left,
right, opts)),
+ (Utf8View, Utf8View) => Ok(compare_byte_view::<StringViewType>(left,
right, opts)),
(Binary, Binary) => Ok(compare_bytes::<BinaryType>(left, right, opts)),
(LargeBinary, LargeBinary) =>
Ok(compare_bytes::<LargeBinaryType>(left, right, opts)),
+ (BinaryView, BinaryView) =>
Ok(compare_byte_view::<BinaryViewType>(left, right, opts)),
(FixedSizeBinary(_), FixedSizeBinary(_)) => {
let left = left.as_fixed_size_binary();
let right = right.as_fixed_size_binary();
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index 8e1285493b0..5dce771c85a 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -135,6 +135,7 @@ use arrow_array::*;
use arrow_buffer::ArrowNativeType;
use arrow_data::ArrayDataBuilder;
use arrow_schema::*;
+use variable::{decode_binary_view, decode_string_view};
use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
use crate::variable::{decode_binary, decode_string};
@@ -1079,6 +1080,9 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder])
-> Vec<usize> {
.iter()
.zip(lengths.iter_mut())
.for_each(|(slice, length)| *length +=
variable::encoded_len(slice)),
+ DataType::BinaryView =>
array.as_binary_view().iter().zip(lengths.iter_mut()).for_each(|(slice,
length)| {
+ *length += variable::encoded_len(slice)
+ }),
DataType::Utf8 => array.as_string::<i32>()
.iter()
.zip(lengths.iter_mut())
@@ -1091,11 +1095,14 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder])
-> Vec<usize> {
.for_each(|(slice, length)| {
*length += variable::encoded_len(slice.map(|x|
x.as_bytes()))
}),
+ DataType::Utf8View =>
array.as_string_view().iter().zip(lengths.iter_mut()).for_each(|(slice,
length)| {
+ *length += variable::encoded_len(slice.map(|x|
x.as_bytes()))
+ }),
DataType::FixedSizeBinary(len) => {
let len = len.to_usize().unwrap();
lengths.iter_mut().for_each(|x| *x += 1 + len)
}
- _ => unreachable!(),
+ _ => unimplemented!("unsupported data type: {}",
array.data_type()),
}
}
Encoder::Dictionary(values, null) => {
@@ -1152,6 +1159,9 @@ fn encode_column(
DataType::Binary => {
variable::encode(data, offsets,
as_generic_binary_array::<i32>(column).iter(), opts)
}
+ DataType::BinaryView => {
+ variable::encode(data, offsets,
column.as_binary_view().iter(), opts)
+ }
DataType::LargeBinary => {
variable::encode(data, offsets,
as_generic_binary_array::<i64>(column).iter(), opts)
}
@@ -1167,11 +1177,16 @@ fn encode_column(
.map(|x| x.map(|x| x.as_bytes())),
opts,
),
+ DataType::Utf8View => variable::encode(
+ data, offsets,
+ column.as_string_view().iter().map(|x| x.map(|x|
x.as_bytes())),
+ opts,
+ ),
DataType::FixedSizeBinary(_) => {
let array = column.as_any().downcast_ref().unwrap();
fixed::encode_fixed_size_binary(data, offsets, array, opts)
}
- _ => unreachable!(),
+ _ => unimplemented!("unsupported data type: {}",
column.data_type()),
}
}
Encoder::Dictionary(values, nulls) => {
@@ -1255,11 +1270,12 @@ unsafe fn decode_column(
DataType::Boolean => Arc::new(decode_bool(rows, options)),
DataType::Binary => Arc::new(decode_binary::<i32>(rows,
options)),
DataType::LargeBinary => Arc::new(decode_binary::<i64>(rows,
options)),
+ DataType::BinaryView => Arc::new(decode_binary_view(rows,
options)),
DataType::FixedSizeBinary(size) =>
Arc::new(decode_fixed_size_binary(rows, size, options)),
DataType::Utf8 => Arc::new(decode_string::<i32>(rows, options,
validate_utf8)),
DataType::LargeUtf8 => Arc::new(decode_string::<i64>(rows,
options, validate_utf8)),
- DataType::Dictionary(_, _) => todo!(),
- _ => unreachable!()
+ DataType::Utf8View => Arc::new(decode_string_view(rows,
options, validate_utf8)),
+ _ => return
Err(ArrowError::NotYetImplemented(format!("unsupported data type: {}",
data_type)))
}
}
Codec::Dictionary(converter, _) => {
@@ -2047,6 +2063,32 @@ mod tests {
.collect()
}
+ fn generate_string_view(len: usize, valid_percent: f64) -> StringViewArray
{
+ let mut rng = thread_rng();
+ (0..len)
+ .map(|_| {
+ rng.gen_bool(valid_percent).then(|| {
+ let len = rng.gen_range(0..100);
+ let bytes = (0..len).map(|_|
rng.gen_range(0..128)).collect();
+ String::from_utf8(bytes).unwrap()
+ })
+ })
+ .collect()
+ }
+
+ fn generate_byte_view(len: usize, valid_percent: f64) -> BinaryViewArray {
+ let mut rng = thread_rng();
+ (0..len)
+ .map(|_| {
+ rng.gen_bool(valid_percent).then(|| {
+ let len = rng.gen_range(0..100);
+ let bytes: Vec<_> = (0..len).map(|_|
rng.gen_range(0..128)).collect();
+ bytes
+ })
+ })
+ .collect()
+ }
+
fn generate_dictionary<K>(
values: ArrayRef,
len: usize,
@@ -2127,7 +2169,7 @@ mod tests {
fn generate_column(len: usize) -> ArrayRef {
let mut rng = thread_rng();
- match rng.gen_range(0..14) {
+ match rng.gen_range(0..16) {
0 => Arc::new(generate_primitive_array::<Int32Type>(len, 0.8)),
1 => Arc::new(generate_primitive_array::<UInt32Type>(len, 0.8)),
2 => Arc::new(generate_primitive_array::<Int64Type>(len, 0.8)),
@@ -2161,6 +2203,8 @@ mod tests {
13 => Arc::new(generate_list(len, 0.8, |values_len| {
Arc::new(generate_struct(values_len, 0.8))
})),
+ 14 => Arc::new(generate_string_view(len, 0.8)),
+ 15 => Arc::new(generate_byte_view(len, 0.8)),
_ => unreachable!(),
}
}
diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
index 45068baf2a3..c5aa7d8ac32 100644
--- a/arrow-row/src/variable.rs
+++ b/arrow-row/src/variable.rs
@@ -268,3 +268,30 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>(
// Row data must have come from a valid UTF-8 array
GenericStringArray::from(builder.build_unchecked())
}
+
+/// Decodes a binary view array from `rows` with the provided `options`
+pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) ->
BinaryViewArray {
+ let decoded: GenericBinaryArray<i64> = decode_binary(rows, options);
+
+ // Better performance might be to directly build the binary view instead
of building to BinaryArray and then casting
+ // I suspect that the overhead is not a big deal.
+ // If it is, we can reimplement the `decode_binary_view` function to
directly build the StringViewArray
+ BinaryViewArray::from(&decoded)
+}
+
+/// Decodes a string view array from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// The row must contain valid UTF-8 data
+pub unsafe fn decode_string_view(
+ rows: &mut [&[u8]],
+ options: SortOptions,
+ validate_utf8: bool,
+) -> StringViewArray {
+ let decoded: GenericStringArray<i64> = decode_string(rows, options,
validate_utf8);
+ // Better performance might be to directly build the string view instead
of building to StringArray and then casting
+ // I suspect that the overhead is not a big deal.
+ // If it is, we can reimplement the `decode_string_view` function to
directly build the StringViewArray
+ StringViewArray::from(&decoded)
+}
diff --git a/arrow/benches/comparison_kernels.rs
b/arrow/benches/comparison_kernels.rs
index 360d4865924..e5432c70ee4 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -171,6 +171,26 @@ fn add_benchmark(c: &mut Criterion) {
})
});
+ c.bench_function("lt scalar StringViewArray", |b| {
+ b.iter(|| {
+ lt(
+ &Scalar::new(StringViewArray::from_iter_values(["xxxx"])),
+ &string_view_left,
+ )
+ .unwrap()
+ })
+ });
+
+ c.bench_function("lt scalar StringArray", |b| {
+ b.iter(|| {
+ lt(
+ &Scalar::new(StringArray::from_iter_values(["xxxx"])),
+ &string_left,
+ )
+ .unwrap()
+ })
+ });
+
c.bench_function("eq scalar StringViewArray", |b| {
b.iter(|| {
eq(