(arrow-rs) branch master updated: Implement arrow-row encoding/decoding for view types (#5922)

alamb Sun, 23 Jun 2024 19:32:09 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 0c3a24d2a42 Implement arrow-row encoding/decoding for view types 
(#5922)
0c3a24d2a42 is described below

commit 0c3a24d2a42dfac7bf56bf2c87374463efee722e
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Sun Jun 23 19:30:59 2024 -0700

    Implement arrow-row encoding/decoding for view types (#5922)
    
    * implement arrow-row encoding/decoding for view types
    
    * add doc comments, better error msg, more test coverage
    
    * ensure no performance regression
    
    * update perf
    
    * fix bug
    
    * make fmt happy
    
    * Update arrow-array/src/array/byte_view_array.rs
    
    Co-authored-by: Raphael Taylor-Davies 
<[email protected]>
    
    * update
    
    * update comments
    
    * move cmp around
    
    * move things around and remove inline hint
    
    * Update arrow-array/src/array/byte_view_array.rs
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * Update arrow-ord/src/cmp.rs
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * return error instead of panic
    
    * remove unnecessary func
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
    Co-authored-by: Raphael Taylor-Davies 
<[email protected]>
---
 arrow-array/src/array/byte_view_array.rs |  54 ++++++++++-
 arrow-cast/src/cast/mod.rs               |  40 +-------
 arrow-ord/src/cmp.rs                     | 158 ++++++++++++++++---------------
 arrow-ord/src/ord.rs                     |  17 ++++
 arrow-row/src/lib.rs                     |  54 ++++++++++-
 arrow-row/src/variable.rs                |  27 ++++++
 arrow/benches/comparison_kernels.rs      |  20 ++++
 7 files changed, 250 insertions(+), 120 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index 187f5b8e6f9..f31bc1c785b 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -16,19 +16,22 @@
 // under the License.
 
 use crate::array::print_long_array;
-use crate::builder::GenericByteViewBuilder;
+use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
 use crate::iterator::ArrayIter;
 use crate::types::bytes::ByteArrayNativeType;
 use crate::types::{BinaryViewType, ByteViewType, StringViewType};
-use crate::{Array, ArrayAccessor, ArrayRef, Scalar};
-use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
+use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, 
Scalar};
+use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
 use arrow_schema::{ArrowError, DataType};
+use num::ToPrimitive;
 use std::any::Any;
 use std::fmt::Debug;
 use std::marker::PhantomData;
 use std::sync::Arc;
 
+use super::ByteArrayType;
+
 /// [Variable-size Binary View Layout]: An array of variable length bytes view 
arrays.
 ///
 /// Different than [`crate::GenericByteArray`] as it stores both an offset and 
length
@@ -429,6 +432,51 @@ impl<T: ByteViewType + ?Sized> From<ArrayData> for 
GenericByteViewArray<T> {
     }
 }
 
+/// Convert a [`GenericByteArray`] to a [`GenericByteViewArray`] but in a 
smart way:
+/// If the offsets are all less than u32::MAX, then we directly build the view 
array on top of existing buffer.
+impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
+where
+    FROM: ByteArrayType,
+    FROM::Offset: OffsetSizeTrait + ToPrimitive,
+    V: ByteViewType<Native = FROM::Native>,
+{
+    fn from(byte_array: &GenericByteArray<FROM>) -> Self {
+        let offsets = byte_array.offsets();
+
+        let can_reuse_buffer = match offsets.last() {
+            Some(offset) => offset.as_usize() < u32::MAX as usize,
+            None => true,
+        };
+
+        if can_reuse_buffer {
+            let len = byte_array.len();
+            let mut views_builder = 
GenericByteViewBuilder::<V>::with_capacity(len);
+            let str_values_buf = byte_array.values().clone();
+            let block = views_builder.append_block(str_values_buf);
+            for (i, w) in offsets.windows(2).enumerate() {
+                let offset = w[0].as_usize();
+                let end = w[1].as_usize();
+                let length = end - offset;
+
+                if byte_array.is_null(i) {
+                    views_builder.append_null();
+                } else {
+                    // Safety: the input was a valid array so it valid UTF8 
(if string). And
+                    // all offsets were valid
+                    unsafe {
+                        views_builder.append_view_unchecked(block, offset as 
u32, length as u32)
+                    }
+                }
+            }
+            assert_eq!(views_builder.len(), len);
+            views_builder.finish()
+        } else {
+            // TODO: the first u32::MAX can still be reused
+            GenericByteViewArray::<V>::from_iter(byte_array.iter())
+        }
+    }
+}
+
 impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
     fn from(mut array: GenericByteViewArray<T>) -> Self {
         let len = array.len();
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 7a6e1a31bb4..e5ab304bb6f 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -1230,7 +1230,7 @@ pub fn cast_with_options(
                 let binary = 
BinaryArray::from(array.as_string::<i32>().clone());
                 cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
             }
-            Utf8View => cast_byte_to_view::<Utf8Type, StringViewType>(array),
+            Utf8View => 
Ok(Arc::new(StringViewArray::from(array.as_string::<i32>()))),
             LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
             Time32(TimeUnit::Second) => parse_string::<Time32SecondType, 
i32>(array, cast_options),
             Time32(TimeUnit::Millisecond) => {
@@ -1290,7 +1290,7 @@ pub fn cast_with_options(
             LargeBinary => Ok(Arc::new(LargeBinaryArray::from(
                 array.as_string::<i64>().clone(),
             ))),
-            Utf8View => cast_byte_to_view::<LargeUtf8Type, 
StringViewType>(array),
+            Utf8View => 
Ok(Arc::new(StringViewArray::from(array.as_string::<i64>()))),
             Time32(TimeUnit::Second) => parse_string::<Time32SecondType, 
i64>(array, cast_options),
             Time32(TimeUnit::Millisecond) => {
                 parse_string::<Time32MillisecondType, i64>(array, cast_options)
@@ -1338,7 +1338,7 @@ pub fn cast_with_options(
             FixedSizeBinary(size) => {
                 cast_binary_to_fixed_size_binary::<i32>(array, *size, 
cast_options)
             }
-            BinaryView => cast_byte_to_view::<BinaryType, 
BinaryViewType>(array),
+            BinaryView => 
Ok(Arc::new(BinaryViewArray::from(array.as_binary::<i32>()))),
             _ => Err(ArrowError::CastError(format!(
                 "Casting from {from_type:?} to {to_type:?} not supported",
             ))),
@@ -1353,7 +1353,7 @@ pub fn cast_with_options(
             FixedSizeBinary(size) => {
                 cast_binary_to_fixed_size_binary::<i64>(array, *size, 
cast_options)
             }
-            BinaryView => cast_byte_to_view::<LargeBinaryType, 
BinaryViewType>(array),
+            BinaryView => 
Ok(Arc::new(BinaryViewArray::from(array.as_binary::<i64>()))),
             _ => Err(ArrowError::CastError(format!(
                 "Casting from {from_type:?} to {to_type:?} not supported",
             ))),
@@ -2345,38 +2345,6 @@ where
     Ok(Arc::new(GenericByteArray::<TO>::from(array_data)))
 }
 
-/// Helper function to cast from one `ByteArrayType` array to `ByteViewType` 
array.
-fn cast_byte_to_view<FROM, V>(array: &dyn Array) -> Result<ArrayRef, 
ArrowError>
-where
-    FROM: ByteArrayType,
-    FROM::Offset: OffsetSizeTrait + ToPrimitive,
-    V: ByteViewType,
-{
-    let byte_array: &GenericByteArray<FROM> = array.as_bytes();
-    let len = array.len();
-    let str_values_buf = byte_array.values().clone();
-    let offsets = byte_array.offsets();
-
-    let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
-    let block = views_builder.append_block(str_values_buf);
-    for (i, w) in offsets.windows(2).enumerate() {
-        let offset = w[0].to_u32().unwrap();
-        let end = w[1].to_u32().unwrap();
-        let length = end - offset;
-
-        if byte_array.is_null(i) {
-            views_builder.append_null();
-        } else {
-            // Safety: the input was a valid array so it valid UTF8 (if 
string). And
-            // all offsets were valid and we created the views correctly
-            unsafe { views_builder.append_view_unchecked(block, offset, 
length) }
-        }
-    }
-
-    assert_eq!(views_builder.len(), len);
-    Ok(Arc::new(views_builder.finish()))
-}
-
 /// Helper function to cast from one `ByteViewType` array to `ByteArrayType` 
array.
 fn cast_view_to_byte<FROM, TO>(array: &dyn Array) -> Result<ArrayRef, 
ArrowError>
 where
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index c300f995283..18f77a9275c 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -540,98 +540,32 @@ impl<'a, T: ByteArrayType> ArrayOrd for &'a 
GenericByteArray<T> {
     }
 }
 
-/// Comparing two ByteView types are non-trivial.
-/// It takes a bit of patience to understand why we don't just compare two 
&[u8] directly.
-///
-/// ByteView types give us the following two advantages, and we need to be 
careful not to lose them:
-/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in 
the view.
-///     Meaning that reading one array element requires only one memory access
-///     (two memory access required for StringArray, one for offset buffer, 
the other for value buffer).
-///
-/// (2) For string/byte larger than 12 bytes, we can still be faster than (for 
certain operations) StringArray/ByteArray,
-///     thanks to the inlined 4 bytes.
-///     Consider equality check:
-///     If the first four bytes of the two strings are different, we can 
return false immediately (with just one memory access).
-///     If we are unlucky and the first four bytes are the same, we need to 
fallback to compare two full strings.
 impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
-    /// Item.0 is the array, Item.1 is the index into the array.
-    /// Why don't we just store Item.0[Item.1] as the item?
-    ///  - Because if we do so, we materialize the entire string (i.e., make 
multiple memory accesses), which might be unnecessary.
-    ///  - Most of the time (eq, ord), we only need to look at the first 4 
bytes to know the answer,
-    ///     e.g., if the inlined 4 bytes are different, we can directly return 
unequal without looking at the full string.
+    /// This is the item type for the GenericByteViewArray::compare
+    /// Item.0 is the array, Item.1 is the index
     type Item = (&'a GenericByteViewArray<T>, usize);
 
-    /// # Equality check flow
-    /// (1) if both string are smaller than 12 bytes, we can directly compare 
the data inlined to the view.
-    /// (2) if any of the string is larger than 12 bytes, we need to compare 
the full string.
-    ///     (2.1) if the inlined 4 bytes are different, we can return false 
immediately.
-    ///     (2.2) o.w., we need to compare the full string.
-    ///
-    /// # Safety
-    /// (1) Indexing. The Self::Item.1 encodes the index value, which is 
already checked in `value` function,
-    ///               so it is safe to index into the views.
-    /// (2) Slice data from view. We know the bytes 4-8 are inlined data (per 
spec), so it is safe to slice from the view.
     fn is_eq(l: Self::Item, r: Self::Item) -> bool {
+        // # Safety
+        // The index is within bounds as it is checked in value()
         let l_view = unsafe { l.0.views().get_unchecked(l.1) };
         let l_len = *l_view as u32;
 
         let r_view = unsafe { r.0.views().get_unchecked(r.1) };
         let r_len = *r_view as u32;
-
+        // This is a fast path for equality check.
+        // We don't need to look at the actual bytes to determine if they are 
equal.
         if l_len != r_len {
             return false;
         }
 
-        if l_len <= 12 {
-            let l_data = unsafe { 
GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
-            let r_data = unsafe { 
GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
-            l_data == r_data
-        } else {
-            let l_inlined_data = unsafe { 
GenericByteViewArray::<T>::inline_value(l_view, 4) };
-            let r_inlined_data = unsafe { 
GenericByteViewArray::<T>::inline_value(r_view, 4) };
-            if l_inlined_data != r_inlined_data {
-                return false;
-            }
-
-            let l_full_data: &[u8] = unsafe { 
l.0.value_unchecked(l.1).as_ref() };
-            let r_full_data: &[u8] = unsafe { 
r.0.value_unchecked(r.1).as_ref() };
-            l_full_data == r_full_data
-        }
+        unsafe { compare_byte_view_unchecked(l.0, l.1, r.0, r.1).is_eq() }
     }
 
-    /// # Ordering check flow
-    /// (1) if both string are smaller than 12 bytes, we can directly compare 
the data inlined to the view.
-    /// (2) if any of the string is larger than 12 bytes, we need to compare 
the full string.
-    ///     (2.1) if the inlined 4 bytes are different, we can return the 
result immediately.
-    ///     (2.2) o.w., we need to compare the full string.
-    ///
-    /// # Safety
-    /// (1) Indexing. The Self::Item.1 encodes the index value, which is 
already checked in `value` function,
-    ///              so it is safe to index into the views.
-    /// (2) Slice data from view. We know the bytes 4-8 are inlined data (per 
spec), so it is safe to slice from the view.
     fn is_lt(l: Self::Item, r: Self::Item) -> bool {
-        let l_view = l.0.views().get(l.1).unwrap();
-        let l_len = *l_view as u32;
-
-        let r_view = r.0.views().get(r.1).unwrap();
-        let r_len = *r_view as u32;
-
-        if l_len <= 12 && r_len <= 12 {
-            let l_data = unsafe { 
GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
-            let r_data = unsafe { 
GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
-            return l_data < r_data;
-        }
-        // one of the string is larger than 12 bytes,
-        // we then try to compare the inlined data first
-        let l_inlined_data = unsafe { 
GenericByteViewArray::<T>::inline_value(l_view, 4) };
-        let r_inlined_data = unsafe { 
GenericByteViewArray::<T>::inline_value(r_view, 4) };
-        if r_inlined_data != l_inlined_data {
-            return l_inlined_data < r_inlined_data;
-        }
-        // unfortunately, we need to compare the full data
-        let l_full_data: &[u8] = unsafe { l.0.value_unchecked(l.1).as_ref() };
-        let r_full_data: &[u8] = unsafe { r.0.value_unchecked(r.1).as_ref() };
-        l_full_data < r_full_data
+        // # Safety
+        // The index is within bounds as it is checked in value()
+        unsafe { compare_byte_view_unchecked(l.0, l.1, r.0, r.1).is_lt() }
     }
 
     fn len(&self) -> usize {
@@ -663,6 +597,78 @@ impl<'a> ArrayOrd for &'a FixedSizeBinaryArray {
     }
 }
 
+/// Compares two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
+pub fn compare_byte_view<T: ByteViewType>(
+    left: &GenericByteViewArray<T>,
+    left_idx: usize,
+    right: &GenericByteViewArray<T>,
+    right_idx: usize,
+) -> std::cmp::Ordering {
+    assert!(left_idx < left.len());
+    assert!(right_idx < right.len());
+    unsafe { compare_byte_view_unchecked(left, left_idx, right, right_idx) }
+}
+
+/// Comparing two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
+///
+/// Comparing two ByteView types are non-trivial.
+/// It takes a bit of patience to understand why we don't just compare two 
&[u8] directly.
+///
+/// ByteView types give us the following two advantages, and we need to be 
careful not to lose them:
+/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in 
the view.
+///     Meaning that reading one array element requires only one memory access
+///     (two memory access required for StringArray, one for offset buffer, 
the other for value buffer).
+///
+/// (2) For string/byte larger than 12 bytes, we can still be faster than (for 
certain operations) StringArray/ByteArray,
+///     thanks to the inlined 4 bytes.
+///     Consider equality check:
+///     If the first four bytes of the two strings are different, we can 
return false immediately (with just one memory access).
+///
+/// If we directly compare two &[u8], we materialize the entire string (i.e., 
make multiple memory accesses), which might be unnecessary.
+/// - Most of the time (eq, ord), we only need to look at the first 4 bytes to 
know the answer,
+///   e.g., if the inlined 4 bytes are different, we can directly return 
unequal without looking at the full string.
+///
+/// # Order check flow
+/// (1) if both string are smaller than 12 bytes, we can directly compare the 
data inlined to the view.
+/// (2) if any of the string is larger than 12 bytes, we need to compare the 
full string.
+///     (2.1) if the inlined 4 bytes are different, we can return the result 
immediately.
+///     (2.2) o.w., we need to compare the full string.
+///
+/// # Safety
+/// The left/right_idx must within range of each array
+pub unsafe fn compare_byte_view_unchecked<T: ByteViewType>(
+    left: &GenericByteViewArray<T>,
+    left_idx: usize,
+    right: &GenericByteViewArray<T>,
+    right_idx: usize,
+) -> std::cmp::Ordering {
+    let l_view = left.views().get_unchecked(left_idx);
+    let l_len = *l_view as u32;
+
+    let r_view = right.views().get_unchecked(right_idx);
+    let r_len = *r_view as u32;
+
+    if l_len <= 12 && r_len <= 12 {
+        let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, 
l_len as usize) };
+        let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, 
r_len as usize) };
+        return l_data.cmp(r_data);
+    }
+
+    // one of the string is larger than 12 bytes,
+    // we then try to compare the inlined data first
+    let l_inlined_data = unsafe { 
GenericByteViewArray::<T>::inline_value(l_view, 4) };
+    let r_inlined_data = unsafe { 
GenericByteViewArray::<T>::inline_value(r_view, 4) };
+    if r_inlined_data != l_inlined_data {
+        return l_inlined_data.cmp(r_inlined_data);
+    }
+
+    // unfortunately, we need to compare the full data
+    let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() 
};
+    let r_full_data: &[u8] = unsafe { 
right.value_unchecked(right_idx).as_ref() };
+
+    l_full_data.cmp(r_full_data)
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs
index 3825e5ec66f..6430c8f0e40 100644
--- a/arrow-ord/src/ord.rs
+++ b/arrow-ord/src/ord.rs
@@ -135,6 +135,21 @@ fn compare_bytes<T: ByteArrayType>(
     })
 }
 
+fn compare_byte_view<T: ByteViewType>(
+    left: &dyn Array,
+    right: &dyn Array,
+    opts: SortOptions,
+) -> DynComparator {
+    let left = left.as_byte_view::<T>();
+    let right = right.as_byte_view::<T>();
+
+    let l = left.clone();
+    let r = right.clone();
+    compare(left, right, opts, move |i, j| {
+        crate::cmp::compare_byte_view(&l, i, &r, j)
+    })
+}
+
 fn compare_dict<K: ArrowDictionaryKeyType>(
     left: &dyn Array,
     right: &dyn Array,
@@ -342,8 +357,10 @@ pub fn make_comparator(
         (Boolean, Boolean) => Ok(compare_boolean(left, right, opts)),
         (Utf8, Utf8) => Ok(compare_bytes::<Utf8Type>(left, right, opts)),
         (LargeUtf8, LargeUtf8) => Ok(compare_bytes::<LargeUtf8Type>(left, 
right, opts)),
+        (Utf8View, Utf8View) => Ok(compare_byte_view::<StringViewType>(left, 
right, opts)),
         (Binary, Binary) => Ok(compare_bytes::<BinaryType>(left, right, opts)),
         (LargeBinary, LargeBinary) => 
Ok(compare_bytes::<LargeBinaryType>(left, right, opts)),
+        (BinaryView, BinaryView) => 
Ok(compare_byte_view::<BinaryViewType>(left, right, opts)),
         (FixedSizeBinary(_), FixedSizeBinary(_)) => {
             let left = left.as_fixed_size_binary();
             let right = right.as_fixed_size_binary();
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index 8e1285493b0..5dce771c85a 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -135,6 +135,7 @@ use arrow_array::*;
 use arrow_buffer::ArrowNativeType;
 use arrow_data::ArrayDataBuilder;
 use arrow_schema::*;
+use variable::{decode_binary_view, decode_string_view};
 
 use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
 use crate::variable::{decode_binary, decode_string};
@@ -1079,6 +1080,9 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) 
-> Vec<usize> {
                         .iter()
                         .zip(lengths.iter_mut())
                         .for_each(|(slice, length)| *length += 
variable::encoded_len(slice)),
+                    DataType::BinaryView => 
array.as_binary_view().iter().zip(lengths.iter_mut()).for_each(|(slice, 
length)| {
+                            *length += variable::encoded_len(slice)
+                        }),
                     DataType::Utf8 => array.as_string::<i32>()
                         .iter()
                         .zip(lengths.iter_mut())
@@ -1091,11 +1095,14 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) 
-> Vec<usize> {
                         .for_each(|(slice, length)| {
                             *length += variable::encoded_len(slice.map(|x| 
x.as_bytes()))
                         }),
+                    DataType::Utf8View => 
array.as_string_view().iter().zip(lengths.iter_mut()).for_each(|(slice, 
length)| {
+                        *length += variable::encoded_len(slice.map(|x| 
x.as_bytes()))
+                    }),
                     DataType::FixedSizeBinary(len) => {
                         let len = len.to_usize().unwrap();
                         lengths.iter_mut().for_each(|x| *x += 1 + len)
                     }
-                    _ => unreachable!(),
+                    _ => unimplemented!("unsupported data type: {}", 
array.data_type()),
                 }
             }
             Encoder::Dictionary(values, null) => {
@@ -1152,6 +1159,9 @@ fn encode_column(
                 DataType::Binary => {
                     variable::encode(data, offsets, 
as_generic_binary_array::<i32>(column).iter(), opts)
                 }
+                DataType::BinaryView => {
+                    variable::encode(data, offsets, 
column.as_binary_view().iter(), opts)
+                }
                 DataType::LargeBinary => {
                     variable::encode(data, offsets, 
as_generic_binary_array::<i64>(column).iter(), opts)
                 }
@@ -1167,11 +1177,16 @@ fn encode_column(
                         .map(|x| x.map(|x| x.as_bytes())),
                     opts,
                 ),
+                DataType::Utf8View => variable::encode(
+                    data, offsets,
+                    column.as_string_view().iter().map(|x| x.map(|x| 
x.as_bytes())),
+                    opts,
+                ),
                 DataType::FixedSizeBinary(_) => {
                     let array = column.as_any().downcast_ref().unwrap();
                     fixed::encode_fixed_size_binary(data, offsets, array, opts)
                 }
-                _ => unreachable!(),
+                _ => unimplemented!("unsupported data type: {}", 
column.data_type()),
             }
         }
         Encoder::Dictionary(values, nulls) => {
@@ -1255,11 +1270,12 @@ unsafe fn decode_column(
                 DataType::Boolean => Arc::new(decode_bool(rows, options)),
                 DataType::Binary => Arc::new(decode_binary::<i32>(rows, 
options)),
                 DataType::LargeBinary => Arc::new(decode_binary::<i64>(rows, 
options)),
+                DataType::BinaryView => Arc::new(decode_binary_view(rows, 
options)),
                 DataType::FixedSizeBinary(size) => 
Arc::new(decode_fixed_size_binary(rows, size, options)),
                 DataType::Utf8 => Arc::new(decode_string::<i32>(rows, options, 
validate_utf8)),
                 DataType::LargeUtf8 => Arc::new(decode_string::<i64>(rows, 
options, validate_utf8)),
-                DataType::Dictionary(_, _) => todo!(),
-                _ => unreachable!()
+                DataType::Utf8View => Arc::new(decode_string_view(rows, 
options, validate_utf8)),
+                _ => return 
Err(ArrowError::NotYetImplemented(format!("unsupported data type: {}", 
data_type)))
             }
         }
         Codec::Dictionary(converter, _) => {
@@ -2047,6 +2063,32 @@ mod tests {
             .collect()
     }
 
+    fn generate_string_view(len: usize, valid_percent: f64) -> StringViewArray 
{
+        let mut rng = thread_rng();
+        (0..len)
+            .map(|_| {
+                rng.gen_bool(valid_percent).then(|| {
+                    let len = rng.gen_range(0..100);
+                    let bytes = (0..len).map(|_| 
rng.gen_range(0..128)).collect();
+                    String::from_utf8(bytes).unwrap()
+                })
+            })
+            .collect()
+    }
+
+    fn generate_byte_view(len: usize, valid_percent: f64) -> BinaryViewArray {
+        let mut rng = thread_rng();
+        (0..len)
+            .map(|_| {
+                rng.gen_bool(valid_percent).then(|| {
+                    let len = rng.gen_range(0..100);
+                    let bytes: Vec<_> = (0..len).map(|_| 
rng.gen_range(0..128)).collect();
+                    bytes
+                })
+            })
+            .collect()
+    }
+
     fn generate_dictionary<K>(
         values: ArrayRef,
         len: usize,
@@ -2127,7 +2169,7 @@ mod tests {
 
     fn generate_column(len: usize) -> ArrayRef {
         let mut rng = thread_rng();
-        match rng.gen_range(0..14) {
+        match rng.gen_range(0..16) {
             0 => Arc::new(generate_primitive_array::<Int32Type>(len, 0.8)),
             1 => Arc::new(generate_primitive_array::<UInt32Type>(len, 0.8)),
             2 => Arc::new(generate_primitive_array::<Int64Type>(len, 0.8)),
@@ -2161,6 +2203,8 @@ mod tests {
             13 => Arc::new(generate_list(len, 0.8, |values_len| {
                 Arc::new(generate_struct(values_len, 0.8))
             })),
+            14 => Arc::new(generate_string_view(len, 0.8)),
+            15 => Arc::new(generate_byte_view(len, 0.8)),
             _ => unreachable!(),
         }
     }
diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
index 45068baf2a3..c5aa7d8ac32 100644
--- a/arrow-row/src/variable.rs
+++ b/arrow-row/src/variable.rs
@@ -268,3 +268,30 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>(
     // Row data must have come from a valid UTF-8 array
     GenericStringArray::from(builder.build_unchecked())
 }
+
+/// Decodes a binary view array from `rows` with the provided `options`
+pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> 
BinaryViewArray {
+    let decoded: GenericBinaryArray<i64> = decode_binary(rows, options);
+
+    // Better performance might be to directly build the binary view instead 
of building to BinaryArray and then casting
+    // I suspect that the overhead is not a big deal.
+    // If it is, we can reimplement the `decode_binary_view` function to 
directly build the StringViewArray
+    BinaryViewArray::from(&decoded)
+}
+
+/// Decodes a string view array from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// The row must contain valid UTF-8 data
+pub unsafe fn decode_string_view(
+    rows: &mut [&[u8]],
+    options: SortOptions,
+    validate_utf8: bool,
+) -> StringViewArray {
+    let decoded: GenericStringArray<i64> = decode_string(rows, options, 
validate_utf8);
+    // Better performance might be to directly build the string view instead 
of building to StringArray and then casting
+    // I suspect that the overhead is not a big deal.
+    // If it is, we can reimplement the `decode_string_view` function to 
directly build the StringViewArray
+    StringViewArray::from(&decoded)
+}
diff --git a/arrow/benches/comparison_kernels.rs 
b/arrow/benches/comparison_kernels.rs
index 360d4865924..e5432c70ee4 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -171,6 +171,26 @@ fn add_benchmark(c: &mut Criterion) {
         })
     });
 
+    c.bench_function("lt scalar StringViewArray", |b| {
+        b.iter(|| {
+            lt(
+                &Scalar::new(StringViewArray::from_iter_values(["xxxx"])),
+                &string_view_left,
+            )
+            .unwrap()
+        })
+    });
+
+    c.bench_function("lt scalar StringArray", |b| {
+        b.iter(|| {
+            lt(
+                &Scalar::new(StringArray::from_iter_values(["xxxx"])),
+                &string_left,
+            )
+            .unwrap()
+        })
+    });
+
     c.bench_function("eq scalar StringViewArray", |b| {
         b.iter(|| {
             eq(

(arrow-rs) branch master updated: Implement arrow-row encoding/decoding for view types (#5922)

Reply via email to