This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 074bcb5793 Directly decode String/BinaryView types from arrow-row 
format (#6044)
074bcb5793 is described below

commit 074bcb5793e466c54ef7d81e9675a69aa16d2f6c
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jul 15 07:11:36 2024 -0400

    Directly decode String/BinaryView types from arrow-row format (#6044)
    
    * add string view bench
    
    * check in new impl
    
    * add utf8
    
    * quick utf8 validation
    
    * Update arrow-row/src/variable.rs
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * address comments
    
    * update
    
    * Revert "address comments"
    
    This reverts commit e2656c94dd5ff4fb2f486278feb346d44a7f5436.
    
    * addr comments
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-row/src/variable.rs   | 83 +++++++++++++++++++++++++++++++++++++--------
 arrow/benches/row_format.rs | 14 +++++++-
 2 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
index c5aa7d8ac3..4d4bcddc08 100644
--- a/arrow-row/src/variable.rs
+++ b/arrow-row/src/variable.rs
@@ -22,6 +22,7 @@ use arrow_buffer::bit_util::ceil;
 use arrow_buffer::MutableBuffer;
 use arrow_data::ArrayDataBuilder;
 use arrow_schema::{DataType, SortOptions};
+use builder::make_view;
 
 /// The block size of the variable length encoding
 pub const BLOCK_SIZE: usize = 32;
@@ -152,6 +153,8 @@ fn encode_blocks<const SIZE: usize>(out: &mut [u8], val: 
&[u8]) -> usize {
     end_offset
 }
 
+/// Decodes a single block of data
+/// The `f` function accepts a slice of the decoded data, it may be called 
multiple times
 pub fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl 
FnMut(&[u8])) -> usize {
     let (non_empty_sentinel, continuation) = match options.descending {
         true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION),
@@ -243,6 +246,69 @@ pub fn decode_binary<I: OffsetSizeTrait>(
     unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
 }
 
+fn decode_binary_view_inner(
+    rows: &mut [&[u8]],
+    options: SortOptions,
+    check_utf8: bool,
+) -> BinaryViewArray {
+    let len = rows.len();
+
+    let mut null_count = 0;
+
+    let nulls = MutableBuffer::collect_bool(len, |x| {
+        let valid = rows[x][0] != null_sentinel(options);
+        null_count += !valid as usize;
+        valid
+    });
+
+    let values_capacity: usize = rows.iter().map(|row| decoded_len(row, 
options)).sum();
+    let mut values = MutableBuffer::new(values_capacity);
+    let mut views = BufferBuilder::<u128>::new(len);
+
+    for row in rows {
+        let start_offset = values.len();
+        let offset = decode_blocks(row, options, |b| 
values.extend_from_slice(b));
+        if row[0] == null_sentinel(options) {
+            debug_assert_eq!(offset, 1);
+            debug_assert_eq!(start_offset, values.len());
+            views.append(0);
+        } else {
+            // Safety: we just appended the data to the end of the buffer
+            let val = unsafe { values.get_unchecked_mut(start_offset..) };
+
+            if options.descending {
+                val.iter_mut().for_each(|o| *o = !*o);
+            }
+
+            let view = make_view(val, 0, start_offset as u32);
+            views.append(view);
+        }
+        *row = &row[offset..];
+    }
+
+    if check_utf8 {
+        // the values contains all data, no matter if it is short or long
+        // we can validate utf8 in one go.
+        std::str::from_utf8(values.as_slice()).unwrap();
+    }
+
+    let builder = ArrayDataBuilder::new(DataType::BinaryView)
+        .len(len)
+        .null_count(null_count)
+        .null_bit_buffer(Some(nulls.into()))
+        .add_buffer(views.finish())
+        .add_buffer(values.into());
+
+    // SAFETY:
+    // Valid by construction above
+    unsafe { BinaryViewArray::from(builder.build_unchecked()) }
+}
+
+/// Decodes a binary view array from `rows` with the provided `options`
+pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> 
BinaryViewArray {
+    decode_binary_view_inner(rows, options, false)
+}
+
 /// Decodes a string array from `rows` with the provided `options`
 ///
 /// # Safety
@@ -269,16 +335,6 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>(
     GenericStringArray::from(builder.build_unchecked())
 }
 
-/// Decodes a binary view array from `rows` with the provided `options`
-pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> 
BinaryViewArray {
-    let decoded: GenericBinaryArray<i64> = decode_binary(rows, options);
-
-    // Better performance might be to directly build the binary view instead 
of building to BinaryArray and then casting
-    // I suspect that the overhead is not a big deal.
-    // If it is, we can reimplement the `decode_binary_view` function to 
directly build the StringViewArray
-    BinaryViewArray::from(&decoded)
-}
-
 /// Decodes a string view array from `rows` with the provided `options`
 ///
 /// # Safety
@@ -289,9 +345,6 @@ pub unsafe fn decode_string_view(
     options: SortOptions,
     validate_utf8: bool,
 ) -> StringViewArray {
-    let decoded: GenericStringArray<i64> = decode_string(rows, options, 
validate_utf8);
-    // Better performance might be to directly build the string view instead 
of building to StringArray and then casting
-    // I suspect that the overhead is not a big deal.
-    // If it is, we can reimplement the `decode_string_view` function to 
directly build the StringViewArray
-    StringViewArray::from(&decoded)
+    let view = decode_binary_view_inner(rows, options, validate_utf8);
+    view.to_string_view_unchecked()
 }
diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs
index b5298cbe36..0fb63b5b32 100644
--- a/arrow/benches/row_format.rs
+++ b/arrow/benches/row_format.rs
@@ -24,7 +24,7 @@ use arrow::datatypes::{Int64Type, UInt64Type};
 use arrow::row::{RowConverter, SortField};
 use arrow::util::bench_util::{
     create_boolean_array, create_dict_from_values, create_primitive_array,
-    create_string_array_with_len, create_string_dict_array,
+    create_string_array_with_len, create_string_dict_array, 
create_string_view_array_with_len,
 };
 use arrow_array::types::Int32Type;
 use arrow_array::Array;
@@ -87,6 +87,18 @@ fn row_bench(c: &mut Criterion) {
     let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 
100)) as ArrayRef];
     do_bench(c, "4096 string(100, 0.5)", cols);
 
+    let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10, 
false)) as ArrayRef];
+    do_bench(c, "4096 string view(10, 0)", cols);
+
+    let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30, 
false)) as ArrayRef];
+    do_bench(c, "4096 string view(30, 0)", cols);
+
+    let cols = vec![Arc::new(create_string_view_array_with_len(40960, 0., 100, 
false)) as ArrayRef];
+    do_bench(c, "40960 string view(100, 0)", cols);
+
+    let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, 
false)) as ArrayRef];
+    do_bench(c, "4096 string view(100, 0.5)", cols);
+
     let cols = vec![Arc::new(create_string_dict_array::<Int32Type>(4096, 0., 
10)) as ArrayRef];
     do_bench(c, "4096 string_dictionary(10, 0)", cols);
 

Reply via email to