This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 074bcb5793 Directly decode String/BinaryView types from arrow-row
format (#6044)
074bcb5793 is described below
commit 074bcb5793e466c54ef7d81e9675a69aa16d2f6c
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jul 15 07:11:36 2024 -0400
Directly decode String/BinaryView types from arrow-row format (#6044)
* add string view bench
* check in new impl
* add utf8
* quick utf8 validation
* Update arrow-row/src/variable.rs
Co-authored-by: Andrew Lamb <[email protected]>
* address comments
* update
* Revert "address comments"
This reverts commit e2656c94dd5ff4fb2f486278feb346d44a7f5436.
* addr comments
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-row/src/variable.rs | 83 +++++++++++++++++++++++++++++++++++++--------
arrow/benches/row_format.rs | 14 +++++++-
2 files changed, 81 insertions(+), 16 deletions(-)
diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
index c5aa7d8ac3..4d4bcddc08 100644
--- a/arrow-row/src/variable.rs
+++ b/arrow-row/src/variable.rs
@@ -22,6 +22,7 @@ use arrow_buffer::bit_util::ceil;
use arrow_buffer::MutableBuffer;
use arrow_data::ArrayDataBuilder;
use arrow_schema::{DataType, SortOptions};
+use builder::make_view;
/// The block size of the variable length encoding
pub const BLOCK_SIZE: usize = 32;
@@ -152,6 +153,8 @@ fn encode_blocks<const SIZE: usize>(out: &mut [u8], val:
&[u8]) -> usize {
end_offset
}
+/// Decodes a single block of data
+/// The `f` function accepts a slice of the decoded data, it may be called
multiple times
pub fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl
FnMut(&[u8])) -> usize {
let (non_empty_sentinel, continuation) = match options.descending {
true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION),
@@ -243,6 +246,69 @@ pub fn decode_binary<I: OffsetSizeTrait>(
unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
}
+fn decode_binary_view_inner(
+ rows: &mut [&[u8]],
+ options: SortOptions,
+ check_utf8: bool,
+) -> BinaryViewArray {
+ let len = rows.len();
+
+ let mut null_count = 0;
+
+ let nulls = MutableBuffer::collect_bool(len, |x| {
+ let valid = rows[x][0] != null_sentinel(options);
+ null_count += !valid as usize;
+ valid
+ });
+
+ let values_capacity: usize = rows.iter().map(|row| decoded_len(row,
options)).sum();
+ let mut values = MutableBuffer::new(values_capacity);
+ let mut views = BufferBuilder::<u128>::new(len);
+
+ for row in rows {
+ let start_offset = values.len();
+ let offset = decode_blocks(row, options, |b|
values.extend_from_slice(b));
+ if row[0] == null_sentinel(options) {
+ debug_assert_eq!(offset, 1);
+ debug_assert_eq!(start_offset, values.len());
+ views.append(0);
+ } else {
+ // Safety: we just appended the data to the end of the buffer
+ let val = unsafe { values.get_unchecked_mut(start_offset..) };
+
+ if options.descending {
+ val.iter_mut().for_each(|o| *o = !*o);
+ }
+
+ let view = make_view(val, 0, start_offset as u32);
+ views.append(view);
+ }
+ *row = &row[offset..];
+ }
+
+ if check_utf8 {
+ // the values contains all data, no matter if it is short or long
+ // we can validate utf8 in one go.
+ std::str::from_utf8(values.as_slice()).unwrap();
+ }
+
+ let builder = ArrayDataBuilder::new(DataType::BinaryView)
+ .len(len)
+ .null_count(null_count)
+ .null_bit_buffer(Some(nulls.into()))
+ .add_buffer(views.finish())
+ .add_buffer(values.into());
+
+ // SAFETY:
+ // Valid by construction above
+ unsafe { BinaryViewArray::from(builder.build_unchecked()) }
+}
+
+/// Decodes a binary view array from `rows` with the provided `options`
+pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) ->
BinaryViewArray {
+ decode_binary_view_inner(rows, options, false)
+}
+
/// Decodes a string array from `rows` with the provided `options`
///
/// # Safety
@@ -269,16 +335,6 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>(
GenericStringArray::from(builder.build_unchecked())
}
-/// Decodes a binary view array from `rows` with the provided `options`
-pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) ->
BinaryViewArray {
- let decoded: GenericBinaryArray<i64> = decode_binary(rows, options);
-
- // Better performance might be to directly build the binary view instead
of building to BinaryArray and then casting
- // I suspect that the overhead is not a big deal.
- // If it is, we can reimplement the `decode_binary_view` function to
directly build the StringViewArray
- BinaryViewArray::from(&decoded)
-}
-
/// Decodes a string view array from `rows` with the provided `options`
///
/// # Safety
@@ -289,9 +345,6 @@ pub unsafe fn decode_string_view(
options: SortOptions,
validate_utf8: bool,
) -> StringViewArray {
- let decoded: GenericStringArray<i64> = decode_string(rows, options,
validate_utf8);
- // Better performance might be to directly build the string view instead
of building to StringArray and then casting
- // I suspect that the overhead is not a big deal.
- // If it is, we can reimplement the `decode_string_view` function to
directly build the StringViewArray
- StringViewArray::from(&decoded)
+ let view = decode_binary_view_inner(rows, options, validate_utf8);
+ view.to_string_view_unchecked()
}
diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs
index b5298cbe36..0fb63b5b32 100644
--- a/arrow/benches/row_format.rs
+++ b/arrow/benches/row_format.rs
@@ -24,7 +24,7 @@ use arrow::datatypes::{Int64Type, UInt64Type};
use arrow::row::{RowConverter, SortField};
use arrow::util::bench_util::{
create_boolean_array, create_dict_from_values, create_primitive_array,
- create_string_array_with_len, create_string_dict_array,
+ create_string_array_with_len, create_string_dict_array,
create_string_view_array_with_len,
};
use arrow_array::types::Int32Type;
use arrow_array::Array;
@@ -87,6 +87,18 @@ fn row_bench(c: &mut Criterion) {
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.5,
100)) as ArrayRef];
do_bench(c, "4096 string(100, 0.5)", cols);
+ let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10,
false)) as ArrayRef];
+ do_bench(c, "4096 string view(10, 0)", cols);
+
+ let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30,
false)) as ArrayRef];
+ do_bench(c, "4096 string view(30, 0)", cols);
+
+ let cols = vec![Arc::new(create_string_view_array_with_len(40960, 0., 100,
false)) as ArrayRef];
+ do_bench(c, "40960 string view(100, 0)", cols);
+
+ let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100,
false)) as ArrayRef];
+ do_bench(c, "4096 string view(100, 0.5)", cols);
+
let cols = vec![Arc::new(create_string_dict_array::<Int32Type>(4096, 0.,
10)) as ArrayRef];
do_bench(c, "4096 string_dictionary(10, 0)", cols);