This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 79575aa343 perf: zero-copy path in `RowConverter::from_binary` (#8686)
79575aa343 is described below
commit 79575aa343c2a2aa4bf226554ec4e4f7ff1cb37e
Author: Mikhail Zabaluev <[email protected]>
AuthorDate: Fri Oct 24 18:28:03 2025 +0300
perf: zero-copy path in `RowConverter::from_binary` (#8686)
# Which issue does this PR close?
- Closes #8685.
# What changes are included in this PR?
In the implementation of `RowConverter::from_binary`, the `BinaryArray`
is broken into parts and an attempt is made to convert the data buffer
into `Vec` at no copying cost with `Buffer::into_vec`. Only if this
fails, the data is copied out for a newly allocated `Vec`.
# Are these changes tested?
Passes existing tests using `RowConverter::from_binary`, which all
convert a non-shared buffer taking advantage of the optimization.
Another test is added to cover the copying path.
# Are there any user-facing changes?
No
---
arrow-row/src/lib.rs | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index db7758047c..5f690e9a67 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -913,9 +913,13 @@ impl RowConverter {
0,
"can't construct Rows instance from array with nulls"
);
+ let (offsets, values, _) = array.into_parts();
+ let offsets = offsets.iter().map(|&i| i.as_usize()).collect();
+ // Try zero-copy, if it does not succeed, fall back to copying the
values.
+ let buffer = values.into_vec().unwrap_or_else(|values|
values.to_vec());
Rows {
- buffer: array.values().to_vec(),
- offsets: array.offsets().iter().map(|&i| i.as_usize()).collect(),
+ buffer,
+ offsets,
config: RowConfig {
fields: Arc::clone(&self.fields),
validate_utf8: true,
@@ -2474,6 +2478,19 @@ mod tests {
assert!(rows.row(3) < rows.row(0));
}
+ #[test]
+ fn test_from_binary_shared_buffer() {
+ let converter =
RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap();
+ let array = Arc::new(BinaryArray::from_iter_values([&[0xFF]])) as _;
+ let rows = converter.convert_columns(&[array]).unwrap();
+ let binary_rows = rows.try_into_binary().expect("known-small rows");
+ let _binary_rows_shared_buffer = binary_rows.clone();
+
+ let parsed = converter.from_binary(binary_rows);
+
+ converter.convert_rows(parsed.iter()).unwrap();
+ }
+
#[test]
#[should_panic(expected = "Encountered non UTF-8 data")]
fn test_invalid_utf8() {