This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new f538991 Fewer ByteArray allocations when writing binary columns (#820)
f538991 is described below
commit f538991fc9deaa2804e8c701a9a25f1e42d818d2
Author: Wakahisa <[email protected]>
AuthorDate: Mon Oct 11 21:59:11 2021 +0200
Fewer ByteArray allocations when writing binary columns (#820)
* split benchmarks of primitive arrays
* add list benches
* Allocate one ByteArray per row group write
* enumerate
---
parquet/src/arrow/arrow_writer.rs | 28 +++++++++++++++++++---------
1 file changed, 19 insertions(+), 9 deletions(-)
diff --git a/parquet/src/arrow/arrow_writer.rs
b/parquet/src/arrow/arrow_writer.rs
index 7728cd4..29bb54f 100644
--- a/parquet/src/arrow/arrow_writer.rs
+++ b/parquet/src/arrow/arrow_writer.rs
@@ -461,15 +461,25 @@ fn write_leaf(
macro_rules! def_get_binary_array_fn {
($name:ident, $ty:ty) => {
fn $name(array: &$ty) -> Vec<ByteArray> {
- let mut values = Vec::with_capacity(array.len() -
array.null_count());
- for i in 0..array.len() {
- if array.is_valid(i) {
- let bytes: Vec<u8> = array.value(i).into();
- let bytes = ByteArray::from(bytes);
- values.push(bytes);
- }
- }
- values
+ let mut byte_array = ByteArray::new();
+ let ptr = crate::memory::ByteBufferPtr::new(
+ unsafe { array.value_data().typed_data::<u8>() }.to_vec(),
+ );
+ byte_array.set_data(ptr);
+ array
+ .value_offsets()
+ .windows(2)
+ .enumerate()
+ .filter_map(|(i, offsets)| {
+ if array.is_valid(i) {
+ let start = offsets[0] as usize;
+ let len = offsets[1] as usize - start;
+ Some(byte_array.slice(start, len))
+ } else {
+ None
+ }
+ })
+ .collect()
}
};
}