This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new f538991  Fewer ByteArray allocations when writing binary columns (#820)
f538991 is described below

commit f538991fc9deaa2804e8c701a9a25f1e42d818d2
Author: Wakahisa <[email protected]>
AuthorDate: Mon Oct 11 21:59:11 2021 +0200

    Fewer ByteArray allocations when writing binary columns (#820)
    
    * split benchmarks of primitive arrays
    
    * add list benches
    
    * Allocate one ByteArray per row group write
    
    * enumerate
---
 parquet/src/arrow/arrow_writer.rs | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer.rs 
b/parquet/src/arrow/arrow_writer.rs
index 7728cd4..29bb54f 100644
--- a/parquet/src/arrow/arrow_writer.rs
+++ b/parquet/src/arrow/arrow_writer.rs
@@ -461,15 +461,25 @@ fn write_leaf(
 macro_rules! def_get_binary_array_fn {
     ($name:ident, $ty:ty) => {
         fn $name(array: &$ty) -> Vec<ByteArray> {
-            let mut values = Vec::with_capacity(array.len() - 
array.null_count());
-            for i in 0..array.len() {
-                if array.is_valid(i) {
-                    let bytes: Vec<u8> = array.value(i).into();
-                    let bytes = ByteArray::from(bytes);
-                    values.push(bytes);
-                }
-            }
-            values
+            let mut byte_array = ByteArray::new();
+            let ptr = crate::memory::ByteBufferPtr::new(
+                unsafe { array.value_data().typed_data::<u8>() }.to_vec(),
+            );
+            byte_array.set_data(ptr);
+            array
+                .value_offsets()
+                .windows(2)
+                .enumerate()
+                .filter_map(|(i, offsets)| {
+                    if array.is_valid(i) {
+                        let start = offsets[0] as usize;
+                        let len = offsets[1] as usize - start;
+                        Some(byte_array.slice(start, len))
+                    } else {
+                        None
+                    }
+                })
+                .collect()
         }
     };
 }

Reply via email to