This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e9bf8aa6b Speed up `filter_bytes` (#6699)
e9bf8aa6b is described below

commit e9bf8aa6bf67ec192fce1a6f3e7ab604c9689fef
Author: DaniĆ«l Heres <[email protected]>
AuthorDate: Thu Nov 7 23:51:41 2024 +0100

    Speed up `filter_bytes` (#6699)
    
    * Use vec
    
    * Use extend, fix capacity
---
 arrow-data/src/transform/variable_size.rs |  2 +-
 arrow-select/src/filter.rs                | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arrow-data/src/transform/variable_size.rs 
b/arrow-data/src/transform/variable_size.rs
index fa1592d97..ec0174bf8 100644
--- a/arrow-data/src/transform/variable_size.rs
+++ b/arrow-data/src/transform/variable_size.rs
@@ -34,7 +34,7 @@ fn extend_offset_values<T: ArrowNativeType + 
AsPrimitive<usize>>(
     len: usize,
 ) {
     let start_values = offsets[start].as_();
-    let end_values = offsets[start + len].as_();
+    let end_values: usize = offsets[start + len].as_();
     let new_values = &values[start_values..end_values];
     buffer.extend_from_slice(new_values);
 }
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index d96dad2f1..451b04485 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -582,7 +582,6 @@ fn filter_native<T: ArrowNativeType>(values: &[T], 
predicate: &FilterPredicate)
         }
         IterationStrategy::Indices(indices) => {
             let iter = indices.iter().map(|x| values[*x]);
-
             // SAFETY: `Vec::iter` is trusted length
             unsafe { MutableBuffer::from_trusted_len_iter(iter) }
         }
@@ -618,8 +617,8 @@ where
 struct FilterBytes<'a, OffsetSize> {
     src_offsets: &'a [OffsetSize],
     src_values: &'a [u8],
-    dst_offsets: MutableBuffer,
-    dst_values: MutableBuffer,
+    dst_offsets: Vec<OffsetSize>,
+    dst_values: Vec<u8>,
     cur_offset: OffsetSize,
 }
 
@@ -631,10 +630,10 @@ where
     where
         T: ByteArrayType<Offset = OffsetSize>,
     {
-        let num_offsets_bytes = (capacity + 1) * 
std::mem::size_of::<OffsetSize>();
-        let mut dst_offsets = MutableBuffer::new(num_offsets_bytes);
-        let dst_values = MutableBuffer::new(0);
+        let dst_values = Vec::new();
+        let mut dst_offsets: Vec<OffsetSize> = Vec::with_capacity(capacity + 
1);
         let cur_offset = OffsetSize::from_usize(0).unwrap();
+
         dst_offsets.push(cur_offset);
 
         Self {
@@ -664,13 +663,15 @@ where
 
     /// Extends the in-progress array by the indexes in the provided iterator
     fn extend_idx(&mut self, iter: impl Iterator<Item = usize>) {
-        for idx in iter {
-            let (start, end, len) = self.get_value_range(idx);
+        self.dst_offsets.extend(iter.map(|idx| {
+            let start = self.src_offsets[idx].as_usize();
+            let end = self.src_offsets[idx + 1].as_usize();
+            let len = OffsetSize::from_usize(end - start).expect("illegal 
offset range");
             self.cur_offset += len;
-            self.dst_offsets.push(self.cur_offset);
             self.dst_values
                 .extend_from_slice(&self.src_values[start..end]);
-        }
+            self.cur_offset
+        }));
     }
 
     /// Extends the in-progress array by the ranges in the provided iterator

Reply via email to