This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e9bf8aa6b Speed up `filter_bytes` (#6699)
e9bf8aa6b is described below
commit e9bf8aa6bf67ec192fce1a6f3e7ab604c9689fef
Author: Daniƫl Heres <[email protected]>
AuthorDate: Thu Nov 7 23:51:41 2024 +0100
Speed up `filter_bytes` (#6699)
* Use vec
* Use extend, fix capacity
---
arrow-data/src/transform/variable_size.rs | 2 +-
arrow-select/src/filter.rs | 21 +++++++++++----------
2 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/arrow-data/src/transform/variable_size.rs
b/arrow-data/src/transform/variable_size.rs
index fa1592d97..ec0174bf8 100644
--- a/arrow-data/src/transform/variable_size.rs
+++ b/arrow-data/src/transform/variable_size.rs
@@ -34,7 +34,7 @@ fn extend_offset_values<T: ArrowNativeType +
AsPrimitive<usize>>(
len: usize,
) {
let start_values = offsets[start].as_();
- let end_values = offsets[start + len].as_();
+ let end_values: usize = offsets[start + len].as_();
let new_values = &values[start_values..end_values];
buffer.extend_from_slice(new_values);
}
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index d96dad2f1..451b04485 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -582,7 +582,6 @@ fn filter_native<T: ArrowNativeType>(values: &[T],
predicate: &FilterPredicate)
}
IterationStrategy::Indices(indices) => {
let iter = indices.iter().map(|x| values[*x]);
-
// SAFETY: `Vec::iter` is trusted length
unsafe { MutableBuffer::from_trusted_len_iter(iter) }
}
@@ -618,8 +617,8 @@ where
struct FilterBytes<'a, OffsetSize> {
src_offsets: &'a [OffsetSize],
src_values: &'a [u8],
- dst_offsets: MutableBuffer,
- dst_values: MutableBuffer,
+ dst_offsets: Vec<OffsetSize>,
+ dst_values: Vec<u8>,
cur_offset: OffsetSize,
}
@@ -631,10 +630,10 @@ where
where
T: ByteArrayType<Offset = OffsetSize>,
{
- let num_offsets_bytes = (capacity + 1) *
std::mem::size_of::<OffsetSize>();
- let mut dst_offsets = MutableBuffer::new(num_offsets_bytes);
- let dst_values = MutableBuffer::new(0);
+ let dst_values = Vec::new();
+ let mut dst_offsets: Vec<OffsetSize> = Vec::with_capacity(capacity +
1);
let cur_offset = OffsetSize::from_usize(0).unwrap();
+
dst_offsets.push(cur_offset);
Self {
@@ -664,13 +663,15 @@ where
/// Extends the in-progress array by the indexes in the provided iterator
fn extend_idx(&mut self, iter: impl Iterator<Item = usize>) {
- for idx in iter {
- let (start, end, len) = self.get_value_range(idx);
+ self.dst_offsets.extend(iter.map(|idx| {
+ let start = self.src_offsets[idx].as_usize();
+ let end = self.src_offsets[idx + 1].as_usize();
+ let len = OffsetSize::from_usize(end - start).expect("illegal
offset range");
self.cur_offset += len;
- self.dst_offsets.push(self.cur_offset);
self.dst_values
.extend_from_slice(&self.src_values[start..end]);
- }
+ self.cur_offset
+ }));
}
/// Extends the in-progress array by the ranges in the provided iterator