Dandandan commented on code in PR #9391:
URL: https://github.com/apache/arrow-rs/pull/9391#discussion_r2795072876
##########
arrow-select/src/take.rs:
##########
@@ -608,6 +616,141 @@ fn take_byte_view<T: ByteViewType, IndexType:
ArrowPrimitiveType>(
})
}
+/// `take` implementation for byte view arrays that compacts string data into
+/// new buffers rather than sharing the original buffers.
+///
+/// This fuses the gather (take) with string compaction in a single pass,
+/// producing an output array whose buffers contain only the referenced data.
+/// This is beneficial when `take` selects a small fraction of the source
array,
+/// as it avoids keeping the original large buffers alive.
+///
+/// The output uses multiple buffers if a single buffer would exceed `u32::MAX`
+/// bytes, ensuring `ByteView::offset` never overflows.
+///
+/// # Safety contract
+/// Callers must ensure that all non-null indices are within bounds of
+/// `array` (i.e. `< array.len()`). This is guaranteed when called via
+/// `take()` with `check_bounds` enabled, or when the caller otherwise
+/// validates indices. Out-of-bounds indices will cause a panic (indexing
+/// `src_views`) or UB (via `get_unchecked` on `src_buffers`).
+#[inline(never)]
+fn take_byte_view_compact<T: ByteViewType, IndexType: ArrowPrimitiveType>(
+ array: &GenericByteViewArray<T>,
+ indices: &PrimitiveArray<IndexType>,
+) -> Result<GenericByteViewArray<T>, ArrowError> {
+ let src_views = array.views();
+ let src_buffers = array.data_buffers();
+ let index_nulls = indices.nulls();
+
+ // Phase 1: Calculate total non-inlined string bytes to pre-allocate.
+ // This avoids reallocations during the copy phase. We only read the u128
+ // view descriptors here, not the actual string data.
+ let mut total_bytes: usize = 0;
+ for (i, idx) in indices.values().iter().enumerate() {
+ // SAFETY: i < indices.len(), which equals index_nulls.len() by Arrow
invariant
+ if index_nulls.is_some_and(|n| unsafe { !n.inner().value_unchecked(i)
}) {
+ continue;
+ }
+ let raw_view = src_views[idx.as_usize()];
+ let len = raw_view as u32;
+ if len > MAX_INLINE_VIEW_LEN {
+ total_bytes += len as usize;
+ }
+ }
+
+ // Phase 2: Build output views and compact string data.
+ // We cap each buffer at u32::MAX bytes to avoid offset overflow.
+ let mut new_views: Vec<u128> = Vec::with_capacity(indices.len());
+ let mut completed_buffers: Vec<Buffer> = Vec::new();
+ let initial_cap = total_bytes.min(u32::MAX as usize);
+ let mut current_buffer: Vec<u8> = Vec::with_capacity(initial_cap);
+ let mut written_bytes: usize = 0;
+
+ for (i, idx) in indices.values().iter().enumerate() {
Review Comment:
This could use `collect`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]