rluvaton commented on code in PR #8951:
URL: https://github.com/apache/arrow-rs/pull/8951#discussion_r2702355920
##########
arrow-select/src/coalesce.rs:
##########
@@ -238,10 +242,100 @@ impl BatchCoalescer {
batch: RecordBatch,
filter: &BooleanArray,
) -> Result<(), ArrowError> {
- // TODO: optimize this to avoid materializing (copying the results
- // of filter to a new batch)
- let filtered_batch = filter_record_batch(&batch, filter)?;
- self.push_batch(filtered_batch)
+ // We only support primitve now, fallback to filter_record_batch for
other types
+ // Also, skip optimization when filter is not very selectivex§
+
+ // Build an optimized filter predicate that chooses the best iteration
strategy
+ let is_optimize_beneficial =
is_optimize_beneficial_record_batch(&batch);
+ let selected_count = filter.true_count();
+ let num_rows = batch.num_rows();
+
+ // Fast path: skip if no rows selected
+ if selected_count == 0 {
+ return Ok(());
+ }
+
+ // Fast path: if all rows selected, just push the batch
+ if selected_count == num_rows {
+ return self.push_batch(batch);
+ }
+
+ let selectivity = Some(selected_count as f64 / num_rows as f64);
+ let (_schema, arrays, _num_rows) = batch.into_parts();
+
+ // Setup input arrays as sources
+ assert_eq!(arrays.len(), self.in_progress_arrays.len());
+ self.in_progress_arrays
+ .iter_mut()
+ .zip(&arrays)
+ .for_each(|(in_progress, array)| {
+ in_progress.set_source(Some(Arc::clone(array)), selectivity);
+ });
+
+ // Choose iteration strategy based on the optimized predicate
+ self.copy_from_filter(filter, is_optimize_beneficial, selected_count)?;
+
+ // Clear sources to allow memory to be freed
+ for in_progress in self.in_progress_arrays.iter_mut() {
+ in_progress.set_source(None, None);
+ }
+
+ Ok(())
+ }
+
+ /// Helper to copy rows at the given indices, handling batch boundaries
efficiently
+ ///
+ /// This method batches the index iteration to avoid per-row batch
boundary checks.
+ fn copy_from_filter(
+ &mut self,
+ filter: &BooleanArray,
Review Comment:
Why getting `BooleanArray` and not `BooleanBuffer`? does nulls have meaning
here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]