alamb commented on code in PR #7513: URL: https://github.com/apache/arrow-rs/pull/7513#discussion_r2113849972
########## arrow-select/src/filter.rs: ########## @@ -337,7 +367,94 @@ impl FilterPredicate { } } -fn filter_array(values: &dyn Array, predicate: &FilterPredicate) -> Result<ArrayRef, ArrowError> { +/// Appends the nulls from array for any filtered rows to the `null_buffer_builder` +/// +/// Panics if the strategy is +/// [`IterationStrategy::All`] or [`IterationStrategy::None`], which must be handled by the caller +fn append_filtered_nulls( + null_buffer_builder: &mut NullBufferBuilder, + array: &dyn Array, + predicate: &FilterPredicate, +) { + assert!(!matches!( + predicate.strategy, + IterationStrategy::All | IterationStrategy::None + )); + + let Some(nulls) = array.nulls() else { + // No nulls in the source array, so it anything selected by filter will be non null as well + null_buffer_builder.append_n_non_nulls(predicate.count()); + return; + }; + + // Filter the packed bitmask `buffer`, with `predicate` starting at bit offset `offset` + // TODO: maybe this could be improved to avoid materializing the temporary buffer + let nulls = filter_bits(nulls.inner(), predicate); + let nulls = NullBuffer::from(BooleanBuffer::new(nulls, 0, predicate.count)); + + null_buffer_builder.append_buffer(&nulls); +} + +impl<T: ArrowPrimitiveType> ArrayBuilderExtFilter for PrimitiveBuilder<T> { + fn append_filtered( + &mut self, + array: &dyn Array, + predicate: &FilterPredicate, + ) -> Result<(), ArrowError> { + // todo reserve space for the new values + // self.reserve(predicate.count); + + let array = array.as_primitive::<T>(); + let values = array.values(); + + assert!(values.len() >= predicate.filter.len()); + + let (values_builder, null_buffer_builder) = self.inner_mut(); + match &predicate.strategy { + IterationStrategy::SlicesIterator => { + append_filtered_nulls(null_buffer_builder, array, predicate); + for (start, end) in SlicesIterator::new(&predicate.filter) { + values_builder.append_slice(&values[start..end]); + } + } + IterationStrategy::Slices(slices) => { + append_filtered_nulls(null_buffer_builder, array, predicate); + for (start, end) in slices { + values_builder.append_slice(&values[*start..*end]); + } + } + IterationStrategy::IndexIterator => { + append_filtered_nulls(null_buffer_builder, array, predicate); + let iter = + IndexIterator::new(&predicate.filter, predicate.count).map(|x| values[x]); + // SAFETY: IndexIterator is trusted length + unsafe { values_builder.append_trusted_len_iter(iter) } + } + IterationStrategy::Indices(indices) => { + append_filtered_nulls(null_buffer_builder, array, predicate); + let iter = indices.iter().map(|x| values[*x]); + for v in iter { Review Comment: Good call -- I can find a way to make it faster (direct access to the underlying buffer) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org