This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 7f3d3aef30 perf: override `count`, `nth`, `nth_back`, `last` and `max`
for BitIterator (#8696)
7f3d3aef30 is described below
commit 7f3d3aef30613b039939efec8780ab4eaa8cb98d
Author: Raz Luvaton <[email protected]>
AuthorDate: Mon Oct 27 20:21:28 2025 +0200
perf: override `count`, `nth`, `nth_back`, `last` and `max` for BitIterator
(#8696)
# Which issue does this PR close?
N/A
# Rationale for this change
overriding this function improve performance over the fallback
implementation
# What changes are included in this PR?
Override implementation of:
- `count` which is not optimized away even when `ExactSizeIterator` is
implemented
- `nth` to avoid calling `next` `n + 1` times (which is also used when
doing `.skip`)
- `nth_back`
- `last`
- `max`
# Are these changes tested?
Yes, I've added a lot of tests
# Are there any user-facing changes?
Nope
---
arrow-buffer/src/util/bit_iterator.rs | 516 ++++++++++++++++++++++++++++++++++
1 file changed, 516 insertions(+)
diff --git a/arrow-buffer/src/util/bit_iterator.rs
b/arrow-buffer/src/util/bit_iterator.rs
index c7f6f94fb8..0aa94a5d4d 100644
--- a/arrow-buffer/src/util/bit_iterator.rs
+++ b/arrow-buffer/src/util/bit_iterator.rs
@@ -23,6 +23,7 @@ use crate::bit_util::{ceil, get_bit_raw};
/// Iterator over the bits within a packed bitmask
///
/// To efficiently iterate over just the set bits see [`BitIndexIterator`] and
[`BitSliceIterator`]
+#[derive(Clone)]
pub struct BitIterator<'a> {
buffer: &'a [u8],
current_offset: usize,
@@ -71,6 +72,71 @@ impl Iterator for BitIterator<'_> {
let remaining_bits = self.end_offset - self.current_offset;
(remaining_bits, Some(remaining_bits))
}
+
+ fn count(self) -> usize
+ where
+ Self: Sized,
+ {
+ self.len()
+ }
+
+ fn nth(&mut self, n: usize) -> Option<Self::Item> {
+ // Check if we can advance to the desired offset.
+ // When n is 0 it means we want the next() value
+ // and when n is 1 we want the next().next() value
+ // so adding n to the current offset and not n - 1
+ match self.current_offset.checked_add(n) {
+ // Yes, and still within bounds
+ Some(new_offset) if new_offset < self.end_offset => {
+ self.current_offset = new_offset;
+ }
+
+ // Either overflow or would exceed end_offset
+ _ => {
+ self.current_offset = self.end_offset;
+ return None;
+ }
+ }
+
+ self.next()
+ }
+
+ fn last(mut self) -> Option<Self::Item> {
+ // If already at the end, return None
+ if self.current_offset == self.end_offset {
+ return None;
+ }
+
+ // Go to the one before the last bit
+ self.current_offset = self.end_offset - 1;
+
+ // Return the last bit
+ self.next()
+ }
+
+ fn max(self) -> Option<Self::Item>
+ where
+ Self: Sized,
+ Self::Item: Ord,
+ {
+ if self.current_offset == self.end_offset {
+ return None;
+ }
+
+ // true is greater than false so we only need to check if there's any
true bit
+ let mut bit_index_iter = BitIndexIterator::new(
+ self.buffer,
+ self.current_offset,
+ self.end_offset - self.current_offset,
+ );
+
+ if bit_index_iter.next().is_some() {
+ return Some(true);
+ }
+
+ // We know the iterator is not empty and there are no set bits so
false is the max
+ Some(false)
+ }
}
impl ExactSizeIterator for BitIterator<'_> {}
@@ -86,6 +152,27 @@ impl DoubleEndedIterator for BitIterator<'_> {
let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.end_offset) };
Some(v)
}
+
+ fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
+ // Check if we can advance to the desired offset.
+ // When n is 0 it means we want the next_back() value
+ // and when n is 1 we want the next_back().next_back() value
+ // so subtracting n to the current offset and not n - 1
+ match self.end_offset.checked_sub(n) {
+ // Yes, and still within bounds
+ Some(new_offset) if self.current_offset < new_offset => {
+ self.end_offset = new_offset;
+ }
+
+ // Either underflow or would exceed current_offset
+ _ => {
+ self.current_offset = self.end_offset;
+ return None;
+ }
+ }
+
+ self.next_back()
+ }
}
/// Iterator of contiguous ranges of set bits within a provided packed bitmask
@@ -327,6 +414,12 @@ pub fn try_for_each_valid_idx<E, F: FnMut(usize) ->
Result<(), E>>(
#[cfg(test)]
mod tests {
use super::*;
+ use crate::BooleanBuffer;
+ use rand::rngs::StdRng;
+ use rand::{Rng, SeedableRng};
+ use std::fmt::Debug;
+ use std::iter::Copied;
+ use std::slice::Iter;
#[test]
fn test_bit_iterator_size_hint() {
@@ -486,4 +579,427 @@ mod tests {
.collect();
assert_eq!(result, expected);
}
+
+ trait SharedBetweenBitIteratorAndSliceIter:
+ ExactSizeIterator<Item = bool> + DoubleEndedIterator<Item = bool>
+ {
+ }
+ impl<T: ?Sized + ExactSizeIterator<Item = bool> + DoubleEndedIterator<Item
= bool>>
+ SharedBetweenBitIteratorAndSliceIter for T
+ {
+ }
+
+ fn get_bit_iterator_cases() -> impl Iterator<Item = (BooleanBuffer,
Vec<bool>)> {
+ let mut rng = StdRng::seed_from_u64(42);
+
+ [0, 1, 6, 8, 100, 164]
+ .map(|len| {
+ let source = (0..len).map(|_|
rng.random_bool(0.5)).collect::<Vec<_>>();
+
+ (BooleanBuffer::from(source.as_slice()), source)
+ })
+ .into_iter()
+ }
+
+ fn setup_and_assert(
+ setup_iters: impl Fn(&mut dyn SharedBetweenBitIteratorAndSliceIter),
+ assert_fn: impl Fn(BitIterator, Copied<Iter<bool>>),
+ ) {
+ for (boolean_buffer, source) in get_bit_iterator_cases() {
+ // Not using `boolean_buffer.iter()` in case the implementation
change to not call BitIterator internally
+ // in which case the test would not test what it intends to test
+ let mut actual = BitIterator::new(boolean_buffer.values(), 0,
boolean_buffer.len());
+ let mut expected = source.iter().copied();
+
+ setup_iters(&mut actual);
+ setup_iters(&mut expected);
+
+ assert_fn(actual, expected);
+ }
+ }
+
+ /// Trait representing an operation on a BitIterator
+ /// that can be compared against a slice iterator
+ trait BitIteratorOp {
+ /// What the operation returns (e.g. Option<bool> for last/max, usize
for count, etc)
+ type Output: PartialEq + Debug;
+
+ /// The name of the operation, used for error messages
+ const NAME: &'static str;
+
+ /// Get the value of the operation for the provided iterator
+ /// This will be either a BitIterator or a slice iterator to make sure
they produce the same result
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(iter: T) ->
Self::Output;
+ }
+
+ /// Helper function that will assert that the provided operation
+ /// produces the same result for both BitIterator and slice iterator
+ /// under various consumption patterns (e.g. some calls to
next/next_back/consume_all/etc)
+ fn assert_bit_iterator_cases<O: BitIteratorOp>() {
+ setup_and_assert(
+ |_iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {},
+ |actual, expected| {
+ let current_iterator_values: Vec<bool> =
expected.clone().collect();
+ assert_eq!(
+ O::get_value(actual),
+ O::get_value(expected),
+ "Failed on op {} for new iter (left actual, right
expected) ({current_iterator_values:?})",
+ O::NAME
+ );
+ },
+ );
+
+ setup_and_assert(
+ |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+ iter.next();
+ },
+ |actual, expected| {
+ let current_iterator_values: Vec<bool> =
expected.clone().collect();
+
+ assert_eq!(
+ O::get_value(actual),
+ O::get_value(expected),
+ "Failed on op {} for new iter after consuming 1 element
from the start (left actual, right expected) ({current_iterator_values:?})",
+ O::NAME
+ );
+ },
+ );
+
+ setup_and_assert(
+ |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+ iter.next_back();
+ },
+ |actual, expected| {
+ let current_iterator_values: Vec<bool> =
expected.clone().collect();
+
+ assert_eq!(
+ O::get_value(actual),
+ O::get_value(expected),
+ "Failed on op {} for new iter after consuming 1 element
from the end (left actual, right expected) ({current_iterator_values:?})",
+ O::NAME
+ );
+ },
+ );
+
+ setup_and_assert(
+ |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+ iter.next();
+ iter.next_back();
+ },
+ |actual, expected| {
+ let current_iterator_values: Vec<bool> =
expected.clone().collect();
+
+ assert_eq!(
+ O::get_value(actual),
+ O::get_value(expected),
+ "Failed on op {} for new iter after consuming 1 element
from start and end (left actual, right expected) ({current_iterator_values:?})",
+ O::NAME
+ );
+ },
+ );
+
+ setup_and_assert(
+ |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+ while iter.len() > 1 {
+ iter.next();
+ }
+ },
+ |actual, expected| {
+ let current_iterator_values: Vec<bool> =
expected.clone().collect();
+
+ assert_eq!(
+ O::get_value(actual),
+ O::get_value(expected),
+ "Failed on op {} for new iter after consuming all from the
start but 1 (left actual, right expected) ({current_iterator_values:?})",
+ O::NAME
+ );
+ },
+ );
+
+ setup_and_assert(
+ |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+ while iter.len() > 1 {
+ iter.next_back();
+ }
+ },
+ |actual, expected| {
+ let current_iterator_values: Vec<bool> =
expected.clone().collect();
+
+ assert_eq!(
+ O::get_value(actual),
+ O::get_value(expected),
+ "Failed on op {} for new iter after consuming all from the
end but 1 (left actual, right expected) ({current_iterator_values:?})",
+ O::NAME
+ );
+ },
+ );
+
+ setup_and_assert(
+ |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+ while iter.next().is_some() {}
+ },
+ |actual, expected| {
+ let current_iterator_values: Vec<bool> =
expected.clone().collect();
+
+ assert_eq!(
+ O::get_value(actual),
+ O::get_value(expected),
+ "Failed on op {} for new iter after consuming all from the
start (left actual, right expected) ({current_iterator_values:?})",
+ O::NAME
+ );
+ },
+ );
+
+ setup_and_assert(
+ |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+ while iter.next_back().is_some() {}
+ },
+ |actual, expected| {
+ let current_iterator_values: Vec<bool> =
expected.clone().collect();
+
+ assert_eq!(
+ O::get_value(actual),
+ O::get_value(expected),
+ "Failed on op {} for new iter after consuming all from the
end (left actual, right expected) ({current_iterator_values:?})",
+ O::NAME
+ );
+ },
+ );
+ }
+
+ #[test]
+ fn assert_bit_iterator_count() {
+ struct CountOp;
+
+ impl BitIteratorOp for CountOp {
+ type Output = usize;
+ const NAME: &'static str = "count";
+
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(iter: T) ->
Self::Output {
+ iter.count()
+ }
+ }
+
+ assert_bit_iterator_cases::<CountOp>()
+ }
+
+ #[test]
+ fn assert_bit_iterator_last() {
+ struct LastOp;
+
+ impl BitIteratorOp for LastOp {
+ type Output = Option<bool>;
+ const NAME: &'static str = "last";
+
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(iter: T) ->
Self::Output {
+ iter.last()
+ }
+ }
+
+ assert_bit_iterator_cases::<LastOp>()
+ }
+
+ #[test]
+ fn assert_bit_iterator_max() {
+ struct MaxOp;
+
+ impl BitIteratorOp for MaxOp {
+ type Output = Option<bool>;
+ const NAME: &'static str = "max";
+
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(iter: T) ->
Self::Output {
+ iter.max()
+ }
+ }
+
+ assert_bit_iterator_cases::<MaxOp>()
+ }
+
+ #[test]
+ fn assert_bit_iterator_nth_0() {
+ struct NthOp<const BACK: bool>;
+
+ impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+ type Output = Option<bool>;
+ const NAME: &'static str = if BACK { "nth_back(0)" } else {
"nth(0)" };
+
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T)
-> Self::Output {
+ if BACK { iter.nth_back(0) } else { iter.nth(0) }
+ }
+ }
+
+ assert_bit_iterator_cases::<NthOp<false>>();
+ assert_bit_iterator_cases::<NthOp<true>>();
+ }
+
+ #[test]
+ fn assert_bit_iterator_nth_1() {
+ struct NthOp<const BACK: bool>;
+
+ impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+ type Output = Option<bool>;
+ const NAME: &'static str = if BACK { "nth_back(1)" } else {
"nth(1)" };
+
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T)
-> Self::Output {
+ if BACK { iter.nth_back(1) } else { iter.nth(1) }
+ }
+ }
+
+ assert_bit_iterator_cases::<NthOp<false>>();
+ assert_bit_iterator_cases::<NthOp<true>>();
+ }
+
+ #[test]
+ fn assert_bit_iterator_nth_after_end() {
+ struct NthOp<const BACK: bool>;
+
+ impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+ type Output = Option<bool>;
+ const NAME: &'static str = if BACK {
+ "nth_back(iter.len() + 1)"
+ } else {
+ "nth(iter.len() + 1)"
+ };
+
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T)
-> Self::Output {
+ if BACK {
+ iter.nth_back(iter.len() + 1)
+ } else {
+ iter.nth(iter.len() + 1)
+ }
+ }
+ }
+
+ assert_bit_iterator_cases::<NthOp<false>>();
+ assert_bit_iterator_cases::<NthOp<true>>();
+ }
+
+ #[test]
+ fn assert_bit_iterator_nth_len() {
+ struct NthOp<const BACK: bool>;
+
+ impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+ type Output = Option<bool>;
+ const NAME: &'static str = if BACK {
+ "nth_back(iter.len())"
+ } else {
+ "nth(iter.len())"
+ };
+
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T)
-> Self::Output {
+ if BACK {
+ iter.nth_back(iter.len())
+ } else {
+ iter.nth(iter.len())
+ }
+ }
+ }
+
+ assert_bit_iterator_cases::<NthOp<false>>();
+ assert_bit_iterator_cases::<NthOp<true>>();
+ }
+
+ #[test]
+ fn assert_bit_iterator_nth_last() {
+ struct NthOp<const BACK: bool>;
+
+ impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+ type Output = Option<bool>;
+ const NAME: &'static str = if BACK {
+ "nth_back(iter.len().saturating_sub(1))"
+ } else {
+ "nth(iter.len().saturating_sub(1))"
+ };
+
+ fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T)
-> Self::Output {
+ if BACK {
+ iter.nth_back(iter.len().saturating_sub(1))
+ } else {
+ iter.nth(iter.len().saturating_sub(1))
+ }
+ }
+ }
+
+ assert_bit_iterator_cases::<NthOp<false>>();
+ assert_bit_iterator_cases::<NthOp<true>>();
+ }
+
+ #[test]
+ fn assert_bit_iterator_nth_and_reuse() {
+ setup_and_assert(
+ |_| {},
+ |actual, expected| {
+ {
+ let mut actual = actual.clone();
+ let mut expected = expected.clone();
+ for _ in 0..expected.len() {
+ #[allow(clippy::iter_nth_zero)]
+ let actual_val = actual.nth(0);
+ #[allow(clippy::iter_nth_zero)]
+ let expected_val = expected.nth(0);
+ assert_eq!(actual_val, expected_val, "Failed on
nth(0)");
+ }
+ }
+
+ {
+ let mut actual = actual.clone();
+ let mut expected = expected.clone();
+ for _ in 0..expected.len() {
+ let actual_val = actual.nth(1);
+ let expected_val = expected.nth(1);
+ assert_eq!(actual_val, expected_val, "Failed on
nth(1)");
+ }
+ }
+
+ {
+ let mut actual = actual.clone();
+ let mut expected = expected.clone();
+ for _ in 0..expected.len() {
+ let actual_val = actual.nth(2);
+ let expected_val = expected.nth(2);
+ assert_eq!(actual_val, expected_val, "Failed on
nth(2)");
+ }
+ }
+ },
+ );
+ }
+
+ #[test]
+ fn assert_bit_iterator_nth_back_and_reuse() {
+ setup_and_assert(
+ |_| {},
+ |actual, expected| {
+ {
+ let mut actual = actual.clone();
+ let mut expected = expected.clone();
+ for _ in 0..expected.len() {
+ #[allow(clippy::iter_nth_zero)]
+ let actual_val = actual.nth_back(0);
+ let expected_val = expected.nth_back(0);
+ assert_eq!(actual_val, expected_val, "Failed on
nth_back(0)");
+ }
+ }
+
+ {
+ let mut actual = actual.clone();
+ let mut expected = expected.clone();
+ for _ in 0..expected.len() {
+ let actual_val = actual.nth_back(1);
+ let expected_val = expected.nth_back(1);
+ assert_eq!(actual_val, expected_val, "Failed on
nth_back(1)");
+ }
+ }
+
+ {
+ let mut actual = actual.clone();
+ let mut expected = expected.clone();
+ for _ in 0..expected.len() {
+ let actual_val = actual.nth_back(2);
+ let expected_val = expected.nth_back(2);
+ assert_eq!(actual_val, expected_val, "Failed on
nth_back(2)");
+ }
+ }
+ },
+ );
+ }
}