This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 7f3d3aef30 perf: override `count`, `nth`, `nth_back`, `last` and `max` 
for BitIterator (#8696)
7f3d3aef30 is described below

commit 7f3d3aef30613b039939efec8780ab4eaa8cb98d
Author: Raz Luvaton <[email protected]>
AuthorDate: Mon Oct 27 20:21:28 2025 +0200

    perf: override `count`, `nth`, `nth_back`, `last` and `max` for BitIterator 
(#8696)
    
    # Which issue does this PR close?
    
    N/A
    
    # Rationale for this change
    
    overriding this function improve performance over the fallback
    implementation
    
    # What changes are included in this PR?
    
    Override implementation of:
    - `count` which is not optimized away even when `ExactSizeIterator` is
    implemented
    - `nth` to avoid calling `next` `n + 1` times (which is also used when
    doing `.skip`)
    - `nth_back`
    - `last`
    - `max`
    
    # Are these changes tested?
    
    Yes, I've added a lot of tests
    
    # Are there any user-facing changes?
    
    Nope
---
 arrow-buffer/src/util/bit_iterator.rs | 516 ++++++++++++++++++++++++++++++++++
 1 file changed, 516 insertions(+)

diff --git a/arrow-buffer/src/util/bit_iterator.rs 
b/arrow-buffer/src/util/bit_iterator.rs
index c7f6f94fb8..0aa94a5d4d 100644
--- a/arrow-buffer/src/util/bit_iterator.rs
+++ b/arrow-buffer/src/util/bit_iterator.rs
@@ -23,6 +23,7 @@ use crate::bit_util::{ceil, get_bit_raw};
 /// Iterator over the bits within a packed bitmask
 ///
 /// To efficiently iterate over just the set bits see [`BitIndexIterator`] and 
[`BitSliceIterator`]
+#[derive(Clone)]
 pub struct BitIterator<'a> {
     buffer: &'a [u8],
     current_offset: usize,
@@ -71,6 +72,71 @@ impl Iterator for BitIterator<'_> {
         let remaining_bits = self.end_offset - self.current_offset;
         (remaining_bits, Some(remaining_bits))
     }
+
+    fn count(self) -> usize
+    where
+        Self: Sized,
+    {
+        self.len()
+    }
+
+    fn nth(&mut self, n: usize) -> Option<Self::Item> {
+        // Check if we can advance to the desired offset.
+        // When n is 0 it means we want the next() value
+        // and when n is 1 we want the next().next() value
+        // so adding n to the current offset and not n - 1
+        match self.current_offset.checked_add(n) {
+            // Yes, and still within bounds
+            Some(new_offset) if new_offset < self.end_offset => {
+                self.current_offset = new_offset;
+            }
+
+            // Either overflow or would exceed end_offset
+            _ => {
+                self.current_offset = self.end_offset;
+                return None;
+            }
+        }
+
+        self.next()
+    }
+
+    fn last(mut self) -> Option<Self::Item> {
+        // If already at the end, return None
+        if self.current_offset == self.end_offset {
+            return None;
+        }
+
+        // Go to the one before the last bit
+        self.current_offset = self.end_offset - 1;
+
+        // Return the last bit
+        self.next()
+    }
+
+    fn max(self) -> Option<Self::Item>
+    where
+        Self: Sized,
+        Self::Item: Ord,
+    {
+        if self.current_offset == self.end_offset {
+            return None;
+        }
+
+        // true is greater than false so we only need to check if there's any 
true bit
+        let mut bit_index_iter = BitIndexIterator::new(
+            self.buffer,
+            self.current_offset,
+            self.end_offset - self.current_offset,
+        );
+
+        if bit_index_iter.next().is_some() {
+            return Some(true);
+        }
+
+        // We know the iterator is not empty and there are no set bits so 
false is the max
+        Some(false)
+    }
 }
 
 impl ExactSizeIterator for BitIterator<'_> {}
@@ -86,6 +152,27 @@ impl DoubleEndedIterator for BitIterator<'_> {
         let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.end_offset) };
         Some(v)
     }
+
+    fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
+        // Check if we can advance to the desired offset.
+        // When n is 0 it means we want the next_back() value
+        // and when n is 1 we want the next_back().next_back() value
+        // so subtracting n to the current offset and not n - 1
+        match self.end_offset.checked_sub(n) {
+            // Yes, and still within bounds
+            Some(new_offset) if self.current_offset < new_offset => {
+                self.end_offset = new_offset;
+            }
+
+            // Either underflow or would exceed current_offset
+            _ => {
+                self.current_offset = self.end_offset;
+                return None;
+            }
+        }
+
+        self.next_back()
+    }
 }
 
 /// Iterator of contiguous ranges of set bits within a provided packed bitmask
@@ -327,6 +414,12 @@ pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> 
Result<(), E>>(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::BooleanBuffer;
+    use rand::rngs::StdRng;
+    use rand::{Rng, SeedableRng};
+    use std::fmt::Debug;
+    use std::iter::Copied;
+    use std::slice::Iter;
 
     #[test]
     fn test_bit_iterator_size_hint() {
@@ -486,4 +579,427 @@ mod tests {
             .collect();
         assert_eq!(result, expected);
     }
+
+    trait SharedBetweenBitIteratorAndSliceIter:
+        ExactSizeIterator<Item = bool> + DoubleEndedIterator<Item = bool>
+    {
+    }
+    impl<T: ?Sized + ExactSizeIterator<Item = bool> + DoubleEndedIterator<Item 
= bool>>
+        SharedBetweenBitIteratorAndSliceIter for T
+    {
+    }
+
+    fn get_bit_iterator_cases() -> impl Iterator<Item = (BooleanBuffer, 
Vec<bool>)> {
+        let mut rng = StdRng::seed_from_u64(42);
+
+        [0, 1, 6, 8, 100, 164]
+            .map(|len| {
+                let source = (0..len).map(|_| 
rng.random_bool(0.5)).collect::<Vec<_>>();
+
+                (BooleanBuffer::from(source.as_slice()), source)
+            })
+            .into_iter()
+    }
+
+    fn setup_and_assert(
+        setup_iters: impl Fn(&mut dyn SharedBetweenBitIteratorAndSliceIter),
+        assert_fn: impl Fn(BitIterator, Copied<Iter<bool>>),
+    ) {
+        for (boolean_buffer, source) in get_bit_iterator_cases() {
+            // Not using `boolean_buffer.iter()` in case the implementation 
change to not call BitIterator internally
+            // in which case the test would not test what it intends to test
+            let mut actual = BitIterator::new(boolean_buffer.values(), 0, 
boolean_buffer.len());
+            let mut expected = source.iter().copied();
+
+            setup_iters(&mut actual);
+            setup_iters(&mut expected);
+
+            assert_fn(actual, expected);
+        }
+    }
+
+    /// Trait representing an operation on a BitIterator
+    /// that can be compared against a slice iterator
+    trait BitIteratorOp {
+        /// What the operation returns (e.g. Option<bool> for last/max, usize 
for count, etc)
+        type Output: PartialEq + Debug;
+
+        /// The name of the operation, used for error messages
+        const NAME: &'static str;
+
+        /// Get the value of the operation for the provided iterator
+        /// This will be either a BitIterator or a slice iterator to make sure 
they produce the same result
+        fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(iter: T) -> 
Self::Output;
+    }
+
+    /// Helper function that will assert that the provided operation
+    /// produces the same result for both BitIterator and slice iterator
+    /// under various consumption patterns (e.g. some calls to 
next/next_back/consume_all/etc)
+    fn assert_bit_iterator_cases<O: BitIteratorOp>() {
+        setup_and_assert(
+            |_iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {},
+            |actual, expected| {
+                let current_iterator_values: Vec<bool> = 
expected.clone().collect();
+                assert_eq!(
+                    O::get_value(actual),
+                    O::get_value(expected),
+                    "Failed on op {} for new iter (left actual, right 
expected) ({current_iterator_values:?})",
+                    O::NAME
+                );
+            },
+        );
+
+        setup_and_assert(
+            |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+                iter.next();
+            },
+            |actual, expected| {
+                let current_iterator_values: Vec<bool> = 
expected.clone().collect();
+
+                assert_eq!(
+                    O::get_value(actual),
+                    O::get_value(expected),
+                    "Failed on op {} for new iter after consuming 1 element 
from the start (left actual, right expected) ({current_iterator_values:?})",
+                    O::NAME
+                );
+            },
+        );
+
+        setup_and_assert(
+            |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+                iter.next_back();
+            },
+            |actual, expected| {
+                let current_iterator_values: Vec<bool> = 
expected.clone().collect();
+
+                assert_eq!(
+                    O::get_value(actual),
+                    O::get_value(expected),
+                    "Failed on op {} for new iter after consuming 1 element 
from the end (left actual, right expected) ({current_iterator_values:?})",
+                    O::NAME
+                );
+            },
+        );
+
+        setup_and_assert(
+            |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+                iter.next();
+                iter.next_back();
+            },
+            |actual, expected| {
+                let current_iterator_values: Vec<bool> = 
expected.clone().collect();
+
+                assert_eq!(
+                    O::get_value(actual),
+                    O::get_value(expected),
+                    "Failed on op {} for new iter after consuming 1 element 
from start and end (left actual, right expected) ({current_iterator_values:?})",
+                    O::NAME
+                );
+            },
+        );
+
+        setup_and_assert(
+            |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+                while iter.len() > 1 {
+                    iter.next();
+                }
+            },
+            |actual, expected| {
+                let current_iterator_values: Vec<bool> = 
expected.clone().collect();
+
+                assert_eq!(
+                    O::get_value(actual),
+                    O::get_value(expected),
+                    "Failed on op {} for new iter after consuming all from the 
start but 1 (left actual, right expected) ({current_iterator_values:?})",
+                    O::NAME
+                );
+            },
+        );
+
+        setup_and_assert(
+            |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+                while iter.len() > 1 {
+                    iter.next_back();
+                }
+            },
+            |actual, expected| {
+                let current_iterator_values: Vec<bool> = 
expected.clone().collect();
+
+                assert_eq!(
+                    O::get_value(actual),
+                    O::get_value(expected),
+                    "Failed on op {} for new iter after consuming all from the 
end but 1 (left actual, right expected) ({current_iterator_values:?})",
+                    O::NAME
+                );
+            },
+        );
+
+        setup_and_assert(
+            |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+                while iter.next().is_some() {}
+            },
+            |actual, expected| {
+                let current_iterator_values: Vec<bool> = 
expected.clone().collect();
+
+                assert_eq!(
+                    O::get_value(actual),
+                    O::get_value(expected),
+                    "Failed on op {} for new iter after consuming all from the 
start (left actual, right expected) ({current_iterator_values:?})",
+                    O::NAME
+                );
+            },
+        );
+
+        setup_and_assert(
+            |iter: &mut dyn SharedBetweenBitIteratorAndSliceIter| {
+                while iter.next_back().is_some() {}
+            },
+            |actual, expected| {
+                let current_iterator_values: Vec<bool> = 
expected.clone().collect();
+
+                assert_eq!(
+                    O::get_value(actual),
+                    O::get_value(expected),
+                    "Failed on op {} for new iter after consuming all from the 
end (left actual, right expected) ({current_iterator_values:?})",
+                    O::NAME
+                );
+            },
+        );
+    }
+
+    #[test]
+    fn assert_bit_iterator_count() {
+        struct CountOp;
+
+        impl BitIteratorOp for CountOp {
+            type Output = usize;
+            const NAME: &'static str = "count";
+
+            fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(iter: T) -> 
Self::Output {
+                iter.count()
+            }
+        }
+
+        assert_bit_iterator_cases::<CountOp>()
+    }
+
+    #[test]
+    fn assert_bit_iterator_last() {
+        struct LastOp;
+
+        impl BitIteratorOp for LastOp {
+            type Output = Option<bool>;
+            const NAME: &'static str = "last";
+
+            fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(iter: T) -> 
Self::Output {
+                iter.last()
+            }
+        }
+
+        assert_bit_iterator_cases::<LastOp>()
+    }
+
+    #[test]
+    fn assert_bit_iterator_max() {
+        struct MaxOp;
+
+        impl BitIteratorOp for MaxOp {
+            type Output = Option<bool>;
+            const NAME: &'static str = "max";
+
+            fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(iter: T) -> 
Self::Output {
+                iter.max()
+            }
+        }
+
+        assert_bit_iterator_cases::<MaxOp>()
+    }
+
+    #[test]
+    fn assert_bit_iterator_nth_0() {
+        struct NthOp<const BACK: bool>;
+
+        impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+            type Output = Option<bool>;
+            const NAME: &'static str = if BACK { "nth_back(0)" } else { 
"nth(0)" };
+
+            fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T) 
-> Self::Output {
+                if BACK { iter.nth_back(0) } else { iter.nth(0) }
+            }
+        }
+
+        assert_bit_iterator_cases::<NthOp<false>>();
+        assert_bit_iterator_cases::<NthOp<true>>();
+    }
+
+    #[test]
+    fn assert_bit_iterator_nth_1() {
+        struct NthOp<const BACK: bool>;
+
+        impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+            type Output = Option<bool>;
+            const NAME: &'static str = if BACK { "nth_back(1)" } else { 
"nth(1)" };
+
+            fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T) 
-> Self::Output {
+                if BACK { iter.nth_back(1) } else { iter.nth(1) }
+            }
+        }
+
+        assert_bit_iterator_cases::<NthOp<false>>();
+        assert_bit_iterator_cases::<NthOp<true>>();
+    }
+
+    #[test]
+    fn assert_bit_iterator_nth_after_end() {
+        struct NthOp<const BACK: bool>;
+
+        impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+            type Output = Option<bool>;
+            const NAME: &'static str = if BACK {
+                "nth_back(iter.len() + 1)"
+            } else {
+                "nth(iter.len() + 1)"
+            };
+
+            fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T) 
-> Self::Output {
+                if BACK {
+                    iter.nth_back(iter.len() + 1)
+                } else {
+                    iter.nth(iter.len() + 1)
+                }
+            }
+        }
+
+        assert_bit_iterator_cases::<NthOp<false>>();
+        assert_bit_iterator_cases::<NthOp<true>>();
+    }
+
+    #[test]
+    fn assert_bit_iterator_nth_len() {
+        struct NthOp<const BACK: bool>;
+
+        impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+            type Output = Option<bool>;
+            const NAME: &'static str = if BACK {
+                "nth_back(iter.len())"
+            } else {
+                "nth(iter.len())"
+            };
+
+            fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T) 
-> Self::Output {
+                if BACK {
+                    iter.nth_back(iter.len())
+                } else {
+                    iter.nth(iter.len())
+                }
+            }
+        }
+
+        assert_bit_iterator_cases::<NthOp<false>>();
+        assert_bit_iterator_cases::<NthOp<true>>();
+    }
+
+    #[test]
+    fn assert_bit_iterator_nth_last() {
+        struct NthOp<const BACK: bool>;
+
+        impl<const BACK: bool> BitIteratorOp for NthOp<BACK> {
+            type Output = Option<bool>;
+            const NAME: &'static str = if BACK {
+                "nth_back(iter.len().saturating_sub(1))"
+            } else {
+                "nth(iter.len().saturating_sub(1))"
+            };
+
+            fn get_value<T: SharedBetweenBitIteratorAndSliceIter>(mut iter: T) 
-> Self::Output {
+                if BACK {
+                    iter.nth_back(iter.len().saturating_sub(1))
+                } else {
+                    iter.nth(iter.len().saturating_sub(1))
+                }
+            }
+        }
+
+        assert_bit_iterator_cases::<NthOp<false>>();
+        assert_bit_iterator_cases::<NthOp<true>>();
+    }
+
+    #[test]
+    fn assert_bit_iterator_nth_and_reuse() {
+        setup_and_assert(
+            |_| {},
+            |actual, expected| {
+                {
+                    let mut actual = actual.clone();
+                    let mut expected = expected.clone();
+                    for _ in 0..expected.len() {
+                        #[allow(clippy::iter_nth_zero)]
+                        let actual_val = actual.nth(0);
+                        #[allow(clippy::iter_nth_zero)]
+                        let expected_val = expected.nth(0);
+                        assert_eq!(actual_val, expected_val, "Failed on 
nth(0)");
+                    }
+                }
+
+                {
+                    let mut actual = actual.clone();
+                    let mut expected = expected.clone();
+                    for _ in 0..expected.len() {
+                        let actual_val = actual.nth(1);
+                        let expected_val = expected.nth(1);
+                        assert_eq!(actual_val, expected_val, "Failed on 
nth(1)");
+                    }
+                }
+
+                {
+                    let mut actual = actual.clone();
+                    let mut expected = expected.clone();
+                    for _ in 0..expected.len() {
+                        let actual_val = actual.nth(2);
+                        let expected_val = expected.nth(2);
+                        assert_eq!(actual_val, expected_val, "Failed on 
nth(2)");
+                    }
+                }
+            },
+        );
+    }
+
+    #[test]
+    fn assert_bit_iterator_nth_back_and_reuse() {
+        setup_and_assert(
+            |_| {},
+            |actual, expected| {
+                {
+                    let mut actual = actual.clone();
+                    let mut expected = expected.clone();
+                    for _ in 0..expected.len() {
+                        #[allow(clippy::iter_nth_zero)]
+                        let actual_val = actual.nth_back(0);
+                        let expected_val = expected.nth_back(0);
+                        assert_eq!(actual_val, expected_val, "Failed on 
nth_back(0)");
+                    }
+                }
+
+                {
+                    let mut actual = actual.clone();
+                    let mut expected = expected.clone();
+                    for _ in 0..expected.len() {
+                        let actual_val = actual.nth_back(1);
+                        let expected_val = expected.nth_back(1);
+                        assert_eq!(actual_val, expected_val, "Failed on 
nth_back(1)");
+                    }
+                }
+
+                {
+                    let mut actual = actual.clone();
+                    let mut expected = expected.clone();
+                    for _ in 0..expected.len() {
+                        let actual_val = actual.nth_back(2);
+                        let expected_val = expected.nth_back(2);
+                        assert_eq!(actual_val, expected_val, "Failed on 
nth_back(2)");
+                    }
+                }
+            },
+        );
+    }
 }

Reply via email to