jorgecarleitao commented on a change in pull request #1248:
URL: https://github.com/apache/arrow-rs/pull/1248#discussion_r800191885



##########
File path: arrow/src/compute/kernels/filter.rs
##########
@@ -119,17 +155,83 @@ impl<'a> Iterator for SlicesIterator<'a> {
     }
 }
 
+/// An iterator of `usize` whose index in [`BooleanArray`] is true
+///
+/// This provides the best performance on all but the least selective 
predicates (which keep most
+/// / all rows), where the benefits of copying large runs instead favours 
[`SlicesIterator`]
+struct IndexIterator<'a> {
+    current_chunk: u64,
+    chunk_offset: i64,
+    remaining: usize,
+    iter: UnalignedBitChunkIterator<'a>,
+}
+
+impl<'a> IndexIterator<'a> {
+    fn new(filter: &'a BooleanArray, len: usize) -> Self {
+        assert_eq!(filter.null_count(), 0);
+        let data = filter.data();
+        let chunks =
+            UnalignedBitChunk::new(&data.buffers()[0], data.offset(), 
data.len());
+        let mut iter = chunks.iter();
+
+        let current_chunk = iter.next().unwrap_or(0);
+        let chunk_offset = -(chunks.lead_padding() as i64);
+
+        Self {
+            current_chunk,
+            chunk_offset,
+            remaining: len,
+            iter,
+        }
+    }
+}
+
+impl<'a> Iterator for IndexIterator<'a> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        while self.remaining != 0 {
+            if self.current_chunk != 0 {
+                let bit_pos = self.current_chunk.trailing_zeros();
+                self.current_chunk ^= 1 << bit_pos;
+                self.remaining -= 1;
+                return Some((self.chunk_offset + bit_pos as i64) as usize);
+            }
+
+            // Must panic if exhausted early as trusted length iterator
+            self.current_chunk = self.iter.next().expect("IndexIterator 
exhausted early");
+            self.chunk_offset += 64;
+        }
+        None
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.remaining, Some(self.remaining))
+    }
+}
+
+/// Counts the number of set bits in `filter`
 fn filter_count(filter: &BooleanArray) -> usize {

Review comment:
       not sure why this was/here here, but isn't this equal to `filter.len() - 
filter.null_count()`?
   
   EDIT: ah, maybe this is not true because null_count does not take the 
offsets into account?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to