haohuaijin commented on code in PR #10141:
URL: https://github.com/apache/arrow-rs/pull/10141#discussion_r3465724642


##########
parquet/src/arrow/arrow_reader/selection.rs:
##########
@@ -135,12 +143,439 @@ impl RowSelector {
 /// * Consecutive [`RowSelector`]s alternate skipping or selecting rows
 ///
 /// [`PageIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
-#[derive(Debug, Clone, Default, Eq, PartialEq)]
+#[derive(Default, Clone)]
 pub struct RowSelection {
-    selectors: Vec<RowSelector>,
+    inner: RowSelectionInner,
+}
+
+/// Internal storage for [`RowSelection`].
+#[derive(Debug, Clone)]
+pub(crate) enum RowSelectionInner {
+    Selectors(Vec<RowSelector>),
+    Mask(BooleanBuffer),
+}
+
+impl Default for RowSelectionInner {
+    fn default() -> Self {
+        Self::Selectors(Vec::new())
+    }
+}
+
+impl std::fmt::Debug for RowSelection {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.inner {
+            RowSelectionInner::Selectors(s) => f
+                .debug_struct("RowSelection")
+                .field("selectors", s)
+                .finish(),
+            RowSelectionInner::Mask(m) => f
+                .debug_struct("RowSelection")
+                .field("mask_len", &m.len())
+                .finish_non_exhaustive(),
+        }
+    }
+}
+
+impl PartialEq for RowSelection {
+    fn eq(&self, other: &Self) -> bool {
+        match (&self.inner, &other.inner) {
+            (RowSelectionInner::Selectors(a), RowSelectionInner::Selectors(b)) 
=> a == b,
+            (RowSelectionInner::Mask(a), RowSelectionInner::Mask(b)) => a == b,
+            (RowSelectionInner::Mask(mask), 
RowSelectionInner::Selectors(selectors))
+            | (RowSelectionInner::Selectors(selectors), 
RowSelectionInner::Mask(mask)) => {
+                if selectors
+                    .iter()
+                    .try_fold(0usize, |acc, selector| 
acc.checked_add(selector.row_count))
+                    != Some(mask.len())
+                {
+                    return false;
+                }
+
+                let mut slices = mask.set_slices().peekable();
+                let mut cursor = 0usize;
+
+                for selector in selectors {
+                    let end = cursor + selector.row_count;
+
+                    if selector.skip {
+                        if slices.peek().is_some_and(|(start, _)| *start < 
end) {
+                            return false;
+                        }
+                    } else {
+                        match slices.next() {
+                            Some((start, slice_end)) if start == cursor && 
slice_end == end => {}
+                            _ => return false,
+                        }
+                    }
+
+                    cursor = end;
+                }
+
+                slices.next().is_none()
+            }
+        }
+    }
+}
+
+impl Eq for RowSelection {}
+
+/// Iterator over the [`RowSelector`]s of a [`RowSelection`].
+///
+/// Yields owned [`RowSelector`] values; mask-backed selections stream the
+/// run-length form via [`BitSliceIterator`] without allocation.
+#[derive(Debug)]
+pub enum RowSelectionIter<'a> {
+    Selectors(std::iter::Copied<std::slice::Iter<'a, RowSelector>>),
+    Mask(MaskRunIter<'a>),
+}
+
+impl Iterator for RowSelectionIter<'_> {
+    type Item = RowSelector;
+
+    #[inline]
+    fn next(&mut self) -> Option<RowSelector> {
+        match self {
+            Self::Selectors(it) => it.next(),
+            Self::Mask(it) => it.next(),
+        }
+    }
+}
+
+/// Streaming RLE view of a [`BooleanBuffer`].
+#[derive(Debug)]
+pub struct MaskRunIter<'a> {
+    slices: BitSliceIterator<'a>,
+    cursor: usize,
+    total: usize,
+    pending: Option<RowSelector>,
+    finished: bool,
+}
+
+impl<'a> MaskRunIter<'a> {
+    fn new(mask: &'a BooleanBuffer) -> Self {
+        Self {
+            slices: mask.set_slices(),
+            cursor: 0,
+            total: mask.len(),
+            pending: None,
+            finished: false,
+        }
+    }
+}
+
+impl Iterator for MaskRunIter<'_> {
+    type Item = RowSelector;
+
+    fn next(&mut self) -> Option<RowSelector> {

Review Comment:
   I updated `RowSelection::iter()` to keep returning `Iterator<Item = 
&RowSelector>`.
   - For selector-backed selections this still borrows from the internal 
`Vec<RowSelector>`. 
   - For mask-backed selections I added a lazy selector cache, because the 
returned `&RowSelector` values need stable storage inside `RowSelection`; 
otherwise they would point into a temporary `Vec`.
   
   The internal hot paths still avoid this cache and use the `BooleanBuffer` or 
`MaskRunIter` directly. 
   
   
   the struct is below
   
   ```rust
   
   pub struct RowSelection {
       inner: RowSelectionInner,
   }
   
   pub(crate) enum RowSelectionInner {
       Selectors(Vec<RowSelector>),
       Mask(Box<MaskSelection>),
   }
   
   pub(crate) struct MaskSelection {
       mask: BooleanBuffer,
       selectors: OnceLock<Vec<RowSelector>>,
   }
   ```
   
   I am not fully sure this is the best approach, so suggestions and feedback 
are welcome.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to