This is an automated email from the ASF dual-hosted git repository.

mneumann pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b711f23a1 feat(parquet): add union method to RowSelection (#6308)
b711f23a1 is described below

commit b711f23a136e0b094a70a4aafb020d4bb9f60619
Author: Scott Donnelly <[email protected]>
AuthorDate: Tue Aug 27 10:10:19 2024 +0100

    feat(parquet): add union method to RowSelection (#6308)
    
    Complements the existing RowSelection::intersection method.
    Useful for Or-ing row selections together, in contrast to
    intersection's use when AND-ing selections
---
 parquet/src/arrow/arrow_reader/selection.rs | 132 ++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/parquet/src/arrow/arrow_reader/selection.rs 
b/parquet/src/arrow/arrow_reader/selection.rs
index 0287e5b42..ce3fbbf4f 100644
--- a/parquet/src/arrow/arrow_reader/selection.rs
+++ b/parquet/src/arrow/arrow_reader/selection.rs
@@ -343,6 +343,16 @@ impl RowSelection {
         intersect_row_selections(&self.selectors, &other.selectors)
     }
 
+    /// Compute the union of two [`RowSelection`]
+    /// For example:
+    /// self:      NNYYYYNNYYNYN
+    /// other:     NYNNNNNNN
+    ///
+    /// returned:  NYYYYYNNYYNYN
+    pub fn union(&self, other: &Self) -> Self {
+        union_row_selections(&self.selectors, &other.selectors)
+    }
+
     /// Returns `true` if this [`RowSelection`] selects any rows
     pub fn selects_any(&self) -> bool {
         self.selectors.iter().any(|x| !x.skip)
@@ -536,6 +546,92 @@ fn intersect_row_selections(left: &[RowSelector], right: 
&[RowSelector]) -> RowS
     iter.collect()
 }
 
+/// Combine two lists of `RowSelector` return the union of them
+/// For example:
+/// self:      NNYYYYNNYYNYN
+/// other:     NYNNNNNNY
+///
+/// returned:  NYYYYYNNYYNYN
+///
+/// This can be removed from here once RowSelection::union is in parquet::arrow
+fn union_row_selections(left: &[RowSelector], right: &[RowSelector]) -> 
RowSelection {
+    let mut l_iter = left.iter().copied().peekable();
+    let mut r_iter = right.iter().copied().peekable();
+
+    let iter = std::iter::from_fn(move || {
+        loop {
+            let l = l_iter.peek_mut();
+            let r = r_iter.peek_mut();
+
+            match (l, r) {
+                (Some(a), _) if a.row_count == 0 => {
+                    l_iter.next().unwrap();
+                }
+                (_, Some(b)) if b.row_count == 0 => {
+                    r_iter.next().unwrap();
+                }
+                (Some(l), Some(r)) => {
+                    return match (l.skip, r.skip) {
+                        // Skip both ranges
+                        (true, true) => {
+                            if l.row_count < r.row_count {
+                                let skip = l.row_count;
+                                r.row_count -= l.row_count;
+                                l_iter.next();
+                                Some(RowSelector::skip(skip))
+                            } else {
+                                let skip = r.row_count;
+                                l.row_count -= skip;
+                                r_iter.next();
+                                Some(RowSelector::skip(skip))
+                            }
+                        }
+                        // Keep rows from left
+                        (false, true) => {
+                            if l.row_count < r.row_count {
+                                r.row_count -= l.row_count;
+                                l_iter.next()
+                            } else {
+                                let r_row_count = r.row_count;
+                                l.row_count -= r_row_count;
+                                r_iter.next();
+                                Some(RowSelector::select(r_row_count))
+                            }
+                        }
+                        // Keep rows from right
+                        (true, false) => {
+                            if l.row_count < r.row_count {
+                                let l_row_count = l.row_count;
+                                r.row_count -= l_row_count;
+                                l_iter.next();
+                                Some(RowSelector::select(l_row_count))
+                            } else {
+                                l.row_count -= r.row_count;
+                                r_iter.next()
+                            }
+                        }
+                        // Keep at least one
+                        _ => {
+                            if l.row_count < r.row_count {
+                                r.row_count -= l.row_count;
+                                l_iter.next()
+                            } else {
+                                l.row_count -= r.row_count;
+                                r_iter.next()
+                            }
+                        }
+                    };
+                }
+                (Some(_), None) => return l_iter.next(),
+                (None, Some(_)) => return r_iter.next(),
+                (None, None) => return None,
+            }
+        }
+    });
+
+    iter.collect()
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1213,4 +1309,40 @@ mod tests {
             ]
         );
     }
+
+    #[test]
+    fn test_union() {
+        let selection = RowSelection::from(vec![RowSelector::select(1048576)]);
+        let result = selection.union(&selection);
+        assert_eq!(result, selection);
+
+        // NYNYY
+        let a = RowSelection::from(vec![
+            RowSelector::skip(10),
+            RowSelector::select(10),
+            RowSelector::skip(10),
+            RowSelector::select(20),
+        ]);
+
+        // NNYYNYN
+        let b = RowSelection::from(vec![
+            RowSelector::skip(20),
+            RowSelector::select(20),
+            RowSelector::skip(10),
+            RowSelector::select(10),
+            RowSelector::skip(10),
+        ]);
+
+        let result = a.union(&b);
+
+        // NYYYYYN
+        assert_eq!(
+            result.iter().collect::<Vec<_>>(),
+            vec![
+                &RowSelector::skip(10),
+                &RowSelector::select(50),
+                &RowSelector::skip(10),
+            ]
+        );
+    }
 }

Reply via email to