alamb commented on code in PR #5041:
URL: https://github.com/apache/arrow-rs/pull/5041#discussion_r1383355333


##########
parquet/src/arrow/arrow_reader/selection.rs:
##########
@@ -443,7 +417,36 @@ impl RowSelection {
 
 impl From<Vec<RowSelector>> for RowSelection {
     fn from(selectors: Vec<RowSelector>) -> Self {
-        Self::from_selectors_and_combine(selectors.as_slice())
+        selectors.into_iter().collect()
+    }
+}
+
+impl FromIterator<RowSelector> for RowSelection {
+    fn from_iter<T: IntoIterator<Item = RowSelector>>(iter: T) -> Self {
+        let iter = iter.into_iter();
+
+        // Capacity before filter
+        let mut selectors = Vec::with_capacity(iter.size_hint().0);
+
+        let mut filtered = iter.filter(|x| x.row_count != 0);
+        if let Some(x) = filtered.next() {
+            selectors.push(x);
+        }
+
+        for s in filtered {
+            if s.row_count == 0 {
+                continue;
+            }
+
+            let last = selectors.last_mut().unwrap();

Review Comment:
   ```suggestion
               // Combine adjacent skip/non-skip
               let last = selectors.last_mut().unwrap();
   ```



##########
parquet/src/arrow/arrow_reader/selection.rs:
##########
@@ -118,10 +123,13 @@ impl RowSelection {
         let mut last_end = 0;
         for range in ranges {
             let len = range.end - range.start;
+            if len == 0 {

Review Comment:
   Is this the actual bug fix? It also looks like `from_selectors_and_combine` 
didn't remove zero length `RowSelector`s



##########
parquet/src/arrow/arrow_reader/selection.rs:
##########
@@ -64,25 +64,30 @@ impl RowSelector {
 /// use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
 ///
 /// let selectors = vec![
-///   RowSelector { row_count: 5, skip: true },
-///   RowSelector { row_count: 5, skip: false },
-///   RowSelector { row_count: 5, skip: false },
-///   RowSelector { row_count: 5, skip: true },
+///     RowSelector::skip(5),
+///     RowSelector::select(5),
+///     RowSelector::select(5),
+///     RowSelector::skip(5),
 /// ];
 ///
 /// // Creating a selection will combine adjacent selectors
 /// let selection: RowSelection = selectors.into();
 ///
 /// let expected = vec![
-///   RowSelector { row_count: 5, skip: true },
-///   RowSelector { row_count: 10, skip: false },
-///   RowSelector { row_count: 5, skip: true },
+///     RowSelector::skip(5),
+///     RowSelector::select(10),
+///     RowSelector::skip(5),
 /// ];
 ///
 /// let actual: Vec<RowSelector> = selection.into();
 /// assert_eq!(actual, expected);
 /// ```
 ///
+/// A [`RowSelection`] maintains the following invariants:
+///
+/// * It contains no [`RowSelector`] of 0 rows
+/// * Consecutive [`RowSelector`] alternate whether they skip or select rows

Review Comment:
   ```suggestion
   /// * Consecutive [`RowSelector`]s alternate skipping or selecting rows.
   ```



##########
parquet/src/arrow/arrow_reader/selection.rs:
##########
@@ -465,64 +468,58 @@ impl From<RowSelection> for VecDeque<RowSelector> {
 /// other:     NYNNNNNNY
 ///
 /// returned:  NNNNNNNNYYNYN
-fn intersect_row_selections(left: &[RowSelector], right: &[RowSelector]) -> 
Vec<RowSelector> {
-    let mut res = Vec::with_capacity(left.len());
+fn intersect_row_selections(left: &[RowSelector], right: &[RowSelector]) -> 
RowSelection {
     let mut l_iter = left.iter().copied().peekable();
     let mut r_iter = right.iter().copied().peekable();
 
-    while let (Some(a), Some(b)) = (l_iter.peek_mut(), r_iter.peek_mut()) {
-        if a.row_count == 0 {
-            l_iter.next().unwrap();
-            continue;
-        }
-        if b.row_count == 0 {
-            r_iter.next().unwrap();
-            continue;
-        }
-        match (a.skip, b.skip) {
-            // Keep both ranges
-            (false, false) => {
-                if a.row_count < b.row_count {
-                    res.push(RowSelector::select(a.row_count));
-                    b.row_count -= a.row_count;
+    let iter = std::iter::from_fn(move || {
+        loop {
+            let l = l_iter.peek_mut();
+            let r = r_iter.peek_mut();
+
+            match (l, r) {
+                (Some(a), _) if a.row_count == 0 => {
                     l_iter.next().unwrap();
-                } else {
-                    res.push(RowSelector::select(b.row_count));
-                    a.row_count -= b.row_count;
-                    r_iter.next().unwrap();
                 }
-            }
-            // skip at least one
-            _ => {
-                if a.row_count < b.row_count {
-                    res.push(RowSelector::skip(a.row_count));
-                    b.row_count -= a.row_count;
-                    l_iter.next().unwrap();
-                } else {
-                    res.push(RowSelector::skip(b.row_count));
-                    a.row_count -= b.row_count;
+                (_, Some(b)) if b.row_count == 0 => {
                     r_iter.next().unwrap();
                 }
+                (Some(a), Some(b)) => {
+                    return match (a.skip, b.skip) {
+                        // Keep both ranges
+                        (false, false) => {
+                            if a.row_count < b.row_count {
+                                b.row_count -= a.row_count;
+                                l_iter.next()
+                            } else {
+                                a.row_count -= b.row_count;
+                                r_iter.next()
+                            }
+                        }
+                        // skip at least one
+                        _ => {
+                            if a.row_count < b.row_count {
+                                let skip = a.row_count;
+                                b.row_count -= a.row_count;
+                                l_iter.next();
+                                Some(RowSelector::skip(skip))
+                            } else {
+                                let skip = b.row_count;
+                                a.row_count -= skip;
+                                r_iter.next();
+                                Some(RowSelector::skip(skip))
+                            }
+                        }
+                    };
+                }
+                (Some(_), None) => return l_iter.next(),
+                (None, Some(_)) => return r_iter.next(),
+                (None, None) => return None,
             }
         }
-    }
-
-    if l_iter.peek().is_some() {
-        res.extend(l_iter);
-    }
-    if r_iter.peek().is_some() {
-        res.extend(r_iter);
-    }
-    res
-}
+    });

Review Comment:
   I reviewed this logic carefully -- it is very cool



##########
parquet/src/arrow/arrow_reader/selection.rs:
##########
@@ -465,64 +468,58 @@ impl From<RowSelection> for VecDeque<RowSelector> {
 /// other:     NYNNNNNNY
 ///
 /// returned:  NNNNNNNNYYNYN
-fn intersect_row_selections(left: &[RowSelector], right: &[RowSelector]) -> 
Vec<RowSelector> {
-    let mut res = Vec::with_capacity(left.len());
+fn intersect_row_selections(left: &[RowSelector], right: &[RowSelector]) -> 
RowSelection {
     let mut l_iter = left.iter().copied().peekable();
     let mut r_iter = right.iter().copied().peekable();
 
-    while let (Some(a), Some(b)) = (l_iter.peek_mut(), r_iter.peek_mut()) {
-        if a.row_count == 0 {
-            l_iter.next().unwrap();
-            continue;
-        }
-        if b.row_count == 0 {
-            r_iter.next().unwrap();
-            continue;
-        }
-        match (a.skip, b.skip) {
-            // Keep both ranges
-            (false, false) => {
-                if a.row_count < b.row_count {
-                    res.push(RowSelector::select(a.row_count));
-                    b.row_count -= a.row_count;
+    let iter = std::iter::from_fn(move || {
+        loop {
+            let l = l_iter.peek_mut();
+            let r = r_iter.peek_mut();
+
+            match (l, r) {
+                (Some(a), _) if a.row_count == 0 => {
                     l_iter.next().unwrap();
-                } else {
-                    res.push(RowSelector::select(b.row_count));
-                    a.row_count -= b.row_count;
-                    r_iter.next().unwrap();
                 }
-            }
-            // skip at least one
-            _ => {
-                if a.row_count < b.row_count {
-                    res.push(RowSelector::skip(a.row_count));
-                    b.row_count -= a.row_count;
-                    l_iter.next().unwrap();
-                } else {
-                    res.push(RowSelector::skip(b.row_count));
-                    a.row_count -= b.row_count;
+                (_, Some(b)) if b.row_count == 0 => {
                     r_iter.next().unwrap();
                 }
+                (Some(a), Some(b)) => {

Review Comment:
   I found the switch from `l,r` to `a,b` confusing -- can we use one 
consistently throughout this function? Either would be better than a mix



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to