Re: [PR] Refactor parquet row group pruning into a struct (use new statistics API, part 1) [datafusion]

via GitHub Thu, 23 May 2024 10:20:55 -0700


alamb commented on code in PR #10607:
URL: https://github.com/apache/datafusion/pull/10607#discussion_r1612063262



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -38,42 +38,100 @@ use crate::physical_optimizer::pruning::{PruningPredicate, 
PruningStatistics};
 
 use super::ParquetFileMetrics;
 
-/// Prune row groups based on statistics
+/// Tracks which RowGroups within a parquet file should be scanned.
 ///
-/// Returns a vector of indexes into `groups` which should be scanned.
-///
-/// If an index is NOT present in the returned Vec it means the
-/// predicate filtered all the row group.
-///
-/// If an index IS present in the returned Vec it means the predicate
-/// did not filter out that row group.
-///
-/// Note: This method currently ignores ColumnOrder
-/// <https://github.com/apache/datafusion/issues/8335>
-pub(crate) fn prune_row_groups_by_statistics(
-    arrow_schema: &Schema,
-    parquet_schema: &SchemaDescriptor,
-    groups: &[RowGroupMetaData],
-    range: Option<FileRange>,
-    predicate: Option<&PruningPredicate>,
-    metrics: &ParquetFileMetrics,
-) -> Vec<usize> {
-    let mut filtered = Vec::with_capacity(groups.len());
-    for (idx, metadata) in groups.iter().enumerate() {
-        if let Some(range) = &range {
-            // figure out where the first dictionary page (or first data page 
are)
+/// This struct encapsulates the various types of pruning that can be applied 
to
+/// a set of row groups within a parquet file, progressively narrowing down the
+/// set of row groups that should be scanned.
+#[derive(Debug, PartialEq)]
+pub struct RowGroupSet {
+    /// `row_groups[i]` is true if the i-th row group should be scanned
+    row_groups: Vec<bool>,
+}
+
+impl RowGroupSet {
+    /// Create a new `RowGroupSet` with all row groups set to true (will be 
scanned)
+    pub fn new(num_row_groups: usize) -> Self {
+        Self {
+            row_groups: vec![true; num_row_groups],
+        }
+    }
+
+    /// Set the i-th row group to false (should not be scanned)
+    pub fn do_not_scan(&mut self, idx: usize) {
+        self.row_groups[idx] = false;
+    }
+
+    /// Return true if the i-th row group should be scanned
+    fn should_scan(&self, idx: usize) -> bool {
+        self.row_groups[idx]
+    }
+
+    /// Return the total number of row groups (not the total number to be 
scanned)
+    pub fn len(&self) -> usize {
+        self.row_groups.len()
+    }
+
+    /// Return true if there are no row groups
+    pub fn is_empty(&self) -> bool {
+        self.row_groups.is_empty()
+    }
+
+    /// Return an iterator over the row group indexes that should be scanned
+    pub fn iter(&self) -> impl Iterator<Item = usize> + '_ {
+        self.row_groups
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, &b)| if b { Some(idx) } else { None })
+    }
+
+    /// Return a `Vec` of row group indices that should be scanned
+    pub fn indexes(&self) -> Vec<usize> {
+        self.iter().collect()
+    }
+
+    /// Prune remaining row groups to only those  within the specified range.
+    ///
+    /// Updates this set to mark row groups that should not be scanned
+    pub fn prune_by_range(&mut self, groups: &[RowGroupMetaData], range: 
&FileRange) {

Review Comment:
   Here is a PR to add some asserts: 
https://github.com/apache/datafusion/pull/10641



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Refactor parquet row group pruning into a struct (use new statistics API, part 1) [datafusion]

Reply via email to