alamb commented on code in PR #10607: URL: https://github.com/apache/datafusion/pull/10607#discussion_r1608821292
########## datafusion/core/src/datasource/physical_plan/parquet/mod.rs: ########## @@ -556,32 +557,36 @@ impl FileOpener for ParquetOpener { }; }; - // Row group pruning by statistics: attempt to skip entire row_groups - // using metadata on the row groups + // Determine which row groups to actually read. The idea is to skip + // as many row groups as possible based on the metadata and query let file_metadata = builder.metadata().clone(); let predicate = pruning_predicate.as_ref().map(|p| p.as_ref()); - let mut row_groups = row_groups::prune_row_groups_by_statistics( - &file_schema, - builder.parquet_schema(), - file_metadata.row_groups(), - file_range, - predicate, - &file_metrics, - ); + let rg_metadata = file_metadata.row_groups(); + // track which row groups to actually read + let mut row_groups = RowGroupSet::new(rg_metadata.len()); Review Comment: The whole point of this PR is to move the set of RowGroups that are still under consideration into a `RowGroupSet` rather than a `Vec<usize>` This makes it clearer what is going on in my opinion, as well as sets up up for vectorized pruning evaluation in the next PR ########## datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs: ########## @@ -1196,14 +1223,60 @@ mod tests { .await } + // What row groups are expected to be left after pruning Review Comment: I was finding `vec![0]` hard to understand what was being tested, so I made this structure to encapsulate it ########## datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs: ########## @@ -38,42 +38,94 @@ use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; use super::ParquetFileMetrics; -/// Prune row groups based on statistics +/// Tracks which RowGroupsw within a parquet file should be scanned. /// -/// Returns a vector of indexes into `groups` which should be scanned. -/// -/// If an index is NOT present in the returned Vec it means the -/// predicate filtered all the row group. -/// -/// If an index IS present in the returned Vec it means the predicate -/// did not filter out that row group. -/// -/// Note: This method currently ignores ColumnOrder -/// <https://github.com/apache/datafusion/issues/8335> -pub(crate) fn prune_row_groups_by_statistics( - arrow_schema: &Schema, - parquet_schema: &SchemaDescriptor, - groups: &[RowGroupMetaData], - range: Option<FileRange>, - predicate: Option<&PruningPredicate>, - metrics: &ParquetFileMetrics, -) -> Vec<usize> { - let mut filtered = Vec::with_capacity(groups.len()); - for (idx, metadata) in groups.iter().enumerate() { - if let Some(range) = &range { - // figure out where the first dictionary page (or first data page are) +/// This struct encapsulates the various types of pruning that can be applied to +/// a set of row groups within a parquet file. +#[derive(Debug, PartialEq)] +pub(crate) struct RowGroupSet { + /// row_groups[i] is true if the i-th row group should be scanned + row_groups: Vec<bool>, +} + +impl RowGroupSet { + /// Create a new RowGroupSet with all row groups set to true (will be scanned) + pub fn new(num_row_groups: usize) -> Self { + Self { + row_groups: vec![true; num_row_groups], + } + } + + /// Set the i-th row group to false (should not be scanned) + pub fn do_not_scan(&mut self, idx: usize) { + self.row_groups[idx] = false; + } + + /// return true if the i-th row group should be scanned + fn should_scan(&self, idx: usize) -> bool { + self.row_groups[idx] + } + + pub fn len(&self) -> usize { + self.row_groups.len() + } + + pub fn is_empty(&self) -> bool { + self.row_groups.is_empty() + } + + /// Return an iterator over the row group indexes that should be scanned + pub fn iter(&self) -> impl Iterator<Item = usize> + '_ { + self.row_groups + .iter() + .enumerate() + .filter_map(|(idx, &b)| if b { Some(idx) } else { None }) + } + + /// Return a vector with the row group indexes that should be scanned + pub fn indexes(&self) -> Vec<usize> { + self.iter().collect() + } + + /// Prune remaining row groups so that only those row groups within the + /// specified range are scanned. + /// + /// Updates this set to mark row groups that should not be scanned + pub fn prune_by_range(&mut self, groups: &[RowGroupMetaData], range: &FileRange) { + for (idx, metadata) in groups.iter().enumerate() { Review Comment: I moved the range filtering into its own function to make it clearer what is going on (it has nothing to do with statistics pruning) ########## datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs: ########## @@ -439,17 +483,15 @@ mod tests { ); let metrics = parquet_file_metrics(); - assert_eq!( - prune_row_groups_by_statistics( - &schema, - &schema_descr, - &[rgm1, rgm2], - None, - Some(&pruning_predicate), - &metrics - ), - vec![1] + let mut row_groups = RowGroupSet::new(2); Review Comment: I updated the tests to use `RowGroupSet` which I think actually makes them easier to read -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org