advancedxy commented on code in PR #10607: URL: https://github.com/apache/datafusion/pull/10607#discussion_r1612607959
########## datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs: ########## @@ -38,42 +38,100 @@ use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; use super::ParquetFileMetrics; -/// Prune row groups based on statistics +/// Tracks which RowGroups within a parquet file should be scanned. /// -/// Returns a vector of indexes into `groups` which should be scanned. -/// -/// If an index is NOT present in the returned Vec it means the -/// predicate filtered all the row group. -/// -/// If an index IS present in the returned Vec it means the predicate -/// did not filter out that row group. -/// -/// Note: This method currently ignores ColumnOrder -/// <https://github.com/apache/datafusion/issues/8335> -pub(crate) fn prune_row_groups_by_statistics( - arrow_schema: &Schema, - parquet_schema: &SchemaDescriptor, - groups: &[RowGroupMetaData], - range: Option<FileRange>, - predicate: Option<&PruningPredicate>, - metrics: &ParquetFileMetrics, -) -> Vec<usize> { - let mut filtered = Vec::with_capacity(groups.len()); - for (idx, metadata) in groups.iter().enumerate() { - if let Some(range) = &range { - // figure out where the first dictionary page (or first data page are) +/// This struct encapsulates the various types of pruning that can be applied to +/// a set of row groups within a parquet file, progressively narrowing down the +/// set of row groups that should be scanned. +#[derive(Debug, PartialEq)] +pub struct RowGroupSet { + /// `row_groups[i]` is true if the i-th row group should be scanned + row_groups: Vec<bool>, +} + +impl RowGroupSet { + /// Create a new `RowGroupSet` with all row groups set to true (will be scanned) + pub fn new(num_row_groups: usize) -> Self { + Self { + row_groups: vec![true; num_row_groups], + } + } + + /// Set the i-th row group to false (should not be scanned) + pub fn do_not_scan(&mut self, idx: usize) { + self.row_groups[idx] = false; + } + + /// Return true if the i-th row group should be scanned + fn should_scan(&self, idx: usize) -> bool { + self.row_groups[idx] + } + + /// Return the total number of row groups (not the total number to be scanned) + pub fn len(&self) -> usize { + self.row_groups.len() + } + + /// Return true if there are no row groups + pub fn is_empty(&self) -> bool { + self.row_groups.is_empty() + } + + /// Return an iterator over the row group indexes that should be scanned + pub fn iter(&self) -> impl Iterator<Item = usize> + '_ { + self.row_groups + .iter() + .enumerate() + .filter_map(|(idx, &b)| if b { Some(idx) } else { None }) + } + + /// Return a `Vec` of row group indices that should be scanned + pub fn indexes(&self) -> Vec<usize> { + self.iter().collect() + } + + /// Prune remaining row groups to only those within the specified range. + /// + /// Updates this set to mark row groups that should not be scanned + pub fn prune_by_range(&mut self, groups: &[RowGroupMetaData], range: &FileRange) { Review Comment: > Yes this is a good idea. I will do so in the next PR Thanks for the quick update. I agree we can at least add check first. > I actually tried this in an earlier version of this code. One challenge I found is that prune_by_bloom_filter needs to have the builder (as it may need to potentially fetch bloom filters) which also has the metadata. So if the RowGroupSet had the metadata then there was an opportunity for inconsistency (between the metadata in the RowGroupSet and the builder) > > I do think there might be an opportunity for improvement here though Then, I think we can leave it as it is for now. We can revise the impl later if some issues popped up. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org