This is an automated email from the ASF dual-hosted git repository. github-merge-queue[bot] pushed a commit to branch gh-readonly-queue/main/pr-22085-3c5361881a52f0fc673983cc0eee213723e12890 in repository https://gitbox.apache.org/repos/asf/datafusion.git
commit b76970b4f979111eb84337c778685c9df0f9cea9 Author: Nuno Faria <[email protected]> AuthorDate: Tue May 12 13:08:48 2026 +0100 minor: Track Parquet rows and pages matched when the page index is skipped (#22085) ## Which issue does this PR close? - Follow up to #22031. ## Rationale for this change If page index pruning is attempted but fails, report that all rows/pages have matched, instead of returning 0 (see https://github.com/apache/datafusion/pull/22031#discussion_r3193411270). ## What changes are included in this PR? - Track the number of matched rows and pages independently of the result of the predicate pruning. ## Are these changes tested? Existing tests. I'm not sure how to generate Parquet files that trigger this directly. ## Are there any user-facing changes? No. cc: @adriangb @2010YOUY01 --------- Co-authored-by: Daniƫl Heres <[email protected]> Co-authored-by: xudong.w <[email protected]> Co-authored-by: Kumar Ujjawal <[email protected]> --- datafusion/datasource-parquet/src/page_filter.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs index 419cf4f428..bc6551c977 100644 --- a/datafusion/datasource-parquet/src/page_filter.rs +++ b/datafusion/datasource-parquet/src/page_filter.rs @@ -199,9 +199,16 @@ impl PagePruningAccessPlanFilter { for row_group_index in row_group_indexes { // The selection for this particular row group let mut overall_selection = None; - let mut total_pages_in_group = 0; + + let total_pages_in_group = + parquet_metadata.offset_index().map_or(0, |offset_index| { + offset_index[row_group_index] + .first() + .map_or(0, |column| column.page_locations.len()) + }); // stores the indexes of the matched pages - let mut matched_pages_in_group: Option<HashSet<usize>> = None; + let mut matched_pages_in_group: HashSet<usize> = + HashSet::from_iter(0..total_pages_in_group); for predicate in page_index_predicates { let column = predicate @@ -245,19 +252,13 @@ impl PagePruningAccessPlanFilter { predicate.predicate_expr(), ); - total_pages_in_group = pages.len(); let matched_pages_indexes: HashSet<_> = pages .into_iter() .enumerate() .filter(|x| x.1) .map(|x| x.0) .collect(); - if let Some(ref mut m) = matched_pages_in_group { - // only keep pages that also matched in the previous predicate(s) - m.retain(|x| matched_pages_indexes.contains(x)); - } else { - matched_pages_in_group = Some(matched_pages_indexes); - } + matched_pages_in_group.retain(|x| matched_pages_indexes.contains(x)); overall_selection = update_selection(overall_selection, selection); @@ -293,9 +294,12 @@ impl PagePruningAccessPlanFilter { skipping all {rows_skipped} rows in row group {row_group_index}" ); } + } else { + total_select += + parquet_metadata.row_group(row_group_index).num_rows() as usize; } - let pages_matched = matched_pages_in_group.map_or(0, |m| m.len()); + let pages_matched = matched_pages_in_group.len(); total_pages_select += pages_matched; total_pages_skip += total_pages_in_group - pages_matched; } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
