This is an automated email from the ASF dual-hosted git repository.
github-merge-queue[bot] pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b76970b4f9 minor: Track Parquet rows and pages matched when the page
index is skipped (#22085)
b76970b4f9 is described below
commit b76970b4f979111eb84337c778685c9df0f9cea9
Author: Nuno Faria <[email protected]>
AuthorDate: Tue May 12 13:08:48 2026 +0100
minor: Track Parquet rows and pages matched when the page index is skipped
(#22085)
## Which issue does this PR close?
- Follow up to #22031.
## Rationale for this change
If page index pruning is attempted but fails, report that all rows/pages
have matched, instead of returning 0 (see
https://github.com/apache/datafusion/pull/22031#discussion_r3193411270).
## What changes are included in this PR?
- Track the number of matched rows and pages independently of the result
of the predicate pruning.
## Are these changes tested?
Existing tests. I'm not sure how to generate Parquet files that trigger
this directly.
## Are there any user-facing changes?
No.
cc: @adriangb @2010YOUY01
---------
Co-authored-by: Daniƫl Heres <[email protected]>
Co-authored-by: xudong.w <[email protected]>
Co-authored-by: Kumar Ujjawal <[email protected]>
---
datafusion/datasource-parquet/src/page_filter.rs | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/datafusion/datasource-parquet/src/page_filter.rs
b/datafusion/datasource-parquet/src/page_filter.rs
index 419cf4f428..bc6551c977 100644
--- a/datafusion/datasource-parquet/src/page_filter.rs
+++ b/datafusion/datasource-parquet/src/page_filter.rs
@@ -199,9 +199,16 @@ impl PagePruningAccessPlanFilter {
for row_group_index in row_group_indexes {
// The selection for this particular row group
let mut overall_selection = None;
- let mut total_pages_in_group = 0;
+
+ let total_pages_in_group =
+ parquet_metadata.offset_index().map_or(0, |offset_index| {
+ offset_index[row_group_index]
+ .first()
+ .map_or(0, |column| column.page_locations.len())
+ });
// stores the indexes of the matched pages
- let mut matched_pages_in_group: Option<HashSet<usize>> = None;
+ let mut matched_pages_in_group: HashSet<usize> =
+ HashSet::from_iter(0..total_pages_in_group);
for predicate in page_index_predicates {
let column = predicate
@@ -245,19 +252,13 @@ impl PagePruningAccessPlanFilter {
predicate.predicate_expr(),
);
- total_pages_in_group = pages.len();
let matched_pages_indexes: HashSet<_> = pages
.into_iter()
.enumerate()
.filter(|x| x.1)
.map(|x| x.0)
.collect();
- if let Some(ref mut m) = matched_pages_in_group {
- // only keep pages that also matched in the previous
predicate(s)
- m.retain(|x| matched_pages_indexes.contains(x));
- } else {
- matched_pages_in_group = Some(matched_pages_indexes);
- }
+ matched_pages_in_group.retain(|x|
matched_pages_indexes.contains(x));
overall_selection = update_selection(overall_selection,
selection);
@@ -293,9 +294,12 @@ impl PagePruningAccessPlanFilter {
skipping all {rows_skipped} rows in row group
{row_group_index}"
);
}
+ } else {
+ total_select +=
+ parquet_metadata.row_group(row_group_index).num_rows() as
usize;
}
- let pages_matched = matched_pages_in_group.map_or(0, |m| m.len());
+ let pages_matched = matched_pages_in_group.len();
total_pages_select += pages_matched;
total_pages_skip += total_pages_in_group - pages_matched;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]