This is an automated email from the ASF dual-hosted git repository.

github-merge-queue[bot] pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new b76970b4f9 minor: Track Parquet rows and pages matched when the page 
index is skipped (#22085)
b76970b4f9 is described below

commit b76970b4f979111eb84337c778685c9df0f9cea9
Author: Nuno Faria <[email protected]>
AuthorDate: Tue May 12 13:08:48 2026 +0100

    minor: Track Parquet rows and pages matched when the page index is skipped 
(#22085)
    
    ## Which issue does this PR close?
    
    - Follow up to #22031.
    
    ## Rationale for this change
    
    If page index pruning is attempted but fails, report that all rows/pages
    have matched, instead of returning 0 (see
    https://github.com/apache/datafusion/pull/22031#discussion_r3193411270).
    
    
    ## What changes are included in this PR?
    
    - Track the number of matched rows and pages independently of the result
    of the predicate pruning.
    
    ## Are these changes tested?
    
    Existing tests. I'm not sure how to generate Parquet files that trigger
    this directly.
    
    ## Are there any user-facing changes?
    
    No.
    
    cc: @adriangb @2010YOUY01
    
    ---------
    
    Co-authored-by: DaniĆ«l Heres <[email protected]>
    Co-authored-by: xudong.w <[email protected]>
    Co-authored-by: Kumar Ujjawal <[email protected]>
---
 datafusion/datasource-parquet/src/page_filter.rs | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/datafusion/datasource-parquet/src/page_filter.rs 
b/datafusion/datasource-parquet/src/page_filter.rs
index 419cf4f428..bc6551c977 100644
--- a/datafusion/datasource-parquet/src/page_filter.rs
+++ b/datafusion/datasource-parquet/src/page_filter.rs
@@ -199,9 +199,16 @@ impl PagePruningAccessPlanFilter {
         for row_group_index in row_group_indexes {
             // The selection for this particular row group
             let mut overall_selection = None;
-            let mut total_pages_in_group = 0;
+
+            let total_pages_in_group =
+                parquet_metadata.offset_index().map_or(0, |offset_index| {
+                    offset_index[row_group_index]
+                        .first()
+                        .map_or(0, |column| column.page_locations.len())
+                });
             // stores the indexes of the matched pages
-            let mut matched_pages_in_group: Option<HashSet<usize>> = None;
+            let mut matched_pages_in_group: HashSet<usize> =
+                HashSet::from_iter(0..total_pages_in_group);
 
             for predicate in page_index_predicates {
                 let column = predicate
@@ -245,19 +252,13 @@ impl PagePruningAccessPlanFilter {
                     predicate.predicate_expr(),
                 );
 
-                total_pages_in_group = pages.len();
                 let matched_pages_indexes: HashSet<_> = pages
                     .into_iter()
                     .enumerate()
                     .filter(|x| x.1)
                     .map(|x| x.0)
                     .collect();
-                if let Some(ref mut m) = matched_pages_in_group {
-                    // only keep pages that also matched in the previous 
predicate(s)
-                    m.retain(|x| matched_pages_indexes.contains(x));
-                } else {
-                    matched_pages_in_group = Some(matched_pages_indexes);
-                }
+                matched_pages_in_group.retain(|x| 
matched_pages_indexes.contains(x));
 
                 overall_selection = update_selection(overall_selection, 
selection);
 
@@ -293,9 +294,12 @@ impl PagePruningAccessPlanFilter {
                         skipping all {rows_skipped} rows in row group 
{row_group_index}"
                     );
                 }
+            } else {
+                total_select +=
+                    parquet_metadata.row_group(row_group_index).num_rows() as 
usize;
             }
 
-            let pages_matched = matched_pages_in_group.map_or(0, |m| m.len());
+            let pages_matched = matched_pages_in_group.len();
             total_pages_select += pages_matched;
             total_pages_skip += total_pages_in_group - pages_matched;
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to