Ted-Jiang commented on code in PR #11483:
URL: https://github.com/apache/datafusion/pull/11483#discussion_r1680621877


##########
datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs:
##########
@@ -266,105 +287,60 @@ fn update_selection(
     }
 }
 
-/// Returns the column index in the row parquet schema for the single
-/// column of a single column pruning predicate.
-///
-/// For example, give the predicate `y > 5`
+/// Returns a [`RowSelection`] for the rows in this row group to scan.
 ///
-/// And columns in the RowGroupMetadata like `['x', 'y', 'z']` will
-/// return 1.
+/// This Row Selection is formed from the page index and the predicate skips 
row
+/// ranges that can be ruled out based on the predicate.
 ///
-/// Returns `None` if the column is not found, or if there are no
-/// required columns, which is the case for predicate like `abs(i) =
-/// 1` which are rewritten to `lit(true)`
-///
-/// Panics:
-///
-/// If the predicate contains more than one column reference (assumes
-/// that `extract_page_index_push_down_predicates` only returns
-/// predicate with one col)
-fn find_column_index(
-    predicate: &PruningPredicate,
-    arrow_schema: &Schema,
-    parquet_schema: &SchemaDescriptor,
-) -> Option<usize> {
-    let mut found_required_column: Option<&Column> = None;
-
-    for required_column_details in predicate.required_columns().iter() {
-        let column = &required_column_details.0;
-        if let Some(found_required_column) = found_required_column.as_ref() {
-            // make sure it is the same name we have seen previously
-            assert_eq!(
-                column.name(),
-                found_required_column.name(),
-                "Unexpected multi column predicate"
-            );
-        } else {
-            found_required_column = Some(column);
-        }
-    }
-
-    let Some(column) = found_required_column.as_ref() else {
-        trace!("No column references in pruning predicate");
-        return None;
-    };
-
-    parquet_column(parquet_schema, arrow_schema, column.name()).map(|x| x.0)
-}
-
-/// Returns a `RowSelection` for the pages in this RowGroup if any
-/// rows can be pruned based on the page index
+/// Returns `None` if there is an error evaluating the predicate or the 
required
+/// page information is not present.
 fn prune_pages_in_one_row_group(
-    group: &RowGroupMetaData,
-    predicate: &PruningPredicate,
-    col_offset_indexes: Option<&Vec<PageLocation>>,
-    col_page_indexes: Option<&Index>,
-    col_desc: &ColumnDescriptor,
+    row_group_index: usize,
+    pruning_predicate: &PruningPredicate,
+    converter: StatisticsConverter<'_>,
+    parquet_metadata: &ParquetMetaData,
     metrics: &ParquetFileMetrics,
 ) -> Option<RowSelection> {
-    let num_rows = group.num_rows() as usize;
-    let (Some(col_offset_indexes), Some(col_page_indexes)) =
-        (col_offset_indexes, col_page_indexes)
-    else {
-        return None;
-    };
-
-    let target_type = parquet_to_arrow_decimal_type(col_desc);
-    let pruning_stats = PagesPruningStatistics {
-        col_page_indexes,
-        col_offset_indexes,
-        target_type: &target_type,
-        num_rows_in_row_group: group.num_rows(),
-    };
+    let pruning_stats =
+        PagesPruningStatistics::try_new(row_group_index, converter, 
parquet_metadata)?;
 
-    let values = match predicate.prune(&pruning_stats) {
+    // Each element in values is a boolean indicating whether the page may have
+    // values that match the predicate (true) or could not possibly have values
+    // that match the predicate (false).
+    let values = match pruning_predicate.prune(&pruning_stats) {
         Ok(values) => values,
         Err(e) => {
-            // stats filter array could not be built
-            // return a result which will not filter out any pages
             debug!("Error evaluating page index predicate values {e}");
             metrics.predicate_evaluation_errors.add(1);
             return None;
         }
     };
 
+    // Convert the information of which pages to skip into a RowSelection
+    // that describes the ranges of rows to skip.
+    let Some(page_row_counts) = pruning_stats.page_row_counts() else {
+        debug!(
+            "Can not determine page row counts for row group 
{row_group_index}, skipping"
+        );

Review Comment:
   I think here need add `metrics.predicate_evaluation_errors.add(1);`
   as above `Returns None  if there is an error evaluating the predicate`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to