alamb commented on code in PR #4713:
URL: https://github.com/apache/arrow-datafusion/pull/4713#discussion_r1056316886
##########
datafusion/core/src/physical_plan/file_format/parquet.rs:
##########
@@ -126,14 +127,18 @@ impl ParquetExec {
}
}
})
- .and_then(|pruning_predicate| {
- // If the pruning predicate can't prune anything, don't try
- if pruning_predicate.allways_true() {
+ .filter(|p| !p.allways_true());
+
+ let page_pruning_predicate =
predicate.as_ref().and_then(|predicate_expr| {
+ match PagePruningPredicate::try_new(predicate_expr,
file_schema.clone()) {
+ Ok(pruning_predicate) => Some(Arc::new(pruning_predicate)),
+ Err(e) => {
+ debug!("Could not create page pruning predicate for: {}",
e);
Review Comment:
I think it would be worth including the `predicate_expr` in this message too
```suggestion
debug!("Could not create page pruning predicate for
'{}': {}", predicate_expr, e);
```
##########
datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs:
##########
@@ -99,94 +100,114 @@ use super::metrics::ParquetFileMetrics;
///
/// So we can entirely skip rows 0->199 and 250->299 as we know they
/// can not contain rows that match the predicate.
-pub(crate) fn build_page_filter(
- pruning_predicate: Option<&PruningPredicate>,
- schema: SchemaRef,
- row_groups: &[usize],
- file_metadata: &ParquetMetaData,
- file_metrics: &ParquetFileMetrics,
-) -> Result<Option<RowSelection>> {
- // scoped timer updates on drop
- let _timer_guard = file_metrics.page_index_eval_time.timer();
- let page_index_predicates =
- extract_page_index_push_down_predicates(pruning_predicate, schema)?;
+#[derive(Debug)]
+pub(crate) struct PagePruningPredicate {
+ predicates: Vec<PruningPredicate>,
+}
- if page_index_predicates.is_empty() {
- return Ok(None);
+impl PagePruningPredicate {
+ /// Create a new [`PagePruningPredicate`]
+ pub fn try_new(expr: &Expr, schema: SchemaRef) -> Result<Self> {
+ let predicates = split_conjunction(expr)
+ .into_iter()
+ .filter_map(|predicate| match predicate.to_columns() {
+ Ok(columns) if columns.len() == 1 => {
+ match PruningPredicate::try_new(predicate.clone(),
schema.clone()) {
+ Ok(p) if !p.allways_true() => Some(Ok(p)),
Review Comment:
Why don't they refer to a single column? isn't this branch protected by a
match guard `if columns.len() == 1`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]