adriangb commented on code in PR #16424: URL: https://github.com/apache/datafusion/pull/16424#discussion_r2150409636
########## datafusion/datasource-parquet/src/opener.rs: ########## @@ -524,6 +512,91 @@ fn should_enable_page_index( .unwrap_or(false) } +/// Prune based on partition values and file-level statistics. +pub struct FilePruner { + predicate: Arc<dyn PhysicalExpr>, + pruning_schema: Arc<Schema>, + file: PartitionedFile, + partition_fields: Vec<FieldRef>, + predicate_creation_errors: Count, +} + +impl FilePruner { + pub fn new_opt( + predicate: Arc<dyn PhysicalExpr>, + logical_file_schema: &SchemaRef, + partition_fields: Vec<FieldRef>, + file: PartitionedFile, + predicate_creation_errors: Count, + ) -> Result<Option<Self>> { + // If there is not dynamic predicate, we don't need to prune + if !is_dynamic_physical_expr(Arc::clone(&predicate))? { + return Ok(None); + } + // Build a pruning schema that combines the file fields and partition fields. + // Partition fileds are always at the end. + let pruning_schema = Arc::new( + Schema::new( + logical_file_schema + .fields() + .iter() + .cloned() + .chain(partition_fields.iter().cloned()) + .collect_vec(), + ) + .with_metadata(logical_file_schema.metadata().clone()), + ); + Ok(Some(Self { + predicate, + pruning_schema, + file, + partition_fields, + predicate_creation_errors, + })) + } + + pub fn should_prune(&self) -> Result<bool> { + let pruning_predicate = build_pruning_predicate( + Arc::clone(&self.predicate), + &self.pruning_schema, + &self.predicate_creation_errors, + ); Review Comment: Another option would be to add a `generation` to dynamic filters which gets bumped up by 1 every time they get updated. Then it would be super cheap to check if a filter has been updated. But we'd have to come up with APIs for that, put it on `PhysicalExpr` (what happens if there are multiple child dynamic filters with different generations...?), etc. It seems to me that given that if there is a perf tradeoff it's only for some cases with dynamic filters so it should be okay to proceed as is for now and worry about that as a later optimization. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org