adriangb commented on code in PR #15568: URL: https://github.com/apache/datafusion/pull/15568#discussion_r2038768921
########## datafusion/physical-expr/src/expressions/dynamic_filters.rs: ########## @@ -0,0 +1,380 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + any::Any, + fmt::Display, + hash::Hash, + sync::{Arc, RwLock}, +}; + +use crate::PhysicalExpr; +use arrow::datatypes::{DataType, Schema}; +use datafusion_common::{ + tree_node::{Transformed, TransformedResult, TreeNode}, + Result, +}; +use datafusion_expr::ColumnarValue; +use datafusion_physical_expr_common::physical_expr::{DynEq, DynHash}; + +/// A dynamic [`PhysicalExpr`] that can be updated by anyone with a reference to it. +#[derive(Debug)] +pub struct DynamicFilterPhysicalExpr { + /// The original children of this PhysicalExpr, if any. + /// This is necessary because the dynamic filter may be initialized with a placeholder (e.g. `lit(true)`) + /// and later remapped to the actual expressions that are being filtered. + /// But we need to know the children (e.g. columns referenced in the expression) ahead of time to evaluate the expression correctly. + children: Vec<Arc<dyn PhysicalExpr>>, + /// If any of the children were remapped / modified (e.g. to adjust for projections) we need to keep track of the new children + /// so that when we update `current()` in subsequent iterations we can re-apply the replacements. + remapped_children: Option<Vec<Arc<dyn PhysicalExpr>>>, + /// The source of dynamic filters. + inner: Arc<RwLock<Arc<dyn PhysicalExpr>>>, + /// For testing purposes track the data type and nullability to make sure they don't change. + /// If they do, there's a bug in the implementation. + /// But this can have overhead in production, so it's only included in our tests. + data_type: Arc<RwLock<Option<DataType>>>, + nullable: Arc<RwLock<Option<bool>>>, +} + +impl Hash for DynamicFilterPhysicalExpr { + fn hash<H: std::hash::Hasher>(&self, state: &mut H) { + let inner = self.current().expect("Failed to get current expression"); + inner.dyn_hash(state); + self.children.dyn_hash(state); + self.remapped_children.dyn_hash(state); + } +} + +impl PartialEq for DynamicFilterPhysicalExpr { + fn eq(&self, other: &Self) -> bool { + let inner = self.current().expect("Failed to get current expression"); + let our_children = self.remapped_children.as_ref().unwrap_or(&self.children); + let other_children = other.remapped_children.as_ref().unwrap_or(&other.children); + let other = other.current().expect("Failed to get current expression"); + inner.dyn_eq(other.as_any()) && our_children == other_children + } +} + +impl Eq for DynamicFilterPhysicalExpr {} + +impl Display for DynamicFilterPhysicalExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let inner = self.current().expect("Failed to get current expression"); + write!(f, "DynamicFilterPhysicalExpr [ {} ]", inner) + } +} + +impl DynamicFilterPhysicalExpr { + /// Create a new [`DynamicFilterPhysicalExpr`] + /// from an initial expression and a list of children. + /// The list of children is provided separately because + /// the initial expression may not have the same children. + /// For example, if the initial expression is just `true` + /// it will not reference any columns, but we may know that + /// we are going to replace this expression with a real one + /// that does reference certain columns. + /// In this case you **must** pass in the columns that will be + /// used in the final expression as children to this function + /// since DataFusion is generally not compatible with dynamic + /// *children* in expressions. + /// + /// To determine the children you can: + /// + /// - Use [`collect_columns`] to collect the columns from the expression. + /// - Use existing information, such as the sort columns in a `SortExec`. + /// + /// Generally the important bit is that the *leaf children that reference columns + /// do not change* since those will be used to determine what columns need to read or projected + /// when evaluating the expression. + /// + /// [`collect_columns`]: crate::utils::collect_columns + #[allow(dead_code)] // Only used in tests for now + pub fn new( + children: Vec<Arc<dyn PhysicalExpr>>, + inner: Arc<dyn PhysicalExpr>, + ) -> Self { + Self { + children, + remapped_children: None, // Initially no remapped children + inner: Arc::new(RwLock::new(inner)), + data_type: Arc::new(RwLock::new(None)), Review Comment: Okay will look into that in a follow up PR. My thought was that it's pretty cheap to initialize them. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org