jayzhan211 commented on code in PR #15568: URL: https://github.com/apache/datafusion/pull/15568#discussion_r2038636475
########## datafusion/physical-expr/src/expressions/dynamic_filters.rs: ########## @@ -0,0 +1,380 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + any::Any, + fmt::Display, + hash::Hash, + sync::{Arc, RwLock}, +}; + +use crate::PhysicalExpr; +use arrow::datatypes::{DataType, Schema}; +use datafusion_common::{ + tree_node::{Transformed, TransformedResult, TreeNode}, + Result, +}; +use datafusion_expr::ColumnarValue; +use datafusion_physical_expr_common::physical_expr::{DynEq, DynHash}; + +/// A dynamic [`PhysicalExpr`] that can be updated by anyone with a reference to it. +#[derive(Debug)] +pub struct DynamicFilterPhysicalExpr { + /// The original children of this PhysicalExpr, if any. + /// This is necessary because the dynamic filter may be initialized with a placeholder (e.g. `lit(true)`) + /// and later remapped to the actual expressions that are being filtered. + /// But we need to know the children (e.g. columns referenced in the expression) ahead of time to evaluate the expression correctly. + children: Vec<Arc<dyn PhysicalExpr>>, + /// If any of the children were remapped / modified (e.g. to adjust for projections) we need to keep track of the new children + /// so that when we update `current()` in subsequent iterations we can re-apply the replacements. + remapped_children: Option<Vec<Arc<dyn PhysicalExpr>>>, + /// The source of dynamic filters. + inner: Arc<RwLock<Arc<dyn PhysicalExpr>>>, + /// For testing purposes track the data type and nullability to make sure they don't change. + /// If they do, there's a bug in the implementation. + /// But this can have overhead in production, so it's only included in our tests. + data_type: Arc<RwLock<Option<DataType>>>, + nullable: Arc<RwLock<Option<bool>>>, +} + +impl Hash for DynamicFilterPhysicalExpr { + fn hash<H: std::hash::Hasher>(&self, state: &mut H) { + let inner = self.current().expect("Failed to get current expression"); + inner.dyn_hash(state); + self.children.dyn_hash(state); + self.remapped_children.dyn_hash(state); + } +} + +impl PartialEq for DynamicFilterPhysicalExpr { + fn eq(&self, other: &Self) -> bool { + let inner = self.current().expect("Failed to get current expression"); + let our_children = self.remapped_children.as_ref().unwrap_or(&self.children); + let other_children = other.remapped_children.as_ref().unwrap_or(&other.children); + let other = other.current().expect("Failed to get current expression"); + inner.dyn_eq(other.as_any()) && our_children == other_children + } +} + +impl Eq for DynamicFilterPhysicalExpr {} + +impl Display for DynamicFilterPhysicalExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let inner = self.current().expect("Failed to get current expression"); + write!(f, "DynamicFilterPhysicalExpr [ {} ]", inner) + } +} + +impl DynamicFilterPhysicalExpr { + /// Create a new [`DynamicFilterPhysicalExpr`] + /// from an initial expression and a list of children. + /// The list of children is provided separately because + /// the initial expression may not have the same children. + /// For example, if the initial expression is just `true` + /// it will not reference any columns, but we may know that + /// we are going to replace this expression with a real one + /// that does reference certain columns. + /// In this case you **must** pass in the columns that will be + /// used in the final expression as children to this function + /// since DataFusion is generally not compatible with dynamic + /// *children* in expressions. + /// + /// To determine the children you can: + /// + /// - Use [`collect_columns`] to collect the columns from the expression. + /// - Use existing information, such as the sort columns in a `SortExec`. + /// + /// Generally the important bit is that the *leaf children that reference columns + /// do not change* since those will be used to determine what columns need to read or projected + /// when evaluating the expression. + /// + /// [`collect_columns`]: crate::utils::collect_columns + #[allow(dead_code)] // Only used in tests for now + pub fn new( + children: Vec<Arc<dyn PhysicalExpr>>, + inner: Arc<dyn PhysicalExpr>, + ) -> Self { + Self { + children, + remapped_children: None, // Initially no remapped children + inner: Arc::new(RwLock::new(inner)), + data_type: Arc::new(RwLock::new(None)), + nullable: Arc::new(RwLock::new(None)), + } + } + + /// Get the current expression. + /// This will return the current expression with any children + /// remapped to match calls to [`PhysicalExpr::with_new_children`]. + pub fn current(&self) -> Result<Arc<dyn PhysicalExpr>> { + let current = self + .inner + .read() + .map_err(|_| { + datafusion_common::DataFusionError::Execution( + "Failed to acquire read lock for inner".to_string(), + ) + })? + .clone(); + if let Some(remapped_children) = &self.remapped_children { + // Remap children to the current children + // of the expression. + current + .transform_up(|expr| { + // Check if this is any of our original children + if let Some(pos) = self + .children + .iter() + .position(|c| c.as_ref() == expr.as_ref()) + { + // If so, remap it to the current children + // of the expression. + let new_child = Arc::clone(&remapped_children[pos]); + Ok(Transformed::yes(new_child)) + } else { + // Otherwise, just return the expression + Ok(Transformed::no(expr)) + } + }) + .data() + } else { + Ok(current) + } + } + + /// Update the current expression. + /// Any children of this expression must be a subset of the original children + /// passed to the constructor. + /// This should be called e.g.: + /// - When we've computed the probe side's hash table in a HashJoinExec + /// - After every batch is processed if we update the TopK heap in a SortExec using a TopK approach. + #[allow(dead_code)] // Only used in tests for now + pub fn update(&self, new_expr: Arc<dyn PhysicalExpr>) -> Result<()> { + let mut current = self.inner.write().map_err(|_| { + datafusion_common::DataFusionError::Execution( + "Failed to acquire write lock for inner".to_string(), + ) + })?; + *current = new_expr; + Ok(()) + } +} + +impl PhysicalExpr for DynamicFilterPhysicalExpr { + fn as_any(&self) -> &dyn Any { + self + } + + fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> { + self.remapped_children + .as_ref() + .unwrap_or(&self.children) + .iter() + .collect() + } + + fn with_new_children( + self: Arc<Self>, + children: Vec<Arc<dyn PhysicalExpr>>, + ) -> Result<Arc<dyn PhysicalExpr>> { + Ok(Arc::new(Self { + children: self.children.clone(), + remapped_children: Some(children), Review Comment: > Why do they need to be updated here? Children get remapped dynamically when you call current(). What if we are doing plan rewrite and want a completely different `children` in `DynamicFilterPhysicalExpr`? ```rust impl<T: ConcreteTreeNode> TreeNode for T { fn apply_children<'n, F: FnMut(&'n Self) -> Result<TreeNodeRecursion>>( &'n self, f: F, ) -> Result<TreeNodeRecursion> { self.children().iter().apply_until_stop(f) } fn map_children<F: FnMut(Self) -> Result<Transformed<Self>>>( self, f: F, ) -> Result<Transformed<Self>> { let (new_self, children) = self.take_children(); if !children.is_empty() { let new_children = children.into_iter().map_until_stop_and_collect(f)?; // Propagate up `new_children.transformed` and `new_children.tnr` along with // the node containing transformed children. new_children.map_data(|new_children| new_self.with_new_children(new_children)) } else { Ok(Transformed::no(new_self)) } } } ``` `with_new_children` is the function called when you do the plan rewrite, but in your code the `children` isn't changed and updated to `remapped_children` instead. IMO this seems like you rely on the `with_new_children` function for `DynamicFilterPhysicalExpr` requirement, and didn't actually have the ability to update the expression with the new one. ```rust fn with_new_children( self: Arc<Self>, children: Vec<Arc<dyn PhysicalExpr>>, ) -> Result<Arc<dyn PhysicalExpr>> { Ok(Arc::new(Self { children: self.children.clone(), remapped_children: Some(children), inner: Arc::clone(&self.inner), data_type: Arc::clone(&self.data_type), nullable: Arc::clone(&self.nullable), })) } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org