adriangb commented on code in PR #15568:
URL: https://github.com/apache/datafusion/pull/15568#discussion_r2038776161


##########
datafusion/physical-expr/src/expressions/dynamic_filters.rs:
##########
@@ -0,0 +1,380 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    any::Any,
+    fmt::Display,
+    hash::Hash,
+    sync::{Arc, RwLock},
+};
+
+use crate::PhysicalExpr;
+use arrow::datatypes::{DataType, Schema};
+use datafusion_common::{
+    tree_node::{Transformed, TransformedResult, TreeNode},
+    Result,
+};
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr_common::physical_expr::{DynEq, DynHash};
+
+/// A dynamic [`PhysicalExpr`] that can be updated by anyone with a reference 
to it.
+#[derive(Debug)]
+pub struct DynamicFilterPhysicalExpr {
+    /// The original children of this PhysicalExpr, if any.
+    /// This is necessary because the dynamic filter may be initialized with a 
placeholder (e.g. `lit(true)`)
+    /// and later remapped to the actual expressions that are being filtered.
+    /// But we need to know the children (e.g. columns referenced in the 
expression) ahead of time to evaluate the expression correctly.
+    children: Vec<Arc<dyn PhysicalExpr>>,
+    /// If any of the children were remapped / modified (e.g. to adjust for 
projections) we need to keep track of the new children
+    /// so that when we update `current()` in subsequent iterations we can 
re-apply the replacements.
+    remapped_children: Option<Vec<Arc<dyn PhysicalExpr>>>,
+    /// The source of dynamic filters.
+    inner: Arc<RwLock<Arc<dyn PhysicalExpr>>>,
+    /// For testing purposes track the data type and nullability to make sure 
they don't change.
+    /// If they do, there's a bug in the implementation.
+    /// But this can have overhead in production, so it's only included in our 
tests.
+    data_type: Arc<RwLock<Option<DataType>>>,
+    nullable: Arc<RwLock<Option<bool>>>,
+}
+
+impl Hash for DynamicFilterPhysicalExpr {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        let inner = self.current().expect("Failed to get current expression");
+        inner.dyn_hash(state);
+        self.children.dyn_hash(state);
+        self.remapped_children.dyn_hash(state);
+    }
+}
+
+impl PartialEq for DynamicFilterPhysicalExpr {
+    fn eq(&self, other: &Self) -> bool {
+        let inner = self.current().expect("Failed to get current expression");
+        let our_children = 
self.remapped_children.as_ref().unwrap_or(&self.children);
+        let other_children = 
other.remapped_children.as_ref().unwrap_or(&other.children);
+        let other = other.current().expect("Failed to get current expression");
+        inner.dyn_eq(other.as_any()) && our_children == other_children
+    }
+}
+
+impl Eq for DynamicFilterPhysicalExpr {}
+
+impl Display for DynamicFilterPhysicalExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let inner = self.current().expect("Failed to get current expression");
+        write!(f, "DynamicFilterPhysicalExpr [ {} ]", inner)
+    }
+}
+
+impl DynamicFilterPhysicalExpr {
+    /// Create a new [`DynamicFilterPhysicalExpr`]
+    /// from an initial expression and a list of children.
+    /// The list of children is provided separately because
+    /// the initial expression may not have the same children.
+    /// For example, if the initial expression is just `true`
+    /// it will not reference any columns, but we may know that
+    /// we are going to replace this expression with a real one
+    /// that does reference certain columns.
+    /// In this case you **must** pass in the columns that will be
+    /// used in the final expression as children to this function
+    /// since DataFusion is generally not compatible with dynamic
+    /// *children* in expressions.
+    ///
+    /// To determine the children you can:
+    ///
+    /// - Use [`collect_columns`] to collect the columns from the expression.
+    /// - Use existing information, such as the sort columns in a `SortExec`.
+    ///
+    /// Generally the important bit is that the *leaf children that reference 
columns
+    /// do not change* since those will be used to determine what columns need 
to read or projected
+    /// when evaluating the expression.
+    ///
+    /// [`collect_columns`]: crate::utils::collect_columns
+    #[allow(dead_code)] // Only used in tests for now
+    pub fn new(
+        children: Vec<Arc<dyn PhysicalExpr>>,
+        inner: Arc<dyn PhysicalExpr>,
+    ) -> Self {
+        Self {
+            children,
+            remapped_children: None, // Initially no remapped children
+            inner: Arc::new(RwLock::new(inner)),
+            data_type: Arc::new(RwLock::new(None)),
+            nullable: Arc::new(RwLock::new(None)),
+        }
+    }
+
+    /// Get the current expression.
+    /// This will return the current expression with any children
+    /// remapped to match calls to [`PhysicalExpr::with_new_children`].
+    pub fn current(&self) -> Result<Arc<dyn PhysicalExpr>> {
+        let current = self
+            .inner
+            .read()
+            .map_err(|_| {
+                datafusion_common::DataFusionError::Execution(
+                    "Failed to acquire read lock for inner".to_string(),
+                )
+            })?
+            .clone();
+        if let Some(remapped_children) = &self.remapped_children {
+            // Remap children to the current children
+            // of the expression.
+            current
+                .transform_up(|expr| {
+                    // Check if this is any of our original children
+                    if let Some(pos) = self
+                        .children
+                        .iter()
+                        .position(|c| c.as_ref() == expr.as_ref())
+                    {
+                        // If so, remap it to the current children
+                        // of the expression.
+                        let new_child = Arc::clone(&remapped_children[pos]);
+                        Ok(Transformed::yes(new_child))
+                    } else {
+                        // Otherwise, just return the expression
+                        Ok(Transformed::no(expr))
+                    }
+                })
+                .data()
+        } else {
+            Ok(current)
+        }
+    }
+
+    /// Update the current expression.
+    /// Any children of this expression must be a subset of the original 
children
+    /// passed to the constructor.
+    /// This should be called e.g.:
+    /// - When we've computed the probe side's hash table in a HashJoinExec
+    /// - After every batch is processed if we update the TopK heap in a 
SortExec using a TopK approach.
+    #[allow(dead_code)] // Only used in tests for now
+    pub fn update(&self, new_expr: Arc<dyn PhysicalExpr>) -> Result<()> {
+        let mut current = self.inner.write().map_err(|_| {
+            datafusion_common::DataFusionError::Execution(
+                "Failed to acquire write lock for inner".to_string(),
+            )
+        })?;
+        *current = new_expr;
+        Ok(())
+    }
+}
+
+impl PhysicalExpr for DynamicFilterPhysicalExpr {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.remapped_children
+            .as_ref()
+            .unwrap_or(&self.children)
+            .iter()
+            .collect()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Ok(Arc::new(Self {
+            children: self.children.clone(),
+            remapped_children: Some(children),

Review Comment:
   > DynamicFilterPhysicalExpr(Vec<Arc<dyn PhysicalExpr>>) is enough
   
   Again I'm sorry if I'm missing something, but how would this work to share a 
single updatable reference across multiple copies of the PhysicalExpr (eg 
because `with_new_children` was called)?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to