This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new e566329707 Improve Canonicalize API (#8983)
e566329707 is described below

commit e566329707e910ce6f8a32822dbafef993a6faaa
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Feb 1 12:05:36 2024 -0500

    Improve Canonicalize API (#8983)
    
    * Improve Canonicalize API
    
    * Update datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
    
    Co-authored-by: Junhao Liu <[email protected]>
    
    * Simplify the API
    
    * fix comment
    
    * add more flavor in comments
    
    ---------
    
    Co-authored-by: Junhao Liu <[email protected]>
---
 .../src/simplify_expressions/expr_simplifier.rs    | 73 +++++++++++++++++++---
 .../src/simplify_expressions/simplify_exprs.rs     | 55 ++++++++--------
 2 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs 
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 1c12289491..30140101df 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -56,6 +56,9 @@ pub struct ExprSimplifier<S> {
     /// Guarantees about the values of columns. This is provided by the user
     /// in [ExprSimplifier::with_guarantees()].
     guarantees: Vec<(Expr, NullableInterval)>,
+    /// Should expressions be canonicalized before simplification? Defaults to
+    /// true
+    canonicalize: bool,
 }
 
 pub const THRESHOLD_INLINE_INLIST: usize = 3;
@@ -70,6 +73,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
         Self {
             info,
             guarantees: vec![],
+            canonicalize: true,
         }
     }
 
@@ -137,6 +141,12 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
         let mut inlist_simplifier = InListSimplifier::new();
         let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees);
 
+        let expr = if self.canonicalize {
+            expr.rewrite(&mut Canonicalizer::new())?
+        } else {
+            expr
+        };
+
         // TODO iterate until no changes are made during rewrite
         // (evaluating constants can enable new simplifications and
         // simplifications can enable new constant evaluation)
@@ -151,10 +161,6 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
             .rewrite(&mut simplifier)
     }
 
-    pub fn canonicalize(&self, expr: Expr) -> Result<Expr> {
-        let mut canonicalizer = Canonicalizer::new();
-        expr.rewrite(&mut canonicalizer)
-    }
     /// Apply type coercion to an [`Expr`] so that it can be
     /// evaluated as a 
[`PhysicalExpr`](datafusion_physical_expr::PhysicalExpr).
     ///
@@ -229,6 +235,60 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
         self.guarantees = guarantees;
         self
     }
+
+    /// Should [`Canonicalizer`] be applied before simplification?
+    ///
+    /// If true (the default), the expression will be rewritten to canonical
+    /// form before simplification. This is useful to ensure that the 
simplifier
+    /// can apply all possible simplifications.
+    ///
+    /// Some expressions, such as those in some Joins, can not be canonicalized
+    /// without changing their meaning. In these cases, canonicalization should
+    /// be disabled.
+    ///
+    /// ```rust
+    /// use arrow::datatypes::{DataType, Field, Schema};
+    /// use datafusion_expr::{col, lit, Expr};
+    /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
+    /// use datafusion_common::{Result, ScalarValue, ToDFSchema};
+    /// use datafusion_physical_expr::execution_props::ExecutionProps;
+    /// use datafusion_optimizer::simplify_expressions::{
+    ///     ExprSimplifier, SimplifyContext};
+    ///
+    /// let schema = Schema::new(vec![
+    ///   Field::new("a", DataType::Int64, false),
+    ///   Field::new("b", DataType::Int64, false),
+    ///   Field::new("c", DataType::Int64, false),
+    ///   ])
+    ///   .to_dfschema_ref().unwrap();
+    ///
+    /// // Create the simplifier
+    /// let props = ExecutionProps::new();
+    /// let context = SimplifyContext::new(&props)
+    ///    .with_schema(schema);
+    /// let simplifier = ExprSimplifier::new(context);
+    ///
+    /// // Expression: a = c AND 1 = b
+    /// let expr = col("a").eq(col("c")).and(lit(1).eq(col("b")));
+    ///
+    /// // With canonicalization, the expression is rewritten to canonical form
+    /// // (though it is no simpler in this case):
+    /// let canonical = simplifier.simplify(expr.clone()).unwrap();
+    /// // Expression has been rewritten to: (c = a AND b = 1)
+    /// assert_eq!(canonical, col("c").eq(col("a")).and(col("b").eq(lit(1))));
+    ///
+    /// // If canonicalization is disabled, the expression is not changed
+    /// let non_canonicalized = simplifier
+    ///   .with_canonicalize(false)
+    ///   .simplify(expr.clone())
+    ///   .unwrap();
+    ///
+    /// assert_eq!(non_canonicalized, expr);
+    /// ```
+    pub fn with_canonicalize(mut self, canonicalize: bool) -> Self {
+        self.canonicalize = canonicalize;
+        self
+    }
 }
 
 /// Canonicalize any BinaryExprs that are not in canonical form
@@ -236,7 +296,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
 /// `<literal> <op> <col>` is rewritten to `<col> <op> <literal>`
 ///
 /// `<col1> <op> <col2>` is rewritten so that the name of `col1` sorts higher
-/// than `col2` (`b > a` would be canonicalized to `a < b`)
+/// than `col2` (`a > b` would be canonicalized to `b < a`)
 struct Canonicalizer {}
 
 impl Canonicalizer {
@@ -2889,8 +2949,7 @@ mod tests {
         let simplifier = ExprSimplifier::new(
             SimplifyContext::new(&execution_props).with_schema(schema),
         );
-        let cano = simplifier.canonicalize(expr)?;
-        simplifier.simplify(cano)
+        simplifier.simplify(expr)
     }
 
     fn simplify(expr: Expr) -> Expr {
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs 
b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
index d68474dcde..f36cd8f838 100644
--- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
@@ -85,44 +85,39 @@ impl SimplifyExpressions {
         };
         let info = SimplifyContext::new(execution_props).with_schema(schema);
 
-        let simplifier = ExprSimplifier::new(info);
-
         let new_inputs = plan
             .inputs()
             .iter()
             .map(|input| Self::optimize_internal(input, execution_props))
             .collect::<Result<Vec<_>>>()?;
 
-        let expr = match plan {
-            // Canonicalize step won't reorder expressions in a Join on clause.
-            // The left and right expressions in a Join on clause are not 
commutative,
-            // since the order of the columns must match the order of the 
children.
-            LogicalPlan::Join(_) => {
-                plan.expressions()
-                    .into_iter()
-                    .map(|e| {
-                        // TODO: unify with `rewrite_preserving_name`
-                        let original_name = e.name_for_alias()?;
-                        let new_e = simplifier.simplify(e)?;
-                        new_e.alias_if_changed(original_name)
-                    })
-                    .collect::<Result<Vec<_>>>()?
-            }
-            _ => {
-                plan.expressions()
-                    .into_iter()
-                    .map(|e| {
-                        // TODO: unify with `rewrite_preserving_name`
-                        let original_name = e.name_for_alias()?;
-                        let cano_e = simplifier.canonicalize(e)?;
-                        let new_e = simplifier.simplify(cano_e)?;
-                        new_e.alias_if_changed(original_name)
-                    })
-                    .collect::<Result<Vec<_>>>()?
-            }
+        let simplifier = ExprSimplifier::new(info);
+
+        // The left and right expressions in a Join on clause are not
+        // commutative, for reasons that are not entirely clear. Thus, do not
+        // reorder expressions in Join while simplifying.
+        //
+        // This is likely related to the fact that order of the columns must
+        // match the order of the children. see
+        // https://github.com/apache/arrow-datafusion/pull/8780 for more 
details
+        let simplifier = if let LogicalPlan::Join(_) = plan {
+            simplifier.with_canonicalize(false)
+        } else {
+            simplifier
         };
 
-        plan.with_new_exprs(expr, new_inputs)
+        let exprs = plan
+            .expressions()
+            .into_iter()
+            .map(|e| {
+                // TODO: unify with `rewrite_preserving_name`
+                let original_name = e.name_for_alias()?;
+                let new_e = simplifier.simplify(e)?;
+                new_e.alias_if_changed(original_name)
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        plan.with_new_exprs(exprs, new_inputs)
     }
 }
 

Reply via email to