This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 61c38b711 Change public simplify API and add a public coerce API 
(#3758)
61c38b711 is described below

commit 61c38b7114e802f9f289bf5364a031395f5799a6
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Oct 12 06:54:34 2022 -0400

    Change public simplify API and add a public coerce API (#3758)
    
    * Public coerce and simplify API
    
    * Add ticket reference
---
 datafusion/core/src/logical_plan/mod.rs          |   2 +-
 datafusion/core/tests/simplification.rs          |   8 +-
 datafusion/expr/src/logical_plan/plan.rs         |   2 +-
 datafusion/expr/src/type_coercion/functions.rs   |   2 +-
 datafusion/optimizer/src/expr_simplifier.rs      | 213 +++++++++++++++++++++--
 datafusion/optimizer/src/simplify_expressions.rs | 115 ++----------
 datafusion/optimizer/src/type_coercion.rs        |   2 +-
 7 files changed, 222 insertions(+), 122 deletions(-)

diff --git a/datafusion/core/src/logical_plan/mod.rs 
b/datafusion/core/src/logical_plan/mod.rs
index 4aabd5c3a..e09f39ac1 100644
--- a/datafusion/core/src/logical_plan/mod.rs
+++ b/datafusion/core/src/logical_plan/mod.rs
@@ -55,4 +55,4 @@ pub use datafusion_expr::{
     to_timestamp_micros, to_timestamp_millis, to_timestamp_seconds, translate, 
trim,
     trunc, unalias, upper, when, Expr, ExprSchemable, Literal, Operator,
 };
-pub use datafusion_optimizer::expr_simplifier::{ExprSimplifiable, 
SimplifyInfo};
+pub use datafusion_optimizer::expr_simplifier::SimplifyInfo;
diff --git a/datafusion/core/tests/simplification.rs 
b/datafusion/core/tests/simplification.rs
index fe5f5e254..9cccd1aab 100644
--- a/datafusion/core/tests/simplification.rs
+++ b/datafusion/core/tests/simplification.rs
@@ -19,13 +19,13 @@
 
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::logical_plan::ExprSchemable;
-use datafusion::logical_plan::ExprSimplifiable;
 use datafusion::{
     error::Result,
     execution::context::ExecutionProps,
     logical_plan::{DFSchema, Expr, SimplifyInfo},
     prelude::*,
 };
+use datafusion_optimizer::expr_simplifier::ExprSimplifier;
 
 /// In order to simplify expressions, DataFusion must have information
 /// about the expressions.
@@ -90,7 +90,8 @@ fn basic() {
     // optimize form `a < 5` automatically
     let expr = col("a").lt(lit(2i32) + lit(3i32));
 
-    let simplified = expr.simplify(&info).unwrap();
+    let simplifier = ExprSimplifier::new(info);
+    let simplified = simplifier.simplify(expr).unwrap();
     assert_eq!(simplified, col("a").lt(lit(5i32)));
 }
 
@@ -103,6 +104,7 @@ fn fold_and_simplify() {
 
     // Since datafusion applies both simplification *and* rewriting
     // some expressions can be entirely simplified
-    let simplified = expr.simplify(&info).unwrap();
+    let simplifier = ExprSimplifier::new(info);
+    let simplified = simplifier.simplify(expr).unwrap();
     assert_eq!(simplified, lit(true))
 }
diff --git a/datafusion/expr/src/logical_plan/plan.rs 
b/datafusion/expr/src/logical_plan/plan.rs
index fd8280217..738e842a2 100644
--- a/datafusion/expr/src/logical_plan/plan.rs
+++ b/datafusion/expr/src/logical_plan/plan.rs
@@ -1359,7 +1359,7 @@ impl Aggregate {
     /// building the schema again when the schema is already known.
     ///
     /// This method should only be called when you are absolutely sure that 
the schema being
-    /// provided is correct for the aggregate. If in doubt, call [try_new] 
instead.
+    /// provided is correct for the aggregate. If in doubt, call 
[try_new](Self::try_new) instead.
     pub fn try_new_with_schema(
         input: Arc<LogicalPlan>,
         group_expr: Vec<Expr>,
diff --git a/datafusion/expr/src/type_coercion/functions.rs 
b/datafusion/expr/src/type_coercion/functions.rs
index 667451a5f..c20a93b99 100644
--- a/datafusion/expr/src/type_coercion/functions.rs
+++ b/datafusion/expr/src/type_coercion/functions.rs
@@ -26,7 +26,7 @@ use datafusion_common::{DataFusionError, Result};
 /// each argument must be coerced to match `signature`.
 ///
 /// For more details on coercion in general, please see the
-/// [`type_coercion`](datafusion::expr::type_coercion) module.
+/// [`type_coercion`](datafusion_expr::type_coercion) module.
 pub fn data_types(
     current_types: &[DataType],
     signature: &Signature,
diff --git a/datafusion/optimizer/src/expr_simplifier.rs 
b/datafusion/optimizer/src/expr_simplifier.rs
index 7cf5f02c6..b75e86081 100644
--- a/datafusion/optimizer/src/expr_simplifier.rs
+++ b/datafusion/optimizer/src/expr_simplifier.rs
@@ -15,17 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Expression simplifier
+//! Expression simplification API
 
-use crate::simplify_expressions::{ConstEvaluator, Simplifier};
-use datafusion_common::Result;
-use datafusion_expr::{expr_rewriter::ExprRewritable, Expr};
+use crate::{
+    simplify_expressions::{ConstEvaluator, Simplifier},
+    type_coercion::TypeCoercionRewriter,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::{DFSchemaRef, DataFusionError, Result};
+use datafusion_expr::{expr_rewriter::ExprRewritable, Expr, ExprSchemable};
 use datafusion_physical_expr::execution_props::ExecutionProps;
 
 #[allow(rustdoc::private_intra_doc_links)]
 /// The information necessary to apply algebraic simplification to an
-/// [Expr]. See 
[SimplifyContext](crate::optimizer::simplify_expressions::SimplifyContext)
-/// for one implementation
+/// [Expr]. See [SimplifyContext] for one concrete implementation.
+///
+/// This trait exists so that other systems can plug schema
+/// information in without having to create `DFSchema` objects. If you
+/// have a [`DFSchemaRef`] you can use [`SimplifyContext`]
 pub trait SimplifyInfo {
     /// returns true if this Expr has boolean type
     fn is_boolean_type(&self, expr: &Expr) -> Result<bool>;
@@ -37,28 +44,41 @@ pub trait SimplifyInfo {
     fn execution_props(&self) -> &ExecutionProps;
 }
 
-/// trait for types that can be simplified
-pub trait ExprSimplifiable: Sized {
-    /// simplify this trait object using the given SimplifyInfo
-    fn simplify<S: SimplifyInfo>(self, info: &S) -> Result<Self>;
+/// This structure handles API for expression simplification
+pub struct ExprSimplifier<S> {
+    info: S,
 }
 
-impl ExprSimplifiable for Expr {
+impl<S: SimplifyInfo> ExprSimplifier<S> {
+    /// Create a new `ExprSimplifier` with the given `info`. See
+    /// [`simplify`](Self::simplify) for an example.
+    pub fn new(info: S) -> Self {
+        Self { info }
+    }
+
     /// Simplifies this [`Expr`]`s as much as possible, evaluating
-    /// constants and applying algebraic simplifications
+    /// constants and applying algebraic simplifications.
+    ///
+    /// The types of the expression must match what operators expect,
+    /// or else an error may occur trying to evaluate. See
+    /// [`coerce`](Self::coerce) for a function to help.
     ///
     /// # Example:
+    ///
     /// `b > 2 AND b > 2`
+    ///
     /// can be written to
+    ///
     /// `b > 2`
     ///
     /// ```
     /// use datafusion_expr::{col, lit, Expr};
     /// use datafusion_common::Result;
     /// use datafusion_physical_expr::execution_props::ExecutionProps;
-    /// use datafusion_optimizer::expr_simplifier::{SimplifyInfo, 
ExprSimplifiable};
+    /// use datafusion_optimizer::expr_simplifier::{ExprSimplifier, 
SimplifyInfo};
     ///
     /// /// Simple implementation that provides `Simplifier` the information 
it needs
+    /// /// See SimplifyContext for a structure that does this.
     /// #[derive(Default)]
     /// struct Info {
     ///   execution_props: ExecutionProps,
@@ -76,6 +96,9 @@ impl ExprSimplifiable for Expr {
     ///   }
     /// }
     ///
+    /// // Create the simplifier
+    /// let simplifier = ExprSimplifier::new(Info::default());
+    ///
     /// // b < 2
     /// let b_lt_2 = col("b").gt(lit(2));
     ///
@@ -83,17 +106,171 @@ impl ExprSimplifiable for Expr {
     /// let expr = b_lt_2.clone().or(b_lt_2.clone());
     ///
     /// // (b < 2) OR (b < 2) --> (b < 2)
-    /// let expr = expr.simplify(&Info::default()).unwrap();
+    /// let expr = simplifier.simplify(expr).unwrap();
     /// assert_eq!(expr, b_lt_2);
     /// ```
-    fn simplify<S: SimplifyInfo>(self, info: &S) -> Result<Self> {
-        let mut rewriter = Simplifier::new(info);
-        let mut const_evaluator = 
ConstEvaluator::try_new(info.execution_props())?;
+    pub fn simplify(&self, expr: Expr) -> Result<Expr> {
+        let mut rewriter = Simplifier::new(&self.info);
+        let mut const_evaluator = 
ConstEvaluator::try_new(self.info.execution_props())?;
 
         // TODO iterate until no changes are made during rewrite
         // (evaluating constants can enable new simplifications and
         // simplifications can enable new constant evaluation)
         // https://github.com/apache/arrow-datafusion/issues/1160
-        self.rewrite(&mut const_evaluator)?.rewrite(&mut rewriter)
+        expr.rewrite(&mut const_evaluator)?.rewrite(&mut rewriter)
+    }
+
+    /// Apply type coercion to an [`Expr`] so that it can be
+    /// evaluated as a 
[`PhysicalExpr`](datafusion_physical_expr::PhysicalExpr).
+    ///
+    /// See the [type coercion module](datafusion_expr::type_coercion)
+    /// documentation for more details on type coercion
+    ///
+    // Would be nice if this API could use the SimplifyInfo
+    // rather than creating an DFSchemaRef coerces rather than doing
+    // it manually.
+    // https://github.com/apache/arrow-datafusion/issues/3793
+    pub fn coerce(&self, expr: Expr, schema: DFSchemaRef) -> Result<Expr> {
+        let mut expr_rewrite = TypeCoercionRewriter { schema };
+
+        expr.rewrite(&mut expr_rewrite)
+    }
+}
+
+/// Provides simplification information based on DFSchema and
+/// [`ExecutionProps`]. This is the default implementation used by DataFusion
+///
+/// For example:
+/// ```
+/// use arrow::datatypes::{Schema, Field, DataType};
+/// use datafusion_expr::{col, lit};
+/// use datafusion_common::{DataFusionError, ToDFSchema};
+/// use datafusion_physical_expr::execution_props::ExecutionProps;
+/// use datafusion_optimizer::expr_simplifier::{SimplifyContext, 
ExprSimplifier};
+///
+/// // Create the schema
+/// let schema = Schema::new(vec![
+///     Field::new("i", DataType::Int64, false),
+///   ])
+///   .to_dfschema_ref().unwrap();
+///
+/// // Create the simplifier
+/// let props = ExecutionProps::new();
+/// let context = SimplifyContext::new(&props)
+///    .with_schema(schema);
+/// let simplifier = ExprSimplifier::new(context);
+///
+/// // Use the simplifier
+///
+/// // b < 2 or (1 > 3)
+/// let expr = col("b").lt(lit(2)).or(lit(1).gt(lit(3)));
+///
+/// // b < 2
+/// let simplified = simplifier.simplify(expr).unwrap();
+/// assert_eq!(simplified, col("b").lt(lit(2)));
+/// ```
+pub struct SimplifyContext<'a> {
+    schemas: Vec<DFSchemaRef>,
+    props: &'a ExecutionProps,
+}
+
+impl<'a> SimplifyContext<'a> {
+    /// Create a new SimplifyContext
+    pub fn new(props: &'a ExecutionProps) -> Self {
+        Self {
+            schemas: vec![],
+            props,
+        }
+    }
+
+    /// Register a [`DFSchemaRef`] with this context
+    pub fn with_schema(mut self, schema: DFSchemaRef) -> Self {
+        self.schemas.push(schema);
+        self
+    }
+}
+
+impl<'a> SimplifyInfo for SimplifyContext<'a> {
+    /// returns true if this Expr has boolean type
+    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+        for schema in &self.schemas {
+            if let Ok(DataType::Boolean) = expr.get_type(schema) {
+                return Ok(true);
+            }
+        }
+
+        Ok(false)
+    }
+    /// Returns true if expr is nullable
+    fn nullable(&self, expr: &Expr) -> Result<bool> {
+        self.schemas
+            .iter()
+            .find_map(|schema| {
+                // expr may be from another input, so ignore errors
+                // by converting to None to keep trying
+                expr.nullable(schema.as_ref()).ok()
+            })
+            .ok_or_else(|| {
+                // This means we weren't able to compute `Expr::nullable` with
+                // *any* input schemas, signalling a problem
+                DataFusionError::Internal(format!(
+                    "Could not find columns in '{}' during simplify",
+                    expr
+                ))
+            })
+    }
+
+    fn execution_props(&self) -> &ExecutionProps {
+        self.props
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{Field, Schema};
+    use datafusion_common::ToDFSchema;
+    use datafusion_expr::{col, lit};
+
+    #[test]
+    fn api_basic() {
+        let props = ExecutionProps::new();
+        let simplifier =
+            
ExprSimplifier::new(SimplifyContext::new(&props).with_schema(test_schema()));
+
+        let expr = lit(1) + lit(2);
+        let expected = lit(3);
+        assert_eq!(expected, simplifier.simplify(expr).unwrap());
+    }
+
+    #[test]
+    fn basic_coercion() {
+        let schema = test_schema();
+        let props = ExecutionProps::new();
+        let simplifier =
+            
ExprSimplifier::new(SimplifyContext::new(&props).with_schema(schema.clone()));
+
+        // Note expr type is int32 (not int64)
+        // (1i64 + 2i32) < i
+        let expr = (lit(1i64) + lit(2i32)).lt(col("i"));
+        // should fully simplify to 3 < i (though i has been coerced to i64)
+        let expected = lit(3i64).lt(col("i"));
+
+        // Would be nice if this API could use the SimplifyInfo
+        // rather than creating an DFSchemaRef coerces rather than doing
+        // it manually.
+        // https://github.com/apache/arrow-datafusion/issues/3793
+        let expr = simplifier.coerce(expr, schema).unwrap();
+
+        assert_eq!(expected, simplifier.simplify(expr).unwrap());
+    }
+
+    fn test_schema() -> DFSchemaRef {
+        Schema::new(vec![
+            Field::new("i", DataType::Int64, false),
+            Field::new("b", DataType::Boolean, true),
+        ])
+        .to_dfschema_ref()
+        .unwrap()
     }
 }
diff --git a/datafusion/optimizer/src/simplify_expressions.rs 
b/datafusion/optimizer/src/simplify_expressions.rs
index 95c640b2a..233b0e497 100644
--- a/datafusion/optimizer/src/simplify_expressions.rs
+++ b/datafusion/optimizer/src/simplify_expressions.rs
@@ -15,73 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Simplify expressions optimizer rule
+//! Simplify expressions optimizer rule and implementation
 
-use crate::expr_simplifier::ExprSimplifiable;
+use crate::expr_simplifier::{ExprSimplifier, SimplifyContext};
 use crate::{expr_simplifier::SimplifyInfo, OptimizerConfig, OptimizerRule};
 use arrow::array::new_null_array;
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::error::ArrowError;
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, Result, 
ScalarValue};
+use datafusion_common::{DFSchema, DataFusionError, Result, ScalarValue};
 use datafusion_expr::{
     expr_fn::{and, or},
     expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion},
     lit,
     logical_plan::LogicalPlan,
     utils::from_plan,
-    BuiltinScalarFunction, ColumnarValue, Expr, ExprSchemable, Operator, 
Volatility,
+    BuiltinScalarFunction, ColumnarValue, Expr, Operator, Volatility,
 };
 use datafusion_physical_expr::{create_physical_expr, 
execution_props::ExecutionProps};
 
-/// Provides simplification information based on schema and properties
-pub(crate) struct SimplifyContext<'a, 'b> {
-    schemas: Vec<&'a DFSchemaRef>,
-    props: &'b ExecutionProps,
-}
-
-impl<'a, 'b> SimplifyContext<'a, 'b> {
-    /// Create a new SimplifyContext
-    pub fn new(schemas: Vec<&'a DFSchemaRef>, props: &'b ExecutionProps) -> 
Self {
-        Self { schemas, props }
-    }
-}
-
-impl<'a, 'b> SimplifyInfo for SimplifyContext<'a, 'b> {
-    /// returns true if this Expr has boolean type
-    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
-        for schema in &self.schemas {
-            if let Ok(DataType::Boolean) = expr.get_type(schema) {
-                return Ok(true);
-            }
-        }
-
-        Ok(false)
-    }
-    /// Returns true if expr is nullable
-    fn nullable(&self, expr: &Expr) -> Result<bool> {
-        self.schemas
-            .iter()
-            .find_map(|schema| {
-                // expr may be from another input, so ignore errors
-                // by converting to None to keep trying
-                expr.nullable(schema.as_ref()).ok()
-            })
-            .ok_or_else(|| {
-                // This means we weren't able to compute `Expr::nullable` with
-                // *any* input schemas, signalling a problem
-                DataFusionError::Internal(format!(
-                    "Could not find columns in '{}' during simplify",
-                    expr
-                ))
-            })
-    }
-
-    fn execution_props(&self) -> &ExecutionProps {
-        self.props
-    }
-}
-
 /// Optimizer Pass that simplifies [`LogicalPlan`]s by rewriting
 /// [`Expr`]`s evaluating constants and applying algebraic
 /// simplifications
@@ -292,7 +244,14 @@ impl SimplifyExpressions {
         // projected columns. With just the projected schema, it's not 
possible to infer types for
         // expressions that references non-projected columns within the same 
project plan or its
         // children plans.
-        let info = SimplifyContext::new(plan.all_schemas(), execution_props);
+        let info = plan
+            .all_schemas()
+            .into_iter()
+            .fold(SimplifyContext::new(execution_props), |context, schema| {
+                context.with_schema(schema.clone())
+            });
+
+        let simplifier = ExprSimplifier::new(info);
 
         let new_inputs = plan
             .inputs()
@@ -309,7 +268,7 @@ impl SimplifyExpressions {
                 let name = &e.name();
 
                 // Apply the actual simplification logic
-                let new_e = e.simplify(&info)?;
+                let new_e = simplifier.simplify(e)?;
 
                 let new_name = &new_e.name();
 
@@ -991,30 +950,12 @@ macro_rules! assert_contains {
     };
 }
 
-/// Apply simplification and constant propagation to ([Expr]).
-///
-/// # Arguments
-///
-/// * `expr` - The logical expression
-/// * `schema` - The DataFusion schema for the expr, used to resolve `Column` 
references
-///                      to qualified or unqualified fields by name.
-/// * `props` - The Arrow schema for the input, used for determining 
expression data types
-///                    when performing type coercion.
-pub fn simplify_expr(
-    expr: Expr,
-    schema: &DFSchemaRef,
-    props: &ExecutionProps,
-) -> Result<Expr> {
-    let info = SimplifyContext::new(vec![schema], props);
-    expr.simplify(&info)
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
     use arrow::array::{ArrayRef, Int32Array};
     use chrono::{DateTime, TimeZone, Utc};
-    use datafusion_common::{DFField, ToDFSchema};
+    use datafusion_common::{DFField, DFSchemaRef};
     use datafusion_expr::expr::Case;
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::{
@@ -1639,8 +1580,10 @@ mod tests {
     fn simplify(expr: Expr) -> Expr {
         let schema = expr_test_schema();
         let execution_props = ExecutionProps::new();
-        let info = SimplifyContext::new(vec![&schema], &execution_props);
-        expr.simplify(&info).unwrap()
+        let simplifier = ExprSimplifier::new(
+            SimplifyContext::new(&execution_props).with_schema(schema),
+        );
+        simplifier.simplify(expr).unwrap()
     }
 
     fn expr_test_schema() -> DFSchemaRef {
@@ -2634,26 +2577,4 @@ mod tests {
 
         assert_optimized_plan_eq(&plan, expected);
     }
-
-    #[test]
-    fn simplify_expr_api_test() {
-        let schema = Schema::new(vec![Field::new("x", DataType::Int32, false)])
-            .to_dfschema_ref()
-            .unwrap();
-        let props = ExecutionProps::new();
-
-        // x + (1 + 3) -> x + 4
-        {
-            let expr = col("x") + (lit(1) + lit(3));
-            let simplifed_expr = simplify_expr(expr, &schema, &props).unwrap();
-            assert_eq!(simplifed_expr, col("x") + lit(4));
-        }
-
-        // x * 1 -> x
-        {
-            let expr = col("x") * lit(1);
-            let simplifed_expr = simplify_expr(expr, &schema, &props).unwrap();
-            assert_eq!(simplifed_expr, col("x"));
-        }
-    }
 }
diff --git a/datafusion/optimizer/src/type_coercion.rs 
b/datafusion/optimizer/src/type_coercion.rs
index 89d5d660b..be4d70265 100644
--- a/datafusion/optimizer/src/type_coercion.rs
+++ b/datafusion/optimizer/src/type_coercion.rs
@@ -121,7 +121,7 @@ fn optimize_internal(
     from_plan(plan, &new_expr, &new_inputs)
 }
 
-struct TypeCoercionRewriter {
+pub(crate) struct TypeCoercionRewriter {
     pub(crate) schema: DFSchemaRef,
 }
 

Reply via email to