alamb commented on a change in pull request #792:
URL: https://github.com/apache/arrow-datafusion/pull/792#discussion_r680194021



##########
File path: datafusion/src/execution/context.rs
##########
@@ -691,6 +693,7 @@ impl Default for ExecutionConfig {
                 Arc::new(SimplifyExpressions::new()),
                 Arc::new(HashBuildProbeOrder::new()),
                 Arc::new(LimitPushDown::new()),
+                // Arc::new(CommonSubexprEliminate::new()),

Review comment:
       I suggest running this CSE immediately after `ConstantFolding` as it may 
simplify expressions in a way that helps other passes (e.g. make join 
conditions clearer, or simplify filter expressions so they can be pushed down)

##########
File path: datafusion/src/optimizer/common_subexpr_eliminate.rs
##########
@@ -0,0 +1,517 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Eliminate common sub-expression.
+
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+use arrow::datatypes::DataType;
+
+use crate::error::Result;
+use crate::execution::context::ExecutionProps;
+use crate::logical_plan::{
+    col, DFField, DFSchema, Expr, ExprRewriter, ExpressionVisitor, LogicalPlan,
+    Recursion, RewriteRecursion,
+};
+use crate::optimizer::optimizer::OptimizerRule;
+
+/// A map from expression's identifier to tuple including
+/// - the expression itself (cloned)
+/// - counter
+/// - DataType of this expression.
+type ExprSet = HashMap<Identifier, (Expr, usize, DataType)>;
+
+/// Identifier type. Current implementation use describe of a expression (type 
String) as
+/// Identifier.
+///
+/// A Identifier should (ideally) be able to "hash", "accumulate", "equal" and 
"have no
+/// collision (as low as possible)"
+///
+/// Since a identifier is likely to be copied many times, it is better that a 
identifier
+/// is small or "copy". otherwise some kinds of reference count is needed. 
String description
+/// here is not such a good choose.
+type Identifier = String;
+/// Perform Common Sub-expression Elimination optimization.
+///
+/// Currently only common sub-expressions within one logical plan will
+/// be eliminated.
+pub struct CommonSubexprEliminate {}
+
+impl OptimizerRule for CommonSubexprEliminate {
+    fn optimize(
+        &self,
+        plan: &LogicalPlan,
+        execution_props: &ExecutionProps,
+    ) -> Result<LogicalPlan> {
+        optimize(plan, execution_props)
+    }
+
+    fn name(&self) -> &str {
+        "common_sub_expression_eliminate"
+    }
+}
+
+impl CommonSubexprEliminate {
+    #[allow(missing_docs)]
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+fn optimize(plan: &LogicalPlan, execution_props: &ExecutionProps) -> 
Result<LogicalPlan> {
+    let mut expr_set = ExprSet::new();
+    let mut affected_id = HashSet::new();
+
+    match plan {
+        LogicalPlan::Projection {
+            expr,
+            input,
+            schema: _,
+        } => {
+            let mut arrays = vec![];
+            for e in expr {
+                let data_type = e.get_type(input.schema())?;
+                let mut id_array = vec![];
+                expr_to_identifier(e, &mut expr_set, &mut id_array, 
data_type)?;
+                arrays.push(id_array);
+            }
+
+            return optimize(input, execution_props);

Review comment:
       I may be missing something but doesn't this basically remove the 
`LogicalPlan::Projection` step (it just returns the optimized input)

##########
File path: datafusion/src/logical_plan/expr.rs
##########
@@ -981,15 +983,25 @@ pub trait ExpressionVisitor: Sized {
     }
 }
 
+/// Controls how the [ExprRewriter] recursion should proceed.

Review comment:
       👍  this is a nice change




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to