alamb commented on code in PR #11683: URL: https://github.com/apache/datafusion/pull/11683#discussion_r1709709933
########## datafusion/optimizer/src/common_subexpr_eliminate.rs: ########## @@ -227,69 +227,65 @@ impl CommonSubexprEliminate { .into_iter() .zip(arrays.iter()) .map(|(expr, id_array)| { - let replaced = replace_common_expr( + replace_common_expr( expr, id_array, expr_stats, common_exprs, alias_generator, - )?; - // remember if this expression was actually replaced - transformed |= replaced.transformed; - Ok(replaced.data) + ) }) .collect::<Result<Vec<_>>>() }) .collect::<Result<Vec<_>>>() - .map(|rewritten_exprs_list| { - // propagate back transformed information - Transformed::new_transformed(rewritten_exprs_list, transformed) - }) } - /// Rewrites the expression in `exprs_list` with common sub-expressions - /// replaced with a new column and adds a ProjectionExec on top of `input` - /// which computes any replaced common sub-expressions. + /// Extracts common sub-expressions and rewrites `exprs_list`. /// /// Returns a tuple of: /// 1. The rewritten expressions Review Comment: ```suggestion /// 1. The (potentially) rewritten expressions ``` ########## datafusion/optimizer/src/common_subexpr_eliminate.rs: ########## @@ -688,7 +702,10 @@ impl OptimizerRule for CommonSubexprEliminate { } fn apply_order(&self) -> Option<ApplyOrder> { - Some(ApplyOrder::TopDown) + // This rule handles recursion itself in a `ApplyOrder::TopDown` like manner. Review Comment: 💯 ########## datafusion/optimizer/src/common_subexpr_eliminate.rs: ########## @@ -227,69 +227,65 @@ impl CommonSubexprEliminate { .into_iter() .zip(arrays.iter()) .map(|(expr, id_array)| { - let replaced = replace_common_expr( + replace_common_expr( expr, id_array, expr_stats, common_exprs, alias_generator, - )?; - // remember if this expression was actually replaced - transformed |= replaced.transformed; - Ok(replaced.data) + ) }) .collect::<Result<Vec<_>>>() }) .collect::<Result<Vec<_>>>() - .map(|rewritten_exprs_list| { - // propagate back transformed information - Transformed::new_transformed(rewritten_exprs_list, transformed) - }) } - /// Rewrites the expression in `exprs_list` with common sub-expressions - /// replaced with a new column and adds a ProjectionExec on top of `input` - /// which computes any replaced common sub-expressions. + /// Extracts common sub-expressions and rewrites `exprs_list`. /// /// Returns a tuple of: /// 1. The rewritten expressions - /// 2. A `LogicalPlan::Projection` with input of `input` that computes any - /// common sub-expressions that were used - fn rewrite_expr( + /// 2. An optional tuple that contains the extracted common sub-expressions and the + /// original `exprs_list`. + fn find_common_exprs( Review Comment: This is a very nice refactor / rename -- calling it `find_common_exprs` is 💯 and I think makes it much easier to follow ########## datafusion/optimizer/src/common_subexpr_eliminate.rs: ########## @@ -454,136 +435,169 @@ impl CommonSubexprEliminate { group_expr, aggr_expr, input, - schema: orig_schema, + schema, .. } = aggregate; - // track transformed information - let mut transformed = false; - - let name_perserver = NamePreserver::new_for_projection(); - let saved_names = aggr_expr - .iter() - .map(|expr| name_perserver.save(expr)) - .collect::<Result<Vec<_>>>()?; - - let mut expr_stats = ExprStats::new(); - // rewrite inputs - let (group_found_common, group_arrays) = - self.to_arrays(&group_expr, &mut expr_stats, ExprMask::Normal)?; - let (aggr_found_common, aggr_arrays) = - self.to_arrays(&aggr_expr, &mut expr_stats, ExprMask::Normal)?; - let (new_aggr_expr, new_group_expr, new_input) = - if group_found_common || aggr_found_common { - // rewrite both group exprs and aggr_expr - let rewritten = self.rewrite_expr( - // Must clone as Identifiers use references to original expressions so - // we have to keep the original expressions intact. - vec![group_expr.clone(), aggr_expr.clone()], - vec![group_arrays, aggr_arrays], - unwrap_arc(input), - &expr_stats, - config, - )?; - assert!(rewritten.transformed); - transformed |= rewritten.transformed; - let (mut new_expr, new_input) = rewritten.data; - - // note the reversed pop order. - let new_aggr_expr = pop_expr(&mut new_expr)?; - let new_group_expr = pop_expr(&mut new_expr)?; - - (new_aggr_expr, new_group_expr, Arc::new(new_input)) - } else { - (aggr_expr, group_expr, input) - }; - - // create potential projection on top - let mut expr_stats = ExprStats::new(); - let (aggr_found_common, aggr_arrays) = self.to_arrays( - &new_aggr_expr, - &mut expr_stats, - ExprMask::NormalAndAggregates, - )?; - if aggr_found_common { - let mut common_exprs = CommonExprs::new(); - let mut rewritten_exprs = self.rewrite_exprs_list( - // Must clone as Identifiers use references to original expressions so we - // have to keep the original expressions intact. - vec![new_aggr_expr.clone()], - vec![aggr_arrays], - &expr_stats, - &mut common_exprs, - &config.alias_generator(), - )?; - assert!(rewritten_exprs.transformed); - let rewritten = pop_expr(&mut rewritten_exprs.data)?; - - assert!(!common_exprs.is_empty()); - let mut agg_exprs = common_exprs - .into_values() - .map(|(expr, expr_alias)| expr.alias(expr_alias)) - .collect::<Vec<_>>(); - - let new_input_schema = Arc::clone(new_input.schema()); - let mut proj_exprs = vec![]; - for expr in &new_group_expr { - extract_expressions(expr, &new_input_schema, &mut proj_exprs)? - } - for (expr_rewritten, expr_orig) in rewritten.into_iter().zip(new_aggr_expr) { - if expr_rewritten == expr_orig { - if let Expr::Alias(Alias { expr, name, .. }) = expr_rewritten { - agg_exprs.push(expr.alias(&name)); - proj_exprs.push(Expr::Column(Column::from_name(name))); - } else { - let expr_alias = config.alias_generator().next(CSE_PREFIX); - let (qualifier, field) = - expr_rewritten.to_field(&new_input_schema)?; - let out_name = qualified_name(qualifier.as_ref(), field.name()); - - agg_exprs.push(expr_rewritten.alias(&expr_alias)); - proj_exprs.push( - Expr::Column(Column::from_name(expr_alias)).alias(out_name), - ); + let input = unwrap_arc(input); + // Extract common sub-expressions from the aggregate and grouping expressions. Review Comment: this new structure is very nice ########## datafusion/optimizer/src/common_subexpr_eliminate.rs: ########## @@ -227,69 +227,65 @@ impl CommonSubexprEliminate { .into_iter() .zip(arrays.iter()) .map(|(expr, id_array)| { - let replaced = replace_common_expr( + replace_common_expr( expr, id_array, expr_stats, common_exprs, alias_generator, - )?; - // remember if this expression was actually replaced - transformed |= replaced.transformed; - Ok(replaced.data) + ) }) .collect::<Result<Vec<_>>>() }) .collect::<Result<Vec<_>>>() - .map(|rewritten_exprs_list| { - // propagate back transformed information - Transformed::new_transformed(rewritten_exprs_list, transformed) - }) } - /// Rewrites the expression in `exprs_list` with common sub-expressions - /// replaced with a new column and adds a ProjectionExec on top of `input` - /// which computes any replaced common sub-expressions. + /// Extracts common sub-expressions and rewrites `exprs_list`. /// /// Returns a tuple of: /// 1. The rewritten expressions - /// 2. A `LogicalPlan::Projection` with input of `input` that computes any - /// common sub-expressions that were used - fn rewrite_expr( + /// 2. An optional tuple that contains the extracted common sub-expressions and the + /// original `exprs_list`. + fn find_common_exprs( &self, exprs_list: Vec<Vec<Expr>>, - arrays_list: Vec<Vec<IdArray>>, - input: LogicalPlan, - expr_stats: &ExprStats, config: &dyn OptimizerConfig, - ) -> Result<Transformed<(Vec<Vec<Expr>>, LogicalPlan)>> { - let mut transformed = false; - let mut common_exprs = CommonExprs::new(); - - let rewrite_exprs = self.rewrite_exprs_list( - exprs_list, - arrays_list, - expr_stats, - &mut common_exprs, - &config.alias_generator(), - )?; - transformed |= rewrite_exprs.transformed; + expr_mask: ExprMask, + ) -> Result<Transformed<(Vec<Vec<Expr>>, FindCommonExprResult)>> { Review Comment: I found reasoning about this return type `(Vec<Vec<Expr>>, FindCommonExprResult)` hard (I couldn't keep track in my head of the three potential fields that were returned) I made a PR here with a proposal to move it into a named `enum`: https://github.com/peter-toth/arrow-datafusion/pull/4# (I can also make that a PR to main if we would prefer to merge this PR as is) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org