peter-toth commented on code in PR #10396: URL: https://github.com/apache/datafusion/pull/10396#discussion_r1591490350
########## datafusion/optimizer/src/common_subexpr_eliminate.rs: ########## @@ -658,55 +682,89 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { type Node = Expr; fn f_down(&mut self, expr: &Expr) -> Result<TreeNodeRecursion> { - // related to https://github.com/apache/datafusion/issues/8814 + // related to https://github.com/apache/arrow-datafusion/issues/8814 // If the expr contain volatile expression or is a short-circuit expression, skip it. + // TODO: propagate is_volatile state bottom-up + consider non-volatile sub-expressions for CSE + // TODO: consider surely executed children of "short circuited"s for CSE if expr.short_circuits() || expr.is_volatile()? { - self.visit_stack - .push(VisitRecord::JumpMark(self.node_count)); - return Ok(TreeNodeRecursion::Jump); // go to f_up + self.visit_stack.push(VisitRecord::JumpMark); + + return Ok(TreeNodeRecursion::Jump); } + self.id_array.push((0, "".to_string())); self.visit_stack - .push(VisitRecord::EnterMark(self.node_count)); - self.node_count += 1; + .push(VisitRecord::EnterMark(self.down_index)); + self.down_index += 1; Ok(TreeNodeRecursion::Continue) } fn f_up(&mut self, expr: &Expr) -> Result<TreeNodeRecursion> { - let (_idx, sub_expr_identifier) = self.pop_enter_mark(); - - // skip exprs should not be recognize. - if self.expr_mask.ignores(expr) { - let curr_expr_identifier = ExprSet::expr_identifier(expr); - self.visit_stack - .push(VisitRecord::ExprItem(curr_expr_identifier)); + let Some((down_index, sub_expr_id)) = self.pop_enter_mark() else { return Ok(TreeNodeRecursion::Continue); - } - let curr_expr_identifier = ExprSet::expr_identifier(expr); - let alias_symbol = format!("{curr_expr_identifier}{sub_expr_identifier}"); + }; - self.visit_stack - .push(VisitRecord::ExprItem(alias_symbol.clone())); + let expr_id = expr_identifier(expr, sub_expr_id); + + self.id_array[down_index].0 = self.up_index; + if !self.expr_mask.ignores(expr) { + self.id_array[down_index].1.clone_from(&expr_id); - let data_type = expr.get_type(&self.input_schema)?; + // TODO: can we capture the data type in the second traversal only for + // replaced expressions? + let data_type = expr.get_type(&self.input_schema)?; + let (count, _) = self + .expr_stats + .entry(expr_id.clone()) + .or_insert((0, data_type)); + *count += 1; + } + self.visit_stack.push(VisitRecord::ExprItem(expr_id)); + self.up_index += 1; - self.expr_set - .entry(curr_expr_identifier) - .or_insert_with(|| (expr.clone(), 0, data_type, alias_symbol)) Review Comment: this clone is also eliminated -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org