alamb commented on code in PR #10333: URL: https://github.com/apache/datafusion/pull/10333#discussion_r1591315724
########## datafusion/optimizer/src/common_subexpr_eliminate.rs: ########## @@ -592,109 +591,51 @@ impl ExprMask { /// Go through an expression tree and generate identifiers for each subexpression. /// -/// An identifier contains information of the expression itself and its sub-expression. -/// This visitor implementation use a stack `visit_stack` to track traversal, which -/// lets us know when a sub-tree's visiting is finished. When `pre_visit` is called -/// (traversing to a new node), an `EnterMark` and an `ExprItem` will be pushed into stack. -/// And try to pop out a `EnterMark` on leaving a node (`f_up()`). All `ExprItem` -/// before the first `EnterMark` is considered to be sub-tree of the leaving node. -/// -/// This visitor also records identifier in `id_array`. Makes the following traverse -/// pass can get the identifier of a node without recalculate it. We assign each node -/// in the expr tree a series number, start from 1, maintained by `series_number`. -/// Series number represents the order we left (`f_up()`) a node. Has the property -/// that child node's series number always smaller than parent's. While `id_array` is -/// organized in the order we enter (`f_down()`) a node. `node_count` helps us to -/// get the index of `id_array` for each node. +/// This visitor also assigns an alias symbol to each expression, to be used if the +/// expression is getting eliminated, and tracks the frequency and datatype of each expression. /// /// `Expr` without sub-expr (column, literal etc.) will not have identifier /// because they should not be recognized as common sub-expr. struct ExprIdentifierVisitor<'a> { - // param + // expr set to be populated by the visitor expr_set: &'a mut ExprSet, /// input schema for the node that we're optimizing, so we can determine the correct datatype /// for each subexpression input_schema: DFSchemaRef, - // inner states - visit_stack: Vec<VisitRecord>, - /// increased in fn_down, start from 0. - node_count: usize, /// which expression should be skipped? expr_mask: ExprMask, } -/// Record item that used when traversing a expression tree. -enum VisitRecord { - /// `usize` is the monotone increasing series number assigned in pre_visit(). - /// Starts from 0. Is used to index the identifier array `id_array` in post_visit(). - EnterMark(usize), - /// the node's children were skipped => jump to f_up on same node - JumpMark(usize), - /// Accumulated identifier of sub expression. - ExprItem(Identifier), -} - -impl ExprIdentifierVisitor<'_> { - /// Find the first `EnterMark` in the stack, and accumulates every `ExprItem` - /// before it. - fn pop_enter_mark(&mut self) -> (usize, Identifier) { - let mut desc = String::new(); - - while let Some(item) = self.visit_stack.pop() { - match item { - VisitRecord::EnterMark(idx) | VisitRecord::JumpMark(idx) => { - return (idx, desc); - } - VisitRecord::ExprItem(id) => { - desc.push_str(&id); - } - } - } - unreachable!("Enter mark should paired with node number"); - } -} - impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { type Node = Expr; fn f_down(&mut self, expr: &Expr) -> Result<TreeNodeRecursion> { // related to https://github.com/apache/datafusion/issues/8814 // If the expr contain volatile expression or is a short-circuit expression, skip it. if expr.short_circuits() || expr.is_volatile()? { - self.visit_stack - .push(VisitRecord::JumpMark(self.node_count)); return Ok(TreeNodeRecursion::Jump); // go to f_up } - self.visit_stack - .push(VisitRecord::EnterMark(self.node_count)); - self.node_count += 1; - Ok(TreeNodeRecursion::Continue) } fn f_up(&mut self, expr: &Expr) -> Result<TreeNodeRecursion> { - let (_idx, sub_expr_identifier) = self.pop_enter_mark(); - // skip exprs should not be recognize. if self.expr_mask.ignores(expr) { - let curr_expr_identifier = ExprSet::expr_identifier(expr); - self.visit_stack - .push(VisitRecord::ExprItem(curr_expr_identifier)); return Ok(TreeNodeRecursion::Continue); } - let curr_expr_identifier = ExprSet::expr_identifier(expr); - let alias_symbol = format!("{curr_expr_identifier}{sub_expr_identifier}"); - self.visit_stack - .push(VisitRecord::ExprItem(alias_symbol.clone())); + let curr_expr_identifier = ExprSet::expr_identifier(expr); let data_type = expr.get_type(&self.input_schema)?; + let alias_symbol = format!("#{{{curr_expr_identifier}}}"); Review Comment: A small nit might be to pull this into a function that can be documented (so the code that creates identifiers is easier to find) Perhaps something like ```rust let alias_symbol = ExprSet::alias_symbol(expr) ``` 🤔 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org