alamb commented on code in PR #9037:
URL: https://github.com/apache/arrow-datafusion/pull/9037#discussion_r1470188018
##########
datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs:
##########
@@ -17,13 +17,177 @@
//! This module implements a rule that simplifies the values for `InList`s
+use std::borrow::Cow;
use std::collections::HashSet;
use datafusion_common::tree_node::TreeNodeRewriter;
-use datafusion_common::Result;
-use datafusion_expr::expr::InList;
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::expr::{InList, InSubquery};
use datafusion_expr::{lit, BinaryExpr, Expr, Operator};
+use super::utils::{is_null, lit_bool_null};
+use super::THRESHOLD_INLINE_INLIST;
+
+pub(super) struct ShortenInListSimplifier {}
+
+impl ShortenInListSimplifier {
+ pub(super) fn new() -> Self {
+ Self {}
+ }
+}
+
+impl TreeNodeRewriter for ShortenInListSimplifier {
+ type N = Expr;
+
+ fn mutate(&mut self, expr: Expr) -> Result<Expr> {
+ // if expr is a single column reference:
+ // expr IN (A, B, ...) --> (expr = A) OR (expr = B) OR (expr = C)
+ if let Expr::InList(InList {
+ expr,
+ list,
+ negated,
+ }) = expr.clone()
+ {
+ if !list.is_empty()
+ && (
+ // For lists with only 1 value we allow more complex
expressions to be simplified
+ // e.g SUBSTR(c1, 2, 3) IN ('1') -> SUBSTR(c1, 2, 3) = '1'
+ // for more than one we avoid repeating this potentially
expensive
+ // expressions
+ list.len() == 1
+ || list.len() <= THRESHOLD_INLINE_INLIST
+ && expr.try_into_col().is_ok()
+ )
+ {
+ let first_val = list[0].clone();
+ if negated {
+ return Ok(list.into_iter().skip(1).fold(
+ (*expr.clone()).not_eq(first_val),
+ |acc, y| {
+ // Note that `A and B and C and D` is a left-deep
tree structure
+ // as such we want to maintain this structure as
much as possible
+ // to avoid reordering the expression during each
optimization
+ // pass.
+ //
+ // Left-deep tree structure for `A and B and C and
D`:
+ // ```
+ // &
+ // / \
+ // & D
+ // / \
+ // & C
+ // / \
+ // A B
+ // ```
+ //
+ // The code below maintain the left-deep tree
structure.
+ acc.and((*expr.clone()).not_eq(y))
+ },
+ ));
+ } else {
+ return Ok(list.into_iter().skip(1).fold(
+ (*expr.clone()).eq(first_val),
+ |acc, y| {
+ // Same reasoning as above
+ acc.or((*expr.clone()).eq(y))
+ },
+ ));
+ }
+ }
+ }
+
+ Ok(expr)
+ }
+}
+
+pub(super) struct InListSimplifier {}
+
+impl InListSimplifier {
+ pub(super) fn new() -> Self {
+ Self {}
+ }
+}
+
+impl TreeNodeRewriter for InListSimplifier {
+ type N = Expr;
+
+ fn mutate(&mut self, expr: Expr) -> Result<Expr> {
Review Comment:
I wonder what the the reason to have multiple different simplifiers? As
written now, each `TreeNodeRewriter` makes a different pass over the expression
tree. I think we could make a single pass.
For example, could we have a single function in this module like `fn
simplify(expr: Expr) -> Result<Expr>```
That was called as part of `Expr::Simplifier::mutate`? I think that could
make the code simpler as well as more performant
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]