Ted-Jiang commented on code in PR #3903:
URL: https://github.com/apache/arrow-datafusion/pull/3903#discussion_r1006365981


##########
datafusion/optimizer/src/utils.rs:
##########
@@ -99,27 +99,176 @@ fn split_conjunction_impl<'a>(expr: &'a Expr, mut exprs: 
Vec<&'a Expr>) -> Vec<&
 /// assert_eq!(split_conjunction_owned(expr), split);
 /// ```
 pub fn split_conjunction_owned(expr: Expr) -> Vec<Expr> {
-    split_conjunction_owned_impl(expr, vec![])
+    split_binary_owned(expr, Operator::And)
 }
 
-fn split_conjunction_owned_impl(expr: Expr, mut exprs: Vec<Expr>) -> Vec<Expr> 
{
+/// Splits an owned binary operator tree [`Expr`] such as `A <OP> B <OP> C` => 
`[A, B, C]`
+///
+/// This is often used to "split" expressions such as `col1 = 5
+/// AND col2 = 10` into [`col1 = 5`, `col2 = 10`];
+///
+/// # Example
+/// ```
+/// # use datafusion_expr::{col, lit, Operator};
+/// # use datafusion_optimizer::utils::split_binary_owned;
+/// # use std::ops::Add;
+/// // a=1 + b=2
+/// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2)));
+///
+/// // [a=1, b=2]
+/// let split = vec![
+///   col("a").eq(lit(1)),
+///   col("b").eq(lit(2)),
+/// ];
+///
+/// // use split_binary_owned to split them
+/// assert_eq!(split_binary_owned(expr, Operator::Plus), split);
+/// ```
+pub fn split_binary_owned(expr: Expr, op: Operator) -> Vec<Expr> {
+    split_binary_owned_impl(expr, op, vec![])
+}
+
+fn split_binary_owned_impl(
+    expr: Expr,
+    operator: Operator,
+    mut exprs: Vec<Expr>,
+) -> Vec<Expr> {
     match expr {
-        Expr::BinaryExpr(BinaryExpr {
-            right,
-            op: Operator::And,
-            left,
-        }) => {
-            let exprs = split_conjunction_owned_impl(*left, exprs);
-            split_conjunction_owned_impl(*right, exprs)
+        Expr::BinaryExpr(BinaryExpr { right, op, left }) if op == operator => {
+            let exprs = split_binary_owned_impl(*left, operator, exprs);
+            split_binary_owned_impl(*right, operator, exprs)
         }
-        Expr::Alias(expr, _) => split_conjunction_owned_impl(*expr, exprs),
+        Expr::Alias(expr, _) => split_binary_owned_impl(*expr, operator, 
exprs),
         other => {
             exprs.push(other);
             exprs
         }
     }
 }
 
+/// Splits an binary operator tree [`Expr`] such as `A <OP> B <OP> C` => `[A, 
B, C]`
+///
+/// See [`split_binary_owned`] for more details and an example.
+pub fn split_binary(expr: &Expr, op: Operator) -> Vec<&Expr> {
+    split_binary_impl(expr, op, vec![])
+}
+
+fn split_binary_impl<'a>(
+    expr: &'a Expr,
+    operator: Operator,
+    mut exprs: Vec<&'a Expr>,
+) -> Vec<&'a Expr> {
+    match expr {
+        Expr::BinaryExpr(BinaryExpr { right, op, left }) if *op == operator => 
{
+            let exprs = split_binary_impl(left, operator, exprs);
+            split_binary_impl(right, operator, exprs)
+        }
+        Expr::Alias(expr, _) => split_binary_impl(expr, operator, exprs),
+        other => {
+            exprs.push(other);
+            exprs
+        }
+    }
+}
+
+/// Given a list of lists of [`Expr`]s, returns a list of lists of
+/// [`Expr`]s of expressions where there is one expression from each
+/// from each of the input expressions
+///
+/// For example, given the input `[[a, b], [c], [d, e]]` returns
+/// `[a, c, d], [a, c, e], [b, c, d], [b, c, e]]`.
+fn permutations(mut exprs: VecDeque<Vec<&Expr>>) -> Vec<Vec<&Expr>> {
+    let first = if let Some(first) = exprs.pop_front() {
+        first
+    } else {
+        return vec![];
+    };
+
+    // base case:
+    if exprs.is_empty() {
+        first.into_iter().map(|e| vec![e]).collect()
+    } else {
+        first
+            .into_iter()
+            .flat_map(|expr| {
+                permutations(exprs.clone())
+                    .into_iter()
+                    .map(|expr_list| {
+                        // Create [expr, ...] for each permutation
+                        std::iter::once(expr)
+                            .chain(expr_list.into_iter())
+                            .collect::<Vec<&Expr>>()
+                    })
+                    .collect::<Vec<Vec<&Expr>>>()
+            })
+            .collect()
+    }
+}
+
+const MAX_CNF_REWRITE_CONJUNCTS: usize = 10;
+
+/// Tries to convert an expression to conjunctive normal form (CNF).
+///
+/// Does not convert the expression if the total number of conjuncts
+/// (exprs ANDed together) would exceed [`MAX_CNF_REWRITE_CONJUNCTS`].
+///
+/// The following expression is in CNF:
+///  `(a OR b) AND (c OR d)`
+///
+/// The following is not in CNF:
+///  `(a AND b) OR c`.
+///
+/// But could be rewrite to a CNF expression:
+///  `(a OR c) AND (b OR c)`.
+///
+///
+/// # Example
+/// ```
+/// # use datafusion_expr::{col, lit};
+/// # use datafusion_optimizer::utils::cnf_rewrite;
+/// // (a=1 AND b=2)OR c = 3
+/// let expr1 = col("a").eq(lit(1)).and(col("b").eq(lit(2)));
+/// let expr2 = col("c").eq(lit(3));
+/// let expr = expr1.or(expr2);
+///
+///  //(a=1 or c=3)AND(b=2 or c=3)
+/// let expr1 = col("a").eq(lit(1)).or(col("c").eq(lit(3)));
+/// let expr2 = col("b").eq(lit(2)).or(col("c").eq(lit(3)));
+/// let expect = expr1.and(expr2);
+/// assert_eq!(expect, cnf_rewrite(expr));
+/// ```
+pub fn cnf_rewrite(expr: Expr) -> Expr {
+    // Find all exprs joined by OR
+    let disjuncts = split_binary(&expr, Operator::Or);
+
+    // For each expr, split now on AND
+    // A OR B OR C --> split each A, B and C
+    let disjunct_conjuncts: VecDeque<Vec<&Expr>> = disjuncts
+        .into_iter()
+        .map(|e| split_binary(e, Operator::And))
+        .collect::<VecDeque<_>>();
+
+    // Decide if we want to distribute the clauses. Heuristic is
+    // chosen to avoid creating huge predicates
+    let num_conjuncts = disjunct_conjuncts

Review Comment:
   Nice check!



##########
datafusion/optimizer/src/utils.rs:
##########
@@ -99,27 +99,176 @@ fn split_conjunction_impl<'a>(expr: &'a Expr, mut exprs: 
Vec<&'a Expr>) -> Vec<&
 /// assert_eq!(split_conjunction_owned(expr), split);
 /// ```
 pub fn split_conjunction_owned(expr: Expr) -> Vec<Expr> {
-    split_conjunction_owned_impl(expr, vec![])
+    split_binary_owned(expr, Operator::And)
 }
 
-fn split_conjunction_owned_impl(expr: Expr, mut exprs: Vec<Expr>) -> Vec<Expr> 
{
+/// Splits an owned binary operator tree [`Expr`] such as `A <OP> B <OP> C` => 
`[A, B, C]`
+///
+/// This is often used to "split" expressions such as `col1 = 5
+/// AND col2 = 10` into [`col1 = 5`, `col2 = 10`];
+///
+/// # Example
+/// ```
+/// # use datafusion_expr::{col, lit, Operator};
+/// # use datafusion_optimizer::utils::split_binary_owned;
+/// # use std::ops::Add;
+/// // a=1 + b=2
+/// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2)));
+///
+/// // [a=1, b=2]
+/// let split = vec![
+///   col("a").eq(lit(1)),
+///   col("b").eq(lit(2)),
+/// ];
+///
+/// // use split_binary_owned to split them
+/// assert_eq!(split_binary_owned(expr, Operator::Plus), split);
+/// ```
+pub fn split_binary_owned(expr: Expr, op: Operator) -> Vec<Expr> {
+    split_binary_owned_impl(expr, op, vec![])
+}
+
+fn split_binary_owned_impl(
+    expr: Expr,
+    operator: Operator,
+    mut exprs: Vec<Expr>,
+) -> Vec<Expr> {
     match expr {
-        Expr::BinaryExpr(BinaryExpr {
-            right,
-            op: Operator::And,
-            left,
-        }) => {
-            let exprs = split_conjunction_owned_impl(*left, exprs);
-            split_conjunction_owned_impl(*right, exprs)
+        Expr::BinaryExpr(BinaryExpr { right, op, left }) if op == operator => {
+            let exprs = split_binary_owned_impl(*left, operator, exprs);
+            split_binary_owned_impl(*right, operator, exprs)
         }
-        Expr::Alias(expr, _) => split_conjunction_owned_impl(*expr, exprs),
+        Expr::Alias(expr, _) => split_binary_owned_impl(*expr, operator, 
exprs),
         other => {
             exprs.push(other);
             exprs
         }
     }
 }
 
+/// Splits an binary operator tree [`Expr`] such as `A <OP> B <OP> C` => `[A, 
B, C]`
+///
+/// See [`split_binary_owned`] for more details and an example.
+pub fn split_binary(expr: &Expr, op: Operator) -> Vec<&Expr> {
+    split_binary_impl(expr, op, vec![])
+}
+
+fn split_binary_impl<'a>(
+    expr: &'a Expr,
+    operator: Operator,
+    mut exprs: Vec<&'a Expr>,
+) -> Vec<&'a Expr> {
+    match expr {
+        Expr::BinaryExpr(BinaryExpr { right, op, left }) if *op == operator => 
{
+            let exprs = split_binary_impl(left, operator, exprs);
+            split_binary_impl(right, operator, exprs)
+        }
+        Expr::Alias(expr, _) => split_binary_impl(expr, operator, exprs),
+        other => {
+            exprs.push(other);
+            exprs
+        }
+    }
+}
+
+/// Given a list of lists of [`Expr`]s, returns a list of lists of
+/// [`Expr`]s of expressions where there is one expression from each
+/// from each of the input expressions
+///
+/// For example, given the input `[[a, b], [c], [d, e]]` returns
+/// `[a, c, d], [a, c, e], [b, c, d], [b, c, e]]`.
+fn permutations(mut exprs: VecDeque<Vec<&Expr>>) -> Vec<Vec<&Expr>> {
+    let first = if let Some(first) = exprs.pop_front() {
+        first
+    } else {
+        return vec![];
+    };
+
+    // base case:
+    if exprs.is_empty() {
+        first.into_iter().map(|e| vec![e]).collect()
+    } else {
+        first
+            .iter()
+            .flat_map(|expr| {
+                permutations(exprs.clone())
+                    .into_iter()
+                    .map(|expr_list| {
+                        // Create [expr, ...] for each permutation
+                        std::iter::once(expr.clone())
+                            .chain(expr_list.into_iter())
+                            .collect::<Vec<&Expr>>()
+                    })
+                    .collect::<Vec<Vec<&Expr>>>()
+            })
+            .collect()
+    }
+}
+
+const MAX_CNF_REWRITE_CONJUNCTS: usize = 10;
+
+/// Tries to convert an expression to conjunctive normal form (CNF).
+///
+/// Does not convert the expression if the total number of conjuncts
+/// (exprs ANDed together) would exceed [`MAX_CNF_REWRITE_CONJUNCTS`].
+///
+/// The following expression is in CNF:
+///  `(a OR b) AND (c OR d)`
+///
+/// The following is not in CNF:
+///  `(a AND b) OR c`.
+///
+/// But could be rewrite to a CNF expression:
+///  `(a OR c) AND (b OR c)`.
+///
+///
+/// # Example
+/// ```
+/// # use datafusion_expr::{col, lit};
+/// # use datafusion_optimizer::utils::cnf_rewrite;
+/// // (a=1 AND b=2)OR c = 3
+/// let expr1 = col("a").eq(lit(1)).and(col("b").eq(lit(2)));
+/// let expr2 = col("c").eq(lit(3));
+/// let expr = expr1.or(expr2);
+///
+///  //(a=1 or c=3)AND(b=2 or c=3)
+/// let expr1 = col("a").eq(lit(1)).or(col("c").eq(lit(3)));
+/// let expr2 = col("b").eq(lit(2)).or(col("c").eq(lit(3)));
+/// let expect = expr1.and(expr2);
+/// assert_eq!(expect, cnf_rewrite(expr));
+/// ```
+pub fn cnf_rewrite(expr: Expr) -> Expr {
+    // Find all exprs joined by OR
+    let disjuncts = split_binary(&expr, Operator::Or);
+
+    // For each expr, split now on AND
+    // A OR B OR C --> split each A, B and C
+    let disjunct_conjuncts: VecDeque<Vec<&Expr>> = disjuncts
+        .into_iter()
+        .map(|e| split_binary(e, Operator::And))
+        .collect::<VecDeque<_>>();
+
+    // Decide if we want to distribute the clauses. Heuristic is
+    // chosen to avoid creating huge predicates
+    let num_conjuncts = disjunct_conjuncts
+        .iter()
+        .fold(1usize, |sz, exprs| sz.saturating_mul(exprs.len()));
+
+    if disjunct_conjuncts.iter().any(|exprs| exprs.len() > 1)
+        && num_conjuncts < MAX_CNF_REWRITE_CONJUNCTS
+    {
+        let or_clauses = permutations(disjunct_conjuncts)
+            .into_iter()
+            // form the OR clauses( A OR B OR C ..)
+            .map(|exprs| disjunction(exprs.into_iter().cloned()).unwrap());
+        conjunction(or_clauses).unwrap()
+    }
+    // otherwise return the original expression
+    else {
+        expr
+    }
+}

Review Comment:
   Yes, without using the rewrite trait seems more clearly 👍



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to