alamb commented on code in PR #3903:
URL: https://github.com/apache/arrow-datafusion/pull/3903#discussion_r1006934965


##########
datafusion/optimizer/src/utils.rs:
##########
@@ -99,27 +99,176 @@ fn split_conjunction_impl<'a>(expr: &'a Expr, mut exprs: 
Vec<&'a Expr>) -> Vec<&
 /// assert_eq!(split_conjunction_owned(expr), split);
 /// ```
 pub fn split_conjunction_owned(expr: Expr) -> Vec<Expr> {
-    split_conjunction_owned_impl(expr, vec![])
+    split_binary_owned(expr, Operator::And)
 }
 
-fn split_conjunction_owned_impl(expr: Expr, mut exprs: Vec<Expr>) -> Vec<Expr> 
{
+/// Splits an owned binary operator tree [`Expr`] such as `A <OP> B <OP> C` => 
`[A, B, C]`
+///
+/// This is often used to "split" expressions such as `col1 = 5
+/// AND col2 = 10` into [`col1 = 5`, `col2 = 10`];
+///
+/// # Example
+/// ```
+/// # use datafusion_expr::{col, lit, Operator};
+/// # use datafusion_optimizer::utils::split_binary_owned;
+/// # use std::ops::Add;
+/// // a=1 + b=2
+/// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2)));
+///
+/// // [a=1, b=2]
+/// let split = vec![
+///   col("a").eq(lit(1)),
+///   col("b").eq(lit(2)),
+/// ];
+///
+/// // use split_binary_owned to split them
+/// assert_eq!(split_binary_owned(expr, Operator::Plus), split);
+/// ```
+pub fn split_binary_owned(expr: Expr, op: Operator) -> Vec<Expr> {
+    split_binary_owned_impl(expr, op, vec![])
+}
+
+fn split_binary_owned_impl(
+    expr: Expr,
+    operator: Operator,
+    mut exprs: Vec<Expr>,
+) -> Vec<Expr> {
     match expr {
-        Expr::BinaryExpr(BinaryExpr {
-            right,
-            op: Operator::And,
-            left,
-        }) => {
-            let exprs = split_conjunction_owned_impl(*left, exprs);
-            split_conjunction_owned_impl(*right, exprs)
+        Expr::BinaryExpr(BinaryExpr { right, op, left }) if op == operator => {
+            let exprs = split_binary_owned_impl(*left, operator, exprs);
+            split_binary_owned_impl(*right, operator, exprs)
         }
-        Expr::Alias(expr, _) => split_conjunction_owned_impl(*expr, exprs),
+        Expr::Alias(expr, _) => split_binary_owned_impl(*expr, operator, 
exprs),
         other => {
             exprs.push(other);
             exprs
         }
     }
 }
 
+/// Splits an binary operator tree [`Expr`] such as `A <OP> B <OP> C` => `[A, 
B, C]`
+///
+/// See [`split_binary_owned`] for more details and an example.
+pub fn split_binary(expr: &Expr, op: Operator) -> Vec<&Expr> {
+    split_binary_impl(expr, op, vec![])
+}
+
+fn split_binary_impl<'a>(
+    expr: &'a Expr,
+    operator: Operator,
+    mut exprs: Vec<&'a Expr>,
+) -> Vec<&'a Expr> {
+    match expr {
+        Expr::BinaryExpr(BinaryExpr { right, op, left }) if *op == operator => 
{
+            let exprs = split_binary_impl(left, operator, exprs);
+            split_binary_impl(right, operator, exprs)
+        }
+        Expr::Alias(expr, _) => split_binary_impl(expr, operator, exprs),
+        other => {
+            exprs.push(other);
+            exprs
+        }
+    }
+}
+
+/// Given a list of lists of [`Expr`]s, returns a list of lists of
+/// [`Expr`]s of expressions where there is one expression from each
+/// from each of the input expressions
+///
+/// For example, given the input `[[a, b], [c], [d, e]]` returns
+/// `[a, c, d], [a, c, e], [b, c, d], [b, c, e]]`.
+fn permutations(mut exprs: VecDeque<Vec<&Expr>>) -> Vec<Vec<&Expr>> {
+    let first = if let Some(first) = exprs.pop_front() {
+        first
+    } else {
+        return vec![];
+    };
+
+    // base case:
+    if exprs.is_empty() {
+        first.into_iter().map(|e| vec![e]).collect()
+    } else {
+        first
+            .into_iter()
+            .flat_map(|expr| {
+                permutations(exprs.clone())
+                    .into_iter()
+                    .map(|expr_list| {
+                        // Create [expr, ...] for each permutation
+                        std::iter::once(expr)
+                            .chain(expr_list.into_iter())
+                            .collect::<Vec<&Expr>>()
+                    })
+                    .collect::<Vec<Vec<&Expr>>>()
+            })
+            .collect()
+    }
+}
+
+const MAX_CNF_REWRITE_CONJUNCTS: usize = 10;
+
+/// Tries to convert an expression to conjunctive normal form (CNF).
+///
+/// Does not convert the expression if the total number of conjuncts
+/// (exprs ANDed together) would exceed [`MAX_CNF_REWRITE_CONJUNCTS`].
+///
+/// The following expression is in CNF:
+///  `(a OR b) AND (c OR d)`
+///
+/// The following is not in CNF:
+///  `(a AND b) OR c`.
+///
+/// But could be rewrite to a CNF expression:
+///  `(a OR c) AND (b OR c)`.
+///
+///
+/// # Example
+/// ```
+/// # use datafusion_expr::{col, lit};
+/// # use datafusion_optimizer::utils::cnf_rewrite;
+/// // (a=1 AND b=2)OR c = 3
+/// let expr1 = col("a").eq(lit(1)).and(col("b").eq(lit(2)));
+/// let expr2 = col("c").eq(lit(3));
+/// let expr = expr1.or(expr2);
+///
+///  //(a=1 or c=3)AND(b=2 or c=3)
+/// let expr1 = col("a").eq(lit(1)).or(col("c").eq(lit(3)));
+/// let expr2 = col("b").eq(lit(2)).or(col("c").eq(lit(3)));
+/// let expect = expr1.and(expr2);
+/// assert_eq!(expect, cnf_rewrite(expr));
+/// ```
+pub fn cnf_rewrite(expr: Expr) -> Expr {
+    // Find all exprs joined by OR
+    let disjuncts = split_binary(&expr, Operator::Or);
+
+    // For each expr, split now on AND
+    // A OR B OR C --> split each A, B and C
+    let disjunct_conjuncts: VecDeque<Vec<&Expr>> = disjuncts
+        .into_iter()
+        .map(|e| split_binary(e, Operator::And))
+        .collect::<VecDeque<_>>();
+
+    // Decide if we want to distribute the clauses. Heuristic is
+    // chosen to avoid creating huge predicates
+    let num_conjuncts = disjunct_conjuncts
+        .iter()
+        .fold(1usize, |sz, exprs| sz.saturating_mul(exprs.len()));
+
+    if disjunct_conjuncts.iter().any(|exprs| exprs.len() > 1)
+        && num_conjuncts < MAX_CNF_REWRITE_CONJUNCTS
+    {
+        let or_clauses = permutations(disjunct_conjuncts)
+            .into_iter()
+            // form the OR clauses( A OR B OR C ..)
+            .map(|exprs| disjunction(exprs.into_iter().cloned()).unwrap());

Review Comment:
   Yes that is my analysis too -- this code will clone exprs only if it 
rewrites the expression -- if not, it will not clone anything



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to