[arrow-datafusion] branch master updated: Pushdown single column predicates from ON join clauses (#3578)

dheres Sat, 15 Oct 2022 06:47:09 -0700

This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/master by this push:
     new e02376ddc Pushdown  single column predicates from ON join clauses 
(#3578)
e02376ddc is described below

commit e02376ddc431a818e1f19a5bb16fe45307a512e8
Author: AssHero <[email protected]>
AuthorDate: Sat Oct 15 21:46:53 2022 +0800

    Pushdown  single column predicates from ON join clauses (#3578)
    
    * extract OR clause for join
    
    * add more comments
    
    * add some comments
    
    * Update TPCH plans
    
    * Update test plan
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 benchmarks/expected-plans/q19.txt            |   6 +-
 benchmarks/expected-plans/q7.txt             |   8 +-
 benchmarks/src/bin/tpch.rs                   |   5 +-
 datafusion/core/tests/sql/joins.rs           |   3 +-
 datafusion/core/tests/sql/predicates.rs      |   8 +
 datafusion/optimizer/src/filter_push_down.rs | 223 ++++++++++++++++++++++++++-
 6 files changed, 243 insertions(+), 10 deletions(-)

diff --git a/benchmarks/expected-plans/q19.txt 
b/benchmarks/expected-plans/q19.txt
index 902893ea9..cbf4f08b3 100644
--- a/benchmarks/expected-plans/q19.txt
+++ b/benchmarks/expected-plans/q19.txt
@@ -3,7 +3,7 @@ Projection: SUM(lineitem.l_extendedprice * Int64(1) - 
lineitem.l_discount) AS re
     Projection: lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON") AS 
lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON")Utf8("DELIVER IN 
PERSON")lineitem.l_shipinstruct, lineitem.l_shipmode IN ([Utf8("AIR"), 
Utf8("AIR REG")]) AS lineitem.l_shipmode IN ([Utf8("AIR"), Utf8("AIR 
REG")])Utf8("AIR REG")Utf8("AIR")lineitem.l_shipmode, part.p_size >= Int32(1) 
AS part.p_size >= Int32(1)Int32(1)part.p_size, lineitem.l_quantity, 
lineitem.l_extendedprice, lineitem.l_discount, part.p_brand, part.p [...]
       Filter: part.p_brand = Utf8("Brand#12") AND part.p_container IN 
([Utf8("SM CASE"), Utf8("SM BOX"), Utf8("SM PACK"), Utf8("SM PKG")]) AND 
lineitem.l_quantity >= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(1100),15,2) AND part.p_size <= Int32(5) OR part.p_brand = 
Utf8("Brand#23") AND part.p_container IN ([Utf8("MED BAG"), Utf8("MED BOX"), 
Utf8("MED PKG"), Utf8("MED PACK")]) AND lineitem.l_quantity >= 
Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= Dec [...]
         Inner Join: lineitem.l_partkey = part.p_partkey
-          Filter: lineitem.l_shipmode IN ([Utf8("AIR"), Utf8("AIR REG")]) AND 
lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON")
+          Filter: lineitem.l_shipmode IN ([Utf8("AIR"), Utf8("AIR REG")]) AND 
lineitem.l_shipinstruct = Utf8("DELIVER IN PERSON") AND lineitem.l_quantity >= 
Decimal128(Some(100),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(1100),15,2) OR lineitem.l_quantity >= 
Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(2000),15,2) OR lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(3000),15,2)
             TableScan: lineitem projection=[l_partkey, l_quantity, 
l_extendedprice, l_discount, l_shipinstruct, l_shipmode]
-          Filter: part.p_size >= Int32(1)
-            TableScan: part projection=[p_partkey, p_brand, p_size, 
p_container]
\ No newline at end of file
+          Filter: part.p_size >= Int32(1) AND part.p_brand = Utf8("Brand#12") 
AND part.p_container IN ([Utf8("SM CASE"), Utf8("SM BOX"), Utf8("SM PACK"), 
Utf8("SM PKG")]) AND part.p_size <= Int32(5) OR part.p_brand = Utf8("Brand#23") 
AND part.p_container IN ([Utf8("MED BAG"), Utf8("MED BOX"), Utf8("MED PKG"), 
Utf8("MED PACK")]) AND part.p_size <= Int32(10) OR part.p_brand = 
Utf8("Brand#34") AND part.p_container IN ([Utf8("LG CASE"), Utf8("LG BOX"), 
Utf8("LG PACK"), Utf8("LG PKG")]) AND p [...]
+            TableScan: part projection=[p_partkey, p_brand, p_size, 
p_container]
diff --git a/benchmarks/expected-plans/q7.txt b/benchmarks/expected-plans/q7.txt
index 4a2866a42..a1d1806f9 100644
--- a/benchmarks/expected-plans/q7.txt
+++ b/benchmarks/expected-plans/q7.txt
@@ -14,7 +14,9 @@ Sort: shipping.supp_nation ASC NULLS LAST, 
shipping.cust_nation ASC NULLS LAST,
                         TableScan: lineitem projection=[l_orderkey, l_suppkey, 
l_extendedprice, l_discount, l_shipdate]
                     TableScan: orders projection=[o_orderkey, o_custkey]
                   TableScan: customer projection=[c_custkey, c_nationkey]
-                SubqueryAlias: n1
+                Filter: n1.n_name = Utf8("FRANCE") OR n1.n_name = 
Utf8("GERMANY")
+                  SubqueryAlias: n1
+                    TableScan: nation projection=[n_nationkey, n_name]
+              Filter: n2.n_name = Utf8("GERMANY") OR n2.n_name = Utf8("FRANCE")
+                SubqueryAlias: n2
                   TableScan: nation projection=[n_nationkey, n_name]
-              SubqueryAlias: n2
-                TableScan: nation projection=[n_nationkey, n_name]
\ No newline at end of file
diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs
index 7930cb73c..d1aa2c8eb 100644
--- a/benchmarks/src/bin/tpch.rs
+++ b/benchmarks/src/bin/tpch.rs
@@ -792,7 +792,10 @@ mod tests {
         for path in &possibilities {
             let path = Path::new(&path);
             if let Ok(expected) = read_text_file(path) {
-                assert_eq!(expected, actual);
+                assert_eq!(expected, actual,
+                           // generate output that is easier to 
copy/paste/update
+                           "\n\nMismatch of expected content in: 
{:?}\nExpected:\n\n{}\n\nActual:\n\n{}\n\n",
+                           path, expected, actual);
                 found = true;
                 break;
             }
diff --git a/datafusion/core/tests/sql/joins.rs 
b/datafusion/core/tests/sql/joins.rs
index b5b59b1b6..2ff4947b3 100644
--- a/datafusion/core/tests/sql/joins.rs
+++ b/datafusion/core/tests/sql/joins.rs
@@ -1474,7 +1474,8 @@ async fn reduce_left_join_2() -> Result<()> {
         "    Filter: CAST(t2.t2_int AS Int64) < Int64(10) OR CAST(t1.t1_int AS 
Int64) > Int64(2) AND t2.t2_name != Utf8(\"w\") [t1_id:UInt32;N, 
t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, 
t2_int:UInt32;N]",
         "      Inner Join: t1.t1_id = t2.t2_id [t1_id:UInt32;N, 
t1_name:Utf8;N, t1_int:UInt32;N, t2_id:UInt32;N, t2_name:Utf8;N, 
t2_int:UInt32;N]",
         "        TableScan: t1 projection=[t1_id, t1_name, t1_int] 
[t1_id:UInt32;N, t1_name:Utf8;N, t1_int:UInt32;N]",
-        "        TableScan: t2 projection=[t2_id, t2_name, t2_int] 
[t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "        Filter: CAST(t2.t2_int AS Int64) < Int64(10) OR t2.t2_name != 
Utf8(\"w\") [t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
+        "          TableScan: t2 projection=[t2_id, t2_name, t2_int] 
[t2_id:UInt32;N, t2_name:Utf8;N, t2_int:UInt32;N]",
     ];
     let formatted = plan.display_indent_schema().to_string();
     let actual: Vec<&str> = formatted.trim().lines().collect();
diff --git a/datafusion/core/tests/sql/predicates.rs 
b/datafusion/core/tests/sql/predicates.rs
index bb4391c4f..07e016a27 100644
--- a/datafusion/core/tests/sql/predicates.rs
+++ b/datafusion/core/tests/sql/predicates.rs
@@ -468,6 +468,14 @@ async fn multiple_or_predicates() -> Result<()> {
     // factored out and appear only once in the following plan
     let expected = vec![
         "Explain [plan_type:Utf8, plan:Utf8]",
+        "  Projection: #lineitem.l_partkey [l_partkey:Int64]",
+        "    Projection: #part.p_size >= Int32(1) AS #part.p_size >= 
Int32(1)Int32(1)#part.p_size, #lineitem.l_partkey, #lineitem.l_quantity, 
#part.p_brand, #part.p_size [#part.p_size >= 
Int32(1)Int32(1)#part.p_size:Boolean;N, l_partkey:Int64, 
l_quantity:Decimal128(15, 2), p_brand:Utf8, p_size:Int32]",
+        "      Filter: #part.p_brand = Utf8(\"Brand#12\") AND 
#lineitem.l_quantity >= Decimal128(Some(100),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(1100),15,2) AND #part.p_size <= Int32(5) OR #part.p_brand = 
Utf8(\"Brand#23\") AND #lineitem.l_quantity >= Decimal128(Some(1000),15,2) AND 
#lineitem.l_quantity <= Decimal128(Some(2000),15,2) AND #part.p_size <= 
Int32(10) OR #part.p_brand = Utf8(\"Brand#34\") AND #lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND #lineitem.l_quan [...]
+        "        Inner Join: #lineitem.l_partkey = #part.p_partkey 
[l_partkey:Int64, l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, 
p_size:Int32]",
+        "          Filter: #lineitem.l_quantity >= Decimal128(Some(100),15,2) 
AND #lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR #lineitem.l_quantity 
>= Decimal128(Some(1000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(2000),15,2) OR #lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(3000),15,2) [l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
+        "            TableScan: lineitem projection=[l_partkey, l_quantity], 
partial_filters=[#lineitem.l_quantity >= Decimal128(Some(100),15,2) AND 
#lineitem.l_quantity <= Decimal128(Some(1100),15,2) OR #lineitem.l_quantity >= 
Decimal128(Some(1000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(2000),15,2) OR #lineitem.l_quantity >= 
Decimal128(Some(2000),15,2) AND #lineitem.l_quantity <= 
Decimal128(Some(3000),15,2)] [l_partkey:Int64, l_quantity:Decimal128(15, 2)]",
+        "          Filter: #part.p_size >= Int32(1) AND #part.p_brand = 
Utf8(\"Brand#12\") AND #part.p_size <= Int32(5) OR #part.p_brand = 
Utf8(\"Brand#23\") AND #part.p_size <= Int32(10) OR #part.p_brand = 
Utf8(\"Brand#34\") AND #part.p_size <= Int32(15) [p_partkey:Int64, 
p_brand:Utf8, p_size:Int32]",
+        "            TableScan: part projection=[p_partkey, p_brand, p_size], 
partial_filters=[#part.p_size >= Int32(1), #part.p_brand = Utf8(\"Brand#12\") 
AND #part.p_size <= Int32(5) OR #part.p_brand = Utf8(\"Brand#23\") AND 
#part.p_size <= Int32(10) OR #part.p_brand = Utf8(\"Brand#34\") AND 
#part.p_size <= Int32(15)] [p_partkey:Int64, p_brand:Utf8, p_size:Int32]",
         "  Projection: lineitem.l_partkey [l_partkey:Int64]",
         "    Filter: part.p_brand = Utf8(\"Brand#12\") AND lineitem.l_quantity 
>= Decimal128(Some(100),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(1100),15,2) AND CAST(part.p_size AS Int64) BETWEEN Int64(1) AND 
Int64(5) OR part.p_brand = Utf8(\"Brand#23\") AND lineitem.l_quantity >= 
Decimal128(Some(1000),15,2) AND lineitem.l_quantity <= 
Decimal128(Some(2000),15,2) AND CAST(part.p_size AS Int64) BETWEEN Int64(1) AND 
Int64(10) OR part.p_brand = Utf8(\"Brand#34\") AND lineitem.l_quanti [...]
         "      Inner Join: lineitem.l_partkey = part.p_partkey 
[l_partkey:Int64, l_quantity:Decimal128(15, 2), p_partkey:Int64, p_brand:Utf8, 
p_size:Int32]",
diff --git a/datafusion/optimizer/src/filter_push_down.rs 
b/datafusion/optimizer/src/filter_push_down.rs
index 4d720eb22..08ba71cda 100644
--- a/datafusion/optimizer/src/filter_push_down.rs
+++ b/datafusion/optimizer/src/filter_push_down.rs
@@ -17,14 +17,15 @@
 use crate::{utils, OptimizerConfig, OptimizerRule};
 use datafusion_common::{Column, DFSchema, DataFusionError, Result};
 use datafusion_expr::{
-    col,
+    and, col,
     expr_rewriter::{replace_col, ExprRewritable, ExprRewriter},
     logical_plan::{
         Aggregate, CrossJoin, Join, JoinType, Limit, LogicalPlan, Projection, 
TableScan,
         Union,
     },
+    or,
     utils::{expr_to_columns, exprlist_to_columns, from_plan},
-    Expr, TableProviderFilterPushDown,
+    Expr, Operator, TableProviderFilterPushDown,
 };
 use std::collections::{HashMap, HashSet};
 use std::iter::once;
@@ -247,6 +248,156 @@ fn get_pushable_join_predicates<'a>(
         .unzip()
 }
 
+// examine OR clause to see if any useful clauses can be extracted and push 
down.
+// extract at least one qual from each sub clauses of OR clause, then form the 
quals
+// to new OR clause as predicate.
+//
+// Filter: (a = c and a < 20) or (b = d and b > 10)
+//     join/crossjoin:
+//          TableScan: projection=[a, b]
+//          TableScan: projection=[c, d]
+//
+// is optimized to
+//
+// Filter: (a = c and a < 20) or (b = d and b > 10)
+//     join/crossjoin:
+//          Filter: (a < 20) or (b > 10)
+//              TableScan: projection=[a, b]
+//          TableScan: projection=[c, d]
+//
+// In general, predicates of this form:
+//
+// (A AND B) OR (C AND D)
+//
+// will be transformed to
+//
+// ((A AND B) OR (C AND D)) AND (A OR C)
+//
+// OR
+//
+// ((A AND B) OR (C AND D)) AND ((A AND B) OR C)
+//
+// OR
+//
+// do nothing.
+//
+fn extract_or_clauses_for_join(
+    filters: &[&Expr],
+    schema: &DFSchema,
+    preserved: bool,
+) -> (Vec<Expr>, Vec<HashSet<Column>>) {
+    if !preserved {
+        return (vec![], vec![]);
+    }
+
+    let schema_columns = schema
+        .fields()
+        .iter()
+        .flat_map(|f| {
+            [
+                f.qualified_column(),
+                // we need to push down filter using unqualified column as well
+                f.unqualified_column(),
+            ]
+        })
+        .collect::<HashSet<_>>();
+
+    let mut exprs = vec![];
+    let mut expr_columns = vec![];
+    for expr in filters.iter() {
+        if let Expr::BinaryExpr {
+            left,
+            op: Operator::Or,
+            right,
+        } = expr
+        {
+            let left_expr = extract_or_clause(left.as_ref(), &schema_columns);
+            let right_expr = extract_or_clause(right.as_ref(), 
&schema_columns);
+
+            // If nothing can be extracted from any sub clauses, do nothing 
for this OR clause.
+            if let (Some(left_expr), Some(right_expr)) = (left_expr, 
right_expr) {
+                let predicate = or(left_expr, right_expr);
+                let mut columns: HashSet<Column> = HashSet::new();
+                expr_to_columns(&predicate, &mut columns).ok().unwrap();
+
+                exprs.push(predicate);
+                expr_columns.push(columns);
+            }
+        }
+    }
+
+    // new formed OR clauses and their column references
+    (exprs, expr_columns)
+}
+
+// extract qual from OR sub-clause.
+//
+// A qual is extracted if it only contains set of column references in 
schema_columns.
+//
+// For AND clause, we extract from both sub-clauses, then make new AND clause 
by extracted
+// clauses if both extracted; Otherwise, use the extracted clause from any 
sub-clauses or None.
+//
+// For OR clause, we extract from both sub-clauses, then make new OR clause by 
extracted clauses if both extracted;
+// Otherwise, return None.
+//
+// For other clause, apply the rule above to extract clause.
+fn extract_or_clause(expr: &Expr, schema_columns: &HashSet<Column>) -> 
Option<Expr> {
+    let mut predicate = None;
+
+    match expr {
+        Expr::BinaryExpr {
+            left: l_expr,
+            op: Operator::Or,
+            right: r_expr,
+        } => {
+            let l_expr = extract_or_clause(l_expr, schema_columns);
+            let r_expr = extract_or_clause(r_expr, schema_columns);
+
+            if let (Some(l_expr), Some(r_expr)) = (l_expr, r_expr) {
+                predicate = Some(or(l_expr, r_expr));
+            }
+        }
+        Expr::BinaryExpr {
+            left: l_expr,
+            op: Operator::And,
+            right: r_expr,
+        } => {
+            let l_expr = extract_or_clause(l_expr, schema_columns);
+            let r_expr = extract_or_clause(r_expr, schema_columns);
+
+            match (l_expr, r_expr) {
+                (Some(l_expr), Some(r_expr)) => {
+                    predicate = Some(and(l_expr, r_expr));
+                }
+                (Some(l_expr), None) => {
+                    predicate = Some(l_expr);
+                }
+                (None, Some(r_expr)) => {
+                    predicate = Some(r_expr);
+                }
+                (None, None) => {
+                    predicate = None;
+                }
+            }
+        }
+        _ => {
+            let mut columns: HashSet<Column> = HashSet::new();
+            expr_to_columns(expr, &mut columns).ok().unwrap();
+
+            if schema_columns
+                .intersection(&columns)
+                .collect::<HashSet<_>>()
+                .len()
+                == columns.len()
+            {
+                predicate = Some(expr.clone());
+            }
+        }
+    }
+
+    predicate
+}
+
 fn optimize_join(
     mut state: State,
     plan: &LogicalPlan,
@@ -285,17 +436,54 @@ fn optimize_join(
         (on_to_left, on_to_right, on_to_keep)
     };
 
+    // Extract from OR clause, generate new predicates for both side of join 
if possible.
+    // We only track the unpushable predicates above.
+    let or_to_left =
+        extract_or_clauses_for_join(&to_keep.0, left.schema(), left_preserved);
+    let or_to_right =
+        extract_or_clauses_for_join(&to_keep.0, right.schema(), 
right_preserved);
+    let on_or_to_left = extract_or_clauses_for_join(
+        &on_to_keep.iter().collect::<Vec<_>>(),
+        left.schema(),
+        left_preserved,
+    );
+    let on_or_to_right = extract_or_clauses_for_join(
+        &on_to_keep.iter().collect::<Vec<_>>(),
+        right.schema(),
+        right_preserved,
+    );
+
     // Build new filter states using pushable predicates
     // from current optimizer states and from ON clause.
     // Then recursively call optimization for both join inputs
     let mut left_state = State { filters: vec![] };
     left_state.append_predicates(to_left);
     left_state.append_predicates(on_to_left);
+    or_to_left
+        .0
+        .into_iter()
+        .zip(or_to_left.1)
+        .for_each(|(expr, cols)| left_state.filters.push((expr, cols)));
+    on_or_to_left
+        .0
+        .into_iter()
+        .zip(on_or_to_left.1)
+        .for_each(|(expr, cols)| left_state.filters.push((expr, cols)));
     let left = optimize(left, left_state)?;
 
     let mut right_state = State { filters: vec![] };
     right_state.append_predicates(to_right);
     right_state.append_predicates(on_to_right);
+    or_to_right
+        .0
+        .into_iter()
+        .zip(or_to_right.1)
+        .for_each(|(expr, cols)| right_state.filters.push((expr, cols)));
+    on_or_to_right
+        .0
+        .into_iter()
+        .zip(on_or_to_right.1)
+        .for_each(|(expr, cols)| right_state.filters.push((expr, cols)));
     let right = optimize(right, right_state)?;
 
     // Create a new Join with the new `left` and `right`
@@ -2134,4 +2322,35 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_crossjoin_with_or_clause() -> Result<()> {
+        // select * from test,test1 where (test.a = test1.a and test.b > 1) or 
(test.b = test1.b and test.c < 10);
+        let table_scan = test_table_scan()?;
+        let left = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a"), col("b"), col("c")])?
+            .build()?;
+        let right_table_scan = test_table_scan_with_name("test1")?;
+        let right = LogicalPlanBuilder::from(right_table_scan)
+            .project(vec![col("a").alias("d"), col("a").alias("e")])?
+            .build()?;
+        let filter = or(
+            and(col("a").eq(col("d")), col("b").gt(lit(1u32))),
+            and(col("b").eq(col("e")), col("c").lt(lit(10u32))),
+        );
+        let plan = LogicalPlanBuilder::from(left)
+            .cross_join(&right)?
+            .filter(filter)?
+            .build()?;
+
+        let expected = "Filter: test.a = d AND test.b > UInt32(1) OR test.b = 
e AND test.c < UInt32(10)\
+                        \n  CrossJoin:\
+                        \n    Projection: test.a, test.b, test.c\
+                        \n      Filter: test.b > UInt32(1) OR test.c < 
UInt32(10)\
+                        \n        TableScan: test\
+                        \n    Projection: test1.a AS d, test1.a AS e\
+                        \n      TableScan: test1";
+        assert_optimized_plan_eq(&plan, expected);
+        Ok(())
+    }
 }

[arrow-datafusion] branch master updated: Pushdown single column predicates from ON join clauses (#3578)

Reply via email to