[impala] 01/02: IMPALA-11536: fix invalid predicates propagate for outer join simplification

stigahuang Sun, 23 Apr 2023 01:41:11 -0700

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 4f1d8d4d39bda9a1d91db0fc2e57fe4634ecadd0
Author: hexianqing <[email protected]>
AuthorDate: Mon Oct 10 16:20:11 2022 +0800

    IMPALA-11536: fix invalid predicates propagate for outer join simplification
    
    When set ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = true, the planner
    will simplify outer joins if the WHERE clause contains at least one
    null rejecting condition and then remove the outer-joined tuple id
    from the map of GlobalState#outerJoinedTupleIds.
    However, there may be false removals for right join simplification or
    full join simplification. This may lead to incorrect results since it
    is incorrect to propagate a non null-rejecting predicate into a plan
    subtree that is on the nullable side of an outer join.
    GlobalState#outerJoinedTupleIds indicates whether a table is on the
    nullable side of an outer join.
    
    E.g.
    SELECT COUNT(*)
    FROM functional.nullrows t1
      FULL JOIN functional.nullrows t2 ON t1.id = t2.id
      FULL JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id
    WHERE t1.group_str = 'a'
      AND coalesce(t2.group_str, 'f') = 'f'
    The predicate coalesce(t2.group_str, 'f') = 'f' will propagate into t2
    if we remove t2 from GlobalState#outerJoinedTupleIds.
    
    Testing:
    - Add new plan tests in outer-to-inner-joins.test
    - Add new query tests to verify the correctness on transformation
    
    Change-Id: I6565c5bff0d2f24f30118ba47a2583383e83fff7
    Reviewed-on: http://gerrit.cloudera.org:8080/19116
    Reviewed-by: Qifan Chen <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../java/org/apache/impala/analysis/Analyzer.java  |  76 ++++++--
 .../queries/PlannerTest/outer-to-inner-joins.test  | 199 +++++++++++++++++++++
 .../queries/QueryTest/outer-to-inner-joins.test    |  49 +++++
 3 files changed, 307 insertions(+), 17 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java 
b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index 8073fedfc..d99ad68fe 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -4091,8 +4091,8 @@ public class Analyzer {
    * and register in globalState_. We use this to simplify outer joins of 
inline view.
    * eg: t1, (select t3.id c from t2 left join t3 on t1.id = t2.id) t4 where 
t1.id = t4.c
    */
-  private void getNonnullableOjTidsFromIjOnClause(TableRef tblRef,
-      Set<TupleId> nonnullableTids) {
+  private void getNullRejectingOjTidsFromIjOnClause(TableRef tblRef,
+      Set<TupleId> nullRejectingTids) {
     List<Expr> onConjuncts = new ArrayList<>();
     for (Map.Entry<ExprId, TableRef> entry : 
globalState_.ijClauseByConjunct.entrySet()) {
       if (entry.getValue() == tblRef) {
@@ -4112,7 +4112,7 @@ public class Analyzer {
           List<TupleId> ids = new ArrayList<>();
           e.getIds(ids, null);
           for (TupleId id : ids) {
-            if (isOuterJoined(id)) nonnullableTids.add(id);
+            if (isOuterJoined(id)) nullRejectingTids.add(id);
           }
         }
       } catch (InternalException ex) {
@@ -4129,10 +4129,10 @@ public class Analyzer {
    */
   private boolean simplifyOuterJoinsByIjOnClause(List<TableRef> tableRefs,
       TableRef ijTableRef) {
-    Set<TupleId> nonnullableTidSet = new HashSet<>();
-    getNonnullableOjTidsFromIjOnClause(ijTableRef, nonnullableTidSet);
-    if (!nonnullableTidSet.isEmpty()) {
-      return simplifyOuterJoins(tableRefs, nonnullableTidSet);
+    Set<TupleId> nullRejectingTidSet = new HashSet<>();
+    getNullRejectingOjTidsFromIjOnClause(ijTableRef, nullRejectingTidSet);
+    if (!nullRejectingTidSet.isEmpty()) {
+      return simplifyOuterJoins(tableRefs, nullRejectingTidSet);
     }
     return false;
   }
@@ -4153,7 +4153,7 @@ public class Analyzer {
    * least one null rejecting condition on the inner table.
    */
   public boolean simplifyOuterJoins(List<TableRef> tableRefs,
-      Set<TupleId> nonnullableTids) {
+      Set<TupleId> nullRejectingTids) {
     boolean isSimplified = false;
     List<TableRef> processedTblRefs = new ArrayList<>();
     for (TableRef tableRef : tableRefs) {
@@ -4167,7 +4167,7 @@ public class Analyzer {
         }
         case LEFT_OUTER_JOIN: {
           TupleId id = tableRef.getId();
-          if (nonnullableTids.contains(id) || 
hasNullRejectingConjucts(id.asList())) {
+          if (nullRejectingTids.contains(id) || 
hasNullRejectingConjucts(id.asList())) {
             tableRef.setJoinOp(JoinOperator.INNER_JOIN);
             removeOuterJoinedTupleIds(id.asList());
             ojToIjOnClauseConjucts(tableRef);
@@ -4179,9 +4179,12 @@ public class Analyzer {
         }
         case RIGHT_OUTER_JOIN: {
           List<TupleId> ids = tableRef.getLeftTblRef().getAllTableRefIds();
-          if (TupleId.intersect(ids, nonnullableTids) || 
hasNullRejectingConjucts(ids)) {
+          // find out all null-rejecting TupleIds in 'ids'
+          boolean hasNullRejectingTid = gatherNullRejectingTids(ids, 
nullRejectingTids);
+          if (hasNullRejectingTid || TupleId.intersect(ids, nullRejectingTids) 
||
+              hasNullRejectingConjucts(ids)) {
             tableRef.setJoinOp(JoinOperator.INNER_JOIN);
-            removeOuterJoinedTupleIds(ids);
+            removeOuterJoinedTupleIds(new 
ArrayList<TupleId>(nullRejectingTids));
             ojToIjOnClauseConjucts(tableRef);
             reRegisterIsNotEmptyPredicates(tableRef);
             simplifyOuterJoinsByIjOnClause(processedTblRefs, tableRef);
@@ -4191,23 +4194,26 @@ public class Analyzer {
         }
         case FULL_OUTER_JOIN: {
           List<TupleId> ids = tableRef.getLeftTblRef().getAllTableRefIds();
-          if (TupleId.intersect(ids, nonnullableTids) || 
hasNullRejectingConjucts(ids)) {
+          // find out all null-rejecting TupleIds in 'ids'
+          boolean hasNullRejectingTid = gatherNullRejectingTids(ids, 
nullRejectingTids);
+          if (hasNullRejectingTid || TupleId.intersect(ids, nullRejectingTids) 
||
+              hasNullRejectingConjucts(ids)) {
             removeFullOuterJoinedTupleIdsAndConjuncts(ids);
             
removeFullOuterJoinedTupleIdsAndConjuncts(tableRef.getId().asList());
-            if (nonnullableTids.contains(tableRef.getId()) ||
+            if (nullRejectingTids.contains(tableRef.getId()) ||
                 hasNullRejectingConjucts(tableRef.getId().asList())) {
               tableRef.setJoinOp(JoinOperator.INNER_JOIN);
-              removeOuterJoinedTupleIds(ids);
-              removeOuterJoinedTupleIds(tableRef.getId().asList());
+              nullRejectingTids.add(tableRef.getId());
+              removeOuterJoinedTupleIds(new 
ArrayList<TupleId>(nullRejectingTids));
               ojToIjOnClauseConjucts(tableRef);
               reRegisterIsNotEmptyPredicates(tableRef);
               simplifyOuterJoinsByIjOnClause(processedTblRefs, tableRef);
             } else {
               tableRef.setJoinOp(JoinOperator.LEFT_OUTER_JOIN);
-              removeOuterJoinedTupleIds(ids);
+              removeOuterJoinedTupleIds(new 
ArrayList<TupleId>(nullRejectingTids));
             }
             isSimplified = true;
-          } else if (nonnullableTids.contains(tableRef.getId()) ||
+          } else if (nullRejectingTids.contains(tableRef.getId()) ||
               hasNullRejectingConjucts(tableRef.getId().asList())) {
             tableRef.setJoinOp(JoinOperator.RIGHT_OUTER_JOIN);
             removeOuterJoinedTupleIds(tableRef.getId().asList());
@@ -4222,4 +4228,40 @@ public class Analyzer {
     }
     return isSimplified;
   }
+
+   /**
+   * Get the tuple ids that satisfy null-rejecting from the where or having 
onjuncts.
+   * Return true if has null rejecting tid in tupleIds.
+   */
+  private boolean gatherNullRejectingTids(List<TupleId> tupleIds,
+      Set<TupleId> nullRejectingTids) {
+    boolean hasNullRejectingTid = false;
+    for (TupleId id : tupleIds) {
+      List<Expr> conjuncts = getTableConjuncts(id);
+      for (Expr e : conjuncts) {
+        // Skip not null-rejecting conjunct
+        if (isNullableConjunct(e, tupleIds)) continue;
+
+        try {
+          // Check whether 'e' evaluates to true when all its referenced slots 
are NULL,
+          // The false result indicates that 'e' is null-rejecting conjunct.
+          if (!isTrueWithNullSlots(e)) {
+            if (LOG.isTraceEnabled()) {
+              LOG.trace("Tuple " + id + " has null rejecting conjunct: "
+                  + e.debugString());
+            }
+            nullRejectingTids.add(id);
+            hasNullRejectingTid = true;
+            break;
+          }
+        } catch (InternalException ex) {
+          // Expr evaluation failed in the backend. Skip 'e' since we cannot
+          // determine whether it is null-rejecting conjunct.
+          LOG.warn("Fail to verify " + e.toSql() + " being null-rejecting 
because of the"
+              + " backend evaluation failure", ex);
+        }
+      }
+    }
+    return hasNullRejectingTid;
+  }
 }
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test
index a1714e9f4..eba8cdb8d 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test
@@ -1022,4 +1022,203 @@ PLAN-ROOT SINK
    HDFS partitions=11/11 files=11 size=814.73KB
    runtime filters: RF000 -> t2.tinyint_col
    row-size=5B cardinality=11.00K
+====
+# IMPALA-11536: Convert a full join to a left join, only non-nullable tuple 
ids of the left
+# tuple ids can be removed from outerJoinedTupleIds. For this test case, the 
'testtbl' can
+# not be removed and we can not propagate 'coalesce(testtbl.zip, 0) = 0' into 
'testtbl'
+SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, 
test_zip,testtbl.zip
+FROM functional.jointbl
+FULL OUTER JOIN
+functional.testtbl
+ON jointbl.test_id = testtbl.id
+FULL OUTER JOIN
+functional.dimtbl
+ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+WHERE
+jointbl.test_zip = 94611 and coalesce(testtbl.zip, 0) = 0;
+---- PLAN
+PLAN-ROOT SINK
+|
+04:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+|  other predicates: coalesce(testtbl.zip, 0) = 0
+|  row-size=32B cardinality=6
+|
+|--02:SCAN HDFS [functional.dimtbl]
+|     HDFS partitions=1/1 files=1 size=171B
+|     row-size=8B cardinality=10
+|
+03:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: jointbl.test_id = testtbl.id
+|  row-size=24B cardinality=6
+|
+|--01:SCAN HDFS [functional.testtbl]
+|     HDFS partitions=1/1 files=0 size=0B
+|     row-size=12B cardinality=0
+|
+00:SCAN HDFS [functional.jointbl]
+   HDFS partitions=1/1 files=1 size=433B
+   predicates: jointbl.test_zip = 94611
+   row-size=12B cardinality=6
+====
+# IMPALA-11536: test all non-nullable tuple ids are removed from 
outerJoinedTupleIds
+# and the predicates propagate correctly
+SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, 
test_zip,testtbl.zip
+FROM functional.jointbl
+FULL OUTER JOIN
+functional.testtbl
+ON jointbl.test_id = testtbl.id
+FULL OUTER JOIN
+functional.dimtbl
+ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+WHERE
+jointbl.test_zip = 94611 and testtbl.zip = 1;
+---- PLAN
+PLAN-ROOT SINK
+|
+04:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+|  row-size=32B cardinality=6
+|
+|--02:SCAN HDFS [functional.dimtbl]
+|     HDFS partitions=1/1 files=1 size=171B
+|     row-size=8B cardinality=10
+|
+03:HASH JOIN [INNER JOIN]
+|  hash predicates: jointbl.test_id = testtbl.id
+|  runtime filters: RF000 <- testtbl.id
+|  row-size=24B cardinality=6
+|
+|--01:SCAN HDFS [functional.testtbl]
+|     HDFS partitions=1/1 files=0 size=0B
+|     predicates: testtbl.zip = 1
+|     row-size=12B cardinality=0
+|
+00:SCAN HDFS [functional.jointbl]
+   HDFS partitions=1/1 files=1 size=433B
+   predicates: jointbl.test_zip = 94611
+   runtime filters: RF000 -> jointbl.test_id
+   row-size=12B cardinality=6
+====
+# IMPALA-11536: Convert a full join to an inner join, only non-nullable tuple 
ids of the left
+# tuple ids can be removed from outerJoinedTupleIds. For this test case, the 
'jointbl' can
+# not be removed and we can not propagate 'coalesce(jointbl.test_zip, 0) = 0' 
into 'jointbl'
+SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, 
test_zip,testtbl.zip
+FROM functional.jointbl
+FULL OUTER JOIN
+functional.testtbl
+ON jointbl.test_id = testtbl.id
+FULL OUTER JOIN
+functional.dimtbl
+ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+WHERE
+testtbl.zip = 94611 and coalesce(jointbl.test_zip, 0) = 0 and dimtbl.zip=0;
+---- PLAN
+PLAN-ROOT SINK
+|
+04:HASH JOIN [INNER JOIN]
+|  hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+|  other predicates: coalesce(jointbl.test_zip, 0) = 0
+|  runtime filters: RF000 <- dimtbl.id
+|  row-size=36B cardinality=19
+|
+|--02:SCAN HDFS [functional.dimtbl]
+|     HDFS partitions=1/1 files=1 size=171B
+|     predicates: dimtbl.zip = 0
+|     row-size=12B cardinality=2
+|
+03:HASH JOIN [RIGHT OUTER JOIN]
+|  hash predicates: jointbl.test_id = testtbl.id
+|  runtime filters: RF002 <- testtbl.id
+|  row-size=24B cardinality=19
+|
+|--01:SCAN HDFS [functional.testtbl]
+|     HDFS partitions=1/1 files=0 size=0B
+|     predicates: testtbl.zip = 94611
+|     row-size=12B cardinality=0
+|
+00:SCAN HDFS [functional.jointbl]
+   HDFS partitions=1/1 files=1 size=433B
+   runtime filters: RF000 -> coalesce(functional.jointbl.test_id, 
functional.jointbl.test_id), RF002 -> jointbl.test_id
+   row-size=12B cardinality=19
+====
+# IMPALA-11536: Convert a right join to an inner join, only non-nullable tuple 
ids of the left
+# tuple ids can be removed from outerJoinedTupleIds. For this test case, the 
'jointbl' can
+# not be removed and we can not propagate 'coalesce(jointbl.test_zip, 0) = 0' 
into 'jointbl'
+SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, 
test_zip,testtbl.zip
+FROM functional.jointbl
+RIGHT OUTER JOIN
+functional.testtbl
+ON jointbl.test_id = testtbl.id
+RIGHT OUTER JOIN
+functional.dimtbl
+ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+WHERE
+testtbl.zip = 94611 and coalesce(jointbl.test_zip, 0) = 0;
+---- PLAN
+PLAN-ROOT SINK
+|
+04:HASH JOIN [INNER JOIN]
+|  hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+|  other predicates: coalesce(jointbl.test_zip, 0) = 0
+|  runtime filters: RF000 <- dimtbl.id
+|  row-size=32B cardinality=19
+|
+|--02:SCAN HDFS [functional.dimtbl]
+|     HDFS partitions=1/1 files=1 size=171B
+|     row-size=8B cardinality=10
+|
+03:HASH JOIN [RIGHT OUTER JOIN]
+|  hash predicates: jointbl.test_id = testtbl.id
+|  runtime filters: RF002 <- testtbl.id
+|  row-size=24B cardinality=19
+|
+|--01:SCAN HDFS [functional.testtbl]
+|     HDFS partitions=1/1 files=0 size=0B
+|     predicates: testtbl.zip = 94611
+|     row-size=12B cardinality=0
+|
+00:SCAN HDFS [functional.jointbl]
+   HDFS partitions=1/1 files=1 size=433B
+   runtime filters: RF000 -> coalesce(functional.jointbl.test_id, 
functional.jointbl.test_id), RF002 -> jointbl.test_id
+   row-size=12B cardinality=19
+====
+# IMPALA-11536: we can not propagate 'coalesce(jointbl.test_zip, 0) = 0' into 
'jointbl'
+# because the 'jointbl' is nullable
+SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, 
test_zip,testtbl.zip
+FROM functional.jointbl
+FULL OUTER JOIN
+functional.testtbl
+ON jointbl.test_id = testtbl.id
+FULL OUTER JOIN
+functional.dimtbl
+ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+WHERE
+testtbl.zip = 94611 and coalesce(jointbl.test_zip, 0) = 0;
+---- PLAN
+PLAN-ROOT SINK
+|
+04:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id
+|  other predicates: coalesce(jointbl.test_zip, 0) = 0
+|  row-size=32B cardinality=19
+|
+|--02:SCAN HDFS [functional.dimtbl]
+|     HDFS partitions=1/1 files=1 size=171B
+|     row-size=8B cardinality=10
+|
+03:HASH JOIN [RIGHT OUTER JOIN]
+|  hash predicates: jointbl.test_id = testtbl.id
+|  runtime filters: RF000 <- testtbl.id
+|  row-size=24B cardinality=19
+|
+|--01:SCAN HDFS [functional.testtbl]
+|     HDFS partitions=1/1 files=0 size=0B
+|     predicates: testtbl.zip = 94611
+|     row-size=12B cardinality=0
+|
+00:SCAN HDFS [functional.jointbl]
+   HDFS partitions=1/1 files=1 size=433B
+   runtime filters: RF000 -> jointbl.test_id
+   row-size=12B cardinality=19
 ====
\ No newline at end of file
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test
 
b/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test
index f7d2933ca..d81303e71 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test
@@ -282,4 +282,53 @@ END
 12000
 ---- TYPES
 bigint
+====
+---- QUERY
+SELECT COUNT(*)
+FROM functional.nullrows t1
+  FULL JOIN functional.nullrows t2 ON t1.id = t2.id
+  FULL JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id
+WHERE t1.group_str = 'a'
+  AND coalesce(t2.group_str, 'f') = 'f'
+---- RESULTS
+0
+---- TYPES
+bigint
+====
+---- QUERY
+SELECT COUNT(*)
+FROM functional.nullrows t1
+  FULL JOIN functional.nullrows t2 ON t1.id = t2.id
+  FULL JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id
+WHERE t2.group_str = 'a'
+  AND coalesce(t1.group_str, 'f') = 'f'
+  AND t3.group_str = 'a'
+---- RESULTS
+0
+---- TYPES
+bigint
+====
+---- QUERY
+SELECT COUNT(*)
+FROM functional.nullrows t1
+  RIGHT JOIN functional.nullrows t2 ON t1.id = t2.id
+  RIGHT JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id
+WHERE t2.group_str = 'a'
+  AND coalesce(t1.group_str, 'f') = 'f'
+---- RESULTS
+0
+---- TYPES
+bigint
+====
+---- QUERY
+SELECT COUNT(*)
+FROM functional.nullrows t1
+  FULL JOIN functional.nullrows t2 ON t1.id = t2.id
+  FULL JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id
+WHERE t2.group_str = 'a'
+  AND coalesce(t1.group_str, 'f') = 'f'
+---- RESULTS
+0
+---- TYPES
+bigint
 ====
\ No newline at end of file

[impala] 01/02: IMPALA-11536: fix invalid predicates propagate for outer join simplification

Reply via email to