This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 4f1d8d4d39bda9a1d91db0fc2e57fe4634ecadd0 Author: hexianqing <[email protected]> AuthorDate: Mon Oct 10 16:20:11 2022 +0800 IMPALA-11536: fix invalid predicates propagate for outer join simplification When set ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = true, the planner will simplify outer joins if the WHERE clause contains at least one null rejecting condition and then remove the outer-joined tuple id from the map of GlobalState#outerJoinedTupleIds. However, there may be false removals for right join simplification or full join simplification. This may lead to incorrect results since it is incorrect to propagate a non null-rejecting predicate into a plan subtree that is on the nullable side of an outer join. GlobalState#outerJoinedTupleIds indicates whether a table is on the nullable side of an outer join. E.g. SELECT COUNT(*) FROM functional.nullrows t1 FULL JOIN functional.nullrows t2 ON t1.id = t2.id FULL JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id WHERE t1.group_str = 'a' AND coalesce(t2.group_str, 'f') = 'f' The predicate coalesce(t2.group_str, 'f') = 'f' will propagate into t2 if we remove t2 from GlobalState#outerJoinedTupleIds. Testing: - Add new plan tests in outer-to-inner-joins.test - Add new query tests to verify the correctness on transformation Change-Id: I6565c5bff0d2f24f30118ba47a2583383e83fff7 Reviewed-on: http://gerrit.cloudera.org:8080/19116 Reviewed-by: Qifan Chen <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../java/org/apache/impala/analysis/Analyzer.java | 76 ++++++-- .../queries/PlannerTest/outer-to-inner-joins.test | 199 +++++++++++++++++++++ .../queries/QueryTest/outer-to-inner-joins.test | 49 +++++ 3 files changed, 307 insertions(+), 17 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java index 8073fedfc..d99ad68fe 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java +++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java @@ -4091,8 +4091,8 @@ public class Analyzer { * and register in globalState_. We use this to simplify outer joins of inline view. * eg: t1, (select t3.id c from t2 left join t3 on t1.id = t2.id) t4 where t1.id = t4.c */ - private void getNonnullableOjTidsFromIjOnClause(TableRef tblRef, - Set<TupleId> nonnullableTids) { + private void getNullRejectingOjTidsFromIjOnClause(TableRef tblRef, + Set<TupleId> nullRejectingTids) { List<Expr> onConjuncts = new ArrayList<>(); for (Map.Entry<ExprId, TableRef> entry : globalState_.ijClauseByConjunct.entrySet()) { if (entry.getValue() == tblRef) { @@ -4112,7 +4112,7 @@ public class Analyzer { List<TupleId> ids = new ArrayList<>(); e.getIds(ids, null); for (TupleId id : ids) { - if (isOuterJoined(id)) nonnullableTids.add(id); + if (isOuterJoined(id)) nullRejectingTids.add(id); } } } catch (InternalException ex) { @@ -4129,10 +4129,10 @@ public class Analyzer { */ private boolean simplifyOuterJoinsByIjOnClause(List<TableRef> tableRefs, TableRef ijTableRef) { - Set<TupleId> nonnullableTidSet = new HashSet<>(); - getNonnullableOjTidsFromIjOnClause(ijTableRef, nonnullableTidSet); - if (!nonnullableTidSet.isEmpty()) { - return simplifyOuterJoins(tableRefs, nonnullableTidSet); + Set<TupleId> nullRejectingTidSet = new HashSet<>(); + getNullRejectingOjTidsFromIjOnClause(ijTableRef, nullRejectingTidSet); + if (!nullRejectingTidSet.isEmpty()) { + return simplifyOuterJoins(tableRefs, nullRejectingTidSet); } return false; } @@ -4153,7 +4153,7 @@ public class Analyzer { * least one null rejecting condition on the inner table. */ public boolean simplifyOuterJoins(List<TableRef> tableRefs, - Set<TupleId> nonnullableTids) { + Set<TupleId> nullRejectingTids) { boolean isSimplified = false; List<TableRef> processedTblRefs = new ArrayList<>(); for (TableRef tableRef : tableRefs) { @@ -4167,7 +4167,7 @@ public class Analyzer { } case LEFT_OUTER_JOIN: { TupleId id = tableRef.getId(); - if (nonnullableTids.contains(id) || hasNullRejectingConjucts(id.asList())) { + if (nullRejectingTids.contains(id) || hasNullRejectingConjucts(id.asList())) { tableRef.setJoinOp(JoinOperator.INNER_JOIN); removeOuterJoinedTupleIds(id.asList()); ojToIjOnClauseConjucts(tableRef); @@ -4179,9 +4179,12 @@ public class Analyzer { } case RIGHT_OUTER_JOIN: { List<TupleId> ids = tableRef.getLeftTblRef().getAllTableRefIds(); - if (TupleId.intersect(ids, nonnullableTids) || hasNullRejectingConjucts(ids)) { + // find out all null-rejecting TupleIds in 'ids' + boolean hasNullRejectingTid = gatherNullRejectingTids(ids, nullRejectingTids); + if (hasNullRejectingTid || TupleId.intersect(ids, nullRejectingTids) || + hasNullRejectingConjucts(ids)) { tableRef.setJoinOp(JoinOperator.INNER_JOIN); - removeOuterJoinedTupleIds(ids); + removeOuterJoinedTupleIds(new ArrayList<TupleId>(nullRejectingTids)); ojToIjOnClauseConjucts(tableRef); reRegisterIsNotEmptyPredicates(tableRef); simplifyOuterJoinsByIjOnClause(processedTblRefs, tableRef); @@ -4191,23 +4194,26 @@ public class Analyzer { } case FULL_OUTER_JOIN: { List<TupleId> ids = tableRef.getLeftTblRef().getAllTableRefIds(); - if (TupleId.intersect(ids, nonnullableTids) || hasNullRejectingConjucts(ids)) { + // find out all null-rejecting TupleIds in 'ids' + boolean hasNullRejectingTid = gatherNullRejectingTids(ids, nullRejectingTids); + if (hasNullRejectingTid || TupleId.intersect(ids, nullRejectingTids) || + hasNullRejectingConjucts(ids)) { removeFullOuterJoinedTupleIdsAndConjuncts(ids); removeFullOuterJoinedTupleIdsAndConjuncts(tableRef.getId().asList()); - if (nonnullableTids.contains(tableRef.getId()) || + if (nullRejectingTids.contains(tableRef.getId()) || hasNullRejectingConjucts(tableRef.getId().asList())) { tableRef.setJoinOp(JoinOperator.INNER_JOIN); - removeOuterJoinedTupleIds(ids); - removeOuterJoinedTupleIds(tableRef.getId().asList()); + nullRejectingTids.add(tableRef.getId()); + removeOuterJoinedTupleIds(new ArrayList<TupleId>(nullRejectingTids)); ojToIjOnClauseConjucts(tableRef); reRegisterIsNotEmptyPredicates(tableRef); simplifyOuterJoinsByIjOnClause(processedTblRefs, tableRef); } else { tableRef.setJoinOp(JoinOperator.LEFT_OUTER_JOIN); - removeOuterJoinedTupleIds(ids); + removeOuterJoinedTupleIds(new ArrayList<TupleId>(nullRejectingTids)); } isSimplified = true; - } else if (nonnullableTids.contains(tableRef.getId()) || + } else if (nullRejectingTids.contains(tableRef.getId()) || hasNullRejectingConjucts(tableRef.getId().asList())) { tableRef.setJoinOp(JoinOperator.RIGHT_OUTER_JOIN); removeOuterJoinedTupleIds(tableRef.getId().asList()); @@ -4222,4 +4228,40 @@ public class Analyzer { } return isSimplified; } + + /** + * Get the tuple ids that satisfy null-rejecting from the where or having onjuncts. + * Return true if has null rejecting tid in tupleIds. + */ + private boolean gatherNullRejectingTids(List<TupleId> tupleIds, + Set<TupleId> nullRejectingTids) { + boolean hasNullRejectingTid = false; + for (TupleId id : tupleIds) { + List<Expr> conjuncts = getTableConjuncts(id); + for (Expr e : conjuncts) { + // Skip not null-rejecting conjunct + if (isNullableConjunct(e, tupleIds)) continue; + + try { + // Check whether 'e' evaluates to true when all its referenced slots are NULL, + // The false result indicates that 'e' is null-rejecting conjunct. + if (!isTrueWithNullSlots(e)) { + if (LOG.isTraceEnabled()) { + LOG.trace("Tuple " + id + " has null rejecting conjunct: " + + e.debugString()); + } + nullRejectingTids.add(id); + hasNullRejectingTid = true; + break; + } + } catch (InternalException ex) { + // Expr evaluation failed in the backend. Skip 'e' since we cannot + // determine whether it is null-rejecting conjunct. + LOG.warn("Fail to verify " + e.toSql() + " being null-rejecting because of the" + + " backend evaluation failure", ex); + } + } + } + return hasNullRejectingTid; + } } diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test index a1714e9f4..eba8cdb8d 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/outer-to-inner-joins.test @@ -1022,4 +1022,203 @@ PLAN-ROOT SINK HDFS partitions=11/11 files=11 size=814.73KB runtime filters: RF000 -> t2.tinyint_col row-size=5B cardinality=11.00K +==== +# IMPALA-11536: Convert a full join to a left join, only non-nullable tuple ids of the left +# tuple ids can be removed from outerJoinedTupleIds. For this test case, the 'testtbl' can +# not be removed and we can not propagate 'coalesce(testtbl.zip, 0) = 0' into 'testtbl' +SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, test_zip,testtbl.zip +FROM functional.jointbl +FULL OUTER JOIN +functional.testtbl +ON jointbl.test_id = testtbl.id +FULL OUTER JOIN +functional.dimtbl +ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +WHERE +jointbl.test_zip = 94611 and coalesce(testtbl.zip, 0) = 0; +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +| other predicates: coalesce(testtbl.zip, 0) = 0 +| row-size=32B cardinality=6 +| +|--02:SCAN HDFS [functional.dimtbl] +| HDFS partitions=1/1 files=1 size=171B +| row-size=8B cardinality=10 +| +03:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: jointbl.test_id = testtbl.id +| row-size=24B cardinality=6 +| +|--01:SCAN HDFS [functional.testtbl] +| HDFS partitions=1/1 files=0 size=0B +| row-size=12B cardinality=0 +| +00:SCAN HDFS [functional.jointbl] + HDFS partitions=1/1 files=1 size=433B + predicates: jointbl.test_zip = 94611 + row-size=12B cardinality=6 +==== +# IMPALA-11536: test all non-nullable tuple ids are removed from outerJoinedTupleIds +# and the predicates propagate correctly +SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, test_zip,testtbl.zip +FROM functional.jointbl +FULL OUTER JOIN +functional.testtbl +ON jointbl.test_id = testtbl.id +FULL OUTER JOIN +functional.dimtbl +ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +WHERE +jointbl.test_zip = 94611 and testtbl.zip = 1; +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +| row-size=32B cardinality=6 +| +|--02:SCAN HDFS [functional.dimtbl] +| HDFS partitions=1/1 files=1 size=171B +| row-size=8B cardinality=10 +| +03:HASH JOIN [INNER JOIN] +| hash predicates: jointbl.test_id = testtbl.id +| runtime filters: RF000 <- testtbl.id +| row-size=24B cardinality=6 +| +|--01:SCAN HDFS [functional.testtbl] +| HDFS partitions=1/1 files=0 size=0B +| predicates: testtbl.zip = 1 +| row-size=12B cardinality=0 +| +00:SCAN HDFS [functional.jointbl] + HDFS partitions=1/1 files=1 size=433B + predicates: jointbl.test_zip = 94611 + runtime filters: RF000 -> jointbl.test_id + row-size=12B cardinality=6 +==== +# IMPALA-11536: Convert a full join to an inner join, only non-nullable tuple ids of the left +# tuple ids can be removed from outerJoinedTupleIds. For this test case, the 'jointbl' can +# not be removed and we can not propagate 'coalesce(jointbl.test_zip, 0) = 0' into 'jointbl' +SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, test_zip,testtbl.zip +FROM functional.jointbl +FULL OUTER JOIN +functional.testtbl +ON jointbl.test_id = testtbl.id +FULL OUTER JOIN +functional.dimtbl +ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +WHERE +testtbl.zip = 94611 and coalesce(jointbl.test_zip, 0) = 0 and dimtbl.zip=0; +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [INNER JOIN] +| hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +| other predicates: coalesce(jointbl.test_zip, 0) = 0 +| runtime filters: RF000 <- dimtbl.id +| row-size=36B cardinality=19 +| +|--02:SCAN HDFS [functional.dimtbl] +| HDFS partitions=1/1 files=1 size=171B +| predicates: dimtbl.zip = 0 +| row-size=12B cardinality=2 +| +03:HASH JOIN [RIGHT OUTER JOIN] +| hash predicates: jointbl.test_id = testtbl.id +| runtime filters: RF002 <- testtbl.id +| row-size=24B cardinality=19 +| +|--01:SCAN HDFS [functional.testtbl] +| HDFS partitions=1/1 files=0 size=0B +| predicates: testtbl.zip = 94611 +| row-size=12B cardinality=0 +| +00:SCAN HDFS [functional.jointbl] + HDFS partitions=1/1 files=1 size=433B + runtime filters: RF000 -> coalesce(functional.jointbl.test_id, functional.jointbl.test_id), RF002 -> jointbl.test_id + row-size=12B cardinality=19 +==== +# IMPALA-11536: Convert a right join to an inner join, only non-nullable tuple ids of the left +# tuple ids can be removed from outerJoinedTupleIds. For this test case, the 'jointbl' can +# not be removed and we can not propagate 'coalesce(jointbl.test_zip, 0) = 0' into 'jointbl' +SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, test_zip,testtbl.zip +FROM functional.jointbl +RIGHT OUTER JOIN +functional.testtbl +ON jointbl.test_id = testtbl.id +RIGHT OUTER JOIN +functional.dimtbl +ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +WHERE +testtbl.zip = 94611 and coalesce(jointbl.test_zip, 0) = 0; +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [INNER JOIN] +| hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +| other predicates: coalesce(jointbl.test_zip, 0) = 0 +| runtime filters: RF000 <- dimtbl.id +| row-size=32B cardinality=19 +| +|--02:SCAN HDFS [functional.dimtbl] +| HDFS partitions=1/1 files=1 size=171B +| row-size=8B cardinality=10 +| +03:HASH JOIN [RIGHT OUTER JOIN] +| hash predicates: jointbl.test_id = testtbl.id +| runtime filters: RF002 <- testtbl.id +| row-size=24B cardinality=19 +| +|--01:SCAN HDFS [functional.testtbl] +| HDFS partitions=1/1 files=0 size=0B +| predicates: testtbl.zip = 94611 +| row-size=12B cardinality=0 +| +00:SCAN HDFS [functional.jointbl] + HDFS partitions=1/1 files=1 size=433B + runtime filters: RF000 -> coalesce(functional.jointbl.test_id, functional.jointbl.test_id), RF002 -> jointbl.test_id + row-size=12B cardinality=19 +==== +# IMPALA-11536: we can not propagate 'coalesce(jointbl.test_zip, 0) = 0' into 'jointbl' +# because the 'jointbl' is nullable +SELECT COALESCE(jointbl.test_id, testtbl.id, dimtbl.id) AS id, test_zip,testtbl.zip +FROM functional.jointbl +FULL OUTER JOIN +functional.testtbl +ON jointbl.test_id = testtbl.id +FULL OUTER JOIN +functional.dimtbl +ON coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +WHERE +testtbl.zip = 94611 and coalesce(jointbl.test_zip, 0) = 0; +---- PLAN +PLAN-ROOT SINK +| +04:HASH JOIN [LEFT OUTER JOIN] +| hash predicates: coalesce(jointbl.test_id, testtbl.id) = dimtbl.id +| other predicates: coalesce(jointbl.test_zip, 0) = 0 +| row-size=32B cardinality=19 +| +|--02:SCAN HDFS [functional.dimtbl] +| HDFS partitions=1/1 files=1 size=171B +| row-size=8B cardinality=10 +| +03:HASH JOIN [RIGHT OUTER JOIN] +| hash predicates: jointbl.test_id = testtbl.id +| runtime filters: RF000 <- testtbl.id +| row-size=24B cardinality=19 +| +|--01:SCAN HDFS [functional.testtbl] +| HDFS partitions=1/1 files=0 size=0B +| predicates: testtbl.zip = 94611 +| row-size=12B cardinality=0 +| +00:SCAN HDFS [functional.jointbl] + HDFS partitions=1/1 files=1 size=433B + runtime filters: RF000 -> jointbl.test_id + row-size=12B cardinality=19 ==== \ No newline at end of file diff --git a/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test b/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test index f7d2933ca..d81303e71 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test +++ b/testdata/workloads/functional-query/queries/QueryTest/outer-to-inner-joins.test @@ -282,4 +282,53 @@ END 12000 ---- TYPES bigint +==== +---- QUERY +SELECT COUNT(*) +FROM functional.nullrows t1 + FULL JOIN functional.nullrows t2 ON t1.id = t2.id + FULL JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id +WHERE t1.group_str = 'a' + AND coalesce(t2.group_str, 'f') = 'f' +---- RESULTS +0 +---- TYPES +bigint +==== +---- QUERY +SELECT COUNT(*) +FROM functional.nullrows t1 + FULL JOIN functional.nullrows t2 ON t1.id = t2.id + FULL JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id +WHERE t2.group_str = 'a' + AND coalesce(t1.group_str, 'f') = 'f' + AND t3.group_str = 'a' +---- RESULTS +0 +---- TYPES +bigint +==== +---- QUERY +SELECT COUNT(*) +FROM functional.nullrows t1 + RIGHT JOIN functional.nullrows t2 ON t1.id = t2.id + RIGHT JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id +WHERE t2.group_str = 'a' + AND coalesce(t1.group_str, 'f') = 'f' +---- RESULTS +0 +---- TYPES +bigint +==== +---- QUERY +SELECT COUNT(*) +FROM functional.nullrows t1 + FULL JOIN functional.nullrows t2 ON t1.id = t2.id + FULL JOIN functional.nullrows t3 ON coalesce(t1.id, t2.id) = t3.id +WHERE t2.group_str = 'a' + AND coalesce(t1.group_str, 'f') = 'f' +---- RESULTS +0 +---- TYPES +bigint ==== \ No newline at end of file
