This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new df5c406 IMPALA-9162: Do not apply inferred predicate to outer joins
df5c406 is described below
commit df5c4061456abb947cec8add81b361b60c5d3ad8
Author: Aman Sinha <[email protected]>
AuthorDate: Sat Nov 30 13:20:19 2019 -0500
IMPALA-9162: Do not apply inferred predicate to outer joins
When the planner migrates predicates to inline views, it also creates
equivalent predicates based on the value transfer graph which is built
by transitive relationships among join conditions. These newly inferred
predicates are placed typically as 'other predicates' of an inner or
outer join.
However, for outer joins, this has the effect of adding extra predicates
in the WHERE clause which is incorrect since it may filter NULL values.
Since the original query did not have null filtering conditions in
the WHERE clause, we should not add new ones. In this fix we do the
following: during the migration of conjuncts to inline views, analyze
the predicate of type A <op> B and if it is an inferred predicate AND
either the left or right slots reference the output tuple of an outer
join, the inferred predicate is ignored.
Note that simple queries with combination of inner and outer joins may
not reproduce the problem. Due to the nature of predicate inferencing,
some combination of subqueries, inner joins, outer joins is needed. For
the query pattern, please see the example in the JIRA.
Tests:
- Added plan tests with left and right outer joins to inline-view.test
- One baseline plan in inline-view.test had to be updated
- Manually ran few queries on impala shell to verify result
correctness: by checking that NULL values are being produced for outer
joins.
- Ran regression tests on jenkins
Change-Id: Ie9521bd768c4b333069c34d5c1e11b10ea535827
Reviewed-on: http://gerrit.cloudera.org:8080/14813
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
.../apache/impala/planner/SingleNodePlanner.java | 38 +++++++++
.../queries/PlannerTest/inline-view.test | 93 +++++++++++++++++++++-
2 files changed, 130 insertions(+), 1 deletion(-)
diff --git a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
index eeb6ef9..b61830d 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
@@ -24,6 +24,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
+import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
@@ -1242,6 +1243,9 @@ public class SingleNodePlanner {
List<Expr> viewPredicates =
Expr.substituteList(preds, inlineViewRef.getSmap(), analyzer, false);
+ // perform any post-processing of the predicates before registering
+ removeDisqualifyingInferredPreds(inlineViewRef.getAnalyzer(),
viewPredicates);
+
// Unset the On-clause flag of the migrated conjuncts because the migrated
conjuncts
// apply to the post-join/agg/analytic result of the inline view.
for (Expr e: viewPredicates) e.setIsOnClauseConjunct(false);
@@ -1255,6 +1259,40 @@ public class SingleNodePlanner {
}
/**
+ * Analyze the predicates in the context of the inline view for certain
disqualifying
+ * conditions and remove such predicates from the input list. One such
condition is
+ * the predicate is an inferred predicate AND either its left or right
SlotRef
+ * references the output of an outer join. Note that although such predicates
+ * may have been detected at the time of creating the values transfer graph
+ * (in the Analyzer), we do this check here anyways as a safety in case any
such
+ * predicate 'fell through' to this stage.
+ */
+ private void removeDisqualifyingInferredPreds(Analyzer analyzer, List<Expr>
preds) {
+ ListIterator<Expr> iter = preds.listIterator();
+ while (iter.hasNext()) {
+ Expr e = iter.next();
+ if (e instanceof BinaryPredicate && ((BinaryPredicate)e).isInferred()) {
+ BinaryPredicate p = (BinaryPredicate)e;
+ Pair<SlotId, SlotId> slots = p.getEqSlots();
+ if (slots == null) continue;
+ TupleId leftParent = analyzer.getTupleId(slots.first);
+ TupleId rightParent = analyzer.getTupleId(slots.second);
+ // check if either the left parent or right parent is an outer joined
tuple
+ // Note: strictly, we may be ok to check only for the null producing
+ // side but we are being conservative here to check both sides. With
+ // additional testing we could potentially relax this.
+ if (analyzer.isOuterJoined(leftParent) ||
+ analyzer.isOuterJoined(rightParent)) {
+ iter.remove();
+ LOG.warn("Removed inferred predicate " + p.toSql() + " from the list
of " +
+ "predicates considered for inline view because either the
left " +
+ "or right side is derived from an outer join output.");
+ }
+ }
+ }
+ }
+
+ /**
* Checks if conjuncts can be migrated into an inline view.
*/
private boolean canMigrateConjuncts(InlineViewRef inlineViewRef) {
diff --git
a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
index 872821d..4d2ba6d 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
@@ -1410,7 +1410,7 @@ PLAN-ROOT SINK
|
04:HASH JOIN [FULL OUTER JOIN]
| hash predicates: a.id = c.id
-| other predicates: a.id < b.id, a.id = b.id
+| other predicates: a.id < b.id
| row-size=12B cardinality=7.40K
|
|--02:SCAN HDFS [functional.alltypessmall c]
@@ -2220,3 +2220,94 @@ PLAN-ROOT SINK
HDFS partitions=4/4 files=4 size=460B
row-size=4B cardinality=8
====
+# IMPALA-9162: Should not add extra predicates to the WHERE part of a left
outer join
+select x.* from (select v1.c2, v1.max_c2 from functional.alltypessmall t
+ left join (
+ with iv1 AS (SELECT smallint_col c1, bigint_col c2
+ FROM functional.alltypessmall
+ group by c1, c2),
+ iv2 AS (SELECT smallint_col r_c1, max(bigint_col) max_c2
+ FROM functional.alltypessmall
+ group by r_c1)
+ select iv1.c2, iv2.max_c2 FROM iv1, iv2 where iv1.c2 = iv2.max_c2) as v1
+ on t.smallint_col=v1.c2) as x;
+---- PLAN
+PLAN-ROOT SINK
+|
+06:HASH JOIN [RIGHT OUTER JOIN]
+| hash predicates: bigint_col = t.smallint_col
+| runtime filters: RF000 <- t.smallint_col
+| row-size=22B cardinality=81
+|
+|--00:SCAN HDFS [functional.alltypessmall t]
+| HDFS partitions=4/4 files=4 size=6.32KB
+| row-size=2B cardinality=81
+|
+05:HASH JOIN [INNER JOIN]
+| hash predicates: bigint_col = max(bigint_col)
+| runtime filters: RF002 <- max(bigint_col)
+| row-size=20B cardinality=81
+|
+|--04:AGGREGATE [FINALIZE]
+| | output: max(bigint_col)
+| | group by: smallint_col
+| | row-size=10B cardinality=81
+| |
+| 03:SCAN HDFS [functional.alltypessmall]
+| HDFS partitions=4/4 files=4 size=6.32KB
+| row-size=10B cardinality=81
+|
+02:AGGREGATE [FINALIZE]
+| group by: smallint_col, bigint_col
+| row-size=10B cardinality=81
+|
+01:SCAN HDFS [functional.alltypessmall]
+ HDFS partitions=4/4 files=4 size=6.32KB
+ runtime filters: RF000 -> functional.alltypessmall.bigint_col, RF002 ->
functional.alltypessmall.bigint_col
+ row-size=10B cardinality=81
+====
+# IMPALA-9162: Should not add extra predicates to the WHERE part of a full
outer join
+select x.* from (select v1.c2, v1.max_c2 from functional.alltypessmall t
+ full outer join (
+ with iv1 AS (SELECT smallint_col c1, bigint_col c2
+ FROM functional.alltypessmall
+ group by c1, c2),
+ iv2 AS (SELECT smallint_col r_c1, max(bigint_col) max_c2
+ FROM functional.alltypessmall
+ group by r_c1)
+ select iv1.c2, iv2.max_c2 FROM iv1, iv2 where iv1.c2 = iv2.max_c2) as v1
+ on t.smallint_col=v1.c2) as x;
+---- PLAN
+PLAN-ROOT SINK
+|
+06:HASH JOIN [FULL OUTER JOIN]
+| hash predicates: bigint_col = t.smallint_col
+| row-size=22B cardinality=81
+|
+|--00:SCAN HDFS [functional.alltypessmall t]
+| HDFS partitions=4/4 files=4 size=6.32KB
+| row-size=2B cardinality=81
+|
+05:HASH JOIN [INNER JOIN]
+| hash predicates: bigint_col = max(bigint_col)
+| runtime filters: RF000 <- max(bigint_col)
+| row-size=20B cardinality=81
+|
+|--04:AGGREGATE [FINALIZE]
+| | output: max(bigint_col)
+| | group by: smallint_col
+| | row-size=10B cardinality=81
+| |
+| 03:SCAN HDFS [functional.alltypessmall]
+| HDFS partitions=4/4 files=4 size=6.32KB
+| row-size=10B cardinality=81
+|
+02:AGGREGATE [FINALIZE]
+| group by: smallint_col, bigint_col
+| row-size=10B cardinality=81
+|
+01:SCAN HDFS [functional.alltypessmall]
+ HDFS partitions=4/4 files=4 size=6.32KB
+ runtime filters: RF000 -> functional.alltypessmall.bigint_col
+ row-size=10B cardinality=81
+====