[impala] 02/03: IMPALA-10182: Don't add inferred identity predicates to SELECT node

tarmstrong Tue, 05 Jan 2021 15:40:36 -0800

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 49680559b0da843fbb6ff949d52c9d43f98364b1
Author: Aman Sinha <[email protected]>
AuthorDate: Sun Jan 3 15:58:57 2021 -0800

    IMPALA-10182: Don't add inferred identity predicates to SELECT node
    
    For an inferred equality predicates of type c1 = c2 if both sides
    are referring to the same underlying tuple and slot, it is an identity
    predicate which should not be evaluated by the SELECT node since it
    will incorrectly eliminate NULL rows. This patch fixes the behavior.
    
    Testing:
     - Added planner tests with base table and with outer join
     - Added runtime tests with base table and with outer join
     - Added planner test for IMPALA-9694 (same root cause)
     - Ran PlannerTest .. no other plans changed
    
    Change-Id: I924044f582652dbc50085851cc639f3dee1cd1f4
    Reviewed-on: http://gerrit.cloudera.org:8080/16917
    Reviewed-by: Aman Sinha <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../apache/impala/planner/SingleNodePlanner.java   |  18 +++-
 .../queries/PlannerTest/inline-view.test           | 113 +++++++++++++++++++++
 .../queries/QueryTest/inline-view.test             |  36 +++++++
 3 files changed, 166 insertions(+), 1 deletion(-)

diff --git a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java 
b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
index bb00470..2a85d9e 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
@@ -467,8 +467,24 @@ public class SingleNodePlanner {
       analyzer.createEquivConjuncts(tid, conjuncts);
     }
     if (conjuncts.isEmpty()) return root;
+
+    List<Expr> finalConjuncts = new ArrayList<>();
+    // Check if this is an inferred identity predicate i.e for c1 = c2 both
+    // sides are pointing to the same source slot. In such cases it is wrong
+    // to add the predicate to the SELECT node because it will incorrectly
+    // eliminate rows with NULL values.
+    for (Expr e : conjuncts) {
+      if (e instanceof BinaryPredicate && ((BinaryPredicate) e).isInferred()) {
+        SlotDescriptor lhs = ((BinaryPredicate) 
e).getChild(0).findSrcScanSlot();
+        SlotDescriptor rhs = ((BinaryPredicate) 
e).getChild(1).findSrcScanSlot();
+        if (lhs != null && rhs != null && lhs.equals(rhs)) continue;
+      }
+      finalConjuncts.add(e);
+    }
+    if (finalConjuncts.isEmpty()) return root;
+
     // evaluate conjuncts in SelectNode
-    SelectNode selectNode = new SelectNode(ctx_.getNextNodeId(), root, 
conjuncts);
+    SelectNode selectNode = new SelectNode(ctx_.getNextNodeId(), root, 
finalConjuncts);
     // init() marks conjuncts as assigned
     selectNode.init(analyzer);
     Preconditions.checkState(selectNode.hasValidStats());
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test 
b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
index 25f7ea7..0d083e3 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
@@ -2484,3 +2484,116 @@ PLAN-ROOT SINK
    HDFS partitions=4/4 files=4 size=6.32KB
    row-size=89B cardinality=100
 ====
+# IMPALA-10182: Nulls get eliminated with union-all for duplicate columns
+select c1, c2 from (select tinyint_col c1, tinyint_col c2
+  from functional.alltypesagg group by 1, 2) t1
+  group by 1, 2
+ union all
+select c1, c2 from (select tinyint_col c1, tinyint_col c2
+  from functional.alltypesagg group by 1, 2) t1
+  group by 1, 2;
+---- PLAN
+PLAN-ROOT SINK
+|
+00:UNION
+|  pass-through-operands: all
+|  row-size=2B cardinality=18
+|
+|--06:AGGREGATE [FINALIZE]
+|  |  group by: tinyint_col, tinyint_col
+|  |  row-size=2B cardinality=9
+|  |
+|  05:AGGREGATE [FINALIZE]
+|  |  group by: tinyint_col
+|  |  row-size=1B cardinality=9
+|  |
+|  04:SCAN HDFS [functional.alltypesagg]
+|     HDFS partitions=11/11 files=11 size=814.73KB
+|     row-size=1B cardinality=11.00K
+|
+03:AGGREGATE [FINALIZE]
+|  group by: tinyint_col, tinyint_col
+|  row-size=2B cardinality=9
+|
+02:AGGREGATE [FINALIZE]
+|  group by: tinyint_col
+|  row-size=1B cardinality=9
+|
+01:SCAN HDFS [functional.alltypesagg]
+   HDFS partitions=11/11 files=11 size=814.73KB
+   row-size=1B cardinality=11.00K
+====
+# IMPALA-10182: Nulls get eliminated with union-all for duplicate columns
+# Introduce nulls from the null producing side of left outer join
+with dt1 as (select t2.int_col y from functional.alltypessmall t1
+  left outer join functional.alltypestiny t2 on t1.int_col = t2.int_col)
+select c1, c2 from (select dt1.y c1, dt1.y c2 from dt1 group by 1, 2) t1
+  group by 1, 2
+ union all
+select c1, c2 from (select dt1.y c1, dt1.y c2 from dt1 group by 1, 2) t1
+  group by 1, 2;
+---- PLAN
+PLAN-ROOT SINK
+|
+00:UNION
+|  pass-through-operands: all
+|  row-size=8B cardinality=4
+|
+|--10:AGGREGATE [FINALIZE]
+|  |  group by: dt1.y, dt1.y
+|  |  row-size=8B cardinality=2
+|  |
+|  09:AGGREGATE [FINALIZE]
+|  |  group by: t2.int_col
+|  |  row-size=4B cardinality=2
+|  |
+|  08:HASH JOIN [LEFT OUTER JOIN]
+|  |  hash predicates: t1.int_col = t2.int_col
+|  |  row-size=8B cardinality=100
+|  |
+|  |--07:SCAN HDFS [functional.alltypestiny t2]
+|  |     HDFS partitions=4/4 files=4 size=460B
+|  |     row-size=4B cardinality=8
+|  |
+|  06:SCAN HDFS [functional.alltypessmall t1]
+|     HDFS partitions=4/4 files=4 size=6.32KB
+|     row-size=4B cardinality=100
+|
+05:AGGREGATE [FINALIZE]
+|  group by: dt1.y, dt1.y
+|  row-size=8B cardinality=2
+|
+04:AGGREGATE [FINALIZE]
+|  group by: t2.int_col
+|  row-size=4B cardinality=2
+|
+03:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.int_col = t2.int_col
+|  row-size=8B cardinality=100
+|
+|--02:SCAN HDFS [functional.alltypestiny t2]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=4B cardinality=8
+|
+01:SCAN HDFS [functional.alltypessmall t1]
+   HDFS partitions=4/4 files=4 size=6.32KB
+   row-size=4B cardinality=100
+====
+# IMPALA-9694 IllegalStateException was being thrown
+# (related to the same root cause as IMPALA-10182 above)
+# Note that the analytic function does not show up in the final plan.
+# It is optimized out since the caller is only computing count(*)
+select count(*) from (select
+  lead(timestamp_col) over (partition by int_col order by timestamp_col) c1,
+  int_col c2, int_col c3 from functional.alltypesagg) v;
+---- PLAN
+PLAN-ROOT SINK
+|
+01:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  row-size=8B cardinality=1
+|
+00:SCAN HDFS [functional.alltypesagg]
+   HDFS partitions=11/11 files=11 size=814.73KB
+   row-size=0B cardinality=11.00K
+====
\ No newline at end of file
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/inline-view.test 
b/testdata/workloads/functional-query/queries/QueryTest/inline-view.test
index 284d3ea..299db84 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/inline-view.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/inline-view.test
@@ -555,3 +555,39 @@ order by 1 limit 5
 ---- TYPES
 INT,INT,TINYINT
 ====
+---- QUERY
+# IMPALA-10182: Nulls get eliminated with union-all for duplicate columns
+select count(*) from (
+select c1, c2 from (select tinyint_col c1, tinyint_col c2
+  from alltypesagg group by 1, 2) t1
+  group by 1, 2
+ union all
+select c1, c2 from (select tinyint_col c1, tinyint_col c2
+  from alltypesagg group by 1, 2) t1
+  group by 1, 2) tt;
+---- RESULTS
+20
+---- TYPES
+BIGINT
+====
+---- QUERY
+# IMPALA-10182: Nulls get eliminated with union-all for duplicate columns
+# Introduce nulls from the null producing side of left outer join
+with dt1 as (select t2.int_col y from alltypessmall t1
+  left outer join alltypestiny t2 on t1.int_col = t2.int_col)
+select * from (
+select c1, c2 from (select dt1.y c1, dt1.y c2 from dt1 group by 1, 2) t1
+  group by 1, 2
+ union all
+select c1, c2 from (select dt1.y c1, dt1.y c2 from dt1 group by 1, 2) t1
+  group by 1, 2) tt order by 1, 2;
+---- RESULTS
+0,0
+0,0
+1,1
+1,1
+NULL,NULL
+NULL,NULL
+---- TYPES
+INT, INT
+====
\ No newline at end of file

[impala] 02/03: IMPALA-10182: Don't add inferred identity predicates to SELECT node

Reply via email to