This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c1aac4b3a4fb616763cedc59648cfde6e8f5ec70
Author: Daniel Becker <[email protected]>
AuthorDate: Tue Apr 1 10:18:51 2025 +0200

    IMPALA-13873: Missing equivalence conjunct in aggregation node with inline 
views
    
    Some queries involving plain (distinct) UNIONs miss conjuncts, leading
    to incorrect results:
    
    Example:
      WITH u1 AS (select 10 a, 10 b),
      t AS (select a, b, min(b) over (partition by a) min_b from u1 UNION
      select 10, 10, 20)
      select t.* from t where t.b = t.min_b;
    
    Expected result:
      +----+----+-------+
      | a  | b  | min_b |
      +----+----+-------+
      | 10 | 10 | 10    |
      +----+----+-------+
    
    Actual result:
      +----+----+-------+
      | a  | b  | min_b |
      +----+----+-------+
      | 10 | 10 | 10    |
      | 10 | 20 | 10    |
      +----+----+-------+
    
    This is caused by MultiAggregateInfo assuming that conjuncts bound by
    grouping slots that are produced by SlotRef grouping expressions are
    already evaluated below the AggregationNode. However, this is not true
    in all cases: with UNIONs, there may be conjuncts that are unassigned
    below the AggregationNode.
    
    This may happen if a conjunct cannot be pushed into all operands of a
    UNION, because the source tuples in the operands do not contain all of
    the slots referenced by the predicate. In the example above, it happens
    in the first operand:
      select a, b, min(b) over (partition by a) min_b from u1
    The source tuple, 'u1', contains only two slots ('a' and 'b'), but does
    not contain a slot corresponding to 'min(b)' - therefore the predicate
    't.b = t.min_b' is not bound by the tuple of 'u1'. In theory, the
    predicate could still be evaluated directly after materialising the
    tuple with 'min(b)', still inside the UNION operand, but Impala
    currently does not work that way.
    
    In these cases, the conjuncts need to be evaluated in the
    AggregationNode (possibly in addition to some of the UNION operands).
    
    This change fixes this problem by introducing a method in
    MultiAggregateInfo: 'setConjunctsToKeep()', where the caller can pass a
    list of conjuncts that will not be eliminated. This is called during the
    planning of the UNION if there are unassigned conjuncts remaining.
    
    Testing:
     - Added a PlannerTest and an EE test for the case where a conjunct
       was previously incorrectly removed from the AggregationNode.
     - Existing tests cover the case when conjuncts can be safely removed
       from an AggregationNode above a UnionNode because the conjuncts are
       pushed into all union operands, see for example
       
https://github.com/apache/impala/blob/6f2d9a2/testdata/workloads/functional-planner/queries/PlannerTest/union.test#L3914
    
    Change-Id: I67a59cd96d83181ce249fd6ca141906f549a09b3
    Reviewed-on: http://gerrit.cloudera.org:8080/22746
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../apache/impala/analysis/MultiAggregateInfo.java |  24 ++++
 .../apache/impala/planner/SingleNodePlanner.java   |   9 +-
 .../queries/PlannerTest/union.test                 | 125 +++++++++++++++++++++
 .../queries/QueryTest/aggregation.test             |  45 ++++++++
 4 files changed, 202 insertions(+), 1 deletion(-)

diff --git 
a/fe/src/main/java/org/apache/impala/analysis/MultiAggregateInfo.java 
b/fe/src/main/java/org/apache/impala/analysis/MultiAggregateInfo.java
index 24c12fda9..9530b7aae 100644
--- a/fe/src/main/java/org/apache/impala/analysis/MultiAggregateInfo.java
+++ b/fe/src/main/java/org/apache/impala/analysis/MultiAggregateInfo.java
@@ -133,6 +133,11 @@ public class MultiAggregateInfo {
   // Result of substituting 'groupingExprs_' with the output smap of the 
AggregationNode.
   private List<Expr> substGroupingExprs_;
 
+  // These conjuncts are kept even if they would otherwise be removed as 
redundant. This
+  // can be used for example when a conjunct cannot be pushed into all 
operands of a UNION
+  // node below the AggregationNode, so it needs to be evaluated by the 
AggregationNode.
+  private List<Expr> conjunctsToKeep_;
+
   // Results of analyze():
 
   // Aggregation classes and their AggregateInfos. If there is a class with 
non-distinct
@@ -220,6 +225,10 @@ public class MultiAggregateInfo {
     return new MultiAggregateInfo(distinctAggInfo);
   }
 
+  public void setConjunctsToKeep(List<Expr> conjuncts) {
+    conjunctsToKeep_ = conjuncts;
+  }
+
   public void analyze(Analyzer analyzer) throws AnalysisException {
     if (isAnalyzed_) return;
 
@@ -1024,7 +1033,22 @@ public class MultiAggregateInfo {
     if (markAssigned) analyzer.markConjunctsAssigned(unassignedConjuncts);
     result.addAll(bindingPredicates);
     result.addAll(unassignedConjuncts);
+
+    // Select those conjuncts in 'result' that we need to keep.
+    List<Expr> origConjunctsToKeep = new ArrayList<>();
+    if (conjunctsToKeep_ != null) {
+      for (Expr conj : result) {
+        if (conjunctsToKeep_.contains(conj)) {
+          origConjunctsToKeep.add(conj);
+        }
+      }
+    }
+
     analyzer.createEquivConjuncts(tid, result, groupBySids);
+
+    // Add back conjuncts that we need to keep but may have been removed.
+    if (origConjunctsToKeep != null) result.addAll(origConjunctsToKeep);
+
     Expr.removeDuplicates(result);
     return result;
   }
diff --git a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java 
b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
index 921cdd8cf..07333895e 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
@@ -2346,8 +2346,15 @@ public class SingleNodePlanner implements 
SingleNodePlannerIntf {
     if (unionStmt.hasUnionDistinctOps()) {
       result = createUnionPlan(
           analyzer, unionStmt, unionStmt.getUnionDistinctOperands(), null);
+
+      MultiAggregateInfo unionDistinctAggInfo = unionStmt.getDistinctAggInfo();
+
+      // Make sure MultiAggregateInfo does not remove conjuncts that are 
unassigned at
+      // this point as redundant.
+      
unionDistinctAggInfo.setConjunctsToKeep(analyzer.getUnassignedConjuncts(result));
+
       result = new AggregationNode(
-          ctx_.getNextNodeId(), result, unionStmt.getDistinctAggInfo(), 
AggPhase.FIRST);
+          ctx_.getNextNodeId(), result, unionDistinctAggInfo, AggPhase.FIRST);
       result.init(analyzer);
     }
     // create ALL tree
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/union.test 
b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
index e004dfc25..fc5ed39a0 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/union.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
@@ -4634,3 +4634,128 @@ Per-Host Resources: mem-estimate=80.00MB 
mem-reservation=128.00KB thread-reserva
    tuple-ids=0 row-size=4B cardinality=11.00K
    in pipelines: 00(GETNEXT)
 ====
+# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the 
aggregation node
+# should not be discarded.
+WITH u1 AS (select 10 a, 10 b),
+t AS (select a, b, min(b) over (partition by a) min_b from u1 UNION select 10, 
10, 20)
+select t.* from t where t.b = t.min_b;
+---- QUERYOPTIONS
+explain_level=2
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=20.00MB mem-reservation=11.94MB 
thread-reservation=1
+PLAN-ROOT SINK
+|  output exprs: a, b, min_b
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB 
thread-reservation=0
+|
+04:AGGREGATE [FINALIZE]
+|  group by: a, b, min_b
+|  having: b = min_b
+|  mem-estimate=10.00MB mem-reservation=1.94MB spill-buffer=64.00KB 
thread-reservation=0
+|  tuple-ids=2 row-size=3B cardinality=1
+|  in pipelines: 04(GETNEXT), 02(OPEN)
+|
+00:UNION
+|  constant-operands=1
+|  mem-estimate=0B mem-reservation=0B thread-reservation=0
+|  tuple-ids=2 row-size=3B cardinality=2
+|  in pipelines: 02(GETNEXT)
+|
+03:ANALYTIC
+|  functions: min(b)
+|  partition by: a
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB 
thread-reservation=0
+|  tuple-ids=5,4 row-size=3B cardinality=1
+|  in pipelines: 02(GETNEXT)
+|
+02:SORT
+|  order by: a ASC NULLS LAST
+|  mem-estimate=6.00MB mem-reservation=6.00MB spill-buffer=2.00MB 
thread-reservation=0
+|  tuple-ids=5 row-size=2B cardinality=1
+|  in pipelines: 02(GETNEXT)
+|
+01:UNION
+   constant-operands=1
+   mem-estimate=0B mem-reservation=0B thread-reservation=0
+   tuple-ids=0 row-size=2B cardinality=1
+   in pipelines: <none>
+====
+# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the 
aggregation node
+# should not be discarded.
+# Querying from tables, not inline expressions.
+WITH t AS (
+  select d2 a, d3 b, min(d3) over (partition by d2) min_b from 
functional.decimal_tbl
+  UNION
+  select smallint_col, int_col, bigint_col from functional.alltypestiny
+)
+select t.* from t where t.b = t.min_b;
+---- QUERYOPTIONS
+explain_level=1
+---- PLAN
+PLAN-ROOT SINK
+|
+05:AGGREGATE [FINALIZE]
+|  group by: a, b, min_b
+|  having: b = min_b
+|  row-size=40B cardinality=1
+|
+00:UNION
+|  row-size=40B cardinality=4
+|
+|--04:SCAN HDFS [functional.alltypestiny]
+|     HDFS partitions=4/4 files=4 size=460B
+|     predicates: functional.alltypestiny.int_col = 
functional.alltypestiny.bigint_col
+|     row-size=14B cardinality=1
+|
+03:ANALYTIC
+|  functions: min(d3)
+|  partition by: d2
+|  row-size=40B cardinality=3
+|
+02:SORT
+|  order by: d2 ASC NULLS LAST
+|  row-size=24B cardinality=3
+|
+01:SCAN HDFS [functional.decimal_tbl]
+   HDFS partitions=1/1 files=1 size=195B
+   row-size=24B cardinality=3
+====
+# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the 
aggregation node
+# should not be discarded.
+# The predicate cannot be pushed down to any UNION operand.
+WITH u1 AS (select tinyint_col, id from functional.alltypestiny),
+t AS (
+  select tinyint_col, id, min(id) over (partition by tinyint_col) min_id from 
u1
+  UNION
+  select tinyint_col, id, id+100 as min_id from u1)
+select t.* from t where t.id = t.min_id;
+---- QUERYOPTIONS
+explain_level=1
+---- PLAN
+PLAN-ROOT SINK
+|
+05:AGGREGATE [FINALIZE]
+|  group by: tinyint_col, id, min_id
+|  having: id = min_id
+|  row-size=13B cardinality=2
+|
+00:UNION
+|  row-size=13B cardinality=16
+|
+|--04:SCAN HDFS [functional.alltypestiny]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=5B cardinality=8
+|
+03:ANALYTIC
+|  functions: min(id)
+|  partition by: tinyint_col
+|  row-size=9B cardinality=8
+|
+02:SORT
+|  order by: tinyint_col ASC NULLS LAST
+|  row-size=5B cardinality=8
+|
+01:SCAN HDFS [functional.alltypestiny]
+   HDFS partitions=4/4 files=4 size=460B
+   row-size=5B cardinality=8
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test 
b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
index cb20e8c6d..8839edba0 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
@@ -2911,3 +2911,48 @@ select s_store_sk, regr_count(s_number_employees, 
s_floor_space) over (partition
 ---- TYPES
 int,bigint
 ====
+---- QUERY
+# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the 
aggregation node
+# should not be discarded.
+WITH u1 AS (select 10 a, 10 b),
+t AS (select a, b, min(b) over (partition by a) min_b from u1 UNION select 10, 
10, 20)
+select t.* from t where t.b = t.min_b;
+---- RESULTS
+10,10,10
+---- TYPES
+TINYINT,TINYINT,TINYINT
+====
+---- QUERY
+# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the 
aggregation node
+# should not be discarded.
+# Querying from tables, not inline expressions.
+WITH t AS (
+  select d2 a, d3 b, min(d3) over (partition by d2) min_b from 
functional.decimal_tbl
+  UNION
+  select smallint_col, int_col, bigint_col from functional.alltypestiny
+)
+select t.* from t where t.b = t.min_b;
+---- RESULTS
+2222,1.2345678900,1.2345678900
+0,0.0000000000,0.0000000000
+333,123.4567890000,123.4567890000
+111,12.3456789000,12.3456789000
+---- TYPES
+decimal,decimal,decimal
+====
+---- QUERY
+# Regression test for IMPALA-13873: the conjunct 'b = min_b' in the 
aggregation node
+# should not be discarded.
+# The predicate cannot be pushed down to any UNION operand.
+WITH u1 AS (select tinyint_col, id from functional.alltypestiny),
+t AS (
+  select tinyint_col, id, min(id) over (partition by tinyint_col) min_id from 
u1
+  UNION
+  select tinyint_col, id, id+100 as min_id from u1)
+select t.* from t where t.id = t.min_id;
+---- RESULTS
+1,1,1
+0,0,0
+---- TYPES
+tinyint,int,bigint
+====

Reply via email to