(hive) branch master updated: HIVE-28254: CBO (Calcite Return Path): Multiple DISTINCT leads to wrong results (Shohei Okumiya, reviewed by Krisztian Kasa)

krisztiankasa Tue, 04 Jun 2024 00:30:21 -0700

This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 4bc36eb40eb HIVE-28254: CBO (Calcite Return Path): Multiple DISTINCT 
leads to wrong results (Shohei Okumiya, reviewed by Krisztian Kasa)
4bc36eb40eb is described below

commit 4bc36eb40ebd16fb0b79f19ec4471549cb42a177
Author: okumin <[email protected]>
AuthorDate: Tue Jun 4 16:29:28 2024 +0900

    HIVE-28254: CBO (Calcite Return Path): Multiple DISTINCT leads to wrong 
results (Shohei Okumiya, reviewed by Krisztian Kasa)
---
 .../translator/opconventer/HiveGBOpConvUtil.java   |  73 +++++++++----
 .../cbo_rp_groupby3_noskew_multi_distinct.q        |  22 ++++
 .../cbo_rp_groupby3_noskew_multi_distinct.q.out    | 115 +++++++++++++++++++++
 .../test/results/clientpositive/llap/count.q.out   |   2 +-
 4 files changed, 190 insertions(+), 22 deletions(-)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java
index a215dec1148..5142f4f41be 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java
@@ -1028,32 +1028,65 @@ final class HiveGBOpConvUtil {
     List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
     ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
     int udafColStartPosInOriginalGB = gbInfo.gbKeys.size();
-    // the positions in rsColInfoLst are as follows
-    // --grpkey--,--distkey--,--values--
-    // but distUDAF may be before/after some non-distUDAF,
-    // i.e., their positions can be mixed.
-    // so for all UDAF we first check to see if it is groupby key, if not is 
it distinct key
-    // if not it should be value
-    Map<Integer, List<ExprNodeDesc>> indexToParameter = new TreeMap<>();
+
+    final List<List<ColumnInfo>> paramColInfoTable = new 
ArrayList<>(gbInfo.udafAttrs.size());
+    final List<List<String>> distinctColumnNameTable = new 
ArrayList<>(gbInfo.udafAttrs.size());
+    final Map<ColumnInfo, String> distinctColumnMapping = new HashMap<>();
     for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
-      UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
-      ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
+      final UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
+      final List<ColumnInfo> paramColInfo = new 
ArrayList<>(udafAttr.udafParams.size());
+      final List<String> distinctColNames = new 
ArrayList<>(udafAttr.udafParams.size());
 
-      ColumnInfo rsUDAFParamColInfo;
-      ExprNodeDesc udafParam;
-      ExprNodeDesc constantPropDistinctUDAFParam;
       for (int j = 0; j < udafAttr.udafParams.size(); j++) {
-        int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
-        rsUDAFParamColInfo = rsColInfoLst.get(argPos);
-        String rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
+        final int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
+        final ColumnInfo rsUDAFParamColInfo = rsColInfoLst.get(argPos);
+        paramColInfo.add(rsUDAFParamColInfo);
 
+        final String distinctColumnName;
         if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) {
-          rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + 
lastReduceKeyColName
+          distinctColumnName = Utilities.ReduceField.KEY.name() + "." + 
lastReduceKeyColName
                   + ":" + numDistinctUDFs + "." + 
SemanticAnalyzer.getColumnInternalName(j);
+          distinctColumnMapping.putIfAbsent(rsUDAFParamColInfo, 
distinctColumnName);
+        } else {
+          distinctColumnName = null;
+        }
+        distinctColNames.add(distinctColumnName);
+      }
+
+      paramColInfoTable.add(paramColInfo);
+      distinctColumnNameTable.add(distinctColNames);
+
+      if(udafAttr.isDistinctUDAF) {
+        numDistinctUDFs++;
+      }
+    }
+
+    // the positions in rsColInfoLst are as follows
+    // --grpkey--,--distkey--,--values--
+    // but distUDAF may be before/after some non-distUDAF,
+    // i.e., their positions can be mixed.
+    // so for all UDAF we first check to see if it is groupby key, if not is 
it distinct key
+    // if not it should be value
+    final Map<Integer, List<ExprNodeDesc>> indexToParameter = new TreeMap<>();
+    for (int i = 0; i < paramColInfoTable.size(); i++) {
+      final ArrayList<ExprNodeDesc> aggParameters = new ArrayList<>();
+
+      for (int j = 0; j < paramColInfoTable.get(i).size(); j++) {
+        final ColumnInfo rsUDAFParamColInfo = paramColInfoTable.get(i).get(j);
+
+        final String rsUDAFParamName;
+        if (distinctColumnNameTable.get(i).get(j) != null) {
+          rsUDAFParamName = distinctColumnNameTable.get(i).get(j);
+        } else if (distinctColumnMapping.containsKey(rsUDAFParamColInfo)) {
+          // This UDAF is not labeled with DISTINCT, but it refers to a 
DISTINCT key.
+          // The original internal name could be already obsolete as any 
DISTINCT keys are renamed.
+          rsUDAFParamName = distinctColumnMapping.get(rsUDAFParamColInfo);
+        } else {
+          rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
         }
-        udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), 
rsUDAFParamName,
+        ExprNodeDesc udafParam = new 
ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName,
                 rsUDAFParamColInfo.getTabAlias(), 
rsUDAFParamColInfo.getIsVirtualCol());
-        constantPropDistinctUDAFParam = SemanticAnalyzer
+        final ExprNodeDesc constantPropDistinctUDAFParam = SemanticAnalyzer
                 
.isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(),
                         reduceValues);
         if (constantPropDistinctUDAFParam != null) {
@@ -1062,10 +1095,8 @@ final class HiveGBOpConvUtil {
         aggParameters.add(udafParam);
       }
       indexToParameter.put(i, aggParameters);
-      if(udafAttr.isDistinctUDAF) {
-        numDistinctUDFs++;
-      }
     }
+
     for(Map.Entry<Integer, List<ExprNodeDesc>> e : 
indexToParameter.entrySet()){
       UDAFAttrs udafAttr = gbInfo.udafAttrs.get(e.getKey());
       Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, 
udafAttr.isDistinctUDAF);
diff --git 
a/ql/src/test/queries/clientpositive/cbo_rp_groupby3_noskew_multi_distinct.q 
b/ql/src/test/queries/clientpositive/cbo_rp_groupby3_noskew_multi_distinct.q
index ebd08bb37bc..91edff656c6 100644
--- a/ql/src/test/queries/clientpositive/cbo_rp_groupby3_noskew_multi_distinct.q
+++ b/ql/src/test/queries/clientpositive/cbo_rp_groupby3_noskew_multi_distinct.q
@@ -38,3 +38,25 @@ INSERT OVERWRITE TABLE dest1_n123 SELECT
 
 SELECT dest1_n123.* FROM dest1_n123;
 
+
+CREATE TABLE test (col1 INT, col2 INT);
+INSERT INTO test VALUES (1, 100), (2, 200), (2, 200), (3, 300);
+
+EXPLAIN
+SELECT
+  SUM(DISTINCT col1),
+  COUNT(DISTINCT col1),
+  SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+  MAX(DISTINCT col1),
+  SUM(DISTINCT col2),
+  MIN(DISTINCT col1)
+FROM test;
+
+SELECT
+  SUM(DISTINCT col1),
+  COUNT(DISTINCT col1),
+  SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+  MAX(DISTINCT col1),
+  SUM(DISTINCT col2),
+  MIN(DISTINCT col1)
+FROM test;
diff --git 
a/ql/src/test/results/clientpositive/llap/cbo_rp_groupby3_noskew_multi_distinct.q.out
 
b/ql/src/test/results/clientpositive/llap/cbo_rp_groupby3_noskew_multi_distinct.q.out
index fa5b4662726..91ba8aea706 100644
--- 
a/ql/src/test/results/clientpositive/llap/cbo_rp_groupby3_noskew_multi_distinct.q.out
+++ 
b/ql/src/test/results/clientpositive/llap/cbo_rp_groupby3_noskew_multi_distinct.q.out
@@ -185,3 +185,118 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@dest1_n123
 #### A masked pattern was here ####
 130091.0       260.182 256.10355987055016      98.0    0.0     
142.9268095075238       143.06995106518906      20428.072876000002      
20469.010897795593      79136.0 309.0
+PREHOOK: query: CREATE TABLE test (col1 INT, col2 INT)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: CREATE TABLE test (col1 INT, col2 INT)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: INSERT INTO test VALUES (1, 100), (2, 200), (2, 200), (3, 300)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test
+POSTHOOK: query: INSERT INTO test VALUES (1, 100), (2, 200), (2, 200), (3, 300)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test
+POSTHOOK: Lineage: test.col1 SCRIPT []
+POSTHOOK: Lineage: test.col2 SCRIPT []
+PREHOOK: query: EXPLAIN
+SELECT
+  SUM(DISTINCT col1),
+  COUNT(DISTINCT col1),
+  SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+  MAX(DISTINCT col1),
+  SUM(DISTINCT col2),
+  MIN(DISTINCT col1)
+FROM test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT
+  SUM(DISTINCT col1),
+  COUNT(DISTINCT col1),
+  SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+  MAX(DISTINCT col1),
+  SUM(DISTINCT col2),
+  MIN(DISTINCT col1)
+FROM test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: test
+                  Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: col1 (type: int), col2 (type: int)
+                    outputColumnNames: col1, col2
+                    Statistics: Num rows: 4 Data size: 32 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: col1 (type: int), col2 (type: int)
+                      null sort order: zz
+                      sort order: ++
+                      Statistics: Num rows: 4 Data size: 32 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: sum(DISTINCT KEY._col0:0._col0), count(DISTINCT 
KEY._col0:1._col0), sum(KEY._col0:3._col0), max(DISTINCT KEY._col0:2._col0), 
sum(DISTINCT KEY._col0:3._col0), min(DISTINCT KEY._col0:4._col0)
+                mode: complete
+                outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5
+                Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: SELECT
+  SUM(DISTINCT col1),
+  COUNT(DISTINCT col1),
+  SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+  MAX(DISTINCT col1),
+  SUM(DISTINCT col2),
+  MIN(DISTINCT col1)
+FROM test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+  SUM(DISTINCT col1),
+  COUNT(DISTINCT col1),
+  SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+  MAX(DISTINCT col1),
+  SUM(DISTINCT col2),
+  MIN(DISTINCT col1)
+FROM test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+6      3       800     3       600     1
diff --git a/ql/src/test/results/clientpositive/llap/count.q.out 
b/ql/src/test/results/clientpositive/llap/count.q.out
index 0de58e42849..a30e406179e 100644
--- a/ql/src/test/results/clientpositive/llap/count.q.out
+++ b/ql/src/test/results/clientpositive/llap/count.q.out
@@ -811,7 +811,7 @@ STAGE PLANS:
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: count(DISTINCT KEY._col1:0._col0), 
count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(VALUE._col1), 
sum(VALUE._col2), sum(KEY._col1:0._col0), sum(KEY._col1:1._col0), 
sum(KEY._col0), sum(DISTINCT KEY._col1:2._col0), sum(DISTINCT KEY._col1:3._col0)
+                aggregations: count(DISTINCT KEY._col1:0._col0), 
count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(VALUE._col1), 
sum(VALUE._col2), sum(KEY._col1:0._col0), sum(KEY._col1:1._col0), 
sum(KEY._col1:2._col0), sum(DISTINCT KEY._col1:2._col0), sum(DISTINCT 
KEY._col1:3._col0)
                 keys: KEY._col0 (type: int)
                 mode: complete
                 outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7, 
$f8, $f9, $f10

(hive) branch master updated: HIVE-28254: CBO (Calcite Return Path): Multiple DISTINCT leads to wrong results (Shohei Okumiya, reviewed by Krisztian Kasa)

Reply via email to