This is an automated email from the ASF dual-hosted git repository.
krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 4bc36eb40eb HIVE-28254: CBO (Calcite Return Path): Multiple DISTINCT
leads to wrong results (Shohei Okumiya, reviewed by Krisztian Kasa)
4bc36eb40eb is described below
commit 4bc36eb40ebd16fb0b79f19ec4471549cb42a177
Author: okumin <[email protected]>
AuthorDate: Tue Jun 4 16:29:28 2024 +0900
HIVE-28254: CBO (Calcite Return Path): Multiple DISTINCT leads to wrong
results (Shohei Okumiya, reviewed by Krisztian Kasa)
---
.../translator/opconventer/HiveGBOpConvUtil.java | 73 +++++++++----
.../cbo_rp_groupby3_noskew_multi_distinct.q | 22 ++++
.../cbo_rp_groupby3_noskew_multi_distinct.q.out | 115 +++++++++++++++++++++
.../test/results/clientpositive/llap/count.q.out | 2 +-
4 files changed, 190 insertions(+), 22 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java
index a215dec1148..5142f4f41be 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java
@@ -1028,32 +1028,65 @@ final class HiveGBOpConvUtil {
List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
int udafColStartPosInOriginalGB = gbInfo.gbKeys.size();
- // the positions in rsColInfoLst are as follows
- // --grpkey--,--distkey--,--values--
- // but distUDAF may be before/after some non-distUDAF,
- // i.e., their positions can be mixed.
- // so for all UDAF we first check to see if it is groupby key, if not is
it distinct key
- // if not it should be value
- Map<Integer, List<ExprNodeDesc>> indexToParameter = new TreeMap<>();
+
+ final List<List<ColumnInfo>> paramColInfoTable = new
ArrayList<>(gbInfo.udafAttrs.size());
+ final List<List<String>> distinctColumnNameTable = new
ArrayList<>(gbInfo.udafAttrs.size());
+ final Map<ColumnInfo, String> distinctColumnMapping = new HashMap<>();
for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
- UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
- ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
+ final UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
+ final List<ColumnInfo> paramColInfo = new
ArrayList<>(udafAttr.udafParams.size());
+ final List<String> distinctColNames = new
ArrayList<>(udafAttr.udafParams.size());
- ColumnInfo rsUDAFParamColInfo;
- ExprNodeDesc udafParam;
- ExprNodeDesc constantPropDistinctUDAFParam;
for (int j = 0; j < udafAttr.udafParams.size(); j++) {
- int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
- rsUDAFParamColInfo = rsColInfoLst.get(argPos);
- String rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
+ final int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
+ final ColumnInfo rsUDAFParamColInfo = rsColInfoLst.get(argPos);
+ paramColInfo.add(rsUDAFParamColInfo);
+ final String distinctColumnName;
if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) {
- rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." +
lastReduceKeyColName
+ distinctColumnName = Utilities.ReduceField.KEY.name() + "." +
lastReduceKeyColName
+ ":" + numDistinctUDFs + "." +
SemanticAnalyzer.getColumnInternalName(j);
+ distinctColumnMapping.putIfAbsent(rsUDAFParamColInfo,
distinctColumnName);
+ } else {
+ distinctColumnName = null;
+ }
+ distinctColNames.add(distinctColumnName);
+ }
+
+ paramColInfoTable.add(paramColInfo);
+ distinctColumnNameTable.add(distinctColNames);
+
+ if(udafAttr.isDistinctUDAF) {
+ numDistinctUDFs++;
+ }
+ }
+
+ // the positions in rsColInfoLst are as follows
+ // --grpkey--,--distkey--,--values--
+ // but distUDAF may be before/after some non-distUDAF,
+ // i.e., their positions can be mixed.
+ // so for all UDAF we first check to see if it is groupby key, if not is
it distinct key
+ // if not it should be value
+ final Map<Integer, List<ExprNodeDesc>> indexToParameter = new TreeMap<>();
+ for (int i = 0; i < paramColInfoTable.size(); i++) {
+ final ArrayList<ExprNodeDesc> aggParameters = new ArrayList<>();
+
+ for (int j = 0; j < paramColInfoTable.get(i).size(); j++) {
+ final ColumnInfo rsUDAFParamColInfo = paramColInfoTable.get(i).get(j);
+
+ final String rsUDAFParamName;
+ if (distinctColumnNameTable.get(i).get(j) != null) {
+ rsUDAFParamName = distinctColumnNameTable.get(i).get(j);
+ } else if (distinctColumnMapping.containsKey(rsUDAFParamColInfo)) {
+ // This UDAF is not labeled with DISTINCT, but it refers to a
DISTINCT key.
+ // The original internal name could be already obsolete as any
DISTINCT keys are renamed.
+ rsUDAFParamName = distinctColumnMapping.get(rsUDAFParamColInfo);
+ } else {
+ rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
}
- udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(),
rsUDAFParamName,
+ ExprNodeDesc udafParam = new
ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName,
rsUDAFParamColInfo.getTabAlias(),
rsUDAFParamColInfo.getIsVirtualCol());
- constantPropDistinctUDAFParam = SemanticAnalyzer
+ final ExprNodeDesc constantPropDistinctUDAFParam = SemanticAnalyzer
.isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(),
reduceValues);
if (constantPropDistinctUDAFParam != null) {
@@ -1062,10 +1095,8 @@ final class HiveGBOpConvUtil {
aggParameters.add(udafParam);
}
indexToParameter.put(i, aggParameters);
- if(udafAttr.isDistinctUDAF) {
- numDistinctUDFs++;
- }
}
+
for(Map.Entry<Integer, List<ExprNodeDesc>> e :
indexToParameter.entrySet()){
UDAFAttrs udafAttr = gbInfo.udafAttrs.get(e.getKey());
Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode,
udafAttr.isDistinctUDAF);
diff --git
a/ql/src/test/queries/clientpositive/cbo_rp_groupby3_noskew_multi_distinct.q
b/ql/src/test/queries/clientpositive/cbo_rp_groupby3_noskew_multi_distinct.q
index ebd08bb37bc..91edff656c6 100644
--- a/ql/src/test/queries/clientpositive/cbo_rp_groupby3_noskew_multi_distinct.q
+++ b/ql/src/test/queries/clientpositive/cbo_rp_groupby3_noskew_multi_distinct.q
@@ -38,3 +38,25 @@ INSERT OVERWRITE TABLE dest1_n123 SELECT
SELECT dest1_n123.* FROM dest1_n123;
+
+CREATE TABLE test (col1 INT, col2 INT);
+INSERT INTO test VALUES (1, 100), (2, 200), (2, 200), (3, 300);
+
+EXPLAIN
+SELECT
+ SUM(DISTINCT col1),
+ COUNT(DISTINCT col1),
+ SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+ MAX(DISTINCT col1),
+ SUM(DISTINCT col2),
+ MIN(DISTINCT col1)
+FROM test;
+
+SELECT
+ SUM(DISTINCT col1),
+ COUNT(DISTINCT col1),
+ SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+ MAX(DISTINCT col1),
+ SUM(DISTINCT col2),
+ MIN(DISTINCT col1)
+FROM test;
diff --git
a/ql/src/test/results/clientpositive/llap/cbo_rp_groupby3_noskew_multi_distinct.q.out
b/ql/src/test/results/clientpositive/llap/cbo_rp_groupby3_noskew_multi_distinct.q.out
index fa5b4662726..91ba8aea706 100644
---
a/ql/src/test/results/clientpositive/llap/cbo_rp_groupby3_noskew_multi_distinct.q.out
+++
b/ql/src/test/results/clientpositive/llap/cbo_rp_groupby3_noskew_multi_distinct.q.out
@@ -185,3 +185,118 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@dest1_n123
#### A masked pattern was here ####
130091.0 260.182 256.10355987055016 98.0 0.0
142.9268095075238 143.06995106518906 20428.072876000002
20469.010897795593 79136.0 309.0
+PREHOOK: query: CREATE TABLE test (col1 INT, col2 INT)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: CREATE TABLE test (col1 INT, col2 INT)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: INSERT INTO test VALUES (1, 100), (2, 200), (2, 200), (3, 300)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test
+POSTHOOK: query: INSERT INTO test VALUES (1, 100), (2, 200), (2, 200), (3, 300)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test
+POSTHOOK: Lineage: test.col1 SCRIPT []
+POSTHOOK: Lineage: test.col2 SCRIPT []
+PREHOOK: query: EXPLAIN
+SELECT
+ SUM(DISTINCT col1),
+ COUNT(DISTINCT col1),
+ SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+ MAX(DISTINCT col1),
+ SUM(DISTINCT col2),
+ MIN(DISTINCT col1)
+FROM test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT
+ SUM(DISTINCT col1),
+ COUNT(DISTINCT col1),
+ SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+ MAX(DISTINCT col1),
+ SUM(DISTINCT col2),
+ MIN(DISTINCT col1)
+FROM test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: test
+ Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: col1 (type: int), col2 (type: int)
+ outputColumnNames: col1, col2
+ Statistics: Num rows: 4 Data size: 32 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: col1 (type: int), col2 (type: int)
+ null sort order: zz
+ sort order: ++
+ Statistics: Num rows: 4 Data size: 32 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(DISTINCT KEY._col0:0._col0), count(DISTINCT
KEY._col0:1._col0), sum(KEY._col0:3._col0), max(DISTINCT KEY._col0:2._col0),
sum(DISTINCT KEY._col0:3._col0), min(DISTINCT KEY._col0:4._col0)
+ mode: complete
+ outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5
+ Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE
Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT
+ SUM(DISTINCT col1),
+ COUNT(DISTINCT col1),
+ SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+ MAX(DISTINCT col1),
+ SUM(DISTINCT col2),
+ MIN(DISTINCT col1)
+FROM test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+ SUM(DISTINCT col1),
+ COUNT(DISTINCT col1),
+ SUM(col2), -- This has to refer to the key for `SUM(DISTINCT col2)`
+ MAX(DISTINCT col1),
+ SUM(DISTINCT col2),
+ MIN(DISTINCT col1)
+FROM test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+#### A masked pattern was here ####
+6 3 800 3 600 1
diff --git a/ql/src/test/results/clientpositive/llap/count.q.out
b/ql/src/test/results/clientpositive/llap/count.q.out
index 0de58e42849..a30e406179e 100644
--- a/ql/src/test/results/clientpositive/llap/count.q.out
+++ b/ql/src/test/results/clientpositive/llap/count.q.out
@@ -811,7 +811,7 @@ STAGE PLANS:
Execution mode: llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0),
count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(VALUE._col1),
sum(VALUE._col2), sum(KEY._col1:0._col0), sum(KEY._col1:1._col0),
sum(KEY._col0), sum(DISTINCT KEY._col1:2._col0), sum(DISTINCT KEY._col1:3._col0)
+ aggregations: count(DISTINCT KEY._col1:0._col0),
count(DISTINCT KEY._col1:1._col0), sum(VALUE._col0), sum(VALUE._col1),
sum(VALUE._col2), sum(KEY._col1:0._col0), sum(KEY._col1:1._col0),
sum(KEY._col1:2._col0), sum(DISTINCT KEY._col1:2._col0), sum(DISTINCT
KEY._col1:3._col0)
keys: KEY._col0 (type: int)
mode: complete
outputColumnNames: $f0, $f1, $f2, $f3, $f4, $f5, $f6, $f7,
$f8, $f9, $f10