Re: [PR] HIVE-28254: CBO (Calcite Return Path): Multiple DISTINCT leads to wrong results [hive]

via GitHub Mon, 03 Jun 2024 04:32:19 -0700


kasakrisz commented on code in PR #5245:
URL: https://github.com/apache/hive/pull/5245#discussion_r1624106511



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java:
##########
@@ -1028,32 +1028,65 @@ private static OpAttr genReduceSideGB1NoMapGB(OpAttr 
inputOpAf, GBInfo gbInfo,
     List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
     ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
     int udafColStartPosInOriginalGB = gbInfo.gbKeys.size();
-    // the positions in rsColInfoLst are as follows
-    // --grpkey--,--distkey--,--values--
-    // but distUDAF may be before/after some non-distUDAF,
-    // i.e., their positions can be mixed.
-    // so for all UDAF we first check to see if it is groupby key, if not is 
it distinct key
-    // if not it should be value
-    Map<Integer, List<ExprNodeDesc>> indexToParameter = new TreeMap<>();
+
+    final List<List<ColumnInfo>> paramColInfoTable = new 
ArrayList<>(gbInfo.udafAttrs.size());
+    final List<List<String>> distinctColumnNameTable = new 
ArrayList<>(gbInfo.udafAttrs.size());
+    final Map<ColumnInfo, String> distinctColumnMapping = new HashMap<>();
     for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
-      UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
-      ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
+      final UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
+      final List<ColumnInfo> paramColInfo = new 
ArrayList<>(udafAttr.udafParams.size());
+      final List<String> distinctColNames = new 
ArrayList<>(udafAttr.udafParams.size());
 
-      ColumnInfo rsUDAFParamColInfo;
-      ExprNodeDesc udafParam;
-      ExprNodeDesc constantPropDistinctUDAFParam;
       for (int j = 0; j < udafAttr.udafParams.size(); j++) {
-        int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
-        rsUDAFParamColInfo = rsColInfoLst.get(argPos);
-        String rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
+        final int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
+        final ColumnInfo rsUDAFParamColInfo = rsColInfoLst.get(argPos);
+        paramColInfo.add(rsUDAFParamColInfo);
 
+        final String distinctColumnName;
         if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) {
-          rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + 
lastReduceKeyColName
+          distinctColumnName = Utilities.ReduceField.KEY.name() + "." + 
lastReduceKeyColName
                   + ":" + numDistinctUDFs + "." + 
SemanticAnalyzer.getColumnInternalName(j);
+          distinctColumnMapping.putIfAbsent(rsUDAFParamColInfo, 
distinctColumnName);
+        } else {
+          distinctColumnName = null;
+        }
+        distinctColNames.add(distinctColumnName);
+      }
+
+      paramColInfoTable.add(paramColInfo);
+      distinctColumnNameTable.add(distinctColNames);
+
+      if(udafAttr.isDistinctUDAF) {
+        numDistinctUDFs++;
+      }
+    }
+
+    // the positions in rsColInfoLst are as follows
+    // --grpkey--,--distkey--,--values--
+    // but distUDAF may be before/after some non-distUDAF,
+    // i.e., their positions can be mixed.
+    // so for all UDAF we first check to see if it is groupby key, if not is 
it distinct key
+    // if not it should be value
+    final Map<Integer, List<ExprNodeDesc>> indexToParameter = new TreeMap<>();
+    for (int i = 0; i < paramColInfoTable.size(); i++) {
+      final ArrayList<ExprNodeDesc> aggParameters = new ArrayList<>();
+
+      for (int j = 0; j < paramColInfoTable.get(i).size(); j++) {
+        final ColumnInfo rsUDAFParamColInfo = paramColInfoTable.get(i).get(j);
+
+        final String rsUDAFParamName;
+        if (distinctColumnNameTable.get(i).get(j) != null) {
+          rsUDAFParamName = distinctColumnNameTable.get(i).get(j);
+        } else if (distinctColumnMapping.get(rsUDAFParamColInfo) != null) {

Review Comment:
   nit. distinctColumnMapping.containsKey(rsUDAFParamColInfo)



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/opconventer/HiveGBOpConvUtil.java:
##########
@@ -1028,32 +1028,65 @@ private static OpAttr genReduceSideGB1NoMapGB(OpAttr 
inputOpAf, GBInfo gbInfo,
     List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
     ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
     int udafColStartPosInOriginalGB = gbInfo.gbKeys.size();
-    // the positions in rsColInfoLst are as follows
-    // --grpkey--,--distkey--,--values--
-    // but distUDAF may be before/after some non-distUDAF,
-    // i.e., their positions can be mixed.
-    // so for all UDAF we first check to see if it is groupby key, if not is 
it distinct key
-    // if not it should be value
-    Map<Integer, List<ExprNodeDesc>> indexToParameter = new TreeMap<>();
+
+    final List<List<ColumnInfo>> paramColInfoTable = new 
ArrayList<>(gbInfo.udafAttrs.size());
+    final List<List<String>> distinctColumnNameTable = new 
ArrayList<>(gbInfo.udafAttrs.size());
+    final Map<ColumnInfo, String> distinctColumnMapping = new HashMap<>();
     for (int i = 0; i < gbInfo.udafAttrs.size(); i++) {
-      UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
-      ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
+      final UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i);
+      final List<ColumnInfo> paramColInfo = new 
ArrayList<>(udafAttr.udafParams.size());
+      final List<String> distinctColNames = new 
ArrayList<>(udafAttr.udafParams.size());
 
-      ColumnInfo rsUDAFParamColInfo;
-      ExprNodeDesc udafParam;
-      ExprNodeDesc constantPropDistinctUDAFParam;
       for (int j = 0; j < udafAttr.udafParams.size(); j++) {
-        int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
-        rsUDAFParamColInfo = rsColInfoLst.get(argPos);
-        String rsUDAFParamName = rsUDAFParamColInfo.getInternalName();
+        final int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo);
+        final ColumnInfo rsUDAFParamColInfo = rsColInfoLst.get(argPos);
+        paramColInfo.add(rsUDAFParamColInfo);
 
+        final String distinctColumnName;
         if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) {
-          rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + 
lastReduceKeyColName
+          distinctColumnName = Utilities.ReduceField.KEY.name() + "." + 
lastReduceKeyColName
                   + ":" + numDistinctUDFs + "." + 
SemanticAnalyzer.getColumnInternalName(j);
+          distinctColumnMapping.putIfAbsent(rsUDAFParamColInfo, 
distinctColumnName);
+        } else {
+          distinctColumnName = null;
+        }
+        distinctColNames.add(distinctColumnName);
+      }
+
+      paramColInfoTable.add(paramColInfo);
+      distinctColumnNameTable.add(distinctColNames);
+
+      if(udafAttr.isDistinctUDAF) {
+        numDistinctUDFs++;
+      }
+    }
+
+    // the positions in rsColInfoLst are as follows
+    // --grpkey--,--distkey--,--values--
+    // but distUDAF may be before/after some non-distUDAF,
+    // i.e., their positions can be mixed.
+    // so for all UDAF we first check to see if it is groupby key, if not is 
it distinct key
+    // if not it should be value
+    final Map<Integer, List<ExprNodeDesc>> indexToParameter = new TreeMap<>();
+    for (int i = 0; i < paramColInfoTable.size(); i++) {
+      final ArrayList<ExprNodeDesc> aggParameters = new ArrayList<>();
+
+      for (int j = 0; j < paramColInfoTable.get(i).size(); j++) {
+        final ColumnInfo rsUDAFParamColInfo = paramColInfoTable.get(i).get(j);
+
+        final String rsUDAFParamName;
+        if (distinctColumnNameTable.get(i).get(j) != null) {
+          rsUDAFParamName = distinctColumnNameTable.get(i).get(j);
+        } else if (distinctColumnMapping.get(rsUDAFParamColInfo) != null) {
+          // This UDAF is not labeled with DISTINCT, but it refers to a 
DISTINCT key.
+          // The original internal name is already obsolete as any DISTINCT 
keys are renamed.
+          rsUDAFParamName = distinctColumnMapping.get(rsUDAFParamColInfo);
+        } else {
+          rsUDAFParamName = rsUDAFParamColInfo.getInternalName();

Review Comment:
   Thanks for explanation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] HIVE-28254: CBO (Calcite Return Path): Multiple DISTINCT leads to wrong results [hive]

Reply via email to