Repository: hive Updated Branches: refs/heads/master 860ba06fb -> 4f90a7156
HIVE-9534: incorrect result set for query that projects a windowed aggregate (Aihua Xu, reviewed by Yongzhi Chen) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4f90a715 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4f90a715 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4f90a715 Branch: refs/heads/master Commit: 4f90a71565d47f4b9d8ce267ace206d6a1f199fa Parents: 860ba06 Author: Aihua Xu <aihu...@apache.org> Authored: Thu Feb 4 10:57:16 2016 -0500 Committer: Aihua Xu <aihu...@apache.org> Committed: Fri Feb 12 15:21:31 2016 -0500 ---------------------------------------------------------------------- .../calcite/functions/CanAggregateDistinct.java | 27 +++++++++++++++++ .../functions/HiveSqlCountAggFunction.java | 3 +- .../functions/HiveSqlSumAggFunction.java | 4 +-- .../translator/SqlFunctionConverter.java | 29 ++++++++++-------- .../hive/ql/udf/generic/GenericUDAFAverage.java | 31 ++++++++++++++++++-- .../queries/clientpositive/windowing_distinct.q | 8 +++++ .../clientpositive/windowing_distinct.q.out | 26 ++++++++++++++++ 7 files changed, 111 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/4f90a715/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/CanAggregateDistinct.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/CanAggregateDistinct.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/CanAggregateDistinct.java new file mode 100644 index 0000000..c24f3c0 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/CanAggregateDistinct.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.calcite.functions; + +/** + * This is the UDAF interface to support DISTINCT function. + * + */ +public interface CanAggregateDistinct { + boolean isDistinct(); +} http://git-wip-us.apache.org/repos/asf/hive/blob/4f90a715/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlCountAggFunction.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlCountAggFunction.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlCountAggFunction.java index 58191e5..bc48707 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlCountAggFunction.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlCountAggFunction.java @@ -30,7 +30,7 @@ import org.apache.calcite.sql.type.SqlReturnTypeInference; import org.apache.calcite.sql.type.SqlTypeName; import org.apache.calcite.util.ImmutableIntList; -public class HiveSqlCountAggFunction extends SqlAggFunction { +public class HiveSqlCountAggFunction extends SqlAggFunction implements CanAggregateDistinct { final boolean isDistinct; final SqlReturnTypeInference returnTypeInference; @@ -52,6 +52,7 @@ public class HiveSqlCountAggFunction extends SqlAggFunction { this.operandTypeInference = operandTypeInference; } + @Override public boolean isDistinct() { return isDistinct; } http://git-wip-us.apache.org/repos/asf/hive/blob/4f90a715/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlSumAggFunction.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlSumAggFunction.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlSumAggFunction.java index 498cd0e..dc286a2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlSumAggFunction.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/functions/HiveSqlSumAggFunction.java @@ -46,7 +46,7 @@ import com.google.common.collect.ImmutableList; * <code>long</code>, <code>float</code>, <code>double</code>), and the result * is the same type. */ -public class HiveSqlSumAggFunction extends SqlAggFunction { +public class HiveSqlSumAggFunction extends SqlAggFunction implements CanAggregateDistinct{ final boolean isDistinct; final SqlReturnTypeInference returnTypeInference; final SqlOperandTypeInference operandTypeInference; @@ -70,7 +70,7 @@ public class HiveSqlSumAggFunction extends SqlAggFunction { } //~ Methods ---------------------------------------------------------------- - + @Override public boolean isDistinct() { return isDistinct; } http://git-wip-us.apache.org/repos/asf/hive/blob/4f90a715/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java index 19aa414..b4c6e05 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException.UnsupportedFeature; +import org.apache.hadoop.hive.ql.optimizer.calcite.functions.CanAggregateDistinct; import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlCountAggFunction; import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlMinMaxAggFunction; import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlSumAggFunction; @@ -217,24 +218,20 @@ public class SqlFunctionConverter { } else if (op.kind == SqlKind.PLUS_PREFIX) { node = (ASTNode) ParseDriver.adaptor.create(HiveParser.PLUS, "PLUS"); } else { - // Handle 'COUNT' function for the case of COUNT(*) and COUNT(DISTINCT) - if (op instanceof HiveSqlCountAggFunction) { + // Handle COUNT/SUM/AVG function for the case of COUNT(*) and COUNT(DISTINCT) + if (op instanceof HiveSqlCountAggFunction || + op instanceof HiveSqlSumAggFunction || + (op instanceof CalciteUDAF && op.getName().equalsIgnoreCase(SqlStdOperatorTable.AVG.getName()))) { if (children.size() == 0) { node = (ASTNode) ParseDriver.adaptor.create(HiveParser.TOK_FUNCTIONSTAR, "TOK_FUNCTIONSTAR"); } else { - HiveSqlCountAggFunction countFunction = (HiveSqlCountAggFunction)op; - if (countFunction.isDistinct()) { + CanAggregateDistinct distinctFunction = (CanAggregateDistinct) op; + if (distinctFunction.isDistinct()) { node = (ASTNode) ParseDriver.adaptor.create(HiveParser.TOK_FUNCTIONDI, "TOK_FUNCTIONDI"); } } - } else if (op instanceof HiveSqlSumAggFunction) { // case SUM(DISTINCT) - HiveSqlSumAggFunction sumFunction = (HiveSqlSumAggFunction) op; - if (sumFunction.isDistinct()) { - node = (ASTNode) ParseDriver.adaptor.create(HiveParser.TOK_FUNCTIONDI, - "TOK_FUNCTIONDI"); - } } node.addChild((ASTNode) ParseDriver.adaptor.create(HiveParser.Identifier, op.getName())); } @@ -364,11 +361,18 @@ public class SqlFunctionConverter { } // UDAF is assumed to be deterministic - public static class CalciteUDAF extends SqlAggFunction { - public CalciteUDAF(String opName, SqlReturnTypeInference returnTypeInference, + public static class CalciteUDAF extends SqlAggFunction implements CanAggregateDistinct { + private boolean isDistinct; + public CalciteUDAF(boolean isDistinct, String opName, SqlReturnTypeInference returnTypeInference, SqlOperandTypeInference operandTypeInference, SqlOperandTypeChecker operandTypeChecker) { super(opName, SqlKind.OTHER_FUNCTION, returnTypeInference, operandTypeInference, operandTypeChecker, SqlFunctionCategory.USER_DEFINED_FUNCTION); + this.isDistinct = isDistinct; + } + + @Override + public boolean isDistinct() { + return isDistinct; } } @@ -466,6 +470,7 @@ public class SqlFunctionConverter { break; default: calciteAggFn = new CalciteUDAF( + isDistinct, udfInfo.udfName, udfInfo.returnTypeInference, udfInfo.operandTypeInference, http://git-wip-us.apache.org/repos/asf/hive/blob/4f90a715/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java index cd2449f..3c1ce26 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java @@ -26,8 +26,6 @@ import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.parse.WindowingSpec.BoundarySpec; -import org.apache.hadoop.hive.ql.plan.ptf.BoundaryDef; import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer; import org.apache.hadoop.hive.ql.util.JavaDataModel; @@ -35,9 +33,11 @@ import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; @@ -95,6 +95,19 @@ public class GenericUDAFAverage extends AbstractGenericUDAFResolver { } } + @Override + public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo paramInfo) + throws SemanticException { + if (paramInfo.isAllColumns()) { + throw new SemanticException( + "The specified syntax for UDAF invocation is invalid."); + } + + AbstractGenericUDAFAverageEvaluator eval = + (AbstractGenericUDAFAverageEvaluator) getEvaluator(paramInfo.getParameters()); + eval.avgDistinct = paramInfo.isDistinct(); + return eval; + } public static class GenericUDAFAverageEvaluatorDouble extends AbstractGenericUDAFAverageEvaluator<Double> { @@ -102,6 +115,7 @@ public class GenericUDAFAverage extends AbstractGenericUDAFResolver { public void doReset(AverageAggregationBuffer<Double> aggregation) throws HiveException { aggregation.count = 0; aggregation.sum = new Double(0); + aggregation.previousValue = null; } @Override @@ -319,15 +333,18 @@ public class GenericUDAFAverage extends AbstractGenericUDAFResolver { } private static class AverageAggregationBuffer<TYPE> implements AggregationBuffer { + private Object previousValue; private long count; private TYPE sum; }; @SuppressWarnings("unchecked") public static abstract class AbstractGenericUDAFAverageEvaluator<TYPE> extends GenericUDAFEvaluator { + protected boolean avgDistinct; // For PARTIAL1 and COMPLETE protected transient PrimitiveObjectInspector inputOI; + protected transient ObjectInspector copiedOI; // For PARTIAL2 and FINAL private transient StructObjectInspector soi; private transient StructField countField; @@ -359,6 +376,8 @@ public class GenericUDAFAverage extends AbstractGenericUDAFResolver { // init input if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { inputOI = (PrimitiveObjectInspector) parameters[0]; + copiedOI = ObjectInspectorUtils.getStandardObjectInspector(inputOI, + ObjectInspectorCopyOption.JAVA); } else { soi = (StructObjectInspector) parameters[0]; countField = soi.getStructFieldRef("count"); @@ -412,6 +431,14 @@ public class GenericUDAFAverage extends AbstractGenericUDAFResolver { if (parameter != null) { AverageAggregationBuffer<TYPE> averageAggregation = (AverageAggregationBuffer<TYPE>) aggregation; try { + // Skip the same value if avgDistinct is true + if (this.avgDistinct && + ObjectInspectorUtils.compare(parameter, inputOI, averageAggregation.previousValue, copiedOI) == 0) { + return; + } + averageAggregation.previousValue = ObjectInspectorUtils.copyToStandardObject( + parameter, inputOI, ObjectInspectorCopyOption.JAVA); + doIterate(averageAggregation, inputOI, parameter); } catch (NumberFormatException e) { if (!warned) { http://git-wip-us.apache.org/repos/asf/hive/blob/4f90a715/ql/src/test/queries/clientpositive/windowing_distinct.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/windowing_distinct.q b/ql/src/test/queries/clientpositive/windowing_distinct.q index 9f6ddfd..bb192a7 100644 --- a/ql/src/test/queries/clientpositive/windowing_distinct.q +++ b/ql/src/test/queries/clientpositive/windowing_distinct.q @@ -36,3 +36,11 @@ SELECT SUM(DISTINCT t) OVER (PARTITION BY index), SUM(DISTINCT ts) OVER (PARTITION BY index), SUM(DISTINCT dec) OVER (PARTITION BY index) FROM windowing_distinct; + +SELECT AVG(DISTINCT t) OVER (PARTITION BY index), + AVG(DISTINCT d) OVER (PARTITION BY index), + AVG(DISTINCT s) OVER (PARTITION BY index), + AVG(DISTINCT concat('Mr.', s)) OVER (PARTITION BY index), + AVG(DISTINCT ts) OVER (PARTITION BY index), + AVG(DISTINCT dec) OVER (PARTITION BY index) +FROM windowing_distinct; http://git-wip-us.apache.org/repos/asf/hive/blob/4f90a715/ql/src/test/results/clientpositive/windowing_distinct.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/windowing_distinct.q.out b/ql/src/test/results/clientpositive/windowing_distinct.q.out index 0858f0f..074a594 100644 --- a/ql/src/test/results/clientpositive/windowing_distinct.q.out +++ b/ql/src/test/results/clientpositive/windowing_distinct.q.out @@ -102,3 +102,29 @@ POSTHOOK: Input: default@windowing_distinct 235 77.42 0.0 0.0 2.724315837406612E9 69 235 77.42 0.0 0.0 2.724315837406612E9 69 235 77.42 0.0 0.0 2.724315837406612E9 69 +PREHOOK: query: SELECT AVG(DISTINCT t) OVER (PARTITION BY index), + AVG(DISTINCT d) OVER (PARTITION BY index), + AVG(DISTINCT s) OVER (PARTITION BY index), + AVG(DISTINCT concat('Mr.', s)) OVER (PARTITION BY index), + AVG(DISTINCT ts) OVER (PARTITION BY index), + AVG(DISTINCT dec) OVER (PARTITION BY index) +FROM windowing_distinct +PREHOOK: type: QUERY +PREHOOK: Input: default@windowing_distinct +#### A masked pattern was here #### +POSTHOOK: query: SELECT AVG(DISTINCT t) OVER (PARTITION BY index), + AVG(DISTINCT d) OVER (PARTITION BY index), + AVG(DISTINCT s) OVER (PARTITION BY index), + AVG(DISTINCT concat('Mr.', s)) OVER (PARTITION BY index), + AVG(DISTINCT ts) OVER (PARTITION BY index), + AVG(DISTINCT dec) OVER (PARTITION BY index) +FROM windowing_distinct +POSTHOOK: type: QUERY +POSTHOOK: Input: default@windowing_distinct +#### A masked pattern was here #### +27.0 28.315 NULL NULL 1.362157918703148E9 28.5000 +27.0 28.315 NULL NULL 1.362157918703148E9 28.5000 +27.0 28.315 NULL NULL 1.362157918703148E9 28.5000 +117.5 38.71 NULL NULL 1.362157918703306E9 34.5000 +117.5 38.71 NULL NULL 1.362157918703306E9 34.5000 +117.5 38.71 NULL NULL 1.362157918703306E9 34.5000