Repository: hive Updated Branches: refs/heads/branch-3 ef71812bf -> 7ed820f26
HIVE-19586 : Optimize Count(distinct X) pushdown based on the storage capabilities (Slim Bouguerra via Ashutosh Chauhan) Signed-off-by: Ashutosh Chauhan <hashut...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/7ed820f2 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/7ed820f2 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/7ed820f2 Branch: refs/heads/branch-3 Commit: 7ed820f2602761e7df1962127e55ca0a7d39f19c Parents: ef71812 Author: Slim Bouguerra <slim.bougue...@gmail.com> Authored: Thu May 17 16:40:00 2018 -0700 Committer: Ashutosh Chauhan <hashut...@apache.org> Committed: Sun Jun 3 09:15:40 2018 -0700 ---------------------------------------------------------------------- .../optimizer/calcite/rules/HiveDruidRules.java | 240 ++++++++- .../hadoop/hive/ql/parse/CalcitePlanner.java | 104 ++-- .../clientpositive/druidmini_expressions.q | 30 ++ .../druid/druidmini_expressions.q.out | 521 +++++++++++++++++++ 4 files changed, 832 insertions(+), 63 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/7ed820f2/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java index 7d2b9dc..a914210 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java @@ -17,40 +17,57 @@ */ package org.apache.hadoop.hive.ql.optimizer.calcite.rules; -import org.apache.calcite.adapter.druid.DruidRules.DruidHavingFilterRule; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import org.apache.calcite.adapter.druid.DruidQuery; import org.apache.calcite.adapter.druid.DruidRules.DruidAggregateFilterTransposeRule; import org.apache.calcite.adapter.druid.DruidRules.DruidAggregateProjectRule; import org.apache.calcite.adapter.druid.DruidRules.DruidAggregateRule; import org.apache.calcite.adapter.druid.DruidRules.DruidFilterAggregateTransposeRule; import org.apache.calcite.adapter.druid.DruidRules.DruidFilterProjectTransposeRule; import org.apache.calcite.adapter.druid.DruidRules.DruidFilterRule; +import org.apache.calcite.adapter.druid.DruidRules.DruidHavingFilterRule; import org.apache.calcite.adapter.druid.DruidRules.DruidPostAggregationProjectRule; import org.apache.calcite.adapter.druid.DruidRules.DruidProjectFilterTransposeRule; import org.apache.calcite.adapter.druid.DruidRules.DruidProjectRule; import org.apache.calcite.adapter.druid.DruidRules.DruidProjectSortTransposeRule; import org.apache.calcite.adapter.druid.DruidRules.DruidSortProjectTransposeRule; import org.apache.calcite.adapter.druid.DruidRules.DruidSortRule; +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.rel.core.Aggregate; +import org.apache.calcite.rel.core.AggregateCall; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.fun.SqlSumEmptyIsZeroAggFunction; +import org.apache.calcite.tools.RelBuilder; +import org.apache.calcite.tools.RelBuilderFactory; +import org.apache.calcite.util.ImmutableBitSet; +import org.apache.calcite.util.Pair; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; + /** * Druid rules with Hive builder factory. */ public class HiveDruidRules { - public static final DruidFilterRule FILTER = - new DruidFilterRule(HiveRelFactories.HIVE_BUILDER); + public static final DruidFilterRule FILTER = new DruidFilterRule(HiveRelFactories.HIVE_BUILDER); - public static final DruidProjectRule PROJECT = - new DruidProjectRule(HiveRelFactories.HIVE_BUILDER); + public static final DruidProjectRule PROJECT = new DruidProjectRule(HiveRelFactories.HIVE_BUILDER); - public static final DruidAggregateRule AGGREGATE = - new DruidAggregateRule(HiveRelFactories.HIVE_BUILDER); + public static final DruidAggregateRule AGGREGATE = new DruidAggregateRule(HiveRelFactories.HIVE_BUILDER); public static final DruidAggregateProjectRule AGGREGATE_PROJECT = new DruidAggregateProjectRule(HiveRelFactories.HIVE_BUILDER); - public static final DruidSortRule SORT = - new DruidSortRule(HiveRelFactories.HIVE_BUILDER); + public static final DruidSortRule SORT = new DruidSortRule(HiveRelFactories.HIVE_BUILDER); public static final DruidSortProjectTransposeRule SORT_PROJECT_TRANSPOSE = new DruidSortProjectTransposeRule(HiveRelFactories.HIVE_BUILDER); @@ -75,4 +92,209 @@ public class HiveDruidRules { public static final DruidHavingFilterRule HAVING_FILTER_RULE = new DruidHavingFilterRule(HiveRelFactories.HIVE_BUILDER); + + public static final AggregateExpandDistinctAggregatesDruidRule EXPAND_SINGLE_DISTINCT_AGGREGATES_DRUID_RULE = + new AggregateExpandDistinctAggregatesDruidRule(HiveRelFactories.HIVE_BUILDER); + + /** + * This is a simplified version of {@link org.apache.calcite.rel.rules.AggregateExpandDistinctAggregatesRule} + * The goal of this simplified version is to help pushing single count distinct as multi-phase aggregates. + * This is an okay solution before we actually support grouping sets push-down to Druid. + * We are limiting it to one Distinct count to avoid expensive cross join and running into issue + * https://issues.apache.org/jira/browse/HIVE-19601 + */ + public static class AggregateExpandDistinctAggregatesDruidRule extends RelOptRule { + + public AggregateExpandDistinctAggregatesDruidRule(RelBuilderFactory relBuilderFactory) { + super(operand(Aggregate.class, operand(DruidQuery.class, none())), relBuilderFactory, + null + ); + } + @Override public void onMatch(RelOptRuleCall call) { + Aggregate aggregate = call.rel(0); + if (!aggregate.containsDistinctCall()) { + return; + } + final long numCountDistinct = aggregate.getAggCallList() + .stream() + .filter(aggregateCall -> aggregateCall.getAggregation().getKind().equals(SqlKind.COUNT) && + aggregateCall.isDistinct()) + .count(); + if (numCountDistinct != 1) { + return; + } + + // Find all of the agg expressions. We use a LinkedHashSet to ensure determinism. + int nonDistinctAggCallCount = 0; // find all aggregate calls without distinct + int filterCount = 0; + int unsupportedNonDistinctAggCallCount = 0; + final Set<Pair<List<Integer>, Integer>> argLists = new LinkedHashSet<>(); + for (AggregateCall aggCall : aggregate.getAggCallList()) { + if (aggCall.filterArg >= 0) { + ++filterCount; + } + if (!aggCall.isDistinct()) { + ++nonDistinctAggCallCount; + final SqlKind aggCallKind = aggCall.getAggregation().getKind(); + // We only support COUNT/SUM/MIN/MAX for the "single" count distinct optimization + switch (aggCallKind) { + case COUNT: + case SUM: + case SUM0: + case MIN: + case MAX: + break; + default: + ++unsupportedNonDistinctAggCallCount; + } + } else { + argLists.add(Pair.of(aggCall.getArgList(), aggCall.filterArg)); + } + } + // If only one distinct aggregate and one or more non-distinct aggregates, + // we can generate multi-phase aggregates + if (numCountDistinct == 1 // one distinct aggregate + && filterCount == 0 // no filter + && unsupportedNonDistinctAggCallCount == 0 // sum/min/max/count in non-distinct aggregate + && nonDistinctAggCallCount > 0) { // one or more non-distinct aggregates + final RelBuilder relBuilder = call.builder(); + convertSingletonDistinct(relBuilder, aggregate, argLists); + call.transformTo(relBuilder.build()); + return; + } + } + + /** + * Converts an aggregate with one distinct aggregate and one or more + * non-distinct aggregates to multi-phase aggregates (see reference example + * below). + * + * @param relBuilder Contains the input relational expression + * @param aggregate Original aggregate + * @param argLists Arguments and filters to the distinct aggregate function + * + */ + private RelBuilder convertSingletonDistinct(RelBuilder relBuilder, + Aggregate aggregate, Set<Pair<List<Integer>, Integer>> argLists) { + + // In this case, we are assuming that there is a single distinct function. + // So make sure that argLists is of size one. + Preconditions.checkArgument(argLists.size() == 1); + + // For example, + // SELECT deptno, COUNT(*), SUM(bonus), MIN(DISTINCT sal) + // FROM emp + // GROUP BY deptno + // + // becomes + // + // SELECT deptno, SUM(cnt), SUM(bonus), MIN(sal) + // FROM ( + // SELECT deptno, COUNT(*) as cnt, SUM(bonus), sal + // FROM EMP + // GROUP BY deptno, sal) // Aggregate B + // GROUP BY deptno // Aggregate A + relBuilder.push(aggregate.getInput()); + + final List<AggregateCall> originalAggCalls = aggregate.getAggCallList(); + final ImmutableBitSet originalGroupSet = aggregate.getGroupSet(); + + // Add the distinct aggregate column(s) to the group-by columns, + // if not already a part of the group-by + final SortedSet<Integer> bottomGroupSet = new TreeSet<>(); + bottomGroupSet.addAll(aggregate.getGroupSet().asList()); + for (AggregateCall aggCall : originalAggCalls) { + if (aggCall.isDistinct()) { + bottomGroupSet.addAll(aggCall.getArgList()); + break; // since we only have single distinct call + } + } + + // Generate the intermediate aggregate B, the one on the bottom that converts + // a distinct call to group by call. + // Bottom aggregate is the same as the original aggregate, except that + // the bottom aggregate has converted the DISTINCT aggregate to a group by clause. + final List<AggregateCall> bottomAggregateCalls = new ArrayList<>(); + for (AggregateCall aggCall : originalAggCalls) { + // Project the column corresponding to the distinct aggregate. Project + // as-is all the non-distinct aggregates + if (!aggCall.isDistinct()) { + final AggregateCall newCall = + AggregateCall.create(aggCall.getAggregation(), false, + aggCall.isApproximate(), aggCall.getArgList(), -1, + ImmutableBitSet.of(bottomGroupSet).cardinality(), + relBuilder.peek(), null, aggCall.name); + bottomAggregateCalls.add(newCall); + } + } + // Generate the aggregate B (see the reference example above) + relBuilder.push( + aggregate.copy( + aggregate.getTraitSet(), relBuilder.build(), + false, ImmutableBitSet.of(bottomGroupSet), null, bottomAggregateCalls)); + + // Add aggregate A (see the reference example above), the top aggregate + // to handle the rest of the aggregation that the bottom aggregate hasn't handled + final List<AggregateCall> topAggregateCalls = Lists.newArrayList(); + // Use the remapped arguments for the (non)distinct aggregate calls + int nonDistinctAggCallProcessedSoFar = 0; + for (AggregateCall aggCall : originalAggCalls) { + final AggregateCall newCall; + if (aggCall.isDistinct()) { + List<Integer> newArgList = new ArrayList<>(); + for (int arg : aggCall.getArgList()) { + newArgList.add(bottomGroupSet.headSet(arg).size()); + } + newCall = + AggregateCall.create(aggCall.getAggregation(), + false, + aggCall.isApproximate(), + newArgList, + -1, + originalGroupSet.cardinality(), + relBuilder.peek(), + aggCall.getType(), + aggCall.name); + } else { + // If aggregate B had a COUNT aggregate call the corresponding aggregate at + // aggregate A must be SUM. For other aggregates, it remains the same. + final List<Integer> newArgs = + Lists.newArrayList(bottomGroupSet.size() + nonDistinctAggCallProcessedSoFar); + if (aggCall.getAggregation().getKind() == SqlKind.COUNT) { + newCall = + AggregateCall.create(new SqlSumEmptyIsZeroAggFunction(), false, + aggCall.isApproximate(), newArgs, -1, + originalGroupSet.cardinality(), relBuilder.peek(), + aggCall.getType(), aggCall.getName()); + } else { + newCall = + AggregateCall.create(aggCall.getAggregation(), false, + aggCall.isApproximate(), newArgs, -1, + originalGroupSet.cardinality(), + relBuilder.peek(), aggCall.getType(), aggCall.name); + } + nonDistinctAggCallProcessedSoFar++; + } + + topAggregateCalls.add(newCall); + } + + // Populate the group-by keys with the remapped arguments for aggregate A + // The top groupset is basically an identity (first X fields of aggregate B's + // output), minus the distinct aggCall's input. + final Set<Integer> topGroupSet = new HashSet<>(); + int groupSetToAdd = 0; + for (int bottomGroup : bottomGroupSet) { + if (originalGroupSet.get(bottomGroup)) { + topGroupSet.add(groupSetToAdd); + } + groupSetToAdd++; + } + relBuilder.push( + aggregate.copy(aggregate.getTraitSet(), + relBuilder.build(), aggregate.indicator, + ImmutableBitSet.of(topGroupSet), null, topAggregateCalls)); + return relBuilder; + } + } } http://git-wip-us.apache.org/repos/asf/hive/blob/7ed820f2/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 0bc9d23..d939110 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -17,34 +17,14 @@ */ package org.apache.hadoop.hive.ql.parse; -import java.io.IOException; -import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.UndeclaredThrowableException; -import java.math.BigDecimal; -import java.util.AbstractMap.SimpleEntry; -import java.util.ArrayDeque; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; -import java.util.Collection; -import java.util.Collections; -import java.util.Deque; -import java.util.EnumSet; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -import javax.sql.DataSource; +import com.google.common.base.Function; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableList.Builder; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Multimap; import org.antlr.runtime.ClassicToken; import org.antlr.runtime.CommonToken; import org.antlr.runtime.tree.Tree; @@ -177,7 +157,6 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExcept; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIntersect; -import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.jdbc.HiveJdbcConverter; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode; @@ -186,6 +165,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableFunctionScan; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.jdbc.HiveJdbcConverter; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.jdbc.JdbcHiveTableScan; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateJoinTransposeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateProjectMergeRule; @@ -234,22 +214,19 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSubQueryRemoveRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveUnionMergeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveUnionPullUpConstantsRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveWindowingFixRule; - +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCAbstractSplitFilterRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCAggregationPushDownRule; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCExtractJoinFilterRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCFilterJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCFilterPushDownRule; -import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCExtractJoinFilterRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCJoinPushDownRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCProjectPushDownRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCSortPushDownRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCUnionPushDownRule; -import org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCAbstractSplitFilterRule; - import org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveAggregateIncrementalRewritingRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveMaterializedViewRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveNoAggregateIncrementalRewritingRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.MaterializedViewRewritingRelVisitor; - import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTBuilder; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter; @@ -295,13 +272,32 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.joda.time.Interval; -import com.google.common.base.Function; -import com.google.common.collect.ArrayListMultimap; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableList.Builder; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; -import com.google.common.collect.Multimap; +import javax.sql.DataSource; +import java.io.IOException; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.UndeclaredThrowableException; +import java.math.BigDecimal; +import java.util.AbstractMap.SimpleEntry; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Collection; +import java.util.Collections; +import java.util.Deque; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; public class CalcitePlanner extends SemanticAnalyzer { @@ -1833,20 +1829,20 @@ public class CalcitePlanner extends SemanticAnalyzer { // 11. Apply Druid transformation rules perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER); calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null, - HepMatchOrder.BOTTOM_UP, - HiveDruidRules.FILTER, - HiveDruidRules.PROJECT_FILTER_TRANSPOSE, - HiveDruidRules.AGGREGATE_FILTER_TRANSPOSE, - HiveDruidRules.AGGREGATE_PROJECT, - HiveDruidRules.PROJECT, - HiveDruidRules.AGGREGATE, - HiveDruidRules.POST_AGGREGATION_PROJECT, - HiveDruidRules.FILTER_AGGREGATE_TRANSPOSE, - HiveDruidRules.FILTER_PROJECT_TRANSPOSE, - HiveDruidRules.HAVING_FILTER_RULE, - HiveDruidRules.SORT_PROJECT_TRANSPOSE, - HiveDruidRules.SORT, - HiveDruidRules.PROJECT_SORT_TRANSPOSE + HepMatchOrder.BOTTOM_UP, + HiveDruidRules.FILTER, HiveDruidRules.PROJECT_FILTER_TRANSPOSE, + HiveDruidRules.AGGREGATE_FILTER_TRANSPOSE, + HiveDruidRules.AGGREGATE_PROJECT, + HiveDruidRules.PROJECT, + HiveDruidRules.EXPAND_SINGLE_DISTINCT_AGGREGATES_DRUID_RULE, + HiveDruidRules.AGGREGATE, + HiveDruidRules.POST_AGGREGATION_PROJECT, + HiveDruidRules.FILTER_AGGREGATE_TRANSPOSE, + HiveDruidRules.FILTER_PROJECT_TRANSPOSE, + HiveDruidRules.HAVING_FILTER_RULE, + HiveDruidRules.SORT_PROJECT_TRANSPOSE, + HiveDruidRules.SORT, + HiveDruidRules.PROJECT_SORT_TRANSPOSE ); perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Druid transformation rules"); http://git-wip-us.apache.org/repos/asf/hive/blob/7ed820f2/ql/src/test/queries/clientpositive/druidmini_expressions.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/druidmini_expressions.q b/ql/src/test/queries/clientpositive/druidmini_expressions.q index 722936b..72e6256 100644 --- a/ql/src/test/queries/clientpositive/druidmini_expressions.q +++ b/ql/src/test/queries/clientpositive/druidmini_expressions.q @@ -51,4 +51,34 @@ EXPLAIN SELECT cstring1 || '_'|| cstring2, substring(cstring2, 2, 3) as concat , explain extended select count(*) from (select `__time` from druid_table_n0 limit 1) as src ; +SELECT `__time` +FROM druid_table_n0 +WHERE (`__time` BETWEEN '1968-01-01 00:00:00' AND '1970-01-01 00:00:00') + OR (`__time` BETWEEN '1968-02-01 00:00:00' AND '1970-04-01 00:00:00') ORDER BY `__time` ASC LIMIT 10; + +-- COUNT DISTINCT TESTS +-- AS PART OF https://issues.apache.org/jira/browse/HIVE-19586 + +EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` ; + +EXPLAIN select count(distinct cdouble), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` ; + +EXPLAIN select count(distinct cstring2), sum(2 * cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` ; + +EXPLAIN select count(distinct cstring2 || '_'|| cstring1), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` ; + +EXPLAIN select count(DISTINCT cstring2) FROM druid_table_n0 ; +EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 ; +EXPLAIN select count(distinct cstring2 || '_'|| cstring1), sum(cdouble), min(cint) FROM druid_table_n0; + +select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 GROUP BY floor_year(`__time`) ; + +select count(distinct cstring2), sum(2 * cdouble) FROM druid_table_n0 GROUP BY floor_year(`__time`) ; + +select count(DISTINCT cstring2) FROM druid_table_n0 ; + +select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 ; + +select count(distinct cstring2 || '_'|| cstring1), sum(cdouble), min(cint) FROM druid_table_n0; + DROP TABLE druid_table_n0; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/7ed820f2/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out b/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out index d51c5af..30bc5d0 100644 --- a/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out +++ b/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out @@ -368,6 +368,527 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: SELECT `__time` +FROM druid_table_n0 +WHERE (`__time` BETWEEN '1968-01-01 00:00:00' AND '1970-01-01 00:00:00') + OR (`__time` BETWEEN '1968-02-01 00:00:00' AND '1970-04-01 00:00:00') ORDER BY `__time` ASC LIMIT 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@druid_table_n0 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT `__time` +FROM druid_table_n0 +WHERE (`__time` BETWEEN '1968-01-01 00:00:00' AND '1970-01-01 00:00:00') + OR (`__time` BETWEEN '1968-02-01 00:00:00' AND '1970-04-01 00:00:00') ORDER BY `__time` ASC LIMIT 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@druid_table_n0 +POSTHOOK: Output: hdfs://### HDFS PATH ### +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +1969-12-31 15:59:00.0 US/Pacific +PREHOOK: query: EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: druid_table_n0 + properties: + druid.fieldNames extract,cstring1,cstring2,$f3 + druid.fieldTypes timestamp with local time zone,string,string,double + druid.query.json {"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"extraction","dimension":"__time","outputName":"extract","extractionFn":{"type":"timeFormat","format":"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","timeZone":"US/Pacific"}},{"type":"default","dimension":"cstring1","outputName":"cstring1","outputType":"STRING"},{"type":"default","dimension":"cstring2","outputName":"cstring2","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f3","fieldName":"cdouble"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]} + druid.query.type groupBy + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: extract (type: timestamp with local time zone), cstring1 (type: string), cstring2 (type: string), $f3 (type: double) + outputColumnNames: extract, cstring1, cstring2, $f3 + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(cstring2), sum($f3) + keys: extract (type: timestamp with local time zone), cstring1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: timestamp with local time zone), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: timestamp with local time zone), _col1 (type: string) + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: bigint), _col3 (type: double) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: timestamp with local time zone), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: bigint), _col3 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN select count(distinct cdouble), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct cdouble), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: druid_table_n0 + properties: + druid.fieldNames extract,cstring1,cdouble,$f3 + druid.fieldTypes timestamp with local time zone,string,double,double + druid.query.json {"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"extraction","dimension":"__time","outputName":"extract","extractionFn":{"type":"timeFormat","format":"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","timeZone":"US/Pacific"}},{"type":"default","dimension":"cstring1","outputName":"cstring1","outputType":"STRING"},{"type":"default","dimension":"cdouble","outputName":"cdouble","outputType":"DOUBLE"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f3","fieldName":"cdouble"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]} + druid.query.type groupBy + Statistics: Num rows: 9173 Data size: 2091840 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: extract (type: timestamp with local time zone), cstring1 (type: string), cdouble (type: double), $f3 (type: double) + outputColumnNames: extract, cstring1, cdouble, $f3 + Statistics: Num rows: 9173 Data size: 2091840 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(cdouble), sum($f3) + keys: extract (type: timestamp with local time zone), cstring1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 9173 Data size: 2091840 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: timestamp with local time zone), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: timestamp with local time zone), _col1 (type: string) + Statistics: Num rows: 9173 Data size: 2091840 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: bigint), _col3 (type: double) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: timestamp with local time zone), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 4586 Data size: 1045805 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: bigint), _col3 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4586 Data size: 1045805 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4586 Data size: 1045805 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN select count(distinct cstring2), sum(2 * cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct cstring2), sum(2 * cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: druid_table_n0 + properties: + druid.fieldNames extract,cstring1,cstring2,$f3 + druid.fieldTypes timestamp with local time zone,string,string,double + druid.query.json {"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"extraction","dimension":"__time","outputName":"extract","extractionFn":{"type":"timeFormat","format":"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","timeZone":"US/Pacific"}},{"type":"default","dimension":"cstring1","outputName":"cstring1","outputType":"STRING"},{"type":"default","dimension":"cstring2","outputName":"cstring2","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f3","expression":"(CAST(2, 'DOUBLE') * \"cdouble\")"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]} + druid.query.type groupBy + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: extract (type: timestamp with local time zone), cstring1 (type: string), cstring2 (type: string), $f3 (type: double) + outputColumnNames: extract, cstring1, cstring2, $f3 + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(cstring2), sum($f3) + keys: extract (type: timestamp with local time zone), cstring1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: timestamp with local time zone), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: timestamp with local time zone), _col1 (type: string) + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: bigint), _col3 (type: double) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: timestamp with local time zone), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: bigint), _col3 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN select count(distinct cstring2 || '_'|| cstring1), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct cstring2 || '_'|| cstring1), sum(cdouble) FROM druid_table_n0 GROUP BY `__time`, `cstring1` +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: druid_table_n0 + properties: + druid.fieldNames extract,cstring1,vc,$f3 + druid.fieldTypes timestamp with local time zone,string,string,double + druid.query.json {"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"extraction","dimension":"__time","outputName":"extract","extractionFn":{"type":"timeFormat","format":"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","timeZone":"US/Pacific"}},{"type":"default","dimension":"cstring1","outputName":"cstring1","outputType":"STRING"},{"type":"default","dimension":"vc","outputName":"vc","outputType":"STRING"}],"virtualColumns":[{"type":"expression","name":"vc","expression":"concat(concat(\"cstring2\",'_'),\"cstring1\")","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f3","fieldName":"cdouble"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]} + druid.query.type groupBy + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: extract (type: timestamp with local time zone), cstring1 (type: string), vc (type: string), $f3 (type: double) + outputColumnNames: extract, cstring1, vc, $f3 + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(vc), sum($f3) + keys: extract (type: timestamp with local time zone), cstring1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: timestamp with local time zone), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: timestamp with local time zone), _col1 (type: string) + Statistics: Num rows: 9173 Data size: 3625856 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: bigint), _col3 (type: double) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: timestamp with local time zone), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: bigint), _col3 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4586 Data size: 1812730 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN select count(DISTINCT cstring2) FROM druid_table_n0 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(DISTINCT cstring2) FROM druid_table_n0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: druid_table_n0 + properties: + druid.fieldNames cstring2 + druid.fieldTypes string + druid.query.json {"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"default","dimension":"cstring2","outputName":"cstring2","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]} + druid.query.type groupBy + Statistics: Num rows: 9173 Data size: 1603744 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring2 (type: string) + outputColumnNames: cstring2 + Statistics: Num rows: 9173 Data size: 1603744 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(cstring2) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: druid_table_n0 + properties: + druid.fieldNames cstring2,$f1 + druid.fieldTypes string,double + druid.query.json {"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"default","dimension":"cstring2","outputName":"cstring2","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f1","fieldName":"cdouble"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]} + druid.query.type groupBy + Statistics: Num rows: 9173 Data size: 1673472 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring2 (type: string), $f1 (type: double) + outputColumnNames: cstring2, $f1 + Statistics: Num rows: 9173 Data size: 1673472 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(cstring2), sum($f1) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 208 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 208 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: double) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), sum(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 208 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 208 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN select count(distinct cstring2 || '_'|| cstring1), sum(cdouble), min(cint) FROM druid_table_n0 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct cstring2 || '_'|| cstring1), sum(cdouble), min(cint) FROM druid_table_n0 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: druid_table_n0 + properties: + druid.fieldNames vc,$f1,$f2 + druid.fieldTypes string,double,int + druid.query.json {"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"default","dimension":"vc","outputName":"vc","outputType":"STRING"}],"virtualColumns":[{"type":"expression","name":"vc","expression":"concat(concat(\"cstring2\",'_'),\"cstring1\")","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f1","fieldName":"cdouble"},{"type":"longMin","name":"$f2","fieldName":"cint"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]} + druid.query.type groupBy + Statistics: Num rows: 9173 Data size: 1708336 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: vc (type: string), $f1 (type: double), $f2 (type: int) + outputColumnNames: vc, $f1, $f2 + Statistics: Num rows: 9173 Data size: 1708336 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(vc), sum($f1), min($f2) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 216 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 216 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: double), _col2 (type: int) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), sum(VALUE._col1), min(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 216 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 216 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 GROUP BY floor_year(`__time`) +PREHOOK: type: QUERY +PREHOOK: Input: default@druid_table_n0 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 GROUP BY floor_year(`__time`) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@druid_table_n0 +POSTHOOK: Output: hdfs://### HDFS PATH ### +6078 2.7308662809692383E7 +PREHOOK: query: select count(distinct cstring2), sum(2 * cdouble) FROM druid_table_n0 GROUP BY floor_year(`__time`) +PREHOOK: type: QUERY +PREHOOK: Input: default@druid_table_n0 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(distinct cstring2), sum(2 * cdouble) FROM druid_table_n0 GROUP BY floor_year(`__time`) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@druid_table_n0 +POSTHOOK: Output: hdfs://### HDFS PATH ### +6078 5.4617325619384766E7 +PREHOOK: query: select count(DISTINCT cstring2) FROM druid_table_n0 +PREHOOK: type: QUERY +PREHOOK: Input: default@druid_table_n0 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(DISTINCT cstring2) FROM druid_table_n0 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@druid_table_n0 +POSTHOOK: Output: hdfs://### HDFS PATH ### +6078 +PREHOOK: query: select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 +PREHOOK: type: QUERY +PREHOOK: Input: default@druid_table_n0 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@druid_table_n0 +POSTHOOK: Output: hdfs://### HDFS PATH ### +6078 2.7308662809692383E7 +PREHOOK: query: select count(distinct cstring2 || '_'|| cstring1), sum(cdouble), min(cint) FROM druid_table_n0 +PREHOOK: type: QUERY +PREHOOK: Input: default@druid_table_n0 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(distinct cstring2 || '_'|| cstring1), sum(cdouble), min(cint) FROM druid_table_n0 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@druid_table_n0 +POSTHOOK: Output: hdfs://### HDFS PATH ### +6095 2.7308662809692383E7 -1073279343 PREHOOK: query: DROP TABLE druid_table_n0 PREHOOK: type: DROPTABLE PREHOOK: Input: default@druid_table_n0