hive git commit: HIVE-19586 : Optimize Count(distinct X) pushdown based on the storage capabilities (Slim Bouguerra via Ashutosh Chauhan)

hashutosh Sun, 03 Jun 2018 09:15:54 -0700

Repository: hive
Updated Branches:
  refs/heads/branch-3 ef71812bf -> 7ed820f26



HIVE-19586 : Optimize Count(distinct X) pushdown based on the storage 
capabilities (Slim Bouguerra via Ashutosh Chauhan)

Signed-off-by: Ashutosh Chauhan <hashut...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/7ed820f2
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/7ed820f2
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/7ed820f2

Branch: refs/heads/branch-3
Commit: 7ed820f2602761e7df1962127e55ca0a7d39f19c
Parents: ef71812
Author: Slim Bouguerra <slim.bougue...@gmail.com>
Authored: Thu May 17 16:40:00 2018 -0700
Committer: Ashutosh Chauhan <hashut...@apache.org>
Committed: Sun Jun 3 09:15:40 2018 -0700

----------------------------------------------------------------------
 .../optimizer/calcite/rules/HiveDruidRules.java | 240 ++++++++-
 .../hadoop/hive/ql/parse/CalcitePlanner.java    | 104 ++--
 .../clientpositive/druidmini_expressions.q      |  30 ++
 .../druid/druidmini_expressions.q.out           | 521 +++++++++++++++++++
 4 files changed, 832 insertions(+), 63 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/7ed820f2/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java
index 7d2b9dc..a914210 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveDruidRules.java
@@ -17,40 +17,57 @@
  */
 package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
 
-import org.apache.calcite.adapter.druid.DruidRules.DruidHavingFilterRule;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import org.apache.calcite.adapter.druid.DruidQuery;
 import 
org.apache.calcite.adapter.druid.DruidRules.DruidAggregateFilterTransposeRule;
 import org.apache.calcite.adapter.druid.DruidRules.DruidAggregateProjectRule;
 import org.apache.calcite.adapter.druid.DruidRules.DruidAggregateRule;
 import 
org.apache.calcite.adapter.druid.DruidRules.DruidFilterAggregateTransposeRule;
 import 
org.apache.calcite.adapter.druid.DruidRules.DruidFilterProjectTransposeRule;
 import org.apache.calcite.adapter.druid.DruidRules.DruidFilterRule;
+import org.apache.calcite.adapter.druid.DruidRules.DruidHavingFilterRule;
 import 
org.apache.calcite.adapter.druid.DruidRules.DruidPostAggregationProjectRule;
 import 
org.apache.calcite.adapter.druid.DruidRules.DruidProjectFilterTransposeRule;
 import org.apache.calcite.adapter.druid.DruidRules.DruidProjectRule;
 import 
org.apache.calcite.adapter.druid.DruidRules.DruidProjectSortTransposeRule;
 import 
org.apache.calcite.adapter.druid.DruidRules.DruidSortProjectTransposeRule;
 import org.apache.calcite.adapter.druid.DruidRules.DruidSortRule;
+import org.apache.calcite.plan.RelOptRule;
+import org.apache.calcite.plan.RelOptRuleCall;
+import org.apache.calcite.rel.core.Aggregate;
+import org.apache.calcite.rel.core.AggregateCall;
+import org.apache.calcite.sql.SqlKind;
+import org.apache.calcite.sql.fun.SqlSumEmptyIsZeroAggFunction;
+import org.apache.calcite.tools.RelBuilder;
+import org.apache.calcite.tools.RelBuilderFactory;
+import org.apache.calcite.util.ImmutableBitSet;
+import org.apache.calcite.util.Pair;
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
 
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
 /**
  * Druid rules with Hive builder factory.
  */
 public class HiveDruidRules {
 
-  public static final DruidFilterRule FILTER =
-      new DruidFilterRule(HiveRelFactories.HIVE_BUILDER);
+  public static final DruidFilterRule FILTER = new 
DruidFilterRule(HiveRelFactories.HIVE_BUILDER);
 
-  public static final DruidProjectRule PROJECT =
-      new DruidProjectRule(HiveRelFactories.HIVE_BUILDER);
+  public static final DruidProjectRule PROJECT = new 
DruidProjectRule(HiveRelFactories.HIVE_BUILDER);
 
-  public static final DruidAggregateRule AGGREGATE =
-      new DruidAggregateRule(HiveRelFactories.HIVE_BUILDER);
+  public static final DruidAggregateRule AGGREGATE = new 
DruidAggregateRule(HiveRelFactories.HIVE_BUILDER);
 
   public static final DruidAggregateProjectRule AGGREGATE_PROJECT =
       new DruidAggregateProjectRule(HiveRelFactories.HIVE_BUILDER);
 
-  public static final DruidSortRule SORT =
-      new DruidSortRule(HiveRelFactories.HIVE_BUILDER);
+  public static final DruidSortRule SORT = new 
DruidSortRule(HiveRelFactories.HIVE_BUILDER);
 
   public static final DruidSortProjectTransposeRule SORT_PROJECT_TRANSPOSE =
       new DruidSortProjectTransposeRule(HiveRelFactories.HIVE_BUILDER);
@@ -75,4 +92,209 @@ public class HiveDruidRules {
 
   public static final DruidHavingFilterRule HAVING_FILTER_RULE =
       new DruidHavingFilterRule(HiveRelFactories.HIVE_BUILDER);
+
+  public static final AggregateExpandDistinctAggregatesDruidRule 
EXPAND_SINGLE_DISTINCT_AGGREGATES_DRUID_RULE =
+      new 
AggregateExpandDistinctAggregatesDruidRule(HiveRelFactories.HIVE_BUILDER);
+
+  /**
+   * This is a simplified version of {@link 
org.apache.calcite.rel.rules.AggregateExpandDistinctAggregatesRule}
+   * The goal of this simplified version is to help pushing single count 
distinct as multi-phase aggregates.
+   * This is an okay solution before we actually support grouping sets 
push-down to Druid.
+   * We are limiting it to one Distinct count to avoid expensive cross join 
and running into issue
+   * https://issues.apache.org/jira/browse/HIVE-19601
+   */
+  public static class AggregateExpandDistinctAggregatesDruidRule extends 
RelOptRule {
+
+    public AggregateExpandDistinctAggregatesDruidRule(RelBuilderFactory 
relBuilderFactory) {
+      super(operand(Aggregate.class, operand(DruidQuery.class, none())), 
relBuilderFactory,
+          null
+      );
+    }
+    @Override public void onMatch(RelOptRuleCall call) {
+      Aggregate aggregate =  call.rel(0);
+      if (!aggregate.containsDistinctCall()) {
+        return;
+      }
+      final long numCountDistinct = aggregate.getAggCallList()
+          .stream()
+          .filter(aggregateCall -> 
aggregateCall.getAggregation().getKind().equals(SqlKind.COUNT) &&
+              aggregateCall.isDistinct())
+          .count();
+      if (numCountDistinct != 1) {
+        return;
+      }
+
+      // Find all of the agg expressions. We use a LinkedHashSet to ensure 
determinism.
+      int nonDistinctAggCallCount = 0;  // find all aggregate calls without 
distinct
+      int filterCount = 0;
+      int unsupportedNonDistinctAggCallCount = 0;
+      final Set<Pair<List<Integer>, Integer>> argLists = new LinkedHashSet<>();
+      for (AggregateCall aggCall : aggregate.getAggCallList()) {
+        if (aggCall.filterArg >= 0) {
+          ++filterCount;
+        }
+        if (!aggCall.isDistinct()) {
+          ++nonDistinctAggCallCount;
+          final SqlKind aggCallKind = aggCall.getAggregation().getKind();
+          // We only support COUNT/SUM/MIN/MAX for the "single" count distinct 
optimization
+          switch (aggCallKind) {
+          case COUNT:
+          case SUM:
+          case SUM0:
+          case MIN:
+          case MAX:
+            break;
+          default:
+            ++unsupportedNonDistinctAggCallCount;
+          }
+        } else {
+          argLists.add(Pair.of(aggCall.getArgList(), aggCall.filterArg));
+        }
+      }
+      // If only one distinct aggregate and one or more non-distinct 
aggregates,
+      // we can generate multi-phase aggregates
+      if (numCountDistinct == 1 // one distinct aggregate
+          && filterCount == 0 // no filter
+          && unsupportedNonDistinctAggCallCount == 0 // sum/min/max/count in 
non-distinct aggregate
+          && nonDistinctAggCallCount > 0) { // one or more non-distinct 
aggregates
+        final RelBuilder relBuilder = call.builder();
+        convertSingletonDistinct(relBuilder, aggregate, argLists);
+        call.transformTo(relBuilder.build());
+        return;
+      }
+    }
+
+    /**
+     * Converts an aggregate with one distinct aggregate and one or more
+     * non-distinct aggregates to multi-phase aggregates (see reference example
+     * below).
+     *
+     * @param relBuilder Contains the input relational expression
+     * @param aggregate  Original aggregate
+     * @param argLists   Arguments and filters to the distinct aggregate 
function
+     *
+     */
+    private RelBuilder convertSingletonDistinct(RelBuilder relBuilder,
+        Aggregate aggregate, Set<Pair<List<Integer>, Integer>> argLists) {
+
+      // In this case, we are assuming that there is a single distinct 
function.
+      // So make sure that argLists is of size one.
+      Preconditions.checkArgument(argLists.size() == 1);
+
+      // For example,
+      //    SELECT deptno, COUNT(*), SUM(bonus), MIN(DISTINCT sal)
+      //    FROM emp
+      //    GROUP BY deptno
+      //
+      // becomes
+      //
+      //    SELECT deptno, SUM(cnt), SUM(bonus), MIN(sal)
+      //    FROM (
+      //          SELECT deptno, COUNT(*) as cnt, SUM(bonus), sal
+      //          FROM EMP
+      //          GROUP BY deptno, sal)            // Aggregate B
+      //    GROUP BY deptno                        // Aggregate A
+      relBuilder.push(aggregate.getInput());
+
+      final List<AggregateCall> originalAggCalls = aggregate.getAggCallList();
+      final ImmutableBitSet originalGroupSet = aggregate.getGroupSet();
+
+      // Add the distinct aggregate column(s) to the group-by columns,
+      // if not already a part of the group-by
+      final SortedSet<Integer> bottomGroupSet = new TreeSet<>();
+      bottomGroupSet.addAll(aggregate.getGroupSet().asList());
+      for (AggregateCall aggCall : originalAggCalls) {
+        if (aggCall.isDistinct()) {
+          bottomGroupSet.addAll(aggCall.getArgList());
+          break;  // since we only have single distinct call
+        }
+      }
+
+      // Generate the intermediate aggregate B, the one on the bottom that 
converts
+      // a distinct call to group by call.
+      // Bottom aggregate is the same as the original aggregate, except that
+      // the bottom aggregate has converted the DISTINCT aggregate to a group 
by clause.
+      final List<AggregateCall> bottomAggregateCalls = new ArrayList<>();
+      for (AggregateCall aggCall : originalAggCalls) {
+        // Project the column corresponding to the distinct aggregate. Project
+        // as-is all the non-distinct aggregates
+        if (!aggCall.isDistinct()) {
+          final AggregateCall newCall =
+              AggregateCall.create(aggCall.getAggregation(), false,
+                  aggCall.isApproximate(), aggCall.getArgList(), -1,
+                  ImmutableBitSet.of(bottomGroupSet).cardinality(),
+                  relBuilder.peek(), null, aggCall.name);
+          bottomAggregateCalls.add(newCall);
+        }
+      }
+      // Generate the aggregate B (see the reference example above)
+      relBuilder.push(
+          aggregate.copy(
+              aggregate.getTraitSet(), relBuilder.build(),
+              false, ImmutableBitSet.of(bottomGroupSet), null, 
bottomAggregateCalls));
+
+      // Add aggregate A (see the reference example above), the top aggregate
+      // to handle the rest of the aggregation that the bottom aggregate 
hasn't handled
+      final List<AggregateCall> topAggregateCalls = Lists.newArrayList();
+      // Use the remapped arguments for the (non)distinct aggregate calls
+      int nonDistinctAggCallProcessedSoFar = 0;
+      for (AggregateCall aggCall : originalAggCalls) {
+        final AggregateCall newCall;
+        if (aggCall.isDistinct()) {
+          List<Integer> newArgList = new ArrayList<>();
+          for (int arg : aggCall.getArgList()) {
+            newArgList.add(bottomGroupSet.headSet(arg).size());
+          }
+          newCall =
+              AggregateCall.create(aggCall.getAggregation(),
+                  false,
+                  aggCall.isApproximate(),
+                  newArgList,
+                  -1,
+                  originalGroupSet.cardinality(),
+                  relBuilder.peek(),
+                  aggCall.getType(),
+                  aggCall.name);
+        } else {
+          // If aggregate B had a COUNT aggregate call the corresponding 
aggregate at
+          // aggregate A must be SUM. For other aggregates, it remains the 
same.
+          final List<Integer> newArgs =
+              Lists.newArrayList(bottomGroupSet.size() + 
nonDistinctAggCallProcessedSoFar);
+          if (aggCall.getAggregation().getKind() == SqlKind.COUNT) {
+            newCall =
+                AggregateCall.create(new SqlSumEmptyIsZeroAggFunction(), false,
+                    aggCall.isApproximate(), newArgs, -1,
+                    originalGroupSet.cardinality(), relBuilder.peek(),
+                    aggCall.getType(), aggCall.getName());
+          } else {
+            newCall =
+                AggregateCall.create(aggCall.getAggregation(), false,
+                    aggCall.isApproximate(), newArgs, -1,
+                    originalGroupSet.cardinality(),
+                    relBuilder.peek(), aggCall.getType(), aggCall.name);
+          }
+          nonDistinctAggCallProcessedSoFar++;
+        }
+
+        topAggregateCalls.add(newCall);
+      }
+
+      // Populate the group-by keys with the remapped arguments for aggregate A
+      // The top groupset is basically an identity (first X fields of 
aggregate B's
+      // output), minus the distinct aggCall's input.
+      final Set<Integer> topGroupSet = new HashSet<>();
+      int groupSetToAdd = 0;
+      for (int bottomGroup : bottomGroupSet) {
+        if (originalGroupSet.get(bottomGroup)) {
+          topGroupSet.add(groupSetToAdd);
+        }
+        groupSetToAdd++;
+      }
+      relBuilder.push(
+          aggregate.copy(aggregate.getTraitSet(),
+              relBuilder.build(), aggregate.indicator,
+              ImmutableBitSet.of(topGroupSet), null, topAggregateCalls));
+      return relBuilder;
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/7ed820f2/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
index 0bc9d23..d939110 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
@@ -17,34 +17,14 @@
  */
 package org.apache.hadoop.hive.ql.parse;
 
-import java.io.IOException;
-import java.lang.reflect.Field;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.UndeclaredThrowableException;
-import java.math.BigDecimal;
-import java.util.AbstractMap.SimpleEntry;
-import java.util.ArrayDeque;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.BitSet;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Deque;
-import java.util.EnumSet;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import javax.sql.DataSource;
+import com.google.common.base.Function;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableList.Builder;
+import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
 import org.antlr.runtime.ClassicToken;
 import org.antlr.runtime.CommonToken;
 import org.antlr.runtime.tree.Tree;
@@ -177,7 +157,6 @@ import 
org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExcept;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIntersect;
-import 
org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.jdbc.HiveJdbcConverter;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode;
@@ -186,6 +165,7 @@ import 
org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableFunctionScan;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion;
+import 
org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.jdbc.HiveJdbcConverter;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.jdbc.JdbcHiveTableScan;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateJoinTransposeRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateProjectMergeRule;
@@ -234,22 +214,19 @@ import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSubQueryRemoveRule;
 import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveUnionMergeRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveUnionPullUpConstantsRule;
 import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveWindowingFixRule;
-
+import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCAbstractSplitFilterRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCAggregationPushDownRule;
+import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCExtractJoinFilterRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCFilterJoinRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCFilterPushDownRule;
-import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCExtractJoinFilterRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCJoinPushDownRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCProjectPushDownRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCSortPushDownRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCUnionPushDownRule;
-import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.jdbc.JDBCAbstractSplitFilterRule;
-
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveAggregateIncrementalRewritingRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveMaterializedViewRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveNoAggregateIncrementalRewritingRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.MaterializedViewRewritingRelVisitor;
-
 import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTBuilder;
 import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter;
 import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter;
@@ -295,13 +272,32 @@ import 
org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
 import org.joda.time.Interval;
 
-import com.google.common.base.Function;
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableList.Builder;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Multimap;
+import javax.sql.DataSource;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.UndeclaredThrowableException;
+import java.math.BigDecimal;
+import java.util.AbstractMap.SimpleEntry;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 
 
 public class CalcitePlanner extends SemanticAnalyzer {
@@ -1833,20 +1829,20 @@ public class CalcitePlanner extends SemanticAnalyzer {
       // 11. Apply Druid transformation rules
       perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
       calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, 
mdProvider.getMetadataProvider(), null,
-              HepMatchOrder.BOTTOM_UP,
-              HiveDruidRules.FILTER,
-              HiveDruidRules.PROJECT_FILTER_TRANSPOSE,
-              HiveDruidRules.AGGREGATE_FILTER_TRANSPOSE,
-              HiveDruidRules.AGGREGATE_PROJECT,
-              HiveDruidRules.PROJECT,
-              HiveDruidRules.AGGREGATE,
-              HiveDruidRules.POST_AGGREGATION_PROJECT,
-              HiveDruidRules.FILTER_AGGREGATE_TRANSPOSE,
-              HiveDruidRules.FILTER_PROJECT_TRANSPOSE,
-              HiveDruidRules.HAVING_FILTER_RULE,
-              HiveDruidRules.SORT_PROJECT_TRANSPOSE,
-              HiveDruidRules.SORT,
-              HiveDruidRules.PROJECT_SORT_TRANSPOSE
+          HepMatchOrder.BOTTOM_UP,
+          HiveDruidRules.FILTER, HiveDruidRules.PROJECT_FILTER_TRANSPOSE,
+          HiveDruidRules.AGGREGATE_FILTER_TRANSPOSE,
+          HiveDruidRules.AGGREGATE_PROJECT,
+          HiveDruidRules.PROJECT,
+          HiveDruidRules.EXPAND_SINGLE_DISTINCT_AGGREGATES_DRUID_RULE,
+          HiveDruidRules.AGGREGATE,
+          HiveDruidRules.POST_AGGREGATION_PROJECT,
+          HiveDruidRules.FILTER_AGGREGATE_TRANSPOSE,
+          HiveDruidRules.FILTER_PROJECT_TRANSPOSE,
+          HiveDruidRules.HAVING_FILTER_RULE,
+          HiveDruidRules.SORT_PROJECT_TRANSPOSE,
+          HiveDruidRules.SORT,
+          HiveDruidRules.PROJECT_SORT_TRANSPOSE
       );
       perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, 
"Calcite: Druid transformation rules");
 

http://git-wip-us.apache.org/repos/asf/hive/blob/7ed820f2/ql/src/test/queries/clientpositive/druidmini_expressions.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/druidmini_expressions.q 
b/ql/src/test/queries/clientpositive/druidmini_expressions.q
index 722936b..72e6256 100644
--- a/ql/src/test/queries/clientpositive/druidmini_expressions.q
+++ b/ql/src/test/queries/clientpositive/druidmini_expressions.q
@@ -51,4 +51,34 @@ EXPLAIN SELECT cstring1 || '_'|| cstring2, 
substring(cstring2, 2, 3) as concat ,
 
 explain extended select count(*) from (select `__time` from druid_table_n0 
limit 1) as src ;
 
+SELECT `__time`
+FROM druid_table_n0
+WHERE (`__time` BETWEEN '1968-01-01 00:00:00' AND '1970-01-01 00:00:00')
+    OR (`__time` BETWEEN '1968-02-01 00:00:00' AND '1970-04-01 00:00:00') 
ORDER BY `__time` ASC LIMIT 10;
+
+-- COUNT DISTINCT TESTS
+-- AS PART OF https://issues.apache.org/jira/browse/HIVE-19586
+
+EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 
GROUP  BY `__time`, `cstring1` ;
+
+EXPLAIN select count(distinct cdouble), sum(cdouble) FROM druid_table_n0 GROUP 
 BY `__time`, `cstring1` ;
+
+EXPLAIN select count(distinct cstring2), sum(2 * cdouble) FROM druid_table_n0 
GROUP  BY `__time`, `cstring1` ;
+
+EXPLAIN select count(distinct cstring2 || '_'|| cstring1), sum(cdouble) FROM 
druid_table_n0 GROUP  BY `__time`, `cstring1` ;
+
+EXPLAIN select count(DISTINCT cstring2) FROM druid_table_n0 ;
+EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 ;
+EXPLAIN select count(distinct cstring2 || '_'|| cstring1), sum(cdouble), 
min(cint) FROM druid_table_n0;
+
+select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 GROUP  BY 
floor_year(`__time`) ;
+
+select count(distinct cstring2), sum(2 * cdouble) FROM druid_table_n0 GROUP  
BY floor_year(`__time`) ;
+
+select count(DISTINCT cstring2) FROM druid_table_n0 ;
+
+select count(DISTINCT cstring2), sum(cdouble) FROM druid_table_n0 ;
+
+select count(distinct cstring2 || '_'|| cstring1), sum(cdouble), min(cint) 
FROM druid_table_n0;
+
 DROP TABLE druid_table_n0;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/7ed820f2/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out 
b/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out
index d51c5af..30bc5d0 100644
--- a/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out
+++ b/ql/src/test/results/clientpositive/druid/druidmini_expressions.q.out
@@ -368,6 +368,527 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
+PREHOOK: query: SELECT `__time`
+FROM druid_table_n0
+WHERE (`__time` BETWEEN '1968-01-01 00:00:00' AND '1970-01-01 00:00:00')
+    OR (`__time` BETWEEN '1968-02-01 00:00:00' AND '1970-04-01 00:00:00') 
ORDER BY `__time` ASC LIMIT 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@druid_table_n0
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT `__time`
+FROM druid_table_n0
+WHERE (`__time` BETWEEN '1968-01-01 00:00:00' AND '1970-01-01 00:00:00')
+    OR (`__time` BETWEEN '1968-02-01 00:00:00' AND '1970-04-01 00:00:00') 
ORDER BY `__time` ASC LIMIT 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@druid_table_n0
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+1969-12-31 15:59:00.0 US/Pacific
+PREHOOK: query: EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM 
druid_table_n0 GROUP  BY `__time`, `cstring1`
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM 
druid_table_n0 GROUP  BY `__time`, `cstring1`
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: druid_table_n0
+                  properties:
+                    druid.fieldNames extract,cstring1,cstring2,$f3
+                    druid.fieldTypes timestamp with local time 
zone,string,string,double
+                    druid.query.json 
{"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"extraction","dimension":"__time","outputName":"extract","extractionFn":{"type":"timeFormat","format":"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","timeZone":"US/Pacific"}},{"type":"default","dimension":"cstring1","outputName":"cstring1","outputType":"STRING"},{"type":"default","dimension":"cstring2","outputName":"cstring2","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f3","fieldName":"cdouble"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]}
+                    druid.query.type groupBy
+                  Statistics: Num rows: 9173 Data size: 3625856 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: extract (type: timestamp with local time 
zone), cstring1 (type: string), cstring2 (type: string), $f3 (type: double)
+                    outputColumnNames: extract, cstring1, cstring2, $f3
+                    Statistics: Num rows: 9173 Data size: 3625856 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count(cstring2), sum($f3)
+                      keys: extract (type: timestamp with local time zone), 
cstring1 (type: string)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3
+                      Statistics: Num rows: 9173 Data size: 3625856 Basic 
stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: timestamp with local 
time zone), _col1 (type: string)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: timestamp 
with local time zone), _col1 (type: string)
+                        Statistics: Num rows: 9173 Data size: 3625856 Basic 
stats: COMPLETE Column stats: NONE
+                        value expressions: _col2 (type: bigint), _col3 (type: 
double)
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0), sum(VALUE._col1)
+                keys: KEY._col0 (type: timestamp with local time zone), 
KEY._col1 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col2 (type: bigint), _col3 (type: double)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN select count(distinct cdouble), sum(cdouble) FROM 
druid_table_n0 GROUP  BY `__time`, `cstring1`
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(distinct cdouble), sum(cdouble) FROM 
druid_table_n0 GROUP  BY `__time`, `cstring1`
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: druid_table_n0
+                  properties:
+                    druid.fieldNames extract,cstring1,cdouble,$f3
+                    druid.fieldTypes timestamp with local time 
zone,string,double,double
+                    druid.query.json 
{"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"extraction","dimension":"__time","outputName":"extract","extractionFn":{"type":"timeFormat","format":"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","timeZone":"US/Pacific"}},{"type":"default","dimension":"cstring1","outputName":"cstring1","outputType":"STRING"},{"type":"default","dimension":"cdouble","outputName":"cdouble","outputType":"DOUBLE"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f3","fieldName":"cdouble"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]}
+                    druid.query.type groupBy
+                  Statistics: Num rows: 9173 Data size: 2091840 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: extract (type: timestamp with local time 
zone), cstring1 (type: string), cdouble (type: double), $f3 (type: double)
+                    outputColumnNames: extract, cstring1, cdouble, $f3
+                    Statistics: Num rows: 9173 Data size: 2091840 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count(cdouble), sum($f3)
+                      keys: extract (type: timestamp with local time zone), 
cstring1 (type: string)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3
+                      Statistics: Num rows: 9173 Data size: 2091840 Basic 
stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: timestamp with local 
time zone), _col1 (type: string)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: timestamp 
with local time zone), _col1 (type: string)
+                        Statistics: Num rows: 9173 Data size: 2091840 Basic 
stats: COMPLETE Column stats: NONE
+                        value expressions: _col2 (type: bigint), _col3 (type: 
double)
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0), sum(VALUE._col1)
+                keys: KEY._col0 (type: timestamp with local time zone), 
KEY._col1 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 4586 Data size: 1045805 Basic stats: 
COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col2 (type: bigint), _col3 (type: double)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 4586 Data size: 1045805 Basic stats: 
COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 4586 Data size: 1045805 Basic stats: 
COMPLETE Column stats: NONE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN select count(distinct cstring2), sum(2 * cdouble) FROM 
druid_table_n0 GROUP  BY `__time`, `cstring1`
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(distinct cstring2), sum(2 * cdouble) 
FROM druid_table_n0 GROUP  BY `__time`, `cstring1`
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: druid_table_n0
+                  properties:
+                    druid.fieldNames extract,cstring1,cstring2,$f3
+                    druid.fieldTypes timestamp with local time 
zone,string,string,double
+                    druid.query.json 
{"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"extraction","dimension":"__time","outputName":"extract","extractionFn":{"type":"timeFormat","format":"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","timeZone":"US/Pacific"}},{"type":"default","dimension":"cstring1","outputName":"cstring1","outputType":"STRING"},{"type":"default","dimension":"cstring2","outputName":"cstring2","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f3","expression":"(CAST(2,
 'DOUBLE') * 
\"cdouble\")"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]}
+                    druid.query.type groupBy
+                  Statistics: Num rows: 9173 Data size: 3625856 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: extract (type: timestamp with local time 
zone), cstring1 (type: string), cstring2 (type: string), $f3 (type: double)
+                    outputColumnNames: extract, cstring1, cstring2, $f3
+                    Statistics: Num rows: 9173 Data size: 3625856 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count(cstring2), sum($f3)
+                      keys: extract (type: timestamp with local time zone), 
cstring1 (type: string)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3
+                      Statistics: Num rows: 9173 Data size: 3625856 Basic 
stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: timestamp with local 
time zone), _col1 (type: string)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: timestamp 
with local time zone), _col1 (type: string)
+                        Statistics: Num rows: 9173 Data size: 3625856 Basic 
stats: COMPLETE Column stats: NONE
+                        value expressions: _col2 (type: bigint), _col3 (type: 
double)
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0), sum(VALUE._col1)
+                keys: KEY._col0 (type: timestamp with local time zone), 
KEY._col1 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col2 (type: bigint), _col3 (type: double)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN select count(distinct cstring2 || '_'|| cstring1), 
sum(cdouble) FROM druid_table_n0 GROUP  BY `__time`, `cstring1`
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(distinct cstring2 || '_'|| cstring1), 
sum(cdouble) FROM druid_table_n0 GROUP  BY `__time`, `cstring1`
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: druid_table_n0
+                  properties:
+                    druid.fieldNames extract,cstring1,vc,$f3
+                    druid.fieldTypes timestamp with local time 
zone,string,string,double
+                    druid.query.json 
{"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"extraction","dimension":"__time","outputName":"extract","extractionFn":{"type":"timeFormat","format":"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","timeZone":"US/Pacific"}},{"type":"default","dimension":"cstring1","outputName":"cstring1","outputType":"STRING"},{"type":"default","dimension":"vc","outputName":"vc","outputType":"STRING"}],"virtualColumns":[{"type":"expression","name":"vc","expression":"concat(concat(\"cstring2\",'_'),\"cstring1\")","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f3","fieldName":"cdouble"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]}
+                    druid.query.type groupBy
+                  Statistics: Num rows: 9173 Data size: 3625856 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: extract (type: timestamp with local time 
zone), cstring1 (type: string), vc (type: string), $f3 (type: double)
+                    outputColumnNames: extract, cstring1, vc, $f3
+                    Statistics: Num rows: 9173 Data size: 3625856 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count(vc), sum($f3)
+                      keys: extract (type: timestamp with local time zone), 
cstring1 (type: string)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3
+                      Statistics: Num rows: 9173 Data size: 3625856 Basic 
stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: timestamp with local 
time zone), _col1 (type: string)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: timestamp 
with local time zone), _col1 (type: string)
+                        Statistics: Num rows: 9173 Data size: 3625856 Basic 
stats: COMPLETE Column stats: NONE
+                        value expressions: _col2 (type: bigint), _col3 (type: 
double)
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0), sum(VALUE._col1)
+                keys: KEY._col0 (type: timestamp with local time zone), 
KEY._col1 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col2 (type: bigint), _col3 (type: double)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 4586 Data size: 1812730 Basic stats: 
COMPLETE Column stats: NONE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN select count(DISTINCT cstring2) FROM druid_table_n0
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(DISTINCT cstring2) FROM druid_table_n0
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: druid_table_n0
+                  properties:
+                    druid.fieldNames cstring2
+                    druid.fieldTypes string
+                    druid.query.json 
{"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"default","dimension":"cstring2","outputName":"cstring2","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]}
+                    druid.query.type groupBy
+                  Statistics: Num rows: 9173 Data size: 1603744 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: cstring2 (type: string)
+                    outputColumnNames: cstring2
+                    Statistics: Num rows: 9173 Data size: 1603744 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count(cstring2)
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: 192 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 192 Basic stats: 
COMPLETE Column stats: NONE
+                        value expressions: _col0 (type: bigint)
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM 
druid_table_n0
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(DISTINCT cstring2), sum(cdouble) FROM 
druid_table_n0
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: druid_table_n0
+                  properties:
+                    druid.fieldNames cstring2,$f1
+                    druid.fieldTypes string,double
+                    druid.query.json 
{"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"default","dimension":"cstring2","outputName":"cstring2","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f1","fieldName":"cdouble"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]}
+                    druid.query.type groupBy
+                  Statistics: Num rows: 9173 Data size: 1673472 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: cstring2 (type: string), $f1 (type: double)
+                    outputColumnNames: cstring2, $f1
+                    Statistics: Num rows: 9173 Data size: 1673472 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count(cstring2), sum($f1)
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 1 Data size: 208 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 208 Basic stats: 
COMPLETE Column stats: NONE
+                        value expressions: _col0 (type: bigint), _col1 (type: 
double)
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0), sum(VALUE._col1)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 208 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 208 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN select count(distinct cstring2 || '_'|| cstring1), 
sum(cdouble), min(cint) FROM druid_table_n0
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(distinct cstring2 || '_'|| cstring1), 
sum(cdouble), min(cint) FROM druid_table_n0
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: druid_table_n0
+                  properties:
+                    druid.fieldNames vc,$f1,$f2
+                    druid.fieldTypes string,double,int
+                    druid.query.json 
{"queryType":"groupBy","dataSource":"default.druid_table_n0","granularity":"all","dimensions":[{"type":"default","dimension":"vc","outputName":"vc","outputType":"STRING"}],"virtualColumns":[{"type":"expression","name":"vc","expression":"concat(concat(\"cstring2\",'_'),\"cstring1\")","outputType":"STRING"}],"limitSpec":{"type":"default"},"aggregations":[{"type":"doubleSum","name":"$f1","fieldName":"cdouble"},{"type":"longMin","name":"$f2","fieldName":"cint"}],"intervals":["1900-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z"]}
+                    druid.query.type groupBy
+                  Statistics: Num rows: 9173 Data size: 1708336 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: vc (type: string), $f1 (type: double), $f2 
(type: int)
+                    outputColumnNames: vc, $f1, $f2
+                    Statistics: Num rows: 9173 Data size: 1708336 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count(vc), sum($f1), min($f2)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 1 Data size: 216 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 216 Basic stats: 
COMPLETE Column stats: NONE
+                        value expressions: _col0 (type: bigint), _col1 (type: 
double), _col2 (type: int)
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0), sum(VALUE._col1), 
min(VALUE._col2)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 216 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 216 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(DISTINCT cstring2), sum(cdouble) FROM 
druid_table_n0 GROUP  BY floor_year(`__time`)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@druid_table_n0
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(DISTINCT cstring2), sum(cdouble) FROM 
druid_table_n0 GROUP  BY floor_year(`__time`)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@druid_table_n0
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+6078   2.7308662809692383E7
+PREHOOK: query: select count(distinct cstring2), sum(2 * cdouble) FROM 
druid_table_n0 GROUP  BY floor_year(`__time`)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@druid_table_n0
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(distinct cstring2), sum(2 * cdouble) FROM 
druid_table_n0 GROUP  BY floor_year(`__time`)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@druid_table_n0
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+6078   5.4617325619384766E7
+PREHOOK: query: select count(DISTINCT cstring2) FROM druid_table_n0
+PREHOOK: type: QUERY
+PREHOOK: Input: default@druid_table_n0
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(DISTINCT cstring2) FROM druid_table_n0
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@druid_table_n0
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+6078
+PREHOOK: query: select count(DISTINCT cstring2), sum(cdouble) FROM 
druid_table_n0
+PREHOOK: type: QUERY
+PREHOOK: Input: default@druid_table_n0
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(DISTINCT cstring2), sum(cdouble) FROM 
druid_table_n0
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@druid_table_n0
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+6078   2.7308662809692383E7
+PREHOOK: query: select count(distinct cstring2 || '_'|| cstring1), 
sum(cdouble), min(cint) FROM druid_table_n0
+PREHOOK: type: QUERY
+PREHOOK: Input: default@druid_table_n0
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(distinct cstring2 || '_'|| cstring1), 
sum(cdouble), min(cint) FROM druid_table_n0
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@druid_table_n0
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+6095   2.7308662809692383E7    -1073279343
 PREHOOK: query: DROP TABLE druid_table_n0
 PREHOOK: type: DROPTABLE
 PREHOOK: Input: default@druid_table_n0

hive git commit: HIVE-19586 : Optimize Count(distinct X) pushdown based on the storage capabilities (Slim Bouguerra via Ashutosh Chauhan)

Reply via email to