This is an automated email from the ASF dual-hosted git repository. krisztiankasa pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 9cba262 HIVE-24579: Incorrect Result For Groupby With Limit (Krisztian Kasa, reviewed by Zoltan Haindrich) 9cba262 is described below commit 9cba2626e1b0376dc272ab4ed36a4c72eb6d8f57 Author: Krisztian Kasa <kasakri...@gmail.com> AuthorDate: Tue Sep 28 12:39:59 2021 +0200 HIVE-24579: Incorrect Result For Groupby With Limit (Krisztian Kasa, reviewed by Zoltan Haindrich) --- .../calcite/rules/HiveAggregateSortLimitRule.java | 117 +++++++ .../hadoop/hive/ql/parse/CalcitePlanner.java | 3 +- .../queries/clientpositive/topnkey_gby_limit.q | 21 ++ .../clientpositive/llap/bucket_groupby.q.out | 32 +- .../results/clientpositive/llap/filter_union.q.out | 14 + .../clientpositive/llap/groupby1_limit.q.out | 47 +-- .../clientpositive/llap/limit_pushdown.q.out | 101 ++---- .../clientpositive/llap/limit_pushdown4.q.out | 21 +- .../llap/offset_limit_ppd_optimizer.q.out | 98 ++---- .../clientpositive/llap/topnkey_gby_limit.q.out | 379 +++++++++++++++++++++ 10 files changed, 671 insertions(+), 162 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateSortLimitRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateSortLimitRule.java new file mode 100644 index 0000000..01c6c8d --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateSortLimitRule.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import com.google.common.collect.ImmutableList; +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.rel.RelFieldCollation; +import org.apache.calcite.tools.RelBuilder; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelCollation; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit; + +/** + * Rule that adds sorting to GROUP BY col0 LIMIT n in presence of aggregate functions. + * Ex.: SELECT id, count(1) FROM t_table GROUP BY id LIMIT 2 + * + * Above query has a physical plan like Reducer 2 <- Map 1 (SIMPLE_EDGE) + * Both mapper and reducer edges may have multiple Mapper and Reducer instances to enable parallel process of data. + * Aggregate function results are calculated in two steps: + * 1) first mappers calculate a partial result from the rows processed by each instance. + * The result is going to be filtered by Top N optimization in the mappers. + * 2) In the second step reducers aggregate the partial results coming from the mappers. However, some of partial + * results are filtered out by Top N optimization. + * Each reducer generates an output file and in the last stage Fetch Operator choose one af the to be the result of the + * query. In these result files only the first n row has correct aggregation results the ones which has a key value + * falls in the top n key. + * + * In order to get correct aggregation results this rule adds sorting to the HiveSortLimit above the HiveAggregate + * which enables hive to sort merge the results of the reducers and take the first n rows of the merged result. + * + * from: + * HiveSortLimit(fetch=[2]) + * HiveAggregate(group=[{0}], agg#0=[count()]) + * + * to: + * HiveSortLimit(sort0=[$0], dir0=[ASC], fetch=[2]) + * HiveAggregate(group=[{0}], agg#0=[count()]) + */ +public class HiveAggregateSortLimitRule extends RelOptRule { + + private static HiveAggregateSortLimitRule instance = null; + + public static final HiveAggregateSortLimitRule getInstance(HiveConf hiveConf) { + if (instance == null) { + RelFieldCollation.NullDirection defaultAscNullDirection; + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_DEFAULT_NULLS_LAST)) { + defaultAscNullDirection = RelFieldCollation.NullDirection.LAST; + } else { + defaultAscNullDirection = RelFieldCollation.NullDirection.FIRST; + } + instance = new HiveAggregateSortLimitRule(defaultAscNullDirection); + } + + return instance; + } + + private final RelFieldCollation.NullDirection defaultAscNullDirection; + + + private HiveAggregateSortLimitRule(RelFieldCollation.NullDirection defaultAscNullDirection) { + super(operand(HiveSortLimit.class, operand(HiveAggregate.class, any())), + HiveRelFactories.HIVE_BUILDER, "HiveAggregateSortRule"); + this.defaultAscNullDirection = defaultAscNullDirection; + } + + @Override + public void onMatch(RelOptRuleCall call) { + final HiveSortLimit sortLimit = call.rel(0); + final HiveAggregate aggregate = call.rel(1); + final RelBuilder relBuilder = call.builder(); + + if (sortLimit.getFetchExpr() == null && sortLimit.getOffsetExpr() == null) { + // No limit, offset -> No Top N -> all rows are forwarded from all RS to Reducers + return; + } + + if (aggregate.getAggCallList().isEmpty()) { + // No aggregate functions, any Group By key can be in the final result + return; + } + + if (!sortLimit.getSortExps().isEmpty()) { + // Sort keys already present + return; + } + + ImmutableList.Builder<RelFieldCollation> newSortKeyBuilder = ImmutableList.builder(); + for (int i : aggregate.getGroupSet()) { + RelFieldCollation fieldCollation = + new RelFieldCollation(i, RelFieldCollation.Direction.ASCENDING, defaultAscNullDirection); + newSortKeyBuilder.add(fieldCollation); + } + + HiveRelCollation newCollation = new HiveRelCollation(newSortKeyBuilder.build()); + HiveSortLimit newSortLimit = sortLimit.copy(sortLimit.getTraitSet(), + aggregate, newCollation, sortLimit.offset, sortLimit.fetch); + + call.transformTo(newSortLimit); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 472b3d7..7cecfc8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -162,6 +162,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveConfPlannerContext; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveDefaultRelMetadataProvider; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveTezModelRelMetadataProvider; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateSortLimitRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinSwapConstraintsRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSemiJoinProjectTransposeRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveMaterializationRelMetadataProvider; @@ -2214,7 +2215,7 @@ public class CalcitePlanner extends SemanticAnalyzer { ProjectRemoveRule.Config.DEFAULT.toRule(), HiveUnionMergeRule.INSTANCE, new HiveUnionSimpleSelectsToInlineTableRule(dummyTableScan), HiveAggregateProjectMergeRule.INSTANCE, HiveProjectMergeRule.INSTANCE_NO_FORCE, - HiveJoinCommuteRule.INSTANCE); + HiveJoinCommuteRule.INSTANCE, HiveAggregateSortLimitRule.getInstance(conf)); // 2. Run aggregate-join transpose (cost based) // If it failed because of missing stats, we continue with diff --git a/ql/src/test/queries/clientpositive/topnkey_gby_limit.q b/ql/src/test/queries/clientpositive/topnkey_gby_limit.q new file mode 100644 index 0000000..9c79d75 --- /dev/null +++ b/ql/src/test/queries/clientpositive/topnkey_gby_limit.q @@ -0,0 +1,21 @@ +SET hive.optimize.topnkey=false; + +set tez.grouping.max-size=50; +set tez.grouping.min-size=25; + +create table test(id string); + +insert into test(id) values +(4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (2), (1), (5); + +explain cbo +select id, count(1) from test group by id limit 2; +explain extended +select id, count(1) from test group by id limit 2; +select id, count(1) from test group by id limit 2; + +SET hive.optimize.topnkey=true; + +explain extended +select id, count(1) from test group by id limit 2; +select id, count(1) from test group by id limit 2; diff --git a/ql/src/test/results/clientpositive/llap/bucket_groupby.q.out b/ql/src/test/results/clientpositive/llap/bucket_groupby.q.out index 0e96b92..7e08fca 100644 --- a/ql/src/test/results/clientpositive/llap/bucket_groupby.q.out +++ b/ql/src/test/results/clientpositive/llap/bucket_groupby.q.out @@ -318,6 +318,7 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -361,6 +362,19 @@ STAGE PLANS: mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 316 Data size: 3792 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 316 Data size: 3792 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 316 Data size: 3792 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 10 Statistics: Num rows: 10 Data size: 120 Basic stats: COMPLETE Column stats: COMPLETE @@ -388,9 +402,9 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@clustergroupby POSTHOOK: Input: default@clustergroupby@ds=101 #### A masked pattern was here #### -3 416 1 10 2 74 +3 416 PREHOOK: query: explain select abs(length(key)), count(1) from clustergroupby where ds='101' group by abs(length(key)) limit 10 PREHOOK: type: QUERY @@ -413,6 +427,7 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -456,6 +471,19 @@ STAGE PLANS: mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 316 Data size: 3792 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 316 Data size: 3792 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 316 Data size: 3792 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 10 Statistics: Num rows: 10 Data size: 120 Basic stats: COMPLETE Column stats: COMPLETE @@ -483,9 +511,9 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@clustergroupby POSTHOOK: Input: default@clustergroupby@ds=101 #### A masked pattern was here #### -3 416 1 10 2 74 +3 416 PREHOOK: query: explain select key, count(1) from clustergroupby where ds='101' group by key,'a' order by key,'a' limit 10 PREHOOK: type: QUERY diff --git a/ql/src/test/results/clientpositive/llap/filter_union.q.out b/ql/src/test/results/clientpositive/llap/filter_union.q.out index 9d0df50..c6eee0e 100644 --- a/ql/src/test/results/clientpositive/llap/filter_union.q.out +++ b/ql/src/test/results/clientpositive/llap/filter_union.q.out @@ -495,6 +495,7 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -537,6 +538,19 @@ STAGE PLANS: mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 0 Statistics: Num rows: 1 Data size: 95 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/groupby1_limit.q.out b/ql/src/test/results/clientpositive/llap/groupby1_limit.q.out index 27d0dd1..4e7e749 100644 --- a/ql/src/test/results/clientpositive/llap/groupby1_limit.q.out +++ b/ql/src/test/results/clientpositive/llap/groupby1_limit.q.out @@ -28,7 +28,7 @@ STAGE PLANS: #### A masked pattern was here #### Edges: Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -71,33 +71,34 @@ STAGE PLANS: mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - null sort order: - sort order: - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.1 - value expressions: _col0 (type: string), _col1 (type: double) + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) Reducer 3 Execution mode: vectorized, llap Reduce Operator Tree: - Limit - Number of rows: 5 - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - Select Operator - expressions: UDFToInteger(VALUE._col0) (type: int), VALUE._col1 (type: double) - outputColumnNames: _col0, _col1 + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 5 Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false + Select Operator + expressions: UDFToInteger(_col0) (type: int), _col1 (type: double) + outputColumnNames: _col0, _col1 Statistics: Num rows: 5 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.dest1_n125 + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.dest1_n125 Stage: Stage-2 Dependency Collection diff --git a/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out b/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out index e739d5d..7ce1a04 100644 --- a/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out @@ -1040,10 +1040,7 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) - Reducer 4 <- Map 1 (SIMPLE_EDGE) - Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1075,6 +1072,13 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE Top N Key Operator sort order: + keys: key (type: string) @@ -1102,7 +1106,6 @@ STAGE PLANS: Execution mode: vectorized, llap LLAP IO: all inputs Reducer 2 - Execution mode: vectorized, llap Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0) @@ -1114,77 +1117,47 @@ STAGE PLANS: expressions: _col0 (type: string), _col1 (type: bigint) outputColumnNames: _col0, _col1 Limit - Number of rows: 2 - Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE Column stats: COMPLETE + Number of rows: 3 + Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: _col0 is not null (type: boolean) - Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - null sort order: z - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: bigint) - Reducer 3 + Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE + Dummy Store Execution mode: llap Reduce Operator Tree: - Merge Join Operator - condition map: - Inner Join 0 to 1 - keys: - 0 _col0 (type: string) - 1 _col0 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 2 Data size: 380 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 4 - Statistics: Num rows: 2 Data size: 380 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - Statistics: Num rows: 2 Data size: 380 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - Reducer 4 - Execution mode: vectorized, llap - Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0) keys: KEY._col0 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 3 - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - null sort order: - sort order: - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 0.3 - value expressions: _col0 (type: string), _col1 (type: bigint) - Reducer 5 - Execution mode: vectorized, llap - Reduce Operator Tree: - Limit - Number of rows: 3 - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: VALUE._col0 (type: string), VALUE._col1 (type: bigint) + expressions: _col0 (type: string), _col1 (type: bigint) outputColumnNames: _col0, _col1 - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE - Filter Operator - predicate: _col0 is not null (type: boolean) - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - null sort order: z - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: bigint) + Limit + Number of rows: 2 + Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: _col0 is not null (type: boolean) + Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE Column stats: COMPLETE + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 2 Data size: 380 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 4 + Statistics: Num rows: 2 Data size: 380 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 380 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/llap/limit_pushdown4.q.out b/ql/src/test/results/clientpositive/llap/limit_pushdown4.q.out index 70f8fcd..6047293 100644 --- a/ql/src/test/results/clientpositive/llap/limit_pushdown4.q.out +++ b/ql/src/test/results/clientpositive/llap/limit_pushdown4.q.out @@ -349,16 +349,19 @@ STAGE PLANS: mode: mergepartial outputColumnNames: _col0, _col1 Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Limit + Number of rows: 5 Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out b/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out index f6d4562..d48ef82 100644 --- a/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out +++ b/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out @@ -1468,10 +1468,7 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) - Reducer 4 <- Map 1 (SIMPLE_EDGE) - Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1495,6 +1492,13 @@ STAGE PLANS: sort order: + Map-reduce partition columns: key (type: string) Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE Top N Key Operator sort order: + keys: key (type: string) @@ -1514,7 +1518,6 @@ STAGE PLANS: Execution mode: llap LLAP IO: all inputs Reducer 2 - Execution mode: llap Reduce Operator Tree: Group By Operator aggregations: count() @@ -1527,41 +1530,12 @@ STAGE PLANS: outputColumnNames: _col0, _col1 Limit Number of rows: 20 - Offset of rows: 10 + Offset of rows: 20 Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: _col0 is not null (type: boolean) Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - null sort order: z - sort order: + - Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: bigint) - Reducer 3 - Execution mode: llap - Reduce Operator Tree: - Merge Join Operator - condition map: - Inner Join 0 to 1 - keys: - 0 _col0 (type: string) - 1 _col0 (type: string) - outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 20 Data size: 3800 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 5 - Offset of rows: 3 - Statistics: Num rows: 5 Data size: 950 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - Statistics: Num rows: 5 Data size: 950 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - Reducer 4 + Dummy Store Execution mode: llap Reduce Operator Tree: Group By Operator @@ -1570,37 +1544,35 @@ STAGE PLANS: mode: complete outputColumnNames: _col0, _col1 Statistics: Num rows: 316 Data size: 30020 Basic stats: COMPLETE Column stats: COMPLETE - Limit - Number of rows: 20 - Offset of rows: 20 - Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - null sort order: - sort order: - Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE - TopN Hash Memory Usage: 2.0E-5 - value expressions: _col0 (type: string), _col1 (type: bigint) - Reducer 5 - Execution mode: llap - Reduce Operator Tree: - Limit - Number of rows: 20 - Offset of rows: 20 - Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: VALUE._col0 (type: string), VALUE._col1 (type: bigint) + expressions: _col0 (type: string), _col1 (type: bigint) outputColumnNames: _col0, _col1 - Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE - Filter Operator - predicate: _col0 is not null (type: boolean) + Limit + Number of rows: 20 + Offset of rows: 10 Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE - Reduce Output Operator - key expressions: _col0 (type: string) - null sort order: z - sort order: + - Map-reduce partition columns: _col0 (type: string) + Filter Operator + predicate: _col0 is not null (type: boolean) Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: bigint) + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 20 Data size: 3800 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 5 + Offset of rows: 3 + Statistics: Num rows: 5 Data size: 950 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 950 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/llap/topnkey_gby_limit.q.out b/ql/src/test/results/clientpositive/llap/topnkey_gby_limit.q.out new file mode 100644 index 0000000..c4e12e1 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/topnkey_gby_limit.q.out @@ -0,0 +1,379 @@ +PREHOOK: query: create table test(id string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@test +POSTHOOK: query: create table test(id string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test +PREHOOK: query: insert into test(id) values +(4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (2), (1), (5) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@test +POSTHOOK: query: insert into test(id) values +(4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (3), (4), (2), (1), (5) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@test +POSTHOOK: Lineage: test.id SCRIPT [] +PREHOOK: query: explain cbo +select id, count(1) from test group by id limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: explain cbo +select id, count(1) from test group by id limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +CBO PLAN: +HiveSortLimit(sort0=[$0], dir0=[ASC], fetch=[2]) + HiveProject(id=[$0], $f1=[$1]) + HiveAggregate(group=[{0}], agg#0=[count()]) + HiveTableScan(table=[[default, test]], table:alias=[test]) + +PREHOOK: query: explain extended +select id, count(1) from test group by id limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: explain extended +select id, count(1) from test group by id limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +OPTIMIZED SQL: SELECT `id`, COUNT(*) AS `$f1` +FROM `default`.`test` +GROUP BY `id` +ORDER BY `id` +LIMIT 2 +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: test + Statistics: Num rows: 22 Data size: 1870 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false + Select Operator + expressions: id (type: string) + outputColumnNames: id + Statistics: Num rows: 22 Data size: 1870 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: id (type: string) + minReductionHashAggr: 0.77272725 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + bucketingVersion: 2 + key expressions: _col0 (type: string) + null sort order: z + numBuckets: -1 + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + TopN: 2 + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: bigint) + auto parallelism: true + Execution mode: vectorized, llap + LLAP IO: all inputs + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: test + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns id + columns.types string +#### A masked pattern was here #### + name default.test + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucketing_version 2 + column.name.delimiter , + columns id + columns.comments + columns.types string +#### A masked pattern was here #### + name default.test + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test + name: default.test + Truncated Path -> Alias: + /test [test] + Reducer 2 + Execution mode: vectorized, llap + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + bucketingVersion: 2 + key expressions: _col0 (type: string) + null sort order: z + numBuckets: -1 + sort order: + + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + TopN: 2 + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: bigint) + auto parallelism: false + Reducer 3 + Execution mode: vectorized, llap + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 2 + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + bucketingVersion: 2 + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + bucketing_version -1 + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: 2 + Processor Tree: + ListSink + +PREHOOK: query: select id, count(1) from test group by id limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: select id, count(1) from test group by id limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +1 1 +2 1 +PREHOOK: query: explain extended +select id, count(1) from test group by id limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: explain extended +select id, count(1) from test group by id limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +OPTIMIZED SQL: SELECT `id`, COUNT(*) AS `$f1` +FROM `default`.`test` +GROUP BY `id` +ORDER BY `id` +LIMIT 2 +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: test + Statistics: Num rows: 22 Data size: 1870 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false + Top N Key Operator + sort order: + + keys: id (type: string) + null sort order: z + Statistics: Num rows: 22 Data size: 1870 Basic stats: COMPLETE Column stats: COMPLETE + top n: 2 + Select Operator + expressions: id (type: string) + outputColumnNames: id + Statistics: Num rows: 22 Data size: 1870 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: id (type: string) + minReductionHashAggr: 0.77272725 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + bucketingVersion: 2 + key expressions: _col0 (type: string) + null sort order: z + numBuckets: -1 + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + value expressions: _col1 (type: bigint) + auto parallelism: true + Execution mode: vectorized, llap + LLAP IO: all inputs + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: test + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns id + columns.types string +#### A masked pattern was here #### + name default.test + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucketing_version 2 + column.name.delimiter , + columns id + columns.comments + columns.types string +#### A masked pattern was here #### + name default.test + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test + name: default.test + Truncated Path -> Alias: + /test [test] + Reducer 2 + Execution mode: vectorized, llap + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + bucketingVersion: 2 + key expressions: _col0 (type: string) + null sort order: z + numBuckets: -1 + sort order: + + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + value expressions: _col1 (type: bigint) + auto parallelism: false + Reducer 3 + Execution mode: vectorized, llap + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 465 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 2 + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + bucketingVersion: 2 + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + bucketing_version -1 + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: 2 + Processor Tree: + ListSink + +PREHOOK: query: select id, count(1) from test group by id limit 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@test +#### A masked pattern was here #### +POSTHOOK: query: select id, count(1) from test group by id limit 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test +#### A masked pattern was here #### +1 1 +2 1