Repository: hive Updated Branches: refs/heads/master ed29b4ca9 -> f9e6522c9
HIVE-18944: Groupping sets position is set incorrectly during DPP (Zoltan Haindrich reviewed by Ashutosh Chauhan) Signed-off-by: Zoltan Haindrich <k...@rxd.hu> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f9e6522c Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f9e6522c Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f9e6522c Branch: refs/heads/master Commit: f9e6522c9c71afd1543cc1e2533e64033657128b Parents: ed29b4c Author: Zoltan Haindrich <k...@rxd.hu> Authored: Tue Mar 13 20:00:38 2018 +0100 Committer: Zoltan Haindrich <k...@rxd.hu> Committed: Tue Mar 13 20:00:38 2018 +0100 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 1 + .../DynamicPartitionPruningOptimization.java | 4 +- .../clientpositive/groupby_groupingset_bug.q | 100 +++++ .../llap/groupby_groupingset_bug.q.out | 366 +++++++++++++++++++ 4 files changed, 469 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/f9e6522c/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index b3e6c3b..99d3817 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -559,6 +559,7 @@ minillaplocal.query.files=\ explainuser_1.q,\ explainuser_4.q,\ groupby2.q,\ + groupby_groupingset_bug.q,\ hybridgrace_hashjoin_1.q,\ hybridgrace_hashjoin_2.q,\ is_distinct_from.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/f9e6522c/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java index 0dbfbe2..9917330 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java @@ -437,7 +437,7 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor { GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, outputNames, groupByExprs, new ArrayList<AggregationDesc>(), false, groupByMemoryUsage, memoryThreshold, - null, false, 0, true); + null, false, -1, true); GroupByOperator groupByOp = (GroupByOperator) OperatorFactory.getAndMakeChild( groupBy, selectOp); @@ -611,7 +611,7 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor { gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2)); GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, gbOutputNames, new ArrayList<ExprNodeDesc>(), aggs, false, - groupByMemoryUsage, memoryThreshold, null, false, 0, false); + groupByMemoryUsage, memoryThreshold, null, false, -1, false); ArrayList<ColumnInfo> groupbyColInfos = new ArrayList<ColumnInfo>(); groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(0), key.getTypeInfo(), "", false)); http://git-wip-us.apache.org/repos/asf/hive/blob/f9e6522c/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q b/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q new file mode 100644 index 0000000..c6e9a1a --- /dev/null +++ b/ql/src/test/queries/clientpositive/groupby_groupingset_bug.q @@ -0,0 +1,100 @@ +set hive.explain.user=true; +set hive.optimize.index.filter=true; +set hive.auto.convert.join=true; +set hive.vectorized.execution.enabled=true; + +drop table if exists x1_store_sales; +drop table if exists x1_date_dim; +drop table if exists x1_item; + +create table x1_store_sales +( + ss_item_sk int +) +partitioned by (ss_sold_date_sk int) +stored as orc; + +create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc; + +create table x1_item +( + i_item_sk int, + i_category char(10), + i_current_price decimal(7,2) +) +stored as orc; + +insert into x1_date_dim values (1,1,2000,2), + (1,2,2001,2); +insert into x1_store_sales partition (ss_sold_date_sk=1) values (1); +insert into x1_store_sales partition (ss_sold_date_sk=2) values (2); + +insert into x1_item values (1,2,1),(1,2,1),(2,2,1); + +alter table x1_store_sales partition (ss_sold_date_sk=1) update statistics set( +'numRows'='123456', +'rawDataSize'='1234567'); + +alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449'); + +alter table x1_item update statistics set( +'numRows'='18', +'rawDataSize'='32710'); + +explain +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + ,x1_item i + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and s.ss_item_sk = i.i_item_sk + and d.d_month_seq in + (select distinct (d_month_seq) + from x1_date_dim + where d_year = 2000 and d_year*d_moy > 200000 + and d_moy = 2 ) + and i.i_current_price > + (select min(j.i_current_price) + from x1_item j + where j.i_category = i.i_category) + + group by d.d_month_seq + order by cnt + limit 100; + + +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + ,x1_item i + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and s.ss_item_sk = i.i_item_sk + and d.d_month_seq in + (select distinct (d_month_seq) + from x1_date_dim + where d_year = 2000 and d_year*d_moy > 200000 + and d_moy = 2 ) + and i.i_current_price > + (select min(j.i_current_price) + from x1_item j + where j.i_category = i.i_category) + + group by d.d_month_seq + order by cnt + limit 100; + http://git-wip-us.apache.org/repos/asf/hive/blob/f9e6522c/ql/src/test/results/clientpositive/llap/groupby_groupingset_bug.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/groupby_groupingset_bug.q.out b/ql/src/test/results/clientpositive/llap/groupby_groupingset_bug.q.out new file mode 100644 index 0000000..6ba8811 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/groupby_groupingset_bug.q.out @@ -0,0 +1,366 @@ +PREHOOK: query: drop table if exists x1_store_sales +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists x1_store_sales +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists x1_date_dim +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists x1_date_dim +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists x1_item +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists x1_item +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table x1_store_sales +( + ss_item_sk int +) +partitioned by (ss_sold_date_sk int) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x1_store_sales +POSTHOOK: query: create table x1_store_sales +( + ss_item_sk int +) +partitioned by (ss_sold_date_sk int) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x1_store_sales +PREHOOK: query: create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x1_date_dim +PREHOOK: query: create table x1_item +( + i_item_sk int, + i_category char(10), + i_current_price decimal(7,2) +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x1_item +POSTHOOK: query: create table x1_item +( + i_item_sk int, + i_category char(10), + i_current_price decimal(7,2) +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x1_item +PREHOOK: query: insert into x1_date_dim values (1,1,2000,2), + (1,2,2001,2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: insert into x1_date_dim values (1,1,2000,2), + (1,2,2001,2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_date_dim +POSTHOOK: Lineage: x1_date_dim.d_date_sk SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_month_seq SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_moy SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_year SCRIPT [] +PREHOOK: query: insert into x1_store_sales partition (ss_sold_date_sk=1) values (1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: query: insert into x1_store_sales partition (ss_sold_date_sk=1) values (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: Lineage: x1_store_sales PARTITION(ss_sold_date_sk=1).ss_item_sk SCRIPT [] +PREHOOK: query: insert into x1_store_sales partition (ss_sold_date_sk=2) values (2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_store_sales@ss_sold_date_sk=2 +POSTHOOK: query: insert into x1_store_sales partition (ss_sold_date_sk=2) values (2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_store_sales@ss_sold_date_sk=2 +POSTHOOK: Lineage: x1_store_sales PARTITION(ss_sold_date_sk=2).ss_item_sk SCRIPT [] +PREHOOK: query: insert into x1_item values (1,2,1),(1,2,1),(2,2,1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_item +POSTHOOK: query: insert into x1_item values (1,2,1),(1,2,1),(2,2,1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_item +POSTHOOK: Lineage: x1_item.i_category SCRIPT [] +POSTHOOK: Lineage: x1_item.i_current_price SCRIPT [] +POSTHOOK: Lineage: x1_item.i_item_sk SCRIPT [] +PREHOOK: query: alter table x1_store_sales partition (ss_sold_date_sk=1) update statistics set( +'numRows'='123456', +'rawDataSize'='1234567') +PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS +PREHOOK: Input: default@x1_store_sales +PREHOOK: Output: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: query: alter table x1_store_sales partition (ss_sold_date_sk=1) update statistics set( +'numRows'='123456', +'rawDataSize'='1234567') +POSTHOOK: type: ALTERTABLE_UPDATEPARTSTATS +POSTHOOK: Input: default@x1_store_sales +POSTHOOK: Input: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: Output: default@x1_store_sales@ss_sold_date_sk=1 +PREHOOK: query: alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@x1_date_dim +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@x1_date_dim +POSTHOOK: Output: default@x1_date_dim +PREHOOK: query: alter table x1_item update statistics set( +'numRows'='18', +'rawDataSize'='32710') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@x1_item +PREHOOK: Output: default@x1_item +POSTHOOK: query: alter table x1_item update statistics set( +'numRows'='18', +'rawDataSize'='32710') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@x1_item +POSTHOOK: Output: default@x1_item +PREHOOK: query: explain +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + ,x1_item i + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and s.ss_item_sk = i.i_item_sk + and d.d_month_seq in + (select distinct (d_month_seq) + from x1_date_dim + where d_year = 2000 and d_year*d_moy > 200000 + and d_moy = 2 ) + and i.i_current_price > + (select min(j.i_current_price) + from x1_item j + where j.i_category = i.i_category) + + group by d.d_month_seq + order by cnt + limit 100 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + ,x1_item i + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and s.ss_item_sk = i.i_item_sk + and d.d_month_seq in + (select distinct (d_month_seq) + from x1_date_dim + where d_year = 2000 and d_year*d_moy > 200000 + and d_moy = 2 ) + and i.i_current_price > + (select min(j.i_current_price) + from x1_item j + where j.i_category = i.i_category) + + group by d.d_month_seq + order by cnt + limit 100 +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Map 1 <- Reducer 3 (BROADCAST_EDGE) +Map 4 <- Map 1 (BROADCAST_EDGE), Map 7 (BROADCAST_EDGE) +Map 7 <- Reducer 10 (BROADCAST_EDGE), Reducer 9 (BROADCAST_EDGE) +Reducer 10 <- Reducer 9 (CUSTOM_SIMPLE_EDGE) +Reducer 3 <- Map 2 (SIMPLE_EDGE) +Reducer 5 <- Map 4 (SIMPLE_EDGE) +Reducer 6 <- Reducer 5 (SIMPLE_EDGE) +Reducer 9 <- Map 8 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:100 + Stage-1 + Reducer 6 vectorized, llap + File Output Operator [FS_145] + Limit [LIM_144] (rows=1 width=8) + Number of rows:100 + Select Operator [SEL_143] (rows=1 width=8) + Output:["_col0"] + <-Reducer 5 [SIMPLE_EDGE] vectorized, llap + SHUFFLE [RS_142] + Select Operator [SEL_141] (rows=1 width=8) + Output:["_col0"] + Group By Operator [GBY_140] (rows=1 width=12) + Output:["_col0","_col1"],aggregations:["count(VALUE._col0)"],keys:KEY._col0 + <-Map 4 [SIMPLE_EDGE] vectorized, llap + SHUFFLE [RS_139] + PartitionCols:_col0 + Group By Operator [GBY_138] (rows=1 width=12) + Output:["_col0","_col1"],aggregations:["count()"],keys:_col9 + Map Join Operator [MAPJOIN_137] (rows=5185194 width=4) + Conds:MAPJOIN_136._col7=RS_130._col0(Inner),Output:["_col9"] + <-Map 7 [BROADCAST_EDGE] vectorized, llap + BROADCAST [RS_130] + PartitionCols:_col0 + Map Join Operator [MAPJOIN_129] (rows=28 width=8) + Conds:SEL_128._col1=RS_121._col0(Inner),Output:["_col0","_col1"] + <-Reducer 9 [BROADCAST_EDGE] vectorized, llap + PARTITION_ONLY_SHUFFLE [RS_121] + PartitionCols:_col0 + Group By Operator [GBY_120] (rows=1 width=4) + Output:["_col0"],keys:KEY._col0 + <-Map 8 [SIMPLE_EDGE] vectorized, llap + SHUFFLE [RS_119] + PartitionCols:_col0 + Group By Operator [GBY_118] (rows=1 width=4) + Output:["_col0"],keys:d_month_seq + Select Operator [SEL_117] (rows=9 width=12) + Output:["d_month_seq"] + Filter Operator [FIL_116] (rows=9 width=12) + predicate:(((d_year * d_moy) > 200000) and (d_moy = 2) and (d_year = 2000) and d_month_seq is not null) + TableScan [TS_16] (rows=56 width=12) + default@x1_date_dim,x1_date_dim,Tbl:COMPLETE,Col:COMPLETE,Output:["d_month_seq","d_year","d_moy"] + <-Select Operator [SEL_128] (rows=56 width=8) + Output:["_col0","_col1"] + Filter Operator [FIL_127] (rows=56 width=8) + predicate:((d_month_seq BETWEEN DynamicValue(RS_24_x1_date_dim_d_month_seq_min) AND DynamicValue(RS_24_x1_date_dim_d_month_seq_max) and in_bloom_filter(d_month_seq, DynamicValue(RS_24_x1_date_dim_d_month_seq_bloom_filter))) and d_date_sk is not null and d_month_seq is not null) + TableScan [TS_13] (rows=56 width=8) + default@x1_date_dim,d,Tbl:COMPLETE,Col:COMPLETE,Output:["d_date_sk","d_month_seq"] + <-Reducer 10 [BROADCAST_EDGE] vectorized, llap + BROADCAST [RS_126] + Group By Operator [GBY_125] (rows=1 width=12) + Output:["_col0","_col1","_col2"],aggregations:["min(VALUE._col0)","max(VALUE._col1)","bloom_filter(VALUE._col2, expectedEntries=1000000)"] + Dynamic Partitioning Event Operator [EVENT_133] (rows=1 width=8) + Group By Operator [GBY_132] (rows=1 width=8) + Output:["_col0"],keys:_col0 + Select Operator [SEL_131] (rows=28 width=8) + Output:["_col0"] + Please refer to the previous Map Join Operator [MAPJOIN_129] + <-Map Join Operator [MAPJOIN_136] (rows=370371 width=4) + Conds:RS_30._col0=SEL_135._col0(Inner),Output:["_col7"] + <-Map 1 [BROADCAST_EDGE] llap + BROADCAST [RS_30] + PartitionCols:_col0 + Map Join Operator [MAPJOIN_100] (rows=6 width=228) + Conds:SEL_2._col1=RS_113._col2(Inner),Output:["_col0","_col2","_col3","_col4"],residual filter predicates:{(_col2 > CASE WHEN (_col4 is null) THEN (null) ELSE (_col3) END)} + <-Reducer 3 [BROADCAST_EDGE] vectorized, llap + BROADCAST [RS_113] + PartitionCols:_col2 + Select Operator [SEL_112] (rows=1 width=201) + Output:["_col0","_col1","_col2"] + Group By Operator [GBY_111] (rows=1 width=197) + Output:["_col0","_col1"],aggregations:["min(VALUE._col0)"],keys:KEY._col0 + <-Map 2 [SIMPLE_EDGE] vectorized, llap + SHUFFLE [RS_110] + PartitionCols:_col0 + Group By Operator [GBY_109] (rows=1 width=197) + Output:["_col0","_col1"],aggregations:["min(i_current_price)"],keys:i_category + Filter Operator [FIL_108] (rows=18 width=197) + predicate:i_category is not null + TableScan [TS_3] (rows=18 width=197) + default@x1_item,j,Tbl:COMPLETE,Col:COMPLETE,Output:["i_category","i_current_price"] + <-Select Operator [SEL_2] (rows=18 width=201) + Output:["_col0","_col1","_col2"] + Filter Operator [FIL_57] (rows=18 width=201) + predicate:i_item_sk is not null + TableScan [TS_0] (rows=18 width=201) + default@x1_item,i,Tbl:COMPLETE,Col:COMPLETE,Output:["i_item_sk","i_category","i_current_price"] + <-Select Operator [SEL_135] (rows=123457 width=8) + Output:["_col0","_col1"] + Filter Operator [FIL_134] (rows=123457 width=8) + predicate:ss_item_sk is not null + TableScan [TS_10] (rows=123457 width=8) + default@x1_store_sales,s,Tbl:COMPLETE,Col:COMPLETE,Output:["ss_item_sk"] + +PREHOOK: query: select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + ,x1_item i + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and s.ss_item_sk = i.i_item_sk + and d.d_month_seq in + (select distinct (d_month_seq) + from x1_date_dim + where d_year = 2000 and d_year*d_moy > 200000 + and d_moy = 2 ) + and i.i_current_price > + (select min(j.i_current_price) + from x1_item j + where j.i_category = i.i_category) + + group by d.d_month_seq + order by cnt + limit 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@x1_date_dim +PREHOOK: Input: default@x1_item +PREHOOK: Input: default@x1_store_sales +PREHOOK: Input: default@x1_store_sales@ss_sold_date_sk=1 +PREHOOK: Input: default@x1_store_sales@ss_sold_date_sk=2 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + ,x1_item i + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and s.ss_item_sk = i.i_item_sk + and d.d_month_seq in + (select distinct (d_month_seq) + from x1_date_dim + where d_year = 2000 and d_year*d_moy > 200000 + and d_moy = 2 ) + and i.i_current_price > + (select min(j.i_current_price) + from x1_item j + where j.i_category = i.i_category) + + group by d.d_month_seq + order by cnt + limit 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x1_date_dim +POSTHOOK: Input: default@x1_item +POSTHOOK: Input: default@x1_store_sales +POSTHOOK: Input: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: Input: default@x1_store_sales@ss_sold_date_sk=2 +#### A masked pattern was here ####