Repository: hive Updated Branches: refs/heads/branch-2.0 06f8d74f7 -> 77c384da4
HIVE-12824 : CBO doesnt get triggered when aggregate function is used within windowing function (Ashutosh Chauhan via Jesus Camacho Rodriguez) Signed-off-by: Ashutosh Chauhan <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/77c384da Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/77c384da Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/77c384da Branch: refs/heads/branch-2.0 Commit: 77c384da402eca67d3bcc19c3e97fd4c625d2dc1 Parents: 06f8d74 Author: Ashutosh Chauhan <[email protected]> Authored: Fri Jan 8 17:45:47 2016 -0800 Committer: Ashutosh Chauhan <[email protected]> Committed: Mon Jan 11 14:08:12 2016 -0800 ---------------------------------------------------------------------- .../translator/PlanModifierForASTConv.java | 14 ++++ .../test/queries/clientpositive/windowing_gby.q | 1 + .../clientpositive/groupby_resolution.q.out | 4 +- .../results/clientpositive/quotedid_basic.q.out | 32 +++++--- .../spark/groupby_resolution.q.out | 4 +- .../clientpositive/tez/windowing_gby.q.out | 81 +++++++++++--------- 6 files changed, 83 insertions(+), 53 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/77c384da/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/PlanModifierForASTConv.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/PlanModifierForASTConv.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/PlanModifierForASTConv.java index b77beb8..e2fbb4f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/PlanModifierForASTConv.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/PlanModifierForASTConv.java @@ -32,10 +32,12 @@ import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.SetOp; import org.apache.calcite.rel.core.Sort; +import org.apache.calcite.rel.core.Window.RexWinAggCall; import org.apache.calcite.rel.rules.MultiJoin; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFactory; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexOver; import org.apache.calcite.sql.SqlAggFunction; import org.apache.calcite.util.Pair; import org.slf4j.Logger; @@ -295,6 +297,18 @@ public class PlanModifierForASTConv { validParent = false; } + if (parent instanceof Project) { + for (RexNode child : parent.getChildExps()) { + if (child instanceof RexOver || child instanceof RexWinAggCall) { + // Hive can't handle select rank() over(order by sum(c1)/sum(c2)) from t1 group by c3 + // but can handle select rank() over (order by c4) from + // (select sum(c1)/sum(c2) as c4 from t1 group by c3) t2; + // so introduce a project on top of this gby. + return false; + } + } + } + return validParent; } http://git-wip-us.apache.org/repos/asf/hive/blob/77c384da/ql/src/test/queries/clientpositive/windowing_gby.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/windowing_gby.q b/ql/src/test/queries/clientpositive/windowing_gby.q index b948f76..d844f11 100644 --- a/ql/src/test/queries/clientpositive/windowing_gby.q +++ b/ql/src/test/queries/clientpositive/windowing_gby.q @@ -1,3 +1,4 @@ +set hive.mapred.mode=nonstrict; explain select rank() over (order by return_ratio) as return_rank from (select sum(wr.cint)/sum(ws.c_int) as return_ratio http://git-wip-us.apache.org/repos/asf/hive/blob/77c384da/ql/src/test/results/clientpositive/groupby_resolution.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/groupby_resolution.q.out b/ql/src/test/results/clientpositive/groupby_resolution.q.out index ea40014..9e58b75 100644 --- a/ql/src/test/results/clientpositive/groupby_resolution.q.out +++ b/ql/src/test/results/clientpositive/groupby_resolution.q.out @@ -666,10 +666,10 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: 0 (type: int) Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: string), _col1 (type: bigint) + value expressions: _col0 (type: string) Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: string), VALUE._col1 (type: bigint) + expressions: VALUE._col0 (type: string), KEY.reducesinkkey1 (type: bigint) outputColumnNames: _col0, _col1 Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE PTF Operator http://git-wip-us.apache.org/repos/asf/hive/blob/77c384da/ql/src/test/results/clientpositive/quotedid_basic.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/quotedid_basic.q.out b/ql/src/test/results/clientpositive/quotedid_basic.q.out index 519f647..43b63f2 100644 --- a/ql/src/test/results/clientpositive/quotedid_basic.q.out +++ b/ql/src/test/results/clientpositive/quotedid_basic.q.out @@ -175,12 +175,16 @@ STAGE PLANS: mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Select Operator + expressions: _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Stage: Stage-2 Map Reduce @@ -279,12 +283,16 @@ STAGE PLANS: mode: mergepartial outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Select Operator + expressions: _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe Stage: Stage-2 Map Reduce http://git-wip-us.apache.org/repos/asf/hive/blob/77c384da/ql/src/test/results/clientpositive/spark/groupby_resolution.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/groupby_resolution.q.out b/ql/src/test/results/clientpositive/spark/groupby_resolution.q.out index cb2c9bd..cef5b23 100644 --- a/ql/src/test/results/clientpositive/spark/groupby_resolution.q.out +++ b/ql/src/test/results/clientpositive/spark/groupby_resolution.q.out @@ -659,11 +659,11 @@ STAGE PLANS: sort order: ++ Map-reduce partition columns: 0 (type: int) Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: string), _col1 (type: bigint) + value expressions: _col0 (type: string) Reducer 4 Reduce Operator Tree: Select Operator - expressions: VALUE._col0 (type: string), VALUE._col1 (type: bigint) + expressions: VALUE._col0 (type: string), KEY.reducesinkkey1 (type: bigint) outputColumnNames: _col0, _col1 Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE PTF Operator http://git-wip-us.apache.org/repos/asf/hive/blob/77c384da/ql/src/test/results/clientpositive/tez/windowing_gby.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/windowing_gby.q.out b/ql/src/test/results/clientpositive/tez/windowing_gby.q.out index e65533f..72d1f14 100644 --- a/ql/src/test/results/clientpositive/tez/windowing_gby.q.out +++ b/ql/src/test/results/clientpositive/tez/windowing_gby.q.out @@ -10,7 +10,7 @@ POSTHOOK: query: explain from cbo_t3 ws join alltypesorc wr on ws.value = wr.cstring1 group by ws.c_boolean ) in_web POSTHOOK: type: QUERY -Plan not optimized by CBO. +Plan optimized by CBO. Vertex dependency in root stage Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) @@ -22,77 +22,84 @@ Stage-0 limit:-1 Stage-1 Reducer 4 - File Output Operator [FS_17] + File Output Operator [FS_19] compressed:false Statistics:Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE table:{"input format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"} - Select Operator [SEL_15] + Select Operator [SEL_17] outputColumnNames:["_col0"] Statistics:Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE - PTF Operator [PTF_14] - Function definitions:[{"Input definition":{"type:":"WINDOWING"}},{"name:":"windowingtablefunction","order by:":"_col0","partition by:":"0"}] + PTF Operator [PTF_16] + Function definitions:[{"Input definition":{"type:":"WINDOWING"}},{"name:":"windowingtablefunction","order by:":"(UDFToDouble(_col1) / UDFToDouble(_col2))","partition by:":"0"}] Statistics:Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE - Select Operator [SEL_13] - | outputColumnNames:["_col0"] + Select Operator [SEL_15] + | outputColumnNames:["_col1","_col2"] | Statistics:Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE |<-Reducer 3 [SIMPLE_EDGE] - Reduce Output Operator [RS_12] - key expressions:0 (type: int), _col0 (type: double) + Reduce Output Operator [RS_14] + key expressions:0 (type: int), (UDFToDouble(_col1) / UDFToDouble(_col2)) (type: double) Map-reduce partition columns:0 (type: int) sort order:++ Statistics:Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE - Select Operator [SEL_11] - outputColumnNames:["_col0"] + value expressions:_col1 (type: bigint), _col2 (type: bigint) + Select Operator [SEL_13] + outputColumnNames:["_col1","_col2"] Statistics:Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE - Group By Operator [GBY_10] + Group By Operator [GBY_12] | aggregations:["sum(VALUE._col0)","sum(VALUE._col1)"] | keys:KEY._col0 (type: boolean) | outputColumnNames:["_col0","_col1","_col2"] | Statistics:Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE |<-Reducer 2 [SIMPLE_EDGE] - Reduce Output Operator [RS_9] + Reduce Output Operator [RS_11] key expressions:_col0 (type: boolean) Map-reduce partition columns:_col0 (type: boolean) sort order:+ Statistics:Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE value expressions:_col1 (type: bigint), _col2 (type: bigint) - Group By Operator [GBY_8] - aggregations:["sum(_col10)","sum(_col2)"] - keys:_col4 (type: boolean) + Group By Operator [GBY_10] + aggregations:["sum(_col3)","sum(_col1)"] + keys:_col2 (type: boolean) outputColumnNames:["_col0","_col1","_col2"] Statistics:Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE - Select Operator [SEL_7] - outputColumnNames:["_col4","_col10","_col2"] + Select Operator [SEL_9] + outputColumnNames:["_col2","_col3","_col1"] Statistics:Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE - Merge Join Operator [MERGEJOIN_22] + Merge Join Operator [MERGEJOIN_24] | condition map:[{"":"Inner Join 0 to 1"}] - | keys:{"0":"value (type: string)","1":"cstring1 (type: string)"} - | outputColumnNames:["_col2","_col4","_col10"] + | keys:{"0":"_col0 (type: string)","1":"_col1 (type: string)"} + | outputColumnNames:["_col1","_col2","_col3"] | Statistics:Num rows: 13516 Data size: 2906160 Basic stats: COMPLETE Column stats: NONE |<-Map 1 [SIMPLE_EDGE] - | Reduce Output Operator [RS_3] - | key expressions:value (type: string) - | Map-reduce partition columns:value (type: string) + | Reduce Output Operator [RS_6] + | key expressions:_col0 (type: string) + | Map-reduce partition columns:_col0 (type: string) | sort order:+ | Statistics:Num rows: 20 Data size: 262 Basic stats: COMPLETE Column stats: NONE - | value expressions:c_int (type: int), c_boolean (type: boolean) - | Filter Operator [FIL_20] - | predicate:value is not null (type: boolean) + | value expressions:_col1 (type: int), _col2 (type: boolean) + | Select Operator [SEL_2] + | outputColumnNames:["_col0","_col1","_col2"] | Statistics:Num rows: 20 Data size: 262 Basic stats: COMPLETE Column stats: NONE - | TableScan [TS_0] - | alias:ws + | Filter Operator [FIL_22] + | predicate:value is not null (type: boolean) | Statistics:Num rows: 20 Data size: 262 Basic stats: COMPLETE Column stats: NONE + | TableScan [TS_0] + | alias:ws + | Statistics:Num rows: 20 Data size: 262 Basic stats: COMPLETE Column stats: NONE |<-Map 5 [SIMPLE_EDGE] - Reduce Output Operator [RS_5] - key expressions:cstring1 (type: string) - Map-reduce partition columns:cstring1 (type: string) + Reduce Output Operator [RS_7] + key expressions:_col1 (type: string) + Map-reduce partition columns:_col1 (type: string) sort order:+ Statistics:Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE - value expressions:cint (type: int) - Filter Operator [FIL_21] - predicate:cstring1 is not null (type: boolean) + value expressions:_col0 (type: int) + Select Operator [SEL_5] + outputColumnNames:["_col0","_col1"] Statistics:Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE - TableScan [TS_1] - alias:wr + Filter Operator [FIL_23] + predicate:cstring1 is not null (type: boolean) Statistics:Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + TableScan [TS_3] + alias:wr + Statistics:Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
