Repository: hive Updated Branches: refs/heads/master 44f955ed7 -> 6fa9f6339
HIVE-20262 : Implement stats annotation rule for the UDTFOperator (George Pachitariu via Ashutosh Chauhan) Signed-off-by: Ashutosh Chauhan <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/6fa9f633 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/6fa9f633 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/6fa9f633 Branch: refs/heads/master Commit: 6fa9f63394c097547ddca194981779aa9c061317 Parents: 44f955e Author: George Pachitariu <[email protected]> Authored: Mon Jul 30 15:19:35 2018 -0700 Committer: Ashutosh Chauhan <[email protected]> Committed: Mon Jul 30 15:19:35 2018 -0700 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/conf/HiveConf.java | 5 + .../annotation/AnnotateWithStatistics.java | 3 + .../stats/annotation/StatsRulesProcFactory.java | 38 +++ .../clientpositive/annotate_stats_udtf.q | 32 +++ .../clientpositive/annotate_stats_udtf.q.out | 255 +++++++++++++++++++ 5 files changed, 333 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 39c77b3..cce908f 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2398,6 +2398,11 @@ public class HiveConf extends Configuration { "filter operators."), HIVE_STATS_IN_MIN_RATIO("hive.stats.filter.in.min.ratio", (float) 0.05, "Output estimation of an IN filter can't be lower than this ratio"), + HIVE_STATS_UDTF_FACTOR("hive.stats.udtf.factor", (float) 1.0, + "UDTFs change the number of rows of the output. A common UDTF is the explode() method that creates\n" + + "multiple rows for each element in the input array. This factor is applied to the number of\n" + + "output rows and output size."), + // Concurrency HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false, "Whether Hive supports concurrency control or not. \n" + http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java index 4b3b2ac..cfcb355 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.UDTFOperator; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.GraphWalker; @@ -65,6 +66,8 @@ public class AnnotateWithStatistics extends Transform { StatsRulesProcFactory.getLimitRule()); opRules.put(new RuleRegExp("RS", ReduceSinkOperator.getOperatorName() + "%"), StatsRulesProcFactory.getReduceSinkRule()); + opRules.put(new RuleRegExp("UDTF", UDTFOperator.getOperatorName() + "%"), + StatsRulesProcFactory.getUDTFRule()); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 3c2b085..997e289 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -47,6 +47,7 @@ import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.UDTFOperator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; @@ -2499,6 +2500,39 @@ public class StatsRulesProcFactory { } /** + * UDTF operator changes the number of rows and thereby the data size. + */ + public static class UDTFStatsRule extends DefaultStatsRule implements NodeProcessor { + @Override + public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; + UDTFOperator uop = (UDTFOperator) nd; + + Operator<? extends OperatorDesc> parent = uop.getParentOperators().get(0); + + Statistics parentStats = parent.getStatistics(); + + if (parentStats != null) { + Statistics st = parentStats.clone(); + + float udtfFactor=HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_UDTF_FACTOR); + long numRows = (long) (parentStats.getNumRows() * udtfFactor); + long dataSize = StatsUtils.safeMult(parentStats.getDataSize(), udtfFactor); + st.setNumRows(numRows); + st.setDataSize(dataSize); + + if (LOG.isDebugEnabled()) { + LOG.debug("[0] STATS-" + uop.toString() + ": " + st.extendedToString()); + } + + uop.setStatistics(st); + } + return null; + } + } + + /** * Default rule is to aggregate the statistics from all its parent operators. */ public static class DefaultStatsRule implements NodeProcessor { @@ -2584,6 +2618,10 @@ public class StatsRulesProcFactory { return new ReduceSinkStatsRule(); } + public static NodeProcessor getUDTFRule() { + return new UDTFStatsRule(); + } + public static NodeProcessor getDefaultRule() { return new DefaultStatsRule(); } http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/ql/src/test/queries/clientpositive/annotate_stats_udtf.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/annotate_stats_udtf.q b/ql/src/test/queries/clientpositive/annotate_stats_udtf.q new file mode 100644 index 0000000..74e6ebf --- /dev/null +++ b/ql/src/test/queries/clientpositive/annotate_stats_udtf.q @@ -0,0 +1,32 @@ +-- setting up a table with multiple rows +drop table if exists HIVE_20262; +create table HIVE_20262 (a array<int>); +insert into HIVE_20262 select array(1); +insert into HIVE_20262 select array(2); + + +set hive.stats.udtf.factor=5; + +-- Test when input has a single row +explain select explode(array(1,2,3,4,5)) as col; + +-- Test when input has multiple rows +explain select explode(a) from HIVE_20262; + +-- the output data size should increase +explain select 1, r from HIVE_20262 + lateral view explode(a) t as r ; + + +-- Default behaviour tests: + +-- 1 is the default value +set hive.stats.udtf.factor=1; + +-- Test when input has a single row +explain select explode(array(1,2,3,4,5)) as col; + +-- Test when input has multiple rows +explain select explode(a) from HIVE_20262; + + http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/ql/src/test/results/clientpositive/annotate_stats_udtf.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/annotate_stats_udtf.q.out b/ql/src/test/results/clientpositive/annotate_stats_udtf.q.out new file mode 100644 index 0000000..f526487 --- /dev/null +++ b/ql/src/test/results/clientpositive/annotate_stats_udtf.q.out @@ -0,0 +1,255 @@ +PREHOOK: query: drop table if exists HIVE_20262 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists HIVE_20262 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table HIVE_20262 (a array<int>) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@HIVE_20262 +POSTHOOK: query: create table HIVE_20262 (a array<int>) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@HIVE_20262 +PREHOOK: query: insert into HIVE_20262 select array(1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@hive_20262 +POSTHOOK: query: insert into HIVE_20262 select array(1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@hive_20262 +POSTHOOK: Lineage: hive_20262.a EXPRESSION [] +PREHOOK: query: insert into HIVE_20262 select array(2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@hive_20262 +POSTHOOK: query: insert into HIVE_20262 select array(2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@hive_20262 +POSTHOOK: Lineage: hive_20262.a EXPRESSION [] +PREHOOK: query: explain select explode(array(1,2,3,4,5)) as col +PREHOOK: type: QUERY +POSTHOOK: query: explain select explode(array(1,2,3,4,5)) as col +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: array(1,2,3,4,5) (type: array<int>) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + UDTF Operator + Statistics: Num rows: 5 Data size: 320 Basic stats: COMPLETE Column stats: COMPLETE + function name: explode + Select Operator + expressions: col (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select explode(a) from HIVE_20262 +PREHOOK: type: QUERY +POSTHOOK: query: explain select explode(a) from HIVE_20262 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: hive_20262 + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: array<int>) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 10 Data size: 10 Basic stats: COMPLETE Column stats: NONE + function name: explode + Select Operator + expressions: col (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 10 Data size: 10 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 10 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select 1, r from HIVE_20262 + lateral view explode(a) t as r +PREHOOK: type: QUERY +POSTHOOK: query: explain select 1, r from HIVE_20262 + lateral view explode(a) t as r +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: hive_20262 + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Lateral View Forward + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Lateral View Join Operator + outputColumnNames: _col4 + Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 1 (type: int), _col4 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Select Operator + expressions: a (type: array<int>) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 10 Data size: 10 Basic stats: COMPLETE Column stats: NONE + function name: explode + Lateral View Join Operator + outputColumnNames: _col4 + Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 1 (type: int), _col4 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select explode(array(1,2,3,4,5)) as col +PREHOOK: type: QUERY +POSTHOOK: query: explain select explode(array(1,2,3,4,5)) as col +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: array(1,2,3,4,5) (type: array<int>) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + UDTF Operator + Statistics: Num rows: 1 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + function name: explode + Select Operator + expressions: col (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select explode(a) from HIVE_20262 +PREHOOK: type: QUERY +POSTHOOK: query: explain select explode(a) from HIVE_20262 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: hive_20262 + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: a (type: array<int>) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + UDTF Operator + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + function name: explode + Select Operator + expressions: col (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +
