Repository: hive Updated Branches: refs/heads/branch-1 9da586fb2 -> ec1878f41
HIVE-12444 - Global Limit optimization on ACID table without base directory may throw exception ADDENDUM (Wei Zheng via Eugene Koifman) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/ec1878f4 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/ec1878f4 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/ec1878f4 Branch: refs/heads/branch-1 Commit: ec1878f41ab9b8e40f0bdac866e639233ca74697 Parents: 9da586f Author: Eugene Koifman <[email protected]> Authored: Wed Dec 2 14:03:23 2015 -0800 Committer: Eugene Koifman <[email protected]> Committed: Wed Dec 2 14:03:23 2015 -0800 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 3 +- .../hive/ql/optimizer/GenMapRedUtils.java | 43 +++++---- .../hadoop/hive/ql/plan/TableScanDesc.java | 5 ++ .../queries/clientpositive/acid_globallimit.q | 19 ++++ .../clientpositive/acid_globallimit.q.out | 90 +++++++++++++++++++ .../clientpositive/tez/acid_globallimit.q.out | 93 ++++++++++++++++++++ 6 files changed, 234 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 836a67a..a4a2fc6 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -57,7 +57,8 @@ minimr.query.files=auto_sortmerge_join_16.q,\ uber_reduce.q,\ udf_using.q -minitez.query.files.shared=alter_merge_2_orc.q,\ +minitez.query.files.shared=acid_globallimit.q,\ + alter_merge_2_orc.q,\ alter_merge_orc.q,\ alter_merge_stats_orc.q,\ auto_join0.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index 60987b1..5708cb8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -485,6 +485,7 @@ public final class GenMapRedUtils { HiveConf conf, boolean local) throws SemanticException { ArrayList<Path> partDir = new ArrayList<Path>(); ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>(); + boolean isAcidTable = false; Path tblDir = null; plan.setNameToSplitSample(parseCtx.getNameToSplitSample()); @@ -493,6 +494,7 @@ public final class GenMapRedUtils { try { TableScanOperator tsOp = (TableScanOperator) topOp; partsList = PartitionPruner.prune(tsOp, parseCtx, alias_id); + isAcidTable = ((TableScanOperator) topOp).getConf().isAcidTable(); } catch (SemanticException e) { throw e; } catch (HiveException e) { @@ -535,26 +537,31 @@ public final class GenMapRedUtils { long sizeNeeded = Integer.MAX_VALUE; int fileLimit = -1; if (parseCtx.getGlobalLimitCtx().isEnable()) { - long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(), - HiveConf.ConfVars.HIVELIMITMAXROWSIZE); - sizeNeeded = parseCtx.getGlobalLimitCtx().getGlobalLimit() * sizePerRow; - // for the optimization that reduce number of input file, we limit number - // of files allowed. If more than specific number of files have to be - // selected, we skip this optimization. Since having too many files as - // inputs can cause unpredictable latency. It's not necessarily to be - // cheaper. - fileLimit = - HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITOPTLIMITFILE); - - if (sizePerRow <= 0 || fileLimit <= 0) { - LOG.info("Skip optimization to reduce input size of 'limit'"); + if (isAcidTable) { + LOG.info("Skip Global Limit optimization for ACID table"); parseCtx.getGlobalLimitCtx().disableOpt(); - } else if (parts.isEmpty()) { - LOG.info("Empty input: skip limit optimiztion"); } else { - LOG.info("Try to reduce input size for 'limit' " + - "sizeNeeded: " + sizeNeeded + - " file limit : " + fileLimit); + long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(), + HiveConf.ConfVars.HIVELIMITMAXROWSIZE); + sizeNeeded = parseCtx.getGlobalLimitCtx().getGlobalLimit() * sizePerRow; + // for the optimization that reduce number of input file, we limit number + // of files allowed. If more than specific number of files have to be + // selected, we skip this optimization. Since having too many files as + // inputs can cause unpredictable latency. It's not necessarily to be + // cheaper. + fileLimit = + HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITOPTLIMITFILE); + + if (sizePerRow <= 0 || fileLimit <= 0) { + LOG.info("Skip optimization to reduce input size of 'limit'"); + parseCtx.getGlobalLimitCtx().disableOpt(); + } else if (parts.isEmpty()) { + LOG.info("Empty input: skip limit optimiztion"); + } else { + LOG.info("Try to reduce input size for 'limit' " + + "sizeNeeded: " + sizeNeeded + + " file limit : " + fileLimit); + } } } boolean isFirstPart = true; http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index dbb5209..1e7e617 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -26,6 +26,7 @@ import java.util.Map; import org.apache.hadoop.hive.ql.exec.PTFUtils; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; import org.apache.hadoop.hive.ql.parse.TableSample; import org.apache.hadoop.hive.ql.plan.Explain.Level; @@ -132,6 +133,10 @@ public class TableScanDesc extends AbstractOperatorDesc { return alias; } + public boolean isAcidTable() { + return SemanticAnalyzer.isAcidTable(this.tableMetadata); + } + @Explain(displayName = "filterExpr") public String getFilterExprString() { StringBuilder sb = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/test/queries/clientpositive/acid_globallimit.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/acid_globallimit.q b/ql/src/test/queries/clientpositive/acid_globallimit.q new file mode 100644 index 0000000..5968e6b --- /dev/null +++ b/ql/src/test/queries/clientpositive/acid_globallimit.q @@ -0,0 +1,19 @@ +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.enforce.bucketing=true; +set hive.fetch.task.conversion=none; +set hive.limit.optimize.enable=true; + +-- Global Limit optimization does not work with ACID table. Make sure to skip it for ACID table. +CREATE TABLE acidtest1(c1 INT, c2 STRING) +CLUSTERED BY (c1) INTO 3 BUCKETS +STORED AS ORC +TBLPROPERTIES ("transactional"="true"); + +insert into table acidtest1 select cint, cstring1 from alltypesorc where cint is not null order by cint; + +explain +select cast (c1 as string) from acidtest1 limit 10; +select cast (c1 as string) from acidtest1 limit 10; + +drop table acidtest1; http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/test/results/clientpositive/acid_globallimit.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/acid_globallimit.q.out b/ql/src/test/results/clientpositive/acid_globallimit.q.out new file mode 100644 index 0000000..6a2a792 --- /dev/null +++ b/ql/src/test/results/clientpositive/acid_globallimit.q.out @@ -0,0 +1,90 @@ +PREHOOK: query: -- Global Limit optimization does not work with ACID table. Make sure to skip it for ACID table. +CREATE TABLE acidtest1(c1 INT, c2 STRING) +CLUSTERED BY (c1) INTO 3 BUCKETS +STORED AS ORC +TBLPROPERTIES ("transactional"="true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acidtest1 +POSTHOOK: query: -- Global Limit optimization does not work with ACID table. Make sure to skip it for ACID table. +CREATE TABLE acidtest1(c1 INT, c2 STRING) +CLUSTERED BY (c1) INTO 3 BUCKETS +STORED AS ORC +TBLPROPERTIES ("transactional"="true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acidtest1 +PREHOOK: query: insert into table acidtest1 select cint, cstring1 from alltypesorc where cint is not null order by cint +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acidtest1 +POSTHOOK: query: insert into table acidtest1 select cint, cstring1 from alltypesorc where cint is not null order by cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acidtest1 +POSTHOOK: Lineage: acidtest1.c1 SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: acidtest1.c2 SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +PREHOOK: query: explain +select cast (c1 as string) from acidtest1 limit 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select cast (c1 as string) from acidtest1 limit 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acidtest1 + Statistics: Num rows: 9173 Data size: 101822 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToString(c1) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 9173 Data size: 101822 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 10 Data size: 110 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 110 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: select cast (c1 as string) from acidtest1 limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtest1 +#### A masked pattern was here #### +POSTHOOK: query: select cast (c1 as string) from acidtest1 limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtest1 +#### A masked pattern was here #### +185520768 +186950964 +186967185 +187206627 +187503456 +188474907 +190070046 +190435023 +190587882 +191372331 +PREHOOK: query: drop table acidtest1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@acidtest1 +PREHOOK: Output: default@acidtest1 +POSTHOOK: query: drop table acidtest1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@acidtest1 +POSTHOOK: Output: default@acidtest1 http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/test/results/clientpositive/tez/acid_globallimit.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/acid_globallimit.q.out b/ql/src/test/results/clientpositive/tez/acid_globallimit.q.out new file mode 100644 index 0000000..36d0008 --- /dev/null +++ b/ql/src/test/results/clientpositive/tez/acid_globallimit.q.out @@ -0,0 +1,93 @@ +PREHOOK: query: -- Global Limit optimization does not work with ACID table. Make sure to skip it for ACID table. +CREATE TABLE acidtest1(c1 INT, c2 STRING) +CLUSTERED BY (c1) INTO 3 BUCKETS +STORED AS ORC +TBLPROPERTIES ("transactional"="true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acidtest1 +POSTHOOK: query: -- Global Limit optimization does not work with ACID table. Make sure to skip it for ACID table. +CREATE TABLE acidtest1(c1 INT, c2 STRING) +CLUSTERED BY (c1) INTO 3 BUCKETS +STORED AS ORC +TBLPROPERTIES ("transactional"="true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acidtest1 +PREHOOK: query: insert into table acidtest1 select cint, cstring1 from alltypesorc where cint is not null order by cint +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acidtest1 +POSTHOOK: query: insert into table acidtest1 select cint, cstring1 from alltypesorc where cint is not null order by cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acidtest1 +POSTHOOK: Lineage: acidtest1.c1 SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: acidtest1.c2 SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +PREHOOK: query: explain +select cast (c1 as string) from acidtest1 limit 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select cast (c1 as string) from acidtest1 limit 10 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: acidtest1 + Statistics: Num rows: 9173 Data size: 101822 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: UDFToString(c1) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 9173 Data size: 101822 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 10 + Statistics: Num rows: 10 Data size: 110 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 10 Data size: 110 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 10 + Processor Tree: + ListSink + +PREHOOK: query: select cast (c1 as string) from acidtest1 limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtest1 +#### A masked pattern was here #### +POSTHOOK: query: select cast (c1 as string) from acidtest1 limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtest1 +#### A masked pattern was here #### +185520768 +186950964 +186967185 +187206627 +187503456 +188474907 +190070046 +190435023 +190587882 +191372331 +PREHOOK: query: drop table acidtest1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@acidtest1 +PREHOOK: Output: default@acidtest1 +POSTHOOK: query: drop table acidtest1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@acidtest1 +POSTHOOK: Output: default@acidtest1
