hive git commit: HIVE-12444 - Global Limit optimization on ACID table without base directory may throw exception ADDENDUM (Wei Zheng via Eugene Koifman)

ekoifman Wed, 02 Dec 2015 14:09:55 -0800

Repository: hive
Updated Branches:
  refs/heads/branch-1 9da586fb2 -> ec1878f41



HIVE-12444 - Global Limit optimization on ACID table without base directory may 
throw exception ADDENDUM (Wei Zheng via Eugene Koifman)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/ec1878f4
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/ec1878f4
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/ec1878f4

Branch: refs/heads/branch-1
Commit: ec1878f41ab9b8e40f0bdac866e639233ca74697
Parents: 9da586f
Author: Eugene Koifman <[email protected]>
Authored: Wed Dec 2 14:03:23 2015 -0800
Committer: Eugene Koifman <[email protected]>
Committed: Wed Dec 2 14:03:23 2015 -0800

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |  3 +-
 .../hive/ql/optimizer/GenMapRedUtils.java       | 43 +++++----
 .../hadoop/hive/ql/plan/TableScanDesc.java      |  5 ++
 .../queries/clientpositive/acid_globallimit.q   | 19 ++++
 .../clientpositive/acid_globallimit.q.out       | 90 +++++++++++++++++++
 .../clientpositive/tez/acid_globallimit.q.out   | 93 ++++++++++++++++++++
 6 files changed, 234 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index 836a67a..a4a2fc6 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -57,7 +57,8 @@ minimr.query.files=auto_sortmerge_join_16.q,\
   uber_reduce.q,\
   udf_using.q
 
-minitez.query.files.shared=alter_merge_2_orc.q,\
+minitez.query.files.shared=acid_globallimit.q,\
+  alter_merge_2_orc.q,\
   alter_merge_orc.q,\
   alter_merge_stats_orc.q,\
   auto_join0.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
index 60987b1..5708cb8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
@@ -485,6 +485,7 @@ public final class GenMapRedUtils {
       HiveConf conf, boolean local) throws SemanticException {
     ArrayList<Path> partDir = new ArrayList<Path>();
     ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
+    boolean isAcidTable = false;
 
     Path tblDir = null;
     plan.setNameToSplitSample(parseCtx.getNameToSplitSample());
@@ -493,6 +494,7 @@ public final class GenMapRedUtils {
       try {
         TableScanOperator tsOp = (TableScanOperator) topOp;
         partsList = PartitionPruner.prune(tsOp, parseCtx, alias_id);
+        isAcidTable = ((TableScanOperator) topOp).getConf().isAcidTable();
       } catch (SemanticException e) {
         throw e;
       } catch (HiveException e) {
@@ -535,26 +537,31 @@ public final class GenMapRedUtils {
     long sizeNeeded = Integer.MAX_VALUE;
     int fileLimit = -1;
     if (parseCtx.getGlobalLimitCtx().isEnable()) {
-      long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(),
-          HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
-      sizeNeeded = parseCtx.getGlobalLimitCtx().getGlobalLimit() * sizePerRow;
-      // for the optimization that reduce number of input file, we limit number
-      // of files allowed. If more than specific number of files have to be
-      // selected, we skip this optimization. Since having too many files as
-      // inputs can cause unpredictable latency. It's not necessarily to be
-      // cheaper.
-      fileLimit =
-          HiveConf.getIntVar(parseCtx.getConf(), 
HiveConf.ConfVars.HIVELIMITOPTLIMITFILE);
-
-      if (sizePerRow <= 0 || fileLimit <= 0) {
-        LOG.info("Skip optimization to reduce input size of 'limit'");
+      if (isAcidTable) {
+        LOG.info("Skip Global Limit optimization for ACID table");
         parseCtx.getGlobalLimitCtx().disableOpt();
-      } else if (parts.isEmpty()) {
-        LOG.info("Empty input: skip limit optimiztion");
       } else {
-        LOG.info("Try to reduce input size for 'limit' " +
-            "sizeNeeded: " + sizeNeeded +
-            "  file limit : " + fileLimit);
+        long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(),
+            HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
+        sizeNeeded = parseCtx.getGlobalLimitCtx().getGlobalLimit() * 
sizePerRow;
+        // for the optimization that reduce number of input file, we limit 
number
+        // of files allowed. If more than specific number of files have to be
+        // selected, we skip this optimization. Since having too many files as
+        // inputs can cause unpredictable latency. It's not necessarily to be
+        // cheaper.
+        fileLimit =
+            HiveConf.getIntVar(parseCtx.getConf(), 
HiveConf.ConfVars.HIVELIMITOPTLIMITFILE);
+
+        if (sizePerRow <= 0 || fileLimit <= 0) {
+          LOG.info("Skip optimization to reduce input size of 'limit'");
+          parseCtx.getGlobalLimitCtx().disableOpt();
+        } else if (parts.isEmpty()) {
+          LOG.info("Empty input: skip limit optimiztion");
+        } else {
+          LOG.info("Try to reduce input size for 'limit' " +
+              "sizeNeeded: " + sizeNeeded +
+              "  file limit : " + fileLimit);
+        }
       }
     }
     boolean isFirstPart = true;

http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
index dbb5209..1e7e617 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
@@ -26,6 +26,7 @@ import java.util.Map;
 import org.apache.hadoop.hive.ql.exec.PTFUtils;
 import org.apache.hadoop.hive.ql.metadata.Table;
 import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
 import org.apache.hadoop.hive.ql.parse.TableSample;
 import org.apache.hadoop.hive.ql.plan.Explain.Level;
 
@@ -132,6 +133,10 @@ public class TableScanDesc extends AbstractOperatorDesc {
     return alias;
   }
 
+  public boolean isAcidTable() {
+    return SemanticAnalyzer.isAcidTable(this.tableMetadata);
+  }
+
   @Explain(displayName = "filterExpr")
   public String getFilterExprString() {
     StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/test/queries/clientpositive/acid_globallimit.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/acid_globallimit.q 
b/ql/src/test/queries/clientpositive/acid_globallimit.q
new file mode 100644
index 0000000..5968e6b
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/acid_globallimit.q
@@ -0,0 +1,19 @@
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.enforce.bucketing=true;
+set hive.fetch.task.conversion=none;
+set hive.limit.optimize.enable=true;
+
+-- Global Limit optimization does not work with ACID table. Make sure to skip 
it for ACID table.
+CREATE TABLE acidtest1(c1 INT, c2 STRING)
+CLUSTERED BY (c1) INTO 3 BUCKETS
+STORED AS ORC
+TBLPROPERTIES ("transactional"="true");
+
+insert into table acidtest1 select cint, cstring1 from alltypesorc where cint 
is not null order by cint;
+
+explain
+select cast (c1 as string) from acidtest1 limit 10;
+select cast (c1 as string) from acidtest1 limit 10;
+
+drop table acidtest1;

http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/test/results/clientpositive/acid_globallimit.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/acid_globallimit.q.out 
b/ql/src/test/results/clientpositive/acid_globallimit.q.out
new file mode 100644
index 0000000..6a2a792
--- /dev/null
+++ b/ql/src/test/results/clientpositive/acid_globallimit.q.out
@@ -0,0 +1,90 @@
+PREHOOK: query: -- Global Limit optimization does not work with ACID table. 
Make sure to skip it for ACID table.
+CREATE TABLE acidtest1(c1 INT, c2 STRING)
+CLUSTERED BY (c1) INTO 3 BUCKETS
+STORED AS ORC
+TBLPROPERTIES ("transactional"="true")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@acidtest1
+POSTHOOK: query: -- Global Limit optimization does not work with ACID table. 
Make sure to skip it for ACID table.
+CREATE TABLE acidtest1(c1 INT, c2 STRING)
+CLUSTERED BY (c1) INTO 3 BUCKETS
+STORED AS ORC
+TBLPROPERTIES ("transactional"="true")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@acidtest1
+PREHOOK: query: insert into table acidtest1 select cint, cstring1 from 
alltypesorc where cint is not null order by cint
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+PREHOOK: Output: default@acidtest1
+POSTHOOK: query: insert into table acidtest1 select cint, cstring1 from 
alltypesorc where cint is not null order by cint
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+POSTHOOK: Output: default@acidtest1
+POSTHOOK: Lineage: acidtest1.c1 SIMPLE 
[(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: acidtest1.c2 SIMPLE 
[(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, 
comment:null), ]
+PREHOOK: query: explain
+select cast (c1 as string) from acidtest1 limit 10
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select cast (c1 as string) from acidtest1 limit 10
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: acidtest1
+            Statistics: Num rows: 9173 Data size: 101822 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: UDFToString(c1) (type: string)
+              outputColumnNames: _col0
+              Statistics: Num rows: 9173 Data size: 101822 Basic stats: 
COMPLETE Column stats: NONE
+              Limit
+                Number of rows: 10
+                Statistics: Num rows: 10 Data size: 110 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 10 Data size: 110 Basic stats: 
COMPLETE Column stats: NONE
+                  table:
+                      input format: org.apache.hadoop.mapred.TextInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 10
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select cast (c1 as string) from acidtest1 limit 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtest1
+#### A masked pattern was here ####
+POSTHOOK: query: select cast (c1 as string) from acidtest1 limit 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtest1
+#### A masked pattern was here ####
+185520768
+186950964
+186967185
+187206627
+187503456
+188474907
+190070046
+190435023
+190587882
+191372331
+PREHOOK: query: drop table acidtest1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@acidtest1
+PREHOOK: Output: default@acidtest1
+POSTHOOK: query: drop table acidtest1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@acidtest1
+POSTHOOK: Output: default@acidtest1

http://git-wip-us.apache.org/repos/asf/hive/blob/ec1878f4/ql/src/test/results/clientpositive/tez/acid_globallimit.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/acid_globallimit.q.out 
b/ql/src/test/results/clientpositive/tez/acid_globallimit.q.out
new file mode 100644
index 0000000..36d0008
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/acid_globallimit.q.out
@@ -0,0 +1,93 @@
+PREHOOK: query: -- Global Limit optimization does not work with ACID table. 
Make sure to skip it for ACID table.
+CREATE TABLE acidtest1(c1 INT, c2 STRING)
+CLUSTERED BY (c1) INTO 3 BUCKETS
+STORED AS ORC
+TBLPROPERTIES ("transactional"="true")
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@acidtest1
+POSTHOOK: query: -- Global Limit optimization does not work with ACID table. 
Make sure to skip it for ACID table.
+CREATE TABLE acidtest1(c1 INT, c2 STRING)
+CLUSTERED BY (c1) INTO 3 BUCKETS
+STORED AS ORC
+TBLPROPERTIES ("transactional"="true")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@acidtest1
+PREHOOK: query: insert into table acidtest1 select cint, cstring1 from 
alltypesorc where cint is not null order by cint
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+PREHOOK: Output: default@acidtest1
+POSTHOOK: query: insert into table acidtest1 select cint, cstring1 from 
alltypesorc where cint is not null order by cint
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+POSTHOOK: Output: default@acidtest1
+POSTHOOK: Lineage: acidtest1.c1 SIMPLE 
[(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: acidtest1.c2 SIMPLE 
[(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, 
comment:null), ]
+PREHOOK: query: explain
+select cast (c1 as string) from acidtest1 limit 10
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select cast (c1 as string) from acidtest1 limit 10
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: acidtest1
+                  Statistics: Num rows: 9173 Data size: 101822 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: UDFToString(c1) (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 9173 Data size: 101822 Basic stats: 
COMPLETE Column stats: NONE
+                    Limit
+                      Number of rows: 10
+                      Statistics: Num rows: 10 Data size: 110 Basic stats: 
COMPLETE Column stats: NONE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 10 Data size: 110 Basic stats: 
COMPLETE Column stats: NONE
+                        table:
+                            input format: 
org.apache.hadoop.mapred.TextInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 10
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select cast (c1 as string) from acidtest1 limit 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtest1
+#### A masked pattern was here ####
+POSTHOOK: query: select cast (c1 as string) from acidtest1 limit 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtest1
+#### A masked pattern was here ####
+185520768
+186950964
+186967185
+187206627
+187503456
+188474907
+190070046
+190435023
+190587882
+191372331
+PREHOOK: query: drop table acidtest1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@acidtest1
+PREHOOK: Output: default@acidtest1
+POSTHOOK: query: drop table acidtest1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@acidtest1
+POSTHOOK: Output: default@acidtest1

hive git commit: HIVE-12444 - Global Limit optimization on ACID table without base directory may throw exception ADDENDUM (Wei Zheng via Eugene Koifman)

Reply via email to