Repository: hive
Updated Branches:
  refs/heads/branch-2.0 98d516770 -> 5b7230d8e
  refs/heads/master 4185d9b8e -> 43837e8ef


HIVE-12945 : Bucket pruning: bucketing for -ve hashcodes have historical issues 
(Gopal V, reviewed by Sergey Shelukhin, Prasanth Jayachandran)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/43837e8e
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/43837e8e
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/43837e8e

Branch: refs/heads/master
Commit: 43837e8ef0a74b8ac777f0de9227c9bf233a27a6
Parents: 4185d9b
Author: Sergey Shelukhin <[email protected]>
Authored: Mon Feb 1 10:33:15 2016 -0800
Committer: Sergey Shelukhin <[email protected]>
Committed: Mon Feb 1 10:33:15 2016 -0800

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/conf/HiveConf.java   |  4 +
 .../test/resources/testconfiguration.properties |  1 +
 .../optimizer/FixedBucketPruningOptimizer.java  | 25 +++++-
 .../hadoop/hive/ql/optimizer/Optimizer.java     |  4 +-
 .../queries/clientpositive/bucketpruning1.q     |  6 +-
 .../clientpositive/tez/bucketpruning1.q.out     | 86 ++++++++++++++++++++
 6 files changed, 122 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/43837e8e/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index ff376a8..6678de6 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2366,6 +2366,10 @@ public class HiveConf extends Configuration {
          "When pruning is enabled, filters on bucket columns will be processed 
by \n" +
          "filtering the splits against a bitset of included buckets. This 
needs predicates \n"+
          "produced by hive.optimize.ppd and hive.optimize.index.filters."),
+    TEZ_OPTIMIZE_BUCKET_PRUNING_COMPAT(
+        "hive.tez.bucket.pruning.compat", true,
+        "When pruning is enabled, handle possibly broken inserts due to 
negative hashcodes.\n" +
+        "This occasionally doubles the data scan cost, but is default enabled 
for safety"),
     TEZ_DYNAMIC_PARTITION_PRUNING(
         "hive.tez.dynamic.partition.pruning", true,
         "When dynamic pruning is enabled, joins on partition keys will be 
processed by sending\n" +

http://git-wip-us.apache.org/repos/asf/hive/blob/43837e8e/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index a6e599c..dfd221e 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -380,6 +380,7 @@ minitez.query.files=bucket_map_join_tez1.q,\
   constprog_dpp.q,\
   dynamic_partition_pruning.q,\
   dynamic_partition_pruning_2.q,\
+  bucketpruning1.q,\
   explainuser_1.q,\
   explainuser_2.q,\
   explainuser_3.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/43837e8e/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
index c63318e..9e9beb0 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java
@@ -65,6 +65,12 @@ public class FixedBucketPruningOptimizer extends Transform {
   private static final Log LOG = LogFactory
       .getLog(FixedBucketPruningOptimizer.class.getName());
 
+  private final boolean compat;
+
+  public FixedBucketPruningOptimizer(boolean compat) {
+    this.compat = compat;
+  }
+
   public class NoopWalker implements NodeProcessor {
     @Override
     public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
@@ -229,6 +235,14 @@ public class FixedBucketPruningOptimizer extends Transform 
{
         Object convCols[] = new Object[] {conv.convert(literal)};
         int n = ObjectInspectorUtils.getBucketNumber(convCols, new 
ObjectInspector[]{constOI}, ctxt.getNumBuckets());
         bs.set(n);
+        if (ctxt.isCompat()) {
+          int h = ObjectInspectorUtils.getBucketHashCode(convCols, new 
ObjectInspector[]{constOI});
+          // -ve hashcodes had conversion to positive done in different ways 
in the past
+          // abs() is now obsolete and all inserts now use & Integer.MAX_VALUE 
+          // the compat mode assumes that old data could've been loaded using 
the other conversion
+          n = ObjectInspectorUtils.getBucketNumber(Math.abs(h), 
ctxt.getNumBuckets());
+          bs.set(n);
+        }
       }
       if (bs.cardinality() < ctxt.getNumBuckets()) {
         // there is a valid bucket pruning filter
@@ -252,12 +266,14 @@ public class FixedBucketPruningOptimizer extends 
Transform {
   public final class FixedBucketPruningOptimizerCtxt implements
       NodeProcessorCtx {
     public final ParseContext pctx;
+    private final boolean compat;
     private int numBuckets;
     private PrunedPartitionList partitions;
     private List<String> bucketCols;
     private List<StructField> schema;
 
-    public FixedBucketPruningOptimizerCtxt(ParseContext pctx) {
+    public FixedBucketPruningOptimizerCtxt(boolean compat, ParseContext pctx) {
+      this.compat = compat;
       this.pctx = pctx;
     }
 
@@ -292,12 +308,17 @@ public class FixedBucketPruningOptimizer extends 
Transform {
     public void setNumBuckets(int numBuckets) {
       this.numBuckets = numBuckets;
     }
+
+    // compatibility mode enabled
+    public boolean isCompat() {
+      return this.compat;
+    }
   }
 
   @Override
   public ParseContext transform(ParseContext pctx) throws SemanticException {
     // create a the context for walking operators
-    FixedBucketPruningOptimizerCtxt opPartWalkerCtx = new 
FixedBucketPruningOptimizerCtxt(
+    FixedBucketPruningOptimizerCtxt opPartWalkerCtx = new 
FixedBucketPruningOptimizerCtxt(compat,
         pctx);
 
     // Retrieve all partitions generated from partition pruner and partition

http://git-wip-us.apache.org/repos/asf/hive/blob/43837e8e/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
index 60a2e02..c06b8fc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
@@ -187,7 +187,9 @@ public class Optimizer {
         HiveConf.ConfVars.TEZ_OPTIMIZE_BUCKET_PRUNING)
         && HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTPPD)
         && HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.HIVEOPTINDEXFILTER)) {
-      transformations.add(new FixedBucketPruningOptimizer());
+      final boolean compatMode =
+          HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.TEZ_OPTIMIZE_BUCKET_PRUNING_COMPAT);
+      transformations.add(new FixedBucketPruningOptimizer(compatMode));
     }
 
     if(HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION)) {

http://git-wip-us.apache.org/repos/asf/hive/blob/43837e8e/ql/src/test/queries/clientpositive/bucketpruning1.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/bucketpruning1.q 
b/ql/src/test/queries/clientpositive/bucketpruning1.q
index 6c689f1..0f797f7 100644
--- a/ql/src/test/queries/clientpositive/bucketpruning1.q
+++ b/ql/src/test/queries/clientpositive/bucketpruning1.q
@@ -54,6 +54,11 @@ select * from srcbucket_pruned where (key=1 or key=2) and 
ds='2008-04-08';
 explain extended
 select * from srcbucket_pruned where (key=1 or key=2) and value = 'One' and 
ds='2008-04-08';
 
+-- compat case (-15 = 1 & 15)
+
+explain extended
+select * from srcbucket_pruned where key = -15;
+
 -- valid but irrelevant case (all buckets selected)
 
 explain extended
@@ -95,4 +100,3 @@ select * from srcbucket_unpruned where key in (3, 5);
 
 explain extended
 select * from srcbucket_unpruned where key = 1;
-

http://git-wip-us.apache.org/repos/asf/hive/blob/43837e8e/ql/src/test/results/clientpositive/tez/bucketpruning1.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/bucketpruning1.q.out 
b/ql/src/test/results/clientpositive/tez/bucketpruning1.q.out
index 3b90687..68b516f 100644
--- a/ql/src/test/results/clientpositive/tez/bucketpruning1.q.out
+++ b/ql/src/test/results/clientpositive/tez/bucketpruning1.q.out
@@ -1237,6 +1237,92 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
+PREHOOK: query: -- compat case (-15 = 1 & 15)
+
+explain extended
+select * from srcbucket_pruned where key = -15
+PREHOOK: type: QUERY
+POSTHOOK: query: -- compat case (-15 = 1 & 15)
+
+explain extended
+select * from srcbucket_pruned where key = -15
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+  
+TOK_QUERY
+   TOK_FROM
+      TOK_TABREF
+         TOK_TABNAME
+            srcbucket_pruned
+   TOK_INSERT
+      TOK_DESTINATION
+         TOK_DIR
+            TOK_TMP_FILE
+      TOK_SELECT
+         TOK_SELEXPR
+            TOK_ALLCOLREF
+      TOK_WHERE
+         =
+            TOK_TABLE_OR_COL
+               key
+            -
+               15
+
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: srcbucket_pruned
+                  filterExpr: (key = -15) (type: boolean)
+                  buckets included: [1,15,] of 16
+                  Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL 
Column stats: NONE
+                  GatherStats: false
+                  Filter Operator
+                    isSamplingPred: false
+                    predicate: (key = -15) (type: boolean)
+                    Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL 
Column stats: NONE
+                    Select Operator
+                      expressions: -15 (type: int), value (type: string), ds 
(type: string)
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 1 Data size: 0 Basic stats: 
PARTIAL Column stats: NONE
+                      File Output Operator
+                        compressed: false
+                        GlobalTableId: 0
+#### A masked pattern was here ####
+                        NumFilesPerFileSink: 1
+                        Statistics: Num rows: 1 Data size: 0 Basic stats: 
PARTIAL Column stats: NONE
+#### A masked pattern was here ####
+                        table:
+                            input format: 
org.apache.hadoop.mapred.TextInputFormat
+                            output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                            properties:
+                              columns _col0,_col1,_col2
+                              columns.types int:string:string
+                              escape.delim \
+                              
hive.serialization.extend.additional.nesting.levels true
+                              serialization.escape.crlf true
+                              serialization.format 1
+                              serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                        TotalFiles: 1
+                        GatherStats: false
+                        MultiFileSpray: false
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
 PREHOOK: query: -- valid but irrelevant case (all buckets selected)
 
 explain extended

Reply via email to