hive git commit: HIVE-20187 : Incorrect query results in hive when hive.convert.join.bucket.mapjoin.tez is set to true (Deepak Jaiswal, reviewed by Gunther Hagleitner)

djaiswal Sat, 25 Aug 2018 21:49:06 -0700

Repository: hive
Updated Branches:
  refs/heads/master b627fa8fa -> 1cb7e7734



HIVE-20187 : Incorrect query results in hive when 
hive.convert.join.bucket.mapjoin.tez is set to true (Deepak Jaiswal, reviewed 
by Gunther Hagleitner)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/1cb7e773
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/1cb7e773
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/1cb7e773

Branch: refs/heads/master
Commit: 1cb7e77348af7bea53e472cef561fcece3f25b35
Parents: b627fa8
Author: Deepak Jaiswal <[email protected]>
Authored: Sat Aug 25 21:47:55 2018 -0700
Committer: Deepak Jaiswal <[email protected]>
Committed: Sat Aug 25 21:47:55 2018 -0700

----------------------------------------------------------------------
 .../annotation/OpTraitsRulesProcFactory.java    |  13 ++
 .../clientpositive/bucket_map_join_tez2.q       |  19 ++
 .../llap/bucket_map_join_tez2.q.out             | 212 +++++++++++++++++++
 .../clientpositive/llap/limit_pushdown.q.out    |  83 +++++---
 .../llap/offset_limit_ppd_optimizer.q.out       |  85 +++++---
 .../clientpositive/llap/tez_smb_main.q.out      |  12 +-
 .../spark/bucket_map_join_tez2.q.out            | 212 +++++++++++++++++++
 7 files changed, 565 insertions(+), 71 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/1cb7e773/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
index 89db530..8f75126 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
@@ -110,6 +110,19 @@ public class OpTraitsRulesProcFactory {
           for (List<String> cols : parentOpTraits.getBucketColNames()) {
             for (String col : cols) {
               for (Entry<String, ExprNodeDesc> entry : 
rs.getColumnExprMap().entrySet()) {
+                // Make sure this entry is in key columns.
+                boolean isKey = false;
+                for (ExprNodeDesc keyDesc : rs.getConf().getKeyCols()) {
+                  if (keyDesc.isSame(entry.getValue())) {
+                    isKey = true;
+                    break;
+                  }
+                }
+
+                // skip if not a key
+                if (!isKey) {
+                  continue;
+                }
                 // Fetch the column expression. There should be atleast one.
                 Map<Integer, ExprNodeDesc> colMap = new HashMap<>();
                 boolean found = false;

http://git-wip-us.apache.org/repos/asf/hive/blob/1cb7e773/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q 
b/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q
index adcf696..ae1ec44 100644
--- a/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q
+++ b/ql/src/test/queries/clientpositive/bucket_map_join_tez2.q
@@ -138,3 +138,22 @@ explain select a.key, b.key from tab_part_n11 a join 
tab_part_n11 c on a.key = c
 set test.comment=External tables, bucket map join should be disabled;
 set test.comment;
 explain select a.key, b.key from tab_part_ext a join tab_part_ext c on a.key = 
c.key join tab_part_ext b on a.value = b.value;
+
+-- HIVE-20187 : Must not create BMJ
+create table my_fact(AMT decimal(20,3),bucket_col string ,join_col string )
+PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string )
+CLUSTERED BY (bucket_col) INTO 10
+BUCKETS
+stored as ORC
+;
+create table my_dim(join_col string,filter_col string) stored as orc;
+
+INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", "VAL3"), ("4", 
"VAL4");
+INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", 
ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, 
"20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3");
+
+explain  extended
+select bucket_col, my_dim.join_col as account1,my_fact.accounting_period
+FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col
+WHERE my_fact.fiscal_year = '2015'
+AND my_dim.filter_col IN ( 'VAL1', 'VAL2' )
+and my_fact.accounting_period in (10);
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/1cb7e773/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out 
b/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out
index 4f042ce..bf64a15 100644
--- a/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out
+++ b/ql/src/test/results/clientpositive/llap/bucket_map_join_tez2.q.out
@@ -2180,3 +2180,215 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
+PREHOOK: query: create table my_fact(AMT decimal(20,3),bucket_col string 
,join_col string )
+PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string )
+CLUSTERED BY (bucket_col) INTO 10
+BUCKETS
+stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@my_fact
+POSTHOOK: query: create table my_fact(AMT decimal(20,3),bucket_col string 
,join_col string )
+PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string )
+CLUSTERED BY (bucket_col) INTO 10
+BUCKETS
+stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@my_fact
+PREHOOK: query: create table my_dim(join_col string,filter_col string) stored 
as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@my_dim
+POSTHOOK: query: create table my_dim(join_col string,filter_col string) stored 
as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@my_dim
+PREHOOK: query: INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", 
"VAL3"), ("4", "VAL4")
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@my_dim
+POSTHOOK: query: INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", 
"VAL3"), ("4", "VAL4")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@my_dim
+POSTHOOK: Lineage: my_dim.filter_col SCRIPT []
+POSTHOOK: Lineage: my_dim.join_col SCRIPT []
+PREHOOK: query: INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", 
ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, 
"20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3")
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@my_fact@fiscal_year=2015/accounting_period=20
+POSTHOOK: query: INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", 
ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, 
"20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@my_fact@fiscal_year=2015/accounting_period=20
+POSTHOOK: Lineage: my_fact 
PARTITION(fiscal_year=2015,accounting_period=20).amt SCRIPT []
+POSTHOOK: Lineage: my_fact 
PARTITION(fiscal_year=2015,accounting_period=20).bucket_col SCRIPT []
+POSTHOOK: Lineage: my_fact 
PARTITION(fiscal_year=2015,accounting_period=20).join_col SCRIPT []
+PREHOOK: query: explain  extended
+select bucket_col, my_dim.join_col as account1,my_fact.accounting_period
+FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col
+WHERE my_fact.fiscal_year = '2015'
+AND my_dim.filter_col IN ( 'VAL1', 'VAL2' )
+and my_fact.accounting_period in (10)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain  extended
+select bucket_col, my_dim.join_col as account1,my_fact.accounting_period
+FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col
+WHERE my_fact.fiscal_year = '2015'
+AND my_dim.filter_col IN ( 'VAL1', 'VAL2' )
+and my_fact.accounting_period in (10)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 2 <- Map 1 (BROADCAST_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: my_fact
+                  filterExpr: ((fiscal_year = '2015') and 
(UDFToDouble(accounting_period) = 10.0D) and join_col is not null) (type: 
boolean)
+                  Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE 
Column stats: PARTIAL
+                  GatherStats: false
+                  Filter Operator
+                    isSamplingPred: false
+                    predicate: ((UDFToDouble(accounting_period) = 10.0D) and 
(fiscal_year = '2015') and join_col is not null) (type: boolean)
+                    Statistics: Num rows: 1 Data size: 736 Basic stats: 
COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: bucket_col (type: string), join_col (type: 
string), accounting_period (type: string)
+                      outputColumnNames: _col0, _col1, _col3
+                      Statistics: Num rows: 1 Data size: 640 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: _col1 (type: string)
+                        null sort order: a
+                        sort order: +
+                        Map-reduce partition columns: _col1 (type: string)
+                        Statistics: Num rows: 1 Data size: 640 Basic stats: 
COMPLETE Column stats: PARTIAL
+                        tag: 0
+                        value expressions: _col0 (type: string), _col3 (type: 
string)
+                        auto parallelism: true
+            Execution mode: vectorized, llap
+            LLAP IO: unknown
+        Map 2 
+            Map Operator Tree:
+                TableScan
+                  alias: my_dim
+                  filterExpr: ((filter_col) IN ('VAL1', 'VAL2') and join_col 
is not null) (type: boolean)
+                  Statistics: Num rows: 4 Data size: 1472 Basic stats: 
COMPLETE Column stats: NONE
+                  GatherStats: false
+                  Filter Operator
+                    isSamplingPred: false
+                    predicate: ((filter_col) IN ('VAL1', 'VAL2') and join_col 
is not null) (type: boolean)
+                    Statistics: Num rows: 4 Data size: 1472 Basic stats: 
COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: join_col (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 4 Data size: 1472 Basic stats: 
COMPLETE Column stats: NONE
+                      Map Join Operator
+                        condition map:
+                             Inner Join 0 to 1
+                        Estimated key counts: Map 1 => 1
+                        keys:
+                          0 _col1 (type: string)
+                          1 _col0 (type: string)
+                        outputColumnNames: _col0, _col3, _col4
+                        input vertices:
+                          0 Map 1
+                        Position of Big Table: 1
+                        Statistics: Num rows: 4 Data size: 1619 Basic stats: 
COMPLETE Column stats: NONE
+                        Select Operator
+                          expressions: _col0 (type: string), _col4 (type: 
string), _col3 (type: string)
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 4 Data size: 1619 Basic stats: 
COMPLETE Column stats: NONE
+                          File Output Operator
+                            compressed: false
+                            GlobalTableId: 0
+#### A masked pattern was here ####
+                            NumFilesPerFileSink: 1
+                            Statistics: Num rows: 4 Data size: 1619 Basic 
stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+                            table:
+                                input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                                properties:
+                                  columns _col0,_col1,_col2
+                                  columns.types string:string:string
+                                  escape.delim \
+                                  
hive.serialization.extend.additional.nesting.levels true
+                                  serialization.escape.crlf true
+                                  serialization.format 1
+                                  serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                                serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                            TotalFiles: 1
+                            GatherStats: false
+                            MultiFileSpray: false
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+            Path -> Alias:
+#### A masked pattern was here ####
+            Path -> Partition:
+#### A masked pattern was here ####
+                Partition
+                  base file name: my_dim
+                  input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                  properties:
+                    COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"}
+                    bucket_count -1
+                    bucketing_version 2
+                    column.name.delimiter ,
+                    columns join_col,filter_col
+                    columns.comments 
+                    columns.types string:string
+#### A masked pattern was here ####
+                    name default.my_dim
+                    numFiles 1
+                    numRows 4
+                    rawDataSize 692
+                    serialization.ddl struct my_dim { string join_col, string 
filter_col}
+                    serialization.format 1
+                    serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                    totalSize 338
+#### A masked pattern was here ####
+                  serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                
+                    input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                    output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                    properties:
+                      COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"}
+                      bucket_count -1
+                      bucketing_version 2
+                      column.name.delimiter ,
+                      columns join_col,filter_col
+                      columns.comments 
+                      columns.types string:string
+#### A masked pattern was here ####
+                      name default.my_dim
+                      numFiles 1
+                      numRows 4
+                      rawDataSize 692
+                      serialization.ddl struct my_dim { string join_col, 
string filter_col}
+                      serialization.format 1
+                      serialization.lib 
org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                      totalSize 338
+#### A masked pattern was here ####
+                    serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                    name: default.my_dim
+                  name: default.my_dim
+            Truncated Path -> Alias:
+              /my_dim [my_dim]
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+

http://git-wip-us.apache.org/repos/asf/hive/blob/1cb7e773/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out 
b/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out
index 4fc1419..1d19fc0 100644
--- a/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out
@@ -923,8 +923,10 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 3 (CUSTOM_SIMPLE_EDGE)
-        Reducer 3 <- Map 1 (SIMPLE_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (SIMPLE_EDGE)
+        Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -975,18 +977,7 @@ STAGE PLANS:
             Execution mode: vectorized, llap
             LLAP IO: no inputs
         Reducer 2 
-            Reduce Operator Tree:
-              Select Operator
-                expressions: VALUE._col0 (type: string), VALUE._col1 (type: 
bigint)
-                outputColumnNames: _col0, _col1
-                Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE 
Column stats: COMPLETE
-                Limit
-                  Number of rows: 3
-                  Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE 
Column stats: COMPLETE
-                  Filter Operator
-                    predicate: _col0 is not null (type: boolean)
-                    Statistics: Num rows: 3 Data size: 285 Basic stats: 
COMPLETE Column stats: COMPLETE
-            Execution mode: llap
+            Execution mode: vectorized, llap
             Reduce Operator Tree:
               Group By Operator
                 aggregations: count(VALUE._col0)
@@ -1000,25 +991,34 @@ STAGE PLANS:
                   Filter Operator
                     predicate: _col0 is not null (type: boolean)
                     Statistics: Num rows: 2 Data size: 190 Basic stats: 
COMPLETE Column stats: COMPLETE
-                    Merge Join Operator
-                      condition map:
-                           Inner Join 0 to 1
-                      keys:
-                        0 _col0 (type: string)
-                        1 _col0 (type: string)
-                      outputColumnNames: _col0, _col1, _col2, _col3
-                      Statistics: Num rows: 2 Data size: 380 Basic stats: 
COMPLETE Column stats: COMPLETE
-                      Limit
-                        Number of rows: 4
-                        Statistics: Num rows: 2 Data size: 380 Basic stats: 
COMPLETE Column stats: COMPLETE
-                        File Output Operator
-                          compressed: false
-                          Statistics: Num rows: 2 Data size: 380 Basic stats: 
COMPLETE Column stats: COMPLETE
-                          table:
-                              input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
-                              output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                              serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string)
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: string)
+                      Statistics: Num rows: 2 Data size: 190 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col1 (type: bigint)
         Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: string)
+                  1 _col0 (type: string)
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 2 Data size: 380 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Limit
+                  Number of rows: 4
+                  Statistics: Num rows: 2 Data size: 380 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 2 Data size: 380 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Group By Operator
@@ -1035,6 +1035,25 @@ STAGE PLANS:
                     Statistics: Num rows: 3 Data size: 285 Basic stats: 
COMPLETE Column stats: COMPLETE
                     TopN Hash Memory Usage: 0.3
                     value expressions: _col0 (type: string), _col1 (type: 
bigint)
+        Reducer 5 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: VALUE._col0 (type: string), VALUE._col1 (type: 
bigint)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Limit
+                  Number of rows: 3
+                  Statistics: Num rows: 3 Data size: 285 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Filter Operator
+                    predicate: _col0 is not null (type: boolean)
+                    Statistics: Num rows: 3 Data size: 285 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string)
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: string)
+                      Statistics: Num rows: 3 Data size: 285 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col1 (type: bigint)
 
   Stage: Stage-0
     Fetch Operator

http://git-wip-us.apache.org/repos/asf/hive/blob/1cb7e773/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out 
b/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out
index 2e8d5f3..d95025c 100644
--- a/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out
+++ b/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out
@@ -1317,8 +1317,10 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 3 (CUSTOM_SIMPLE_EDGE)
-        Reducer 3 <- Map 1 (SIMPLE_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (SIMPLE_EDGE)
+        Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -1345,18 +1347,6 @@ STAGE PLANS:
             Execution mode: llap
             LLAP IO: no inputs
         Reducer 2 
-            Reduce Operator Tree:
-              Select Operator
-                expressions: VALUE._col0 (type: string), VALUE._col1 (type: 
bigint)
-                outputColumnNames: _col0, _col1
-                Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE 
Column stats: COMPLETE
-                Limit
-                  Number of rows: 20
-                  Offset of rows: 20
-                  Statistics: Num rows: 20 Data size: 1900 Basic stats: 
COMPLETE Column stats: COMPLETE
-                  Filter Operator
-                    predicate: _col0 is not null (type: boolean)
-                    Statistics: Num rows: 20 Data size: 1900 Basic stats: 
COMPLETE Column stats: COMPLETE
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
@@ -1372,28 +1362,37 @@ STAGE PLANS:
                   Filter Operator
                     predicate: _col0 is not null (type: boolean)
                     Statistics: Num rows: 20 Data size: 1900 Basic stats: 
COMPLETE Column stats: COMPLETE
-                    Merge Join Operator
-                      condition map:
-                           Inner Join 0 to 1
-                      keys:
-                        0 _col0 (type: string)
-                        1 _col0 (type: string)
-                      outputColumnNames: _col0, _col1, _col2, _col3
-                      Statistics: Num rows: 20 Data size: 3800 Basic stats: 
COMPLETE Column stats: COMPLETE
-                      Limit
-                        Number of rows: 5
-                        Offset of rows: 3
-                        Statistics: Num rows: 5 Data size: 950 Basic stats: 
COMPLETE Column stats: COMPLETE
-                        File Output Operator
-                          compressed: false
-                          Statistics: Num rows: 5 Data size: 950 Basic stats: 
COMPLETE Column stats: COMPLETE
-                          table:
-                              input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
-                              output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                              serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string)
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: string)
+                      Statistics: Num rows: 20 Data size: 1900 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col1 (type: bigint)
         Reducer 3 
             Execution mode: llap
             Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: string)
+                  1 _col0 (type: string)
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 20 Data size: 3800 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Limit
+                  Number of rows: 5
+                  Offset of rows: 3
+                  Statistics: Num rows: 5 Data size: 950 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 5 Data size: 950 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
               Group By Operator
                 aggregations: count()
                 keys: KEY._col0 (type: string)
@@ -1409,6 +1408,26 @@ STAGE PLANS:
                     Statistics: Num rows: 20 Data size: 1900 Basic stats: 
COMPLETE Column stats: COMPLETE
                     TopN Hash Memory Usage: 2.0E-5
                     value expressions: _col0 (type: string), _col1 (type: 
bigint)
+        Reducer 5 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: VALUE._col0 (type: string), VALUE._col1 (type: 
bigint)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 20 Data size: 1900 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Limit
+                  Number of rows: 20
+                  Offset of rows: 20
+                  Statistics: Num rows: 20 Data size: 1900 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: _col0 is not null (type: boolean)
+                    Statistics: Num rows: 20 Data size: 1900 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string)
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: string)
+                      Statistics: Num rows: 20 Data size: 1900 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col1 (type: bigint)
 
   Stage: Stage-0
     Fetch Operator

http://git-wip-us.apache.org/repos/asf/hive/blob/1cb7e773/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out 
b/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out
index 9929989..1809fa7 100644
--- a/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out
+++ b/ql/src/test/results/clientpositive/llap/tez_smb_main.q.out
@@ -592,7 +592,7 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Map 1 <- Map 4 (CUSTOM_EDGE)
+        Map 1 <- Map 4 (BROADCAST_EDGE)
         Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
         Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -721,7 +721,7 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Map 1 <- Map 4 (CUSTOM_EDGE)
+        Map 1 <- Map 4 (BROADCAST_EDGE)
         Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
         Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -853,7 +853,7 @@ POSTHOOK: Input: default@tab_n11@ds=2008-04-08
 POSTHOOK: Input: default@tab_part_n12
 POSTHOOK: Input: default@tab_part_n12@ds=2008-04-08
 #### A masked pattern was here ####
-9
+40
 PREHOOK: query: explain select count(*) from tab_n11 a join tab_part_n12 b on 
a.value = b.value
 PREHOOK: type: QUERY
 POSTHOOK: query: explain select count(*) from tab_n11 a join tab_part_n12 b on 
a.value = b.value
@@ -1426,7 +1426,7 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Map 1 <- Map 2 (CUSTOM_EDGE)
+        Map 1 <- Map 2 (BROADCAST_EDGE)
         Map 3 <- Map 1 (CUSTOM_EDGE)
         Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -1549,7 +1549,7 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Map 1 <- Map 2 (CUSTOM_EDGE)
+        Map 1 <- Map 2 (BROADCAST_EDGE)
         Map 3 <- Map 1 (CUSTOM_EDGE)
         Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
@@ -1675,7 +1675,7 @@ POSTHOOK: Input: default@tab_n11@ds=2008-04-08
 POSTHOOK: Input: default@tab_part_n12
 POSTHOOK: Input: default@tab_part_n12@ds=2008-04-08
 #### A masked pattern was here ####
-9
+40
 PREHOOK: query: explain
 select count(*) from (select s1.key as key, s1.value as value from tab_n11 s1 
join tab_n11 s3 on s1.key=s3.key
 UNION  ALL

http://git-wip-us.apache.org/repos/asf/hive/blob/1cb7e773/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out 
b/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out
index 243cbc3..7bc0739 100644
--- a/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out
+++ b/ql/src/test/results/clientpositive/spark/bucket_map_join_tez2.q.out
@@ -2172,3 +2172,215 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
+PREHOOK: query: create table my_fact(AMT decimal(20,3),bucket_col string 
,join_col string )
+PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string )
+CLUSTERED BY (bucket_col) INTO 10
+BUCKETS
+stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@my_fact
+POSTHOOK: query: create table my_fact(AMT decimal(20,3),bucket_col string 
,join_col string )
+PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string )
+CLUSTERED BY (bucket_col) INTO 10
+BUCKETS
+stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@my_fact
+PREHOOK: query: create table my_dim(join_col string,filter_col string) stored 
as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@my_dim
+POSTHOOK: query: create table my_dim(join_col string,filter_col string) stored 
as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@my_dim
+PREHOOK: query: INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", 
"VAL3"), ("4", "VAL4")
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@my_dim
+POSTHOOK: query: INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", 
"VAL3"), ("4", "VAL4")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@my_dim
+POSTHOOK: Lineage: my_dim.filter_col SCRIPT []
+POSTHOOK: Lineage: my_dim.join_col SCRIPT []
+PREHOOK: query: INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", 
ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, 
"20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3")
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@my_fact@fiscal_year=2015/accounting_period=20
+POSTHOOK: query: INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", 
ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, 
"20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@my_fact@fiscal_year=2015/accounting_period=20
+POSTHOOK: Lineage: my_fact 
PARTITION(fiscal_year=2015,accounting_period=20).amt SCRIPT []
+POSTHOOK: Lineage: my_fact 
PARTITION(fiscal_year=2015,accounting_period=20).bucket_col SCRIPT []
+POSTHOOK: Lineage: my_fact 
PARTITION(fiscal_year=2015,accounting_period=20).join_col SCRIPT []
+PREHOOK: query: explain  extended
+select bucket_col, my_dim.join_col as account1,my_fact.accounting_period
+FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col
+WHERE my_fact.fiscal_year = '2015'
+AND my_dim.filter_col IN ( 'VAL1', 'VAL2' )
+and my_fact.accounting_period in (10)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain  extended
+select bucket_col, my_dim.join_col as account1,my_fact.accounting_period
+FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col
+WHERE my_fact.fiscal_year = '2015'
+AND my_dim.filter_col IN ( 'VAL1', 'VAL2' )
+and my_fact.accounting_period in (10)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-2 is a root stage
+  Stage-1 depends on stages: Stage-2
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-2
+    Spark
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: my_fact
+                  filterExpr: ((fiscal_year = '2015') and 
(UDFToDouble(accounting_period) = 10.0D) and join_col is not null) (type: 
boolean)
+                  Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL 
Column stats: NONE
+                  GatherStats: false
+                  Filter Operator
+                    isSamplingPred: false
+                    predicate: ((UDFToDouble(accounting_period) = 10.0D) and 
(fiscal_year = '2015') and join_col is not null) (type: boolean)
+                    Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL 
Column stats: NONE
+                    Select Operator
+                      expressions: bucket_col (type: string), join_col (type: 
string), accounting_period (type: string)
+                      outputColumnNames: _col0, _col1, _col3
+                      Statistics: Num rows: 1 Data size: 0 Basic stats: 
PARTIAL Column stats: NONE
+                      Spark HashTable Sink Operator
+                        keys:
+                          0 _col1 (type: string)
+                          1 _col0 (type: string)
+                        Position of Big Table: 1
+            Execution mode: vectorized
+            Local Work:
+              Map Reduce Local Work
+
+  Stage: Stage-1
+    Spark
+#### A masked pattern was here ####
+      Vertices:
+        Map 2 
+            Map Operator Tree:
+                TableScan
+                  alias: my_dim
+                  filterExpr: ((filter_col) IN ('VAL1', 'VAL2') and join_col 
is not null) (type: boolean)
+                  Statistics: Num rows: 4 Data size: 692 Basic stats: COMPLETE 
Column stats: NONE
+                  GatherStats: false
+                  Filter Operator
+                    isSamplingPred: false
+                    predicate: ((filter_col) IN ('VAL1', 'VAL2') and join_col 
is not null) (type: boolean)
+                    Statistics: Num rows: 4 Data size: 692 Basic stats: 
COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: join_col (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 4 Data size: 692 Basic stats: 
COMPLETE Column stats: NONE
+                      Map Join Operator
+                        condition map:
+                             Inner Join 0 to 1
+                        keys:
+                          0 _col1 (type: string)
+                          1 _col0 (type: string)
+                        outputColumnNames: _col0, _col3, _col4
+                        input vertices:
+                          0 Map 1
+                        Position of Big Table: 1
+                        Statistics: Num rows: 4 Data size: 761 Basic stats: 
PARTIAL Column stats: NONE
+                        Select Operator
+                          expressions: _col0 (type: string), _col4 (type: 
string), _col3 (type: string)
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 4 Data size: 761 Basic stats: 
PARTIAL Column stats: NONE
+                          File Output Operator
+                            compressed: false
+                            GlobalTableId: 0
+#### A masked pattern was here ####
+                            NumFilesPerFileSink: 1
+                            Statistics: Num rows: 4 Data size: 761 Basic 
stats: PARTIAL Column stats: NONE
+#### A masked pattern was here ####
+                            table:
+                                input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                                properties:
+                                  columns _col0,_col1,_col2
+                                  columns.types string:string:string
+                                  escape.delim \
+                                  
hive.serialization.extend.additional.nesting.levels true
+                                  serialization.escape.crlf true
+                                  serialization.format 1
+                                  serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                                serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                            TotalFiles: 1
+                            GatherStats: false
+                            MultiFileSpray: false
+            Execution mode: vectorized
+            Local Work:
+              Map Reduce Local Work
+            Path -> Alias:
+#### A masked pattern was here ####
+            Path -> Partition:
+#### A masked pattern was here ####
+                Partition
+                  base file name: my_dim
+                  input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                  properties:
+                    COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"}
+                    bucket_count -1
+                    bucketing_version 2
+                    column.name.delimiter ,
+                    columns join_col,filter_col
+                    columns.comments 
+                    columns.types string:string
+#### A masked pattern was here ####
+                    name default.my_dim
+                    numFiles 1
+                    numRows 4
+                    rawDataSize 692
+                    serialization.ddl struct my_dim { string join_col, string 
filter_col}
+                    serialization.format 1
+                    serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                    totalSize 338
+#### A masked pattern was here ####
+                  serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                
+                    input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                    output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                    properties:
+                      COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"}
+                      bucket_count -1
+                      bucketing_version 2
+                      column.name.delimiter ,
+                      columns join_col,filter_col
+                      columns.comments 
+                      columns.types string:string
+#### A masked pattern was here ####
+                      name default.my_dim
+                      numFiles 1
+                      numRows 4
+                      rawDataSize 692
+                      serialization.ddl struct my_dim { string join_col, 
string filter_col}
+                      serialization.format 1
+                      serialization.lib 
org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                      totalSize 338
+#### A masked pattern was here ####
+                    serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                    name: default.my_dim
+                  name: default.my_dim
+            Truncated Path -> Alias:
+              /my_dim [$hdt$_1:my_dim]
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+

hive git commit: HIVE-20187 : Incorrect query results in hive when hive.convert.join.bucket.mapjoin.tez is set to true (Deepak Jaiswal, reviewed by Gunther Hagleitner)

Reply via email to