[hive] branch master updated: HIVE-27069: Incorrect results with bucket map join (Dayakar M, reviewed by Krisztian Kasa)

krisztiankasa Sun, 16 Apr 2023 23:10:50 -0700

This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new f6732333fa9 HIVE-27069: Incorrect results with bucket map join 
(Dayakar M, reviewed by Krisztian Kasa)
f6732333fa9 is described below

commit f6732333fa9dae07b7b09eaf76241ea69f2492db
Author: Dayakar M <[email protected]>
AuthorDate: Mon Apr 17 11:39:59 2023 +0530

    HIVE-27069: Incorrect results with bucket map join (Dayakar M, reviewed by 
Krisztian Kasa)
---
 .../test/resources/testconfiguration.properties    |   1 +
 .../hive/ql/optimizer/ReduceSinkMapJoinProc.java   |   3 +-
 .../clientpositive/bucketmapjoin_with_subquery.q   |  52 +++
 .../tez/bucketmapjoin_with_subquery.q.out          | 355 +++++++++++++++++++++
 4 files changed, 410 insertions(+), 1 deletion(-)

diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index 46c824bccac..b85c50bc098 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -17,6 +17,7 @@ minitez.query.files.shared=\
 # specific to tez and cannot be added to minillap.
 minitez.query.files=\
   acid_vectorization_original_tez.q,\
+  bucketmapjoin_with_subquery.q,\
   delete_orig_table.q,\
   explainanalyze_1.q,\
   explainanalyze_3.q,\
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
index bd6c41819d5..ea192e8af9a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ReduceSinkMapJoinProc.java
@@ -277,7 +277,8 @@ public class ReduceSinkMapJoinProc implements 
SemanticNodeProcessor {
         edgeType = EdgeType.CUSTOM_SIMPLE_EDGE;
       }
     }
-    if (edgeType == EdgeType.CUSTOM_EDGE) {
+    if (edgeType == EdgeType.CUSTOM_EDGE || (edgeType == 
EdgeType.CUSTOM_SIMPLE_EDGE && !mapJoinOp.getConf()
+        .isDynamicPartitionHashJoin())) {
       // disable auto parallelism for bucket map joins
       parentRS.getConf().setReducerTraits(EnumSet.of(FIXED));
     }
diff --git a/ql/src/test/queries/clientpositive/bucketmapjoin_with_subquery.q 
b/ql/src/test/queries/clientpositive/bucketmapjoin_with_subquery.q
new file mode 100644
index 00000000000..dd7274014c8
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/bucketmapjoin_with_subquery.q
@@ -0,0 +1,52 @@
+-- HIVE-27069
+set hive.query.results.cache.enabled=false;
+set hive.compute.query.using.stats=false;
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.auto.convert.join=true;
+set hive.auto.convert.sortmerge.join=true;
+set hive.auto.convert.sortmerge.join.to.mapjoin=true;
+set hive.convert.join.bucket.mapjoin.tez=true;
+set hive.fetch.task.conversion=none;
+set hive.merge.nway.joins=false;
+set hive.optimize.dynamic.partition.hashjoin=true;
+set hive.optimize.index.filter=true;
+set hive.optimize.remove.sq_count_check=true;
+set hive.prewarm.enabled=false;
+set hive.join.inner.residual=false;
+set hive.limit.optimize.enable=true;
+set hive.mapjoin.bucket.cache.size=10000;
+set hive.strict.managed.tables=true;
+set hive.tez.auto.reducer.parallelism=true;
+set hive.tez.bucket.pruning=true;
+
+CREATE TABLE DUP_TEST (id int , in_date timestamp , sample varchar(100)) 
stored as orc tblproperties('transactional'='true', 
'transactional_properties'='default');
+
+CREATE TABLE DUP_TEST_TARGET (id int , in_date timestamp , sample 
varchar(100)) CLUSTERED by (ID) INTO 5 BUCKETS STORED AS ORC 
tblproperties('transactional'='true', 'transactional_properties'='default');
+
+INSERT INTO DUP_TEST
+(id , in_date , sample)
+values
+(1  , '2023-04-14 10:11:12.111' , 'test1'),
+(2  , '2023-04-14 10:11:12.111' , 'test2'),
+(3  , '2023-04-14 10:11:12.111' , 'test3'),
+(4  , '2023-04-14 10:11:12.111' , 'test4'),
+(5  , '2023-04-14 10:11:12.111' , 'test5'),
+(6  , '2023-04-14 10:11:12.111' , 'test6'),
+(7  , '2023-04-14 10:11:12.111' , 'test7'),
+(8  , '2023-04-14 10:11:12.111' , 'test8'),
+(9  , '2023-04-14 10:11:12.111' , 'test9');
+
+-- Run merge into the target table for the first time
+MERGE INTO DUP_TEST_TARGET T USING (SELECT id , in_date , sample FROM (SELECT 
id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+    WHEN MATCHED THEN UPDATE SET  in_date = S.in_date , sample = S.sample
+    WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.in_date , S.sample);
+
+explain vectorization detail select * from DUP_TEST_TARGET T join (SELECT id , 
in_date , sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id;
+
+select * from DUP_TEST_TARGET T join (SELECT id , in_date , sample FROM 
(SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id;
+
+select * from DUP_TEST_TARGET T join DUP_TEST S ON T.id = S.id;
\ No newline at end of file
diff --git 
a/ql/src/test/results/clientpositive/tez/bucketmapjoin_with_subquery.q.out 
b/ql/src/test/results/clientpositive/tez/bucketmapjoin_with_subquery.q.out
new file mode 100644
index 00000000000..7ce5bb945e8
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/bucketmapjoin_with_subquery.q.out
@@ -0,0 +1,355 @@
+PREHOOK: query: CREATE TABLE DUP_TEST (id int , in_date timestamp , sample 
varchar(100)) stored as orc tblproperties('transactional'='true', 
'transactional_properties'='default')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@DUP_TEST
+POSTHOOK: query: CREATE TABLE DUP_TEST (id int , in_date timestamp , sample 
varchar(100)) stored as orc tblproperties('transactional'='true', 
'transactional_properties'='default')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@DUP_TEST
+PREHOOK: query: CREATE TABLE DUP_TEST_TARGET (id int , in_date timestamp , 
sample varchar(100)) CLUSTERED by (ID) INTO 5 BUCKETS STORED AS ORC 
tblproperties('transactional'='true', 'transactional_properties'='default')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@DUP_TEST_TARGET
+POSTHOOK: query: CREATE TABLE DUP_TEST_TARGET (id int , in_date timestamp , 
sample varchar(100)) CLUSTERED by (ID) INTO 5 BUCKETS STORED AS ORC 
tblproperties('transactional'='true', 'transactional_properties'='default')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@DUP_TEST_TARGET
+PREHOOK: query: INSERT INTO DUP_TEST
+(id , in_date , sample)
+values
+(1  , '2023-04-14 10:11:12.111' , 'test1'),
+(2  , '2023-04-14 10:11:12.111' , 'test2'),
+(3  , '2023-04-14 10:11:12.111' , 'test3'),
+(4  , '2023-04-14 10:11:12.111' , 'test4'),
+(5  , '2023-04-14 10:11:12.111' , 'test5'),
+(6  , '2023-04-14 10:11:12.111' , 'test6'),
+(7  , '2023-04-14 10:11:12.111' , 'test7'),
+(8  , '2023-04-14 10:11:12.111' , 'test8'),
+(9  , '2023-04-14 10:11:12.111' , 'test9')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@dup_test
+POSTHOOK: query: INSERT INTO DUP_TEST
+(id , in_date , sample)
+values
+(1  , '2023-04-14 10:11:12.111' , 'test1'),
+(2  , '2023-04-14 10:11:12.111' , 'test2'),
+(3  , '2023-04-14 10:11:12.111' , 'test3'),
+(4  , '2023-04-14 10:11:12.111' , 'test4'),
+(5  , '2023-04-14 10:11:12.111' , 'test5'),
+(6  , '2023-04-14 10:11:12.111' , 'test6'),
+(7  , '2023-04-14 10:11:12.111' , 'test7'),
+(8  , '2023-04-14 10:11:12.111' , 'test8'),
+(9  , '2023-04-14 10:11:12.111' , 'test9')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@dup_test
+POSTHOOK: Lineage: dup_test.id SCRIPT []
+POSTHOOK: Lineage: dup_test.in_date SCRIPT []
+POSTHOOK: Lineage: dup_test.sample SCRIPT []
+PREHOOK: query: MERGE INTO DUP_TEST_TARGET T USING (SELECT id , in_date , 
sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+    WHEN MATCHED THEN UPDATE SET  in_date = S.in_date , sample = S.sample
+    WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.in_date , S.sample)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dup_test
+PREHOOK: Input: default@dup_test_target
+PREHOOK: Output: default@dup_test_target
+PREHOOK: Output: default@dup_test_target
+PREHOOK: Output: default@merge_tmp_table
+POSTHOOK: query: MERGE INTO DUP_TEST_TARGET T USING (SELECT id , in_date , 
sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+    WHEN MATCHED THEN UPDATE SET  in_date = S.in_date , sample = S.sample
+    WHEN NOT MATCHED THEN INSERT VALUES (S.id, S.in_date , S.sample)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dup_test
+POSTHOOK: Input: default@dup_test_target
+POSTHOOK: Output: default@dup_test_target
+POSTHOOK: Output: default@dup_test_target
+POSTHOOK: Output: default@merge_tmp_table
+POSTHOOK: Lineage: dup_test_target.id SIMPLE 
[(dup_test)dup_test.FieldSchema(name:id, type:int, comment:null), ]
+POSTHOOK: Lineage: dup_test_target.id SIMPLE 
[(dup_test)dup_test.FieldSchema(name:id, type:int, comment:null), ]
+POSTHOOK: Lineage: dup_test_target.in_date SIMPLE 
[(dup_test)dup_test.FieldSchema(name:in_date, type:timestamp, comment:null), ]
+POSTHOOK: Lineage: dup_test_target.in_date SIMPLE 
[(dup_test)dup_test.FieldSchema(name:in_date, type:timestamp, comment:null), ]
+POSTHOOK: Lineage: dup_test_target.sample SIMPLE 
[(dup_test)dup_test.FieldSchema(name:sample, type:varchar(100), comment:null), ]
+POSTHOOK: Lineage: dup_test_target.sample SIMPLE 
[(dup_test)dup_test.FieldSchema(name:sample, type:varchar(100), comment:null), ]
+POSTHOOK: Lineage: merge_tmp_table.val EXPRESSION 
[(dup_test_target)dup_test_target.FieldSchema(name:ROW__ID, 
type:struct<writeId:bigint,bucketId:int,rowId:bigint>, comment:), ]
+PREHOOK: query: explain vectorization detail select * from DUP_TEST_TARGET T 
join (SELECT id , in_date , sample FROM (SELECT id , in_date , sample 
,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dup_test
+PREHOOK: Input: default@dup_test_target
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain vectorization detail select * from DUP_TEST_TARGET T 
join (SELECT id , in_date , sample FROM (SELECT id , in_date , sample 
,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dup_test
+POSTHOOK: Input: default@dup_test_target
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 3 <- Map 1 (CUSTOM_SIMPLE_EDGE), Map 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 9 Data size: 2052 Basic stats: 
COMPLETE Column stats: NONE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:id:int, 
1:in_date:timestamp, 2:sample:varchar(100), 
3:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>, 
4:ROW__IS__DELETED:boolean]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: SelectColumnIsNotNull(col 0:int)
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 9 Data size: 2052 Basic stats: 
COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: id (type: int), in_date (type: timestamp), 
sample (type: varchar(100))
+                      outputColumnNames: _col0, _col1, _col2
+                      Select Vectorization:
+                          className: VectorSelectOperator
+                          native: true
+                          projectedOutputColumnNums: [0, 1, 2]
+                      Statistics: Num rows: 9 Data size: 2052 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Reduce Sink Vectorization:
+                            className: VectorReduceSinkObjectHashOperator
+                            keyColumns: 0:int
+                            native: true
+                            nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+                            partitionColumns: 0:int
+                            valueColumns: 1:timestamp, 2:varchar(100)
+                        Statistics: Num rows: 9 Data size: 2052 Basic stats: 
COMPLETE Column stats: NONE
+                        value expressions: _col1 (type: timestamp), _col2 
(type: varchar(100))
+            Execution mode: vectorized
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: 
hive.vectorized.use.vectorized.input.format IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                allNative: true
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 3
+                    includeColumns: [0, 1, 2]
+                    dataColumns: id:int, in_date:timestamp, sample:varchar(100)
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: []
+        Map 2 
+            Map Operator Tree:
+                TableScan
+                  alias: dup_test
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 9 Data size: 1197 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:id:int, 
1:in_date:timestamp, 2:sample:varchar(100), 
3:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>, 
4:ROW__IS__DELETED:boolean]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: SelectColumnIsNotNull(col 0:int)
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 9 Data size: 1197 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: id (type: int), in_date (type: 
timestamp)
+                      null sort order: aa
+                      sort order: +-
+                      Map-reduce partition columns: id (type: int)
+                      Reduce Sink Vectorization:
+                          className: VectorReduceSinkObjectHashOperator
+                          keyColumns: 0:int, 1:timestamp
+                          native: true
+                          nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+                          partitionColumns: 0:int
+                          valueColumns: 2:varchar(100)
+                      Statistics: Num rows: 9 Data size: 1197 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: sample (type: varchar(100))
+            Execution mode: vectorized
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: 
hive.vectorized.use.vectorized.input.format IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                allNative: true
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 3
+                    includeColumns: [0, 1, 2]
+                    dataColumns: id:int, in_date:timestamp, sample:varchar(100)
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: []
+        Reducer 3 
+            Execution mode: vectorized
+            Reduce Vectorization:
+                enabled: true
+                enableConditionsMet: hive.vectorized.execution.reduce.enabled 
IS true, hive.execution.engine tez IN [tez] IS true
+                reduceColumnNullOrder: aa
+                reduceColumnSortOrder: +-
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 3
+                    dataColumns: KEY.reducesinkkey0:int, 
KEY.reducesinkkey1:timestamp, VALUE._col0:varchar(100)
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: [bigint, timestamp, string]
+            Reduce Operator Tree:
+              Select Operator
+                expressions: KEY.reducesinkkey0 (type: int), 
KEY.reducesinkkey1 (type: timestamp), VALUE._col0 (type: varchar(100))
+                outputColumnNames: _col0, _col1, _col2
+                Select Vectorization:
+                    className: VectorSelectOperator
+                    native: true
+                    projectedOutputColumnNums: [0, 1, 2]
+                Statistics: Num rows: 9 Data size: 1197 Basic stats: COMPLETE 
Column stats: COMPLETE
+                PTF Operator
+                  Function definitions:
+                      Input definition
+                        input alias: ptf_0
+                        output shape: _col0: int, _col1: timestamp, _col2: 
varchar(100)
+                        type: WINDOWING
+                      Windowing table definition
+                        input alias: ptf_1
+                        name: windowingtablefunction
+                        order by: _col1 DESC NULLS FIRST
+                        partition by: _col0
+                        raw input shape:
+                        window functions:
+                            window function definition
+                              alias: ROW_NUMBER_window_0
+                              name: ROW_NUMBER
+                              window function: GenericUDAFRowNumberEvaluator
+                              window frame: ROWS PRECEDING(MAX)~FOLLOWING(MAX)
+                              isPivotResult: true
+                  PTF Vectorization:
+                      allEvaluatorsAreStreaming: true
+                      className: VectorPTFOperator
+                      evaluatorClasses: [VectorPTFEvaluatorRowNumber]
+                      functionInputExpressions: [null]
+                      functionNames: [ROW_NUMBER]
+                      keyInputColumns: [0, 1]
+                      native: true
+                      nonKeyInputColumns: [2]
+                      orderExpressions: [col 1:timestamp]
+                      outputColumns: [3, 0, 1, 2]
+                      outputTypes: [int, int, timestamp, varchar(100)]
+                      partitionExpressions: [col 0:int]
+                      streamingColumns: [3]
+                  Statistics: Num rows: 9 Data size: 1197 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: FilterLongColEqualLongScalar(col 
3:int, val 1)
+                    predicate: (ROW_NUMBER_window_0 = 1) (type: boolean)
+                    Statistics: Num rows: 4 Data size: 532 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: _col0 (type: int), _col1 (type: timestamp), 
_col2 (type: varchar(100))
+                      outputColumnNames: _col0, _col1, _col2
+                      Select Vectorization:
+                          className: VectorSelectOperator
+                          native: true
+                          projectedOutputColumnNums: [0, 1, 2]
+                      Statistics: Num rows: 4 Data size: 532 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Map Join Operator
+                        condition map:
+                             Inner Join 0 to 1
+                        keys:
+                          0 _col0 (type: int)
+                          1 _col0 (type: int)
+                        Map Join Vectorization:
+                            bigTableKeyColumns: 0:int
+                            bigTableRetainColumnNums: [0, 1, 2]
+                            bigTableValueColumns: 0:int, 1:timestamp, 
2:varchar(100)
+                            className: VectorMapJoinInnerLongOperator
+                            native: true
+                            nativeConditionsMet: 
hive.mapjoin.optimized.hashtable IS true, 
hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine 
tez IN [tez] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small 
table vectorizes IS true, Optimized Table and Supports Key Types IS true
+                            nonOuterSmallTableKeyMapping: []
+                            projectedOutput: 0:int, 4:timestamp, 
5:varchar(100), 0:int, 1:timestamp, 2:varchar(100)
+                            smallTableValueMapping: 4:timestamp, 5:varchar(100)
+                            hashTableImplementationType: OPTIMIZED
+                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5
+                        input vertices:
+                          0 Map 1
+                        Statistics: Num rows: 9 Data size: 2257 Basic stats: 
COMPLETE Column stats: NONE
+                        File Output Operator
+                          compressed: false
+                          File Sink Vectorization:
+                              className: VectorFileSinkOperator
+                              native: false
+                          Statistics: Num rows: 9 Data size: 2257 Basic stats: 
COMPLETE Column stats: NONE
+                          table:
+                              input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                              output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                              serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select * from DUP_TEST_TARGET T join (SELECT id , in_date , 
sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dup_test
+PREHOOK: Input: default@dup_test_target
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from DUP_TEST_TARGET T join (SELECT id , in_date , 
sample FROM (SELECT id , in_date , sample ,ROW_NUMBER()
+OVER(PARTITION BY id ORDER BY in_date DESC ) AS ROW_NUMB  FROM DUP_TEST) 
OUTQUERY WHERE ROW_NUMB =1) as S ON T.id = S.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dup_test
+POSTHOOK: Input: default@dup_test_target
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+2      2023-04-14 10:11:12.111 test2   2       2023-04-14 10:11:12.111 test2
+3      2023-04-14 10:11:12.111 test3   3       2023-04-14 10:11:12.111 test3
+6      2023-04-14 10:11:12.111 test6   6       2023-04-14 10:11:12.111 test6
+7      2023-04-14 10:11:12.111 test7   7       2023-04-14 10:11:12.111 test7
+1      2023-04-14 10:11:12.111 test1   1       2023-04-14 10:11:12.111 test1
+4      2023-04-14 10:11:12.111 test4   4       2023-04-14 10:11:12.111 test4
+5      2023-04-14 10:11:12.111 test5   5       2023-04-14 10:11:12.111 test5
+8      2023-04-14 10:11:12.111 test8   8       2023-04-14 10:11:12.111 test8
+9      2023-04-14 10:11:12.111 test9   9       2023-04-14 10:11:12.111 test9
+PREHOOK: query: select * from DUP_TEST_TARGET T join DUP_TEST S ON T.id = S.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dup_test
+PREHOOK: Input: default@dup_test_target
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from DUP_TEST_TARGET T join DUP_TEST S ON T.id = S.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dup_test
+POSTHOOK: Input: default@dup_test_target
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1      2023-04-14 10:11:12.111 test1   1       2023-04-14 10:11:12.111 test1
+8      2023-04-14 10:11:12.111 test8   8       2023-04-14 10:11:12.111 test8
+4      2023-04-14 10:11:12.111 test4   4       2023-04-14 10:11:12.111 test4
+5      2023-04-14 10:11:12.111 test5   5       2023-04-14 10:11:12.111 test5
+3      2023-04-14 10:11:12.111 test3   3       2023-04-14 10:11:12.111 test3
+7      2023-04-14 10:11:12.111 test7   7       2023-04-14 10:11:12.111 test7
+9      2023-04-14 10:11:12.111 test9   9       2023-04-14 10:11:12.111 test9
+2      2023-04-14 10:11:12.111 test2   2       2023-04-14 10:11:12.111 test2
+6      2023-04-14 10:11:12.111 test6   6       2023-04-14 10:11:12.111 test6

[hive] branch master updated: HIVE-27069: Incorrect results with bucket map join (Dayakar M, reviewed by Krisztian Kasa)

Reply via email to