This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new f355c82a5aa HIVE-27988: Don't convert FullOuterJoin with filter to
MapJoin (Seonggon Namgung, reviewed by Denys Kuzmenko)
f355c82a5aa is described below
commit f355c82a5aa77ef1496b35c22b8ac9b84dfe1780
Author: seonggon <[email protected]>
AuthorDate: Wed Jan 10 18:15:39 2024 +0900
HIVE-27988: Don't convert FullOuterJoin with filter to MapJoin (Seonggon
Namgung, reviewed by Denys Kuzmenko)
Closes #4958
---
.../hadoop/hive/ql/optimizer/MapJoinProcessor.java | 26 +++++++++
.../llap/mapjoin_filter_on_outerjoin_tez.q.out | 13 ++---
.../llap/vector_outer_join_constants.q.out | 66 ++++++++++------------
3 files changed, 61 insertions(+), 44 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
index adf4fbe1b21..fc9cb2a98d2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
@@ -425,6 +425,32 @@ public class MapJoinProcessor extends Transform {
return false;
}
+ // Do not convert to MapJoin if FullOuterJoin has any filter expression.
+ // This partially disables HIVE-18908 optimization and solves the MapJoin
correctness problems
+ // described in HIVE-27226.
+ if (joinDesc.getFilters() != null) {
+ // Unlike CommonJoinOperator.hasFilter(), we check getFilters() instead
of getFilterMap() because
+ // getFilterMap() can be non-null while getFilters() is empty.
+
+ boolean hasFullOuterJoinWithFilter =
Arrays.stream(joinDesc.getConds()).anyMatch(cond -> {
+ if (cond.getType() == JoinDesc.FULL_OUTER_JOIN) {
+ Byte left = (byte) cond.getLeft();
+ Byte right = (byte) cond.getRight();
+ boolean leftHasFilter =
+ joinDesc.getFilters().containsKey(left) &&
!joinDesc.getFilters().get(left).isEmpty();
+ boolean rightHasFilter =
+ joinDesc.getFilters().containsKey(right) &&
!joinDesc.getFilters().get(right).isEmpty();
+ return leftHasFilter || rightHasFilter;
+ } else {
+ return false;
+ }
+ });
+ if (hasFullOuterJoinWithFilter) {
+ LOG.debug("FULL OUTER MapJoin not enabled: FullOuterJoin with filters
not supported");
+ return false;
+ }
+ }
+
return true;
}
diff --git
a/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out
b/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out
index 5080aed0950..687ec32910b 100644
---
a/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out
+++
b/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out
@@ -754,7 +754,7 @@ STAGE PLANS:
Tez
#### A masked pattern was here ####
Edges:
- Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE), Map 3 (CUSTOM_SIMPLE_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -790,26 +790,23 @@ STAGE PLANS:
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 2 Data size: 24 Basic stats:
COMPLETE Column stats: COMPLETE
- value expressions: _col1 (type: int), _col2 (type:
boolean), (UDFToShort((not _col2)) * 1S) (type: smallint)
+ value expressions: _col1 (type: int), _col2 (type:
boolean)
Execution mode: llap
LLAP IO: all inputs
Reducer 2
Execution mode: llap
Reduce Operator Tree:
- Map Join Operator
+ Merge Join Operator
condition map:
Full Outer Join 0 to 1
filter predicates:
0 {VALUE._col1}
1 {VALUE._col1}
keys:
- 0 KEY.reducesinkkey0 (type: int)
- 1 KEY.reducesinkkey0 (type: int)
+ 0 _col0 (type: int)
+ 1 _col0 (type: int)
outputColumnNames: _col0, _col1, _col3, _col4
- input vertices:
- 1 Map 3
Statistics: Num rows: 4 Data size: 64 Basic stats: COMPLETE
Column stats: COMPLETE
- DynamicPartitionHashJoin: true
Select Operator
expressions: _col0 (type: int), _col1 (type: int), _col3
(type: int), _col4 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3
diff --git
a/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out
b/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out
index 8ea8348431b..dd94e53a68b 100644
--- a/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out
@@ -184,7 +184,7 @@ POSTHOOK: type: ANALYZE_TABLE
POSTHOOK: Input: default@lday
POSTHOOK: Output: default@lday
#### A masked pattern was here ####
-Warning: Map Join MAPJOIN[79][bigTable=?] in task 'Reducer 4' is a cross
product
+Warning: Shuffle Join MERGEJOIN[79][tables = [$hdt$_0, $hdt$_1]] in Stage
'Reducer 3' is a cross product
PREHOOK: query: EXPLAIN VECTORIZATION DETAIL
select * from
(select item1.S_ID S_ID,
@@ -275,8 +275,8 @@ STAGE PLANS:
Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7
(BROADCAST_EDGE)
Map 6 <- Map 7 (BROADCAST_EDGE)
Reducer 2 <- Map 1 (SIMPLE_EDGE)
- Reducer 3 <- Map 1 (SIMPLE_EDGE)
- Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 3
(CUSTOM_SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 4
(CUSTOM_SIMPLE_EDGE)
+ Reducer 4 <- Map 1 (SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -653,11 +653,34 @@ STAGE PLANS:
className: VectorReduceSinkEmptyKeyOperator
native: true
nativeConditionsMet:
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine
tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true,
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
- valueColumns: 1:int, 2:timestamp, 3:smallint
- valueExpressions: ConstantVectorExpression(val 0) ->
3:smallint
+ valueColumns: 1:int, 2:timestamp
Statistics: Num rows: 1 Data size: 44 Basic stats:
COMPLETE Column stats: COMPLETE
- value expressions: _col0 (type: int), _col1 (type:
timestamp), 0S (type: smallint)
+ value expressions: _col0 (type: int), _col1 (type:
timestamp)
Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Full Outer Join 0 to 1
+ filter predicates:
+ 0
+ 1 {true}
+ keys:
+ 0
+ 1
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE
Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ MergeJoin Vectorization:
+ enabled: false
+ enableConditionsNotMet: Vectorizing MergeJoin Supported IS
false
+ Reducer 4
Execution mode: vectorized, llap
Reduce Vectorization:
enabled: true
@@ -704,35 +727,6 @@ STAGE PLANS:
valueColumns: 1:int, 2:timestamp
Statistics: Num rows: 1 Data size: 44 Basic stats:
COMPLETE Column stats: COMPLETE
value expressions: _col0 (type: int), _col1 (type:
timestamp)
- Reducer 4
- Execution mode: llap
- Reduce Vectorization:
- enabled: true
- enableConditionsMet: hive.vectorized.execution.reduce.enabled
IS true, hive.execution.engine tez IN [tez] IS true
- notVectorizedReason: MAPJOIN operator: Vectorized & filtered
full-outer joins not supported
- vectorized: false
- Reduce Operator Tree:
- Map Join Operator
- condition map:
- Full Outer Join 0 to 1
- filter predicates:
- 0
- 1 {true}
- keys:
- 0
- 1
- outputColumnNames: _col0, _col1, _col2, _col3
- input vertices:
- 0 Reducer 2
- Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE
Column stats: COMPLETE
- DynamicPartitionHashJoin: true
- File Output Operator
- compressed: false
- Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE
Column stats: COMPLETE
- table:
- input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
- output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
@@ -740,7 +734,7 @@ STAGE PLANS:
Processor Tree:
ListSink
-Warning: Map Join MAPJOIN[79][bigTable=?] in task 'Reducer 4' is a cross
product
+Warning: Shuffle Join MERGEJOIN[79][tables = [$hdt$_0, $hdt$_1]] in Stage
'Reducer 3' is a cross product
PREHOOK: query: select * from
(select item1.S_ID S_ID,
ytday1.D_DATE D_DATE