(hive) branch branch-4.0 updated: HIVE-27988: Don't convert FullOuterJoin with filter to MapJoin (Seonggon Namgung, reviewed by Denys Kuzmenko)

dkuzmenko Wed, 10 Jan 2024 01:16:30 -0800

This is an automated email from the ASF dual-hosted git repository.

dkuzmenko pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new f355c82a5aa HIVE-27988: Don't convert FullOuterJoin with filter to 
MapJoin (Seonggon Namgung, reviewed by Denys Kuzmenko)
f355c82a5aa is described below

commit f355c82a5aa77ef1496b35c22b8ac9b84dfe1780
Author: seonggon <[email protected]>
AuthorDate: Wed Jan 10 18:15:39 2024 +0900

    HIVE-27988: Don't convert FullOuterJoin with filter to MapJoin (Seonggon 
Namgung, reviewed by Denys Kuzmenko)
    
    Closes #4958
---
 .../hadoop/hive/ql/optimizer/MapJoinProcessor.java | 26 +++++++++
 .../llap/mapjoin_filter_on_outerjoin_tez.q.out     | 13 ++---
 .../llap/vector_outer_join_constants.q.out         | 66 ++++++++++------------
 3 files changed, 61 insertions(+), 44 deletions(-)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
index adf4fbe1b21..fc9cb2a98d2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
@@ -425,6 +425,32 @@ public class MapJoinProcessor extends Transform {
       return false;
     }
 
+    // Do not convert to MapJoin if FullOuterJoin has any filter expression.
+    // This partially disables HIVE-18908 optimization and solves the MapJoin 
correctness problems
+    // described in HIVE-27226.
+    if (joinDesc.getFilters() != null) {
+      // Unlike CommonJoinOperator.hasFilter(), we check getFilters() instead 
of getFilterMap() because
+      // getFilterMap() can be non-null while getFilters() is empty.
+
+      boolean hasFullOuterJoinWithFilter = 
Arrays.stream(joinDesc.getConds()).anyMatch(cond -> {
+        if (cond.getType() == JoinDesc.FULL_OUTER_JOIN) {
+          Byte left = (byte) cond.getLeft();
+          Byte right = (byte) cond.getRight();
+          boolean leftHasFilter =
+              joinDesc.getFilters().containsKey(left) && 
!joinDesc.getFilters().get(left).isEmpty();
+          boolean rightHasFilter =
+              joinDesc.getFilters().containsKey(right) && 
!joinDesc.getFilters().get(right).isEmpty();
+          return leftHasFilter || rightHasFilter;
+        } else {
+          return false;
+        }
+      });
+      if (hasFullOuterJoinWithFilter) {
+        LOG.debug("FULL OUTER MapJoin not enabled: FullOuterJoin with filters 
not supported");
+        return false;
+      }
+    }
+
     return true;
   }
 
diff --git 
a/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out 
b/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out
index 5080aed0950..687ec32910b 100644
--- 
a/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out
+++ 
b/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out
@@ -754,7 +754,7 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE), Map 3 (CUSTOM_SIMPLE_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -790,26 +790,23 @@ STAGE PLANS:
                       sort order: +
                       Map-reduce partition columns: _col0 (type: int)
                       Statistics: Num rows: 2 Data size: 24 Basic stats: 
COMPLETE Column stats: COMPLETE
-                      value expressions: _col1 (type: int), _col2 (type: 
boolean), (UDFToShort((not _col2)) * 1S) (type: smallint)
+                      value expressions: _col1 (type: int), _col2 (type: 
boolean)
             Execution mode: llap
             LLAP IO: all inputs
         Reducer 2 
             Execution mode: llap
             Reduce Operator Tree:
-              Map Join Operator
+              Merge Join Operator
                 condition map:
                      Full Outer Join 0 to 1
                 filter predicates:
                   0 {VALUE._col1}
                   1 {VALUE._col1}
                 keys:
-                  0 KEY.reducesinkkey0 (type: int)
-                  1 KEY.reducesinkkey0 (type: int)
+                  0 _col0 (type: int)
+                  1 _col0 (type: int)
                 outputColumnNames: _col0, _col1, _col3, _col4
-                input vertices:
-                  1 Map 3
                 Statistics: Num rows: 4 Data size: 64 Basic stats: COMPLETE 
Column stats: COMPLETE
-                DynamicPartitionHashJoin: true
                 Select Operator
                   expressions: _col0 (type: int), _col1 (type: int), _col3 
(type: int), _col4 (type: int)
                   outputColumnNames: _col0, _col1, _col2, _col3
diff --git 
a/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out 
b/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out
index 8ea8348431b..dd94e53a68b 100644
--- a/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out
@@ -184,7 +184,7 @@ POSTHOOK: type: ANALYZE_TABLE
 POSTHOOK: Input: default@lday
 POSTHOOK: Output: default@lday
 #### A masked pattern was here ####
-Warning: Map Join MAPJOIN[79][bigTable=?] in task 'Reducer 4' is a cross 
product
+Warning: Shuffle Join MERGEJOIN[79][tables = [$hdt$_0, $hdt$_1]] in Stage 
'Reducer 3' is a cross product
 PREHOOK: query: EXPLAIN VECTORIZATION DETAIL
 select * from
 (select        item1.S_ID  S_ID,
@@ -275,8 +275,8 @@ STAGE PLANS:
         Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7 
(BROADCAST_EDGE)
         Map 6 <- Map 7 (BROADCAST_EDGE)
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
-        Reducer 3 <- Map 1 (SIMPLE_EDGE)
-        Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 3 
(CUSTOM_SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 4 
(CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -653,11 +653,34 @@ STAGE PLANS:
                         className: VectorReduceSinkEmptyKeyOperator
                         native: true
                         nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                        valueColumns: 1:int, 2:timestamp, 3:smallint
-                        valueExpressions: ConstantVectorExpression(val 0) -> 
3:smallint
+                        valueColumns: 1:int, 2:timestamp
                     Statistics: Num rows: 1 Data size: 44 Basic stats: 
COMPLETE Column stats: COMPLETE
-                    value expressions: _col0 (type: int), _col1 (type: 
timestamp), 0S (type: smallint)
+                    value expressions: _col0 (type: int), _col1 (type: 
timestamp)
         Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Full Outer Join 0 to 1
+                filter predicates:
+                  0 
+                  1 {true}
+                keys:
+                  0 
+                  1 
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            MergeJoin Vectorization:
+                enabled: false
+                enableConditionsNotMet: Vectorizing MergeJoin Supported IS 
false
+        Reducer 4 
             Execution mode: vectorized, llap
             Reduce Vectorization:
                 enabled: true
@@ -704,35 +727,6 @@ STAGE PLANS:
                         valueColumns: 1:int, 2:timestamp
                     Statistics: Num rows: 1 Data size: 44 Basic stats: 
COMPLETE Column stats: COMPLETE
                     value expressions: _col0 (type: int), _col1 (type: 
timestamp)
-        Reducer 4 
-            Execution mode: llap
-            Reduce Vectorization:
-                enabled: true
-                enableConditionsMet: hive.vectorized.execution.reduce.enabled 
IS true, hive.execution.engine tez IN [tez] IS true
-                notVectorizedReason: MAPJOIN operator: Vectorized & filtered 
full-outer joins not supported
-                vectorized: false
-            Reduce Operator Tree:
-              Map Join Operator
-                condition map:
-                     Full Outer Join 0 to 1
-                filter predicates:
-                  0 
-                  1 {true}
-                keys:
-                  0 
-                  1 
-                outputColumnNames: _col0, _col1, _col2, _col3
-                input vertices:
-                  0 Reducer 2
-                Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE 
Column stats: COMPLETE
-                DynamicPartitionHashJoin: true
-                File Output Operator
-                  compressed: false
-                  Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE 
Column stats: COMPLETE
-                  table:
-                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
-                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
 
   Stage: Stage-0
     Fetch Operator
@@ -740,7 +734,7 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
-Warning: Map Join MAPJOIN[79][bigTable=?] in task 'Reducer 4' is a cross 
product
+Warning: Shuffle Join MERGEJOIN[79][tables = [$hdt$_0, $hdt$_1]] in Stage 
'Reducer 3' is a cross product
 PREHOOK: query: select * from
 (select        item1.S_ID  S_ID,
                 ytday1.D_DATE  D_DATE

(hive) branch branch-4.0 updated: HIVE-27988: Don't convert FullOuterJoin with filter to MapJoin (Seonggon Namgung, reviewed by Denys Kuzmenko)

Reply via email to