This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new c054a47c7c2 HIVE-29290: Wrong results when n-way join contains both 
anti and outer join (#6173)
c054a47c7c2 is described below

commit c054a47c7c22875c93ca0d29b7ac730cf7deba04
Author: Krisztian Kasa <[email protected]>
AuthorDate: Wed Nov 12 06:43:36 2025 +0100

    HIVE-29290: Wrong results when n-way join contains both anti and outer join 
(#6173)
---
 .../hadoop/hive/ql/exec/CommonJoinOperator.java    |   8 +-
 .../hive/ql/exec/CommonMergeJoinOperator.java      |   3 +-
 ql/src/test/queries/clientpositive/antijoin3.q     |  22 ++
 .../results/clientpositive/llap/antijoin3.q.out    | 250 +++++++++++++++++++++
 4 files changed, 281 insertions(+), 2 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java
index 58ae7f3ef42..bb0e274212e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java
@@ -990,7 +990,13 @@ protected void checkAndGenObject() throws HiveException {
         } else {
           if (!alw.hasRows()) {
             hasEmpty = true;
-            alw.addRow(dummyObj[i]);
+            if (!isRightOfAntiJoin) {
+              alw.addRow(dummyObj[i]);
+            }
+          } else if (isRightOfAntiJoin && !needsPostEvaluation)  {
+            // For anti join the right side should be empty. For 
needsPostEvaluation case we will
+            // wait till evaluation is done. For other cases we can directly 
return from here.
+            return;
           } else if (!hasEmpty && alw.isSingleRow()) {
             if (hasAnyFiltered(alias, alw.rowIter().first())) {
               hasEmpty = true;
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java
index c677796de73..f9e7a40e10e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java
@@ -428,7 +428,8 @@ private List<Byte> joinObject(int[] smallestPos, boolean 
clear) throws HiveExcep
 
   private void putDummyOrEmpty(Byte i) {
     // put a empty list or null
-    if (noOuterJoin) {
+    boolean isRightOfAntiJoin = (i != 0 && condn[i-1].getType() == 
JoinDesc.ANTI_JOIN);
+    if (noOuterJoin || isRightOfAntiJoin) {
       storage[i] = emptyList;
     } else {
       storage[i] = dummyObjVectors[i];
diff --git a/ql/src/test/queries/clientpositive/antijoin3.q 
b/ql/src/test/queries/clientpositive/antijoin3.q
new file mode 100644
index 00000000000..b2ce2b0fc80
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/antijoin3.q
@@ -0,0 +1,22 @@
+set hive.merge.nway.joins=true;
+
+create table taba(id string);
+create table tabb(id string);
+create table tabc(id string);
+
+INSERT INTO TABLE taba VALUES ('1'),('2');
+INSERT INTO TABLE tabc VALUES ('1'),('2'),('2');
+
+explain
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null;
+explain cbo
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null;
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null;
+
+INSERT INTO TABLE tabb VALUES ('1');
+
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null;
+
+INSERT INTO TABLE tabb VALUES ('2');
+
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null;
diff --git a/ql/src/test/results/clientpositive/llap/antijoin3.q.out 
b/ql/src/test/results/clientpositive/llap/antijoin3.q.out
new file mode 100644
index 00000000000..b12a4fe603f
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/antijoin3.q.out
@@ -0,0 +1,250 @@
+PREHOOK: query: create table taba(id string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@taba
+POSTHOOK: query: create table taba(id string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@taba
+PREHOOK: query: create table tabb(id string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tabb
+POSTHOOK: query: create table tabb(id string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tabb
+PREHOOK: query: create table tabc(id string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tabc
+POSTHOOK: query: create table tabc(id string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tabc
+PREHOOK: query: INSERT INTO TABLE taba VALUES ('1'),('2')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@taba
+POSTHOOK: query: INSERT INTO TABLE taba VALUES ('1'),('2')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@taba
+POSTHOOK: Lineage: taba.id SCRIPT []
+PREHOOK: query: INSERT INTO TABLE tabc VALUES ('1'),('2'),('2')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tabc
+POSTHOOK: query: INSERT INTO TABLE tabc VALUES ('1'),('2'),('2')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tabc
+POSTHOOK: Lineage: tabc.id SCRIPT []
+PREHOOK: query: explain
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE), Map 4 
(SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: a
+                  Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: id (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 2 Data size: 170 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string)
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: string)
+                      Statistics: Num rows: 2 Data size: 170 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 3 
+            Map Operator Tree:
+                TableScan
+                  alias: b
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE 
Column stats: NONE
+                  Filter Operator
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 1 Data size: 184 Basic stats: 
COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: id (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: 184 Basic stats: 
COMPLETE Column stats: NONE
+                      Group By Operator
+                        keys: _col0 (type: string)
+                        minReductionHashAggr: 0.99
+                        mode: hash
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 1 Data size: 184 Basic stats: 
COMPLETE Column stats: NONE
+                        Reduce Output Operator
+                          key expressions: _col0 (type: string)
+                          null sort order: z
+                          sort order: +
+                          Map-reduce partition columns: _col0 (type: string)
+                          Statistics: Num rows: 1 Data size: 184 Basic stats: 
COMPLETE Column stats: NONE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 4 
+            Map Operator Tree:
+                TableScan
+                  alias: c
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Filter Operator
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: id (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 3 Data size: 255 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Anti Join 0 to 1
+                     Left Outer Join 0 to 2
+                keys:
+                  0 _col0 (type: string)
+                  1 _col0 (type: string)
+                  2 _col0 (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE 
Column stats: NONE
+                Select Operator
+                  expressions: _col0 (type: string), null (type: string), 
_col1 (type: string)
+                  outputColumnNames: _col0, _col1, _col2
+                  Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE 
Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 6 Data size: 561 Basic stats: 
COMPLETE Column stats: NONE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain cbo
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join 
tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(id=[$1], id1=[null:VARCHAR(2147483647) CHARACTER SET "UTF-16LE"], 
id2=[$0])
+  HiveJoin(condition=[=($0, $1)], joinType=[right], algorithm=[none], 
cost=[not available])
+    HiveProject(id=[$0])
+      HiveFilter(condition=[IS NOT NULL($0)])
+        HiveTableScan(table=[[default, tabc]], table:alias=[c])
+    HiveAntiJoin(condition=[=($0, $1)], joinType=[anti])
+      HiveProject(id=[$0])
+        HiveTableScan(table=[[default, taba]], table:alias=[a])
+      HiveProject(id=[$0])
+        HiveFilter(condition=[IS NOT NULL($0)])
+          HiveTableScan(table=[[default, tabb]], table:alias=[b])
+
+PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) 
left outer join tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) 
left outer join tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####
+1      NULL    1
+2      NULL    2
+2      NULL    2
+PREHOOK: query: INSERT INTO TABLE tabb VALUES ('1')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tabb
+POSTHOOK: query: INSERT INTO TABLE tabb VALUES ('1')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tabb
+POSTHOOK: Lineage: tabb.id SCRIPT []
+PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) 
left outer join tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) 
left outer join tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####
+2      NULL    2
+2      NULL    2
+PREHOOK: query: INSERT INTO TABLE tabb VALUES ('2')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tabb
+POSTHOOK: query: INSERT INTO TABLE tabb VALUES ('2')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tabb
+POSTHOOK: Lineage: tabb.id SCRIPT []
+PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) 
left outer join tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) 
left outer join tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####

Reply via email to