This is an automated email from the ASF dual-hosted git repository.
krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new c054a47c7c2 HIVE-29290: Wrong results when n-way join contains both
anti and outer join (#6173)
c054a47c7c2 is described below
commit c054a47c7c22875c93ca0d29b7ac730cf7deba04
Author: Krisztian Kasa <[email protected]>
AuthorDate: Wed Nov 12 06:43:36 2025 +0100
HIVE-29290: Wrong results when n-way join contains both anti and outer join
(#6173)
---
.../hadoop/hive/ql/exec/CommonJoinOperator.java | 8 +-
.../hive/ql/exec/CommonMergeJoinOperator.java | 3 +-
ql/src/test/queries/clientpositive/antijoin3.q | 22 ++
.../results/clientpositive/llap/antijoin3.q.out | 250 +++++++++++++++++++++
4 files changed, 281 insertions(+), 2 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java
index 58ae7f3ef42..bb0e274212e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java
@@ -990,7 +990,13 @@ protected void checkAndGenObject() throws HiveException {
} else {
if (!alw.hasRows()) {
hasEmpty = true;
- alw.addRow(dummyObj[i]);
+ if (!isRightOfAntiJoin) {
+ alw.addRow(dummyObj[i]);
+ }
+ } else if (isRightOfAntiJoin && !needsPostEvaluation) {
+ // For anti join the right side should be empty. For
needsPostEvaluation case we will
+ // wait till evaluation is done. For other cases we can directly
return from here.
+ return;
} else if (!hasEmpty && alw.isSingleRow()) {
if (hasAnyFiltered(alias, alw.rowIter().first())) {
hasEmpty = true;
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java
index c677796de73..f9e7a40e10e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java
@@ -428,7 +428,8 @@ private List<Byte> joinObject(int[] smallestPos, boolean
clear) throws HiveExcep
private void putDummyOrEmpty(Byte i) {
// put a empty list or null
- if (noOuterJoin) {
+ boolean isRightOfAntiJoin = (i != 0 && condn[i-1].getType() ==
JoinDesc.ANTI_JOIN);
+ if (noOuterJoin || isRightOfAntiJoin) {
storage[i] = emptyList;
} else {
storage[i] = dummyObjVectors[i];
diff --git a/ql/src/test/queries/clientpositive/antijoin3.q
b/ql/src/test/queries/clientpositive/antijoin3.q
new file mode 100644
index 00000000000..b2ce2b0fc80
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/antijoin3.q
@@ -0,0 +1,22 @@
+set hive.merge.nway.joins=true;
+
+create table taba(id string);
+create table tabb(id string);
+create table tabc(id string);
+
+INSERT INTO TABLE taba VALUES ('1'),('2');
+INSERT INTO TABLE tabc VALUES ('1'),('2'),('2');
+
+explain
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null;
+explain cbo
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null;
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null;
+
+INSERT INTO TABLE tabb VALUES ('1');
+
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null;
+
+INSERT INTO TABLE tabb VALUES ('2');
+
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null;
diff --git a/ql/src/test/results/clientpositive/llap/antijoin3.q.out
b/ql/src/test/results/clientpositive/llap/antijoin3.q.out
new file mode 100644
index 00000000000..b12a4fe603f
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/antijoin3.q.out
@@ -0,0 +1,250 @@
+PREHOOK: query: create table taba(id string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@taba
+POSTHOOK: query: create table taba(id string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@taba
+PREHOOK: query: create table tabb(id string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tabb
+POSTHOOK: query: create table tabb(id string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tabb
+PREHOOK: query: create table tabc(id string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tabc
+POSTHOOK: query: create table tabc(id string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tabc
+PREHOOK: query: INSERT INTO TABLE taba VALUES ('1'),('2')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@taba
+POSTHOOK: query: INSERT INTO TABLE taba VALUES ('1'),('2')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@taba
+POSTHOOK: Lineage: taba.id SCRIPT []
+PREHOOK: query: INSERT INTO TABLE tabc VALUES ('1'),('2'),('2')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tabc
+POSTHOOK: query: INSERT INTO TABLE tabc VALUES ('1'),('2'),('2')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tabc
+POSTHOOK: Lineage: tabc.id SCRIPT []
+PREHOOK: query: explain
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE), Map 4
(SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: id (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 170 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 2 Data size: 170 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: b
+ filterExpr: id is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE
Column stats: NONE
+ Filter Operator
+ predicate: id is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 184 Basic stats:
COMPLETE Column stats: NONE
+ Select Operator
+ expressions: id (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 184 Basic stats:
COMPLETE Column stats: NONE
+ Group By Operator
+ keys: _col0 (type: string)
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 184 Basic stats:
COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 1 Data size: 184 Basic stats:
COMPLETE Column stats: NONE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: c
+ filterExpr: id is not null (type: boolean)
+ Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: id is not null (type: boolean)
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: id (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Anti Join 0 to 1
+ Left Outer Join 0 to 2
+ keys:
+ 0 _col0 (type: string)
+ 1 _col0 (type: string)
+ 2 _col0 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE
Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), null (type: string),
_col1 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE
Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 6 Data size: 561 Basic stats:
COMPLETE Column stats: NONE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain cbo
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select * from taba A left outer join tabb B on (A.id = B.id) left outer join
tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(id=[$1], id1=[null:VARCHAR(2147483647) CHARACTER SET "UTF-16LE"],
id2=[$0])
+ HiveJoin(condition=[=($0, $1)], joinType=[right], algorithm=[none],
cost=[not available])
+ HiveProject(id=[$0])
+ HiveFilter(condition=[IS NOT NULL($0)])
+ HiveTableScan(table=[[default, tabc]], table:alias=[c])
+ HiveAntiJoin(condition=[=($0, $1)], joinType=[anti])
+ HiveProject(id=[$0])
+ HiveTableScan(table=[[default, taba]], table:alias=[a])
+ HiveProject(id=[$0])
+ HiveFilter(condition=[IS NOT NULL($0)])
+ HiveTableScan(table=[[default, tabb]], table:alias=[b])
+
+PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id)
left outer join tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id)
left outer join tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####
+1 NULL 1
+2 NULL 2
+2 NULL 2
+PREHOOK: query: INSERT INTO TABLE tabb VALUES ('1')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tabb
+POSTHOOK: query: INSERT INTO TABLE tabb VALUES ('1')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tabb
+POSTHOOK: Lineage: tabb.id SCRIPT []
+PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id)
left outer join tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id)
left outer join tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####
+2 NULL 2
+2 NULL 2
+PREHOOK: query: INSERT INTO TABLE tabb VALUES ('2')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tabb
+POSTHOOK: query: INSERT INTO TABLE tabb VALUES ('2')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tabb
+POSTHOOK: Lineage: tabb.id SCRIPT []
+PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id)
left outer join tabc C on (C.id = A.id) where B.id is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@taba
+PREHOOK: Input: default@tabb
+PREHOOK: Input: default@tabc
+#### A masked pattern was here ####
+POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id)
left outer join tabc C on (C.id = A.id) where B.id is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@taba
+POSTHOOK: Input: default@tabb
+POSTHOOK: Input: default@tabc
+#### A masked pattern was here ####