This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new f9a969c87ba HIVE-29175: Wrong Anti join conversion leading to missing 
results (#6055)
f9a969c87ba is described below

commit f9a969c87bac441cba07143f457e8a46c8dbe56e
Author: Dayakar M <daya.apa...@gmail.com>
AuthorDate: Mon Sep 8 12:20:15 2025 +0530

    HIVE-29175: Wrong Anti join conversion leading to missing results (#6055)
---
 .../hive/ql/optimizer/calcite/HiveCalciteUtil.java |  22 ++++
 .../calcite/rules/HiveAntiSemiJoinRule.java        |   5 +
 .../queries/clientpositive/antijoin_conversion.q   |  29 ++++++
 .../clientpositive/llap/antijoin_conversion.q.out  | 112 +++++++++++++++++++++
 4 files changed, 168 insertions(+)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
index a19a6c74ec3..a7305326c00 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
@@ -1272,6 +1272,28 @@ public static boolean 
hasAllExpressionsFromRightSide(RelNode joinRel, List<RexNo
     return true;
   }
 
+  /**
+   * Checks the operands in the join conditions are only from left side.
+   *
+   * @param joinRel Join node
+   * @return true if the join condition operands are from right and left side, 
false otherwise.
+   */
+  public static boolean checkIfJoinConditionOnlyUsesLeftOperands(Join joinRel) 
{
+    RexNode condition = joinRel.getCondition();
+    RelNode leftRel = joinRel.getLeft();
+    int leftFieldCount = leftRel.getRowType().getFieldCount();
+    ImmutableBitSet leftBitmap = ImmutableBitSet.range(leftFieldCount);
+    List<RexNode> conditions = RelOptUtil.conjunctions(condition);
+    for (RexNode cond : conditions) {
+      ImmutableBitSet condBitmap = RelOptUtil.InputFinder.bits(cond);
+      // here condition becomes true if both the operands are from left table
+      if (leftBitmap.contains(condBitmap)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   /**
    * Extracts inputs referenced by aggregate operator.
    */
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
index 3697ec2c4aa..3c4c2ab693c 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
@@ -97,6 +97,11 @@ protected void perform(RelOptRuleCall call, Project project, 
Filter filter, Join
       return;
     }
 
+    // if one of the operand from join condition is not from right table then 
no need to convert to anti join.
+    if (HiveCalciteUtil.checkIfJoinConditionOnlyUsesLeftOperands(join)) {
+      return;
+    }
+
     LOG.debug("Matched HiveAntiJoinRule");
 
     // Build anti join with same left, right child and condition as original 
left outer join.
diff --git a/ql/src/test/queries/clientpositive/antijoin_conversion.q 
b/ql/src/test/queries/clientpositive/antijoin_conversion.q
index 70cb8b39d5a..71e5a097aed 100644
--- a/ql/src/test/queries/clientpositive/antijoin_conversion.q
+++ b/ql/src/test/queries/clientpositive/antijoin_conversion.q
@@ -20,3 +20,32 @@ select n.* from n left outer join t on (n.a=t.a) where 
cast(t.a as float) is nul
 select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where 
cast(t.a as float) is null;
 
 
+create table tab1 (col1 int, col2 int);
+create table tab2 (col1 int, col2 int);
+
+insert into tab1 values (123, 1000), (456, 1000), (123, 5000);
+
+insert into tab2 values (123, 1000), (456, 1000), (788, 1000);
+
+select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null;
+
+set hive.auto.convert.anti.join=true;
+
+explain CBO select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null;
+
+select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null;
diff --git a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out 
b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
index 6e2cae9f0cf..c25c8427a27 100644
--- a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
+++ b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
@@ -282,3 +282,115 @@ POSTHOOK: Input: default@n
 POSTHOOK: Input: default@t
 #### A masked pattern was here ####
 NULL
+PREHOOK: query: create table tab1 (col1 int, col2 int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tab1
+POSTHOOK: query: create table tab1 (col1 int, col2 int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tab1
+PREHOOK: query: create table tab2 (col1 int, col2 int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tab2
+POSTHOOK: query: create table tab2 (col1 int, col2 int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tab2
+PREHOOK: query: insert into tab1 values (123, 1000), (456, 1000), (123, 5000)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tab1
+POSTHOOK: query: insert into tab1 values (123, 1000), (456, 1000), (123, 5000)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tab1
+POSTHOOK: Lineage: tab1.col1 SCRIPT []
+POSTHOOK: Lineage: tab1.col2 SCRIPT []
+PREHOOK: query: insert into tab2 values (123, 1000), (456, 1000), (788, 1000)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tab2
+POSTHOOK: query: insert into tab2 values (123, 1000), (456, 1000), (788, 1000)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tab2
+POSTHOOK: Lineage: tab2.col1 SCRIPT []
+POSTHOOK: Lineage: tab2.col2 SCRIPT []
+PREHOOK: query: select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tab1
+PREHOOK: Input: default@tab2
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tab1
+POSTHOOK: Input: default@tab2
+#### A masked pattern was here ####
+123    5000
+PREHOOK: query: explain CBO select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tab1
+PREHOOK: Input: default@tab2
+#### A masked pattern was here ####
+POSTHOOK: query: explain CBO select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tab1
+POSTHOOK: Input: default@tab2
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(col1=[$0], col2=[$1])
+  HiveFilter(condition=[IS NULL($4)])
+    HiveJoin(condition=[AND(=($4, $0), =($3, $1))], joinType=[left], 
algorithm=[none], cost=[not available])
+      HiveJoin(condition=[AND(=($2, $0), =($3, $1))], joinType=[left], 
algorithm=[none], cost=[not available])
+        HiveProject(col1=[$0], col2=[$1])
+          HiveTableScan(table=[[default, tab1]], table:alias=[t1])
+        HiveProject(col1=[$0], col2=[$1])
+          HiveFilter(condition=[AND(IS NOT NULL($0), IS NOT NULL($1))])
+            HiveTableScan(table=[[default, tab2]], table:alias=[t2])
+      HiveProject(col1=[$0])
+        HiveFilter(condition=[IS NOT NULL($0)])
+          HiveTableScan(table=[[default, tab1]], table:alias=[t3])
+
+PREHOOK: query: select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tab1
+PREHOOK: Input: default@tab2
+#### A masked pattern was here ####
+POSTHOOK: query: select t1.col1, t1.col2 from tab1 t1
+left join tab2 t2
+on t2.col1=t1.col1 AND t2.col2=t1.col2
+left join tab1 t3
+on t3.col1=t1.col1 AND t2.col2=t1.col2
+where t3.col1 is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tab1
+POSTHOOK: Input: default@tab2
+#### A masked pattern was here ####
+123    5000

Reply via email to