This is an automated email from the ASF dual-hosted git repository. krisztiankasa pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new f9a969c87ba HIVE-29175: Wrong Anti join conversion leading to missing results (#6055) f9a969c87ba is described below commit f9a969c87bac441cba07143f457e8a46c8dbe56e Author: Dayakar M <daya.apa...@gmail.com> AuthorDate: Mon Sep 8 12:20:15 2025 +0530 HIVE-29175: Wrong Anti join conversion leading to missing results (#6055) --- .../hive/ql/optimizer/calcite/HiveCalciteUtil.java | 22 ++++ .../calcite/rules/HiveAntiSemiJoinRule.java | 5 + .../queries/clientpositive/antijoin_conversion.q | 29 ++++++ .../clientpositive/llap/antijoin_conversion.q.out | 112 +++++++++++++++++++++ 4 files changed, 168 insertions(+) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java index a19a6c74ec3..a7305326c00 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java @@ -1272,6 +1272,28 @@ public static boolean hasAllExpressionsFromRightSide(RelNode joinRel, List<RexNo return true; } + /** + * Checks the operands in the join conditions are only from left side. + * + * @param joinRel Join node + * @return true if the join condition operands are from right and left side, false otherwise. + */ + public static boolean checkIfJoinConditionOnlyUsesLeftOperands(Join joinRel) { + RexNode condition = joinRel.getCondition(); + RelNode leftRel = joinRel.getLeft(); + int leftFieldCount = leftRel.getRowType().getFieldCount(); + ImmutableBitSet leftBitmap = ImmutableBitSet.range(leftFieldCount); + List<RexNode> conditions = RelOptUtil.conjunctions(condition); + for (RexNode cond : conditions) { + ImmutableBitSet condBitmap = RelOptUtil.InputFinder.bits(cond); + // here condition becomes true if both the operands are from left table + if (leftBitmap.contains(condBitmap)) { + return true; + } + } + return false; + } + /** * Extracts inputs referenced by aggregate operator. */ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java index 3697ec2c4aa..3c4c2ab693c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java @@ -97,6 +97,11 @@ protected void perform(RelOptRuleCall call, Project project, Filter filter, Join return; } + // if one of the operand from join condition is not from right table then no need to convert to anti join. + if (HiveCalciteUtil.checkIfJoinConditionOnlyUsesLeftOperands(join)) { + return; + } + LOG.debug("Matched HiveAntiJoinRule"); // Build anti join with same left, right child and condition as original left outer join. diff --git a/ql/src/test/queries/clientpositive/antijoin_conversion.q b/ql/src/test/queries/clientpositive/antijoin_conversion.q index 70cb8b39d5a..71e5a097aed 100644 --- a/ql/src/test/queries/clientpositive/antijoin_conversion.q +++ b/ql/src/test/queries/clientpositive/antijoin_conversion.q @@ -20,3 +20,32 @@ select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is nul select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where cast(t.a as float) is null; +create table tab1 (col1 int, col2 int); +create table tab2 (col1 int, col2 int); + +insert into tab1 values (123, 1000), (456, 1000), (123, 5000); + +insert into tab2 values (123, 1000), (456, 1000), (788, 1000); + +select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null; + +set hive.auto.convert.anti.join=true; + +explain CBO select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null; + +select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null; diff --git a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out index 6e2cae9f0cf..c25c8427a27 100644 --- a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out +++ b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out @@ -282,3 +282,115 @@ POSTHOOK: Input: default@n POSTHOOK: Input: default@t #### A masked pattern was here #### NULL +PREHOOK: query: create table tab1 (col1 int, col2 int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab1 +POSTHOOK: query: create table tab1 (col1 int, col2 int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab1 +PREHOOK: query: create table tab2 (col1 int, col2 int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tab2 +POSTHOOK: query: create table tab2 (col1 int, col2 int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tab2 +PREHOOK: query: insert into tab1 values (123, 1000), (456, 1000), (123, 5000) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tab1 +POSTHOOK: query: insert into tab1 values (123, 1000), (456, 1000), (123, 5000) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tab1 +POSTHOOK: Lineage: tab1.col1 SCRIPT [] +POSTHOOK: Lineage: tab1.col2 SCRIPT [] +PREHOOK: query: insert into tab2 values (123, 1000), (456, 1000), (788, 1000) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tab2 +POSTHOOK: query: insert into tab2 values (123, 1000), (456, 1000), (788, 1000) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tab2 +POSTHOOK: Lineage: tab2.col1 SCRIPT [] +POSTHOOK: Lineage: tab2.col2 SCRIPT [] +PREHOOK: query: select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null +PREHOOK: type: QUERY +PREHOOK: Input: default@tab1 +PREHOOK: Input: default@tab2 +#### A masked pattern was here #### +POSTHOOK: query: select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab1 +POSTHOOK: Input: default@tab2 +#### A masked pattern was here #### +123 5000 +PREHOOK: query: explain CBO select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null +PREHOOK: type: QUERY +PREHOOK: Input: default@tab1 +PREHOOK: Input: default@tab2 +#### A masked pattern was here #### +POSTHOOK: query: explain CBO select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab1 +POSTHOOK: Input: default@tab2 +#### A masked pattern was here #### +CBO PLAN: +HiveProject(col1=[$0], col2=[$1]) + HiveFilter(condition=[IS NULL($4)]) + HiveJoin(condition=[AND(=($4, $0), =($3, $1))], joinType=[left], algorithm=[none], cost=[not available]) + HiveJoin(condition=[AND(=($2, $0), =($3, $1))], joinType=[left], algorithm=[none], cost=[not available]) + HiveProject(col1=[$0], col2=[$1]) + HiveTableScan(table=[[default, tab1]], table:alias=[t1]) + HiveProject(col1=[$0], col2=[$1]) + HiveFilter(condition=[AND(IS NOT NULL($0), IS NOT NULL($1))]) + HiveTableScan(table=[[default, tab2]], table:alias=[t2]) + HiveProject(col1=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, tab1]], table:alias=[t3]) + +PREHOOK: query: select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null +PREHOOK: type: QUERY +PREHOOK: Input: default@tab1 +PREHOOK: Input: default@tab2 +#### A masked pattern was here #### +POSTHOOK: query: select t1.col1, t1.col2 from tab1 t1 +left join tab2 t2 +on t2.col1=t1.col1 AND t2.col2=t1.col2 +left join tab1 t3 +on t3.col1=t1.col1 AND t2.col2=t1.col2 +where t3.col1 is null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tab1 +POSTHOOK: Input: default@tab2 +#### A masked pattern was here #### +123 5000