This is an automated email from the ASF dual-hosted git repository.
kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 59d462ad3e0 HIVE-26135: Invalid Anti join conversion may cause missing
results (#3205) (Zoltan Haindrich reviewed by Krisztian Kasa)
59d462ad3e0 is described below
commit 59d462ad3e023352ffbd57b4f2446e497a421252
Author: Zoltan Haindrich <[email protected]>
AuthorDate: Tue Apr 26 17:22:49 2022 +0200
HIVE-26135: Invalid Anti join conversion may cause missing results (#3205)
(Zoltan Haindrich reviewed by Krisztian Kasa)
---
.../hive/ql/optimizer/calcite/HiveCalciteUtil.java | 18 +-
.../calcite/rules/HiveAntiSemiJoinRule.java | 23 +-
.../queries/clientpositive/antijoin_conversion.q | 22 ++
.../clientpositive/llap/antijoin_conversion.q.out | 280 +++++++++++++++++++++
4 files changed, 339 insertions(+), 4 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
index d925f159fba..160bfb86f6c 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveCalciteUtil.java
@@ -630,7 +630,7 @@ public class HiveCalciteUtil {
}
};
- public static ImmutableList<RexNode> getPredsNotPushedAlready(RelNode inp,
List<RexNode> predsToPushDown) {
+ public static ImmutableList<RexNode> getPredsNotPushedAlready(RelNode inp,
List<RexNode> predsToPushDown) {
return getPredsNotPushedAlready(Sets.<String>newHashSet(), inp,
predsToPushDown);
}
@@ -1238,6 +1238,22 @@ public class HiveCalciteUtil {
return false;
}
+ public static boolean hasAllExpressionsFromRightSide(RelNode joinRel,
List<RexNode> expressions) {
+ List<RelDataTypeField> joinFields = joinRel.getRowType().getFieldList();
+ int nTotalFields = joinFields.size();
+ List<RelDataTypeField> leftFields =
(joinRel.getInputs().get(0)).getRowType().getFieldList();
+ int nFieldsLeft = leftFields.size();
+ ImmutableBitSet rightBitmap = ImmutableBitSet.range(nFieldsLeft,
nTotalFields);
+
+ for (RexNode node : expressions) {
+ ImmutableBitSet inputBits = RelOptUtil.InputFinder.bits(node);
+ if (!rightBitmap.contains(inputBits)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
/**
* Extracts inputs referenced by aggregate operator.
*/
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
index 14a64c3d75c..3697ec2c4aa 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java
@@ -20,12 +20,15 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.RelOptRuleCall;
import org.apache.calcite.plan.RelOptUtil;
+import org.apache.calcite.plan.Strong;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.core.Filter;
import org.apache.calcite.rel.core.Join;
import org.apache.calcite.rel.core.JoinRelType;
import org.apache.calcite.rel.core.Project;
+import org.apache.calcite.rex.RexCall;
import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.rex.RexVisitorImpl;
import org.apache.calcite.sql.SqlKind;
import org.apache.calcite.sql.fun.SqlStdOperatorTable;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
@@ -36,8 +39,7 @@ import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
+import java.util.concurrent.atomic.AtomicBoolean;
/**
* Planner rule that converts a join plus filter to anti join.
@@ -136,7 +138,8 @@ public class HiveAntiSemiJoinRule extends RelOptRule {
for (RexNode filterNode : aboveFilters) {
if (filterNode.getKind() == SqlKind.IS_NULL) {
// Null filter from right side table can be removed and its a
pre-condition for anti join conversion.
- if (HiveCalciteUtil.hasAnyExpressionFromRightSide(join,
Collections.singletonList(filterNode))) {
+ if (HiveCalciteUtil.hasAllExpressionsFromRightSide(join,
Collections.singletonList(filterNode))
+ && isStrong(((RexCall) filterNode).getOperands().get(0))) {
hasNullFilterOnRightSide = true;
} else {
filterList.add(filterNode);
@@ -157,4 +160,18 @@ public class HiveAntiSemiJoinRule extends RelOptRule {
}
return filterList;
}
+
+ private boolean isStrong(RexNode rexNode) {
+ AtomicBoolean hasCast = new AtomicBoolean(false);
+ rexNode.accept(new RexVisitorImpl<Void>(true) {
+ @Override
+ public Void visitCall(RexCall call) {
+ if (call.getKind() == SqlKind.CAST) {
+ hasCast.set(true);
+ }
+ return super.visitCall(call);
+ }
+ });
+ return !hasCast.get() && Strong.isStrong(rexNode);
+ }
}
diff --git a/ql/src/test/queries/clientpositive/antijoin_conversion.q
b/ql/src/test/queries/clientpositive/antijoin_conversion.q
new file mode 100644
index 00000000000..70cb8b39d5a
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/antijoin_conversion.q
@@ -0,0 +1,22 @@
+
+drop table if exists t;
+drop table if exists n;
+
+create table t(a string) stored as orc;
+create table n(a string) stored as orc;
+
+insert into t values ('a'),('1'),('2'),(null);
+insert into n values ('a'),('b'),('1'),('3'),(null);
+
+explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is
null;
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is
null;
+select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where
cast(t.a as float) is null;
+
+set hive.auto.convert.anti.join=false;
+explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is
null;
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is
null;
+select assert_true(count(1)=4) from n left outer join t on (n.a=t.a) where
cast(t.a as float) is null;
+
+
diff --git a/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
new file mode 100644
index 00000000000..583a06994ae
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/antijoin_conversion.q.out
@@ -0,0 +1,280 @@
+PREHOOK: query: drop table if exists t
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists t
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table if exists n
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists n
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table t(a string) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t
+POSTHOOK: query: create table t(a string) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t
+PREHOOK: query: create table n(a string) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@n
+POSTHOOK: query: create table n(a string) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@n
+PREHOOK: query: insert into t values ('a'),('1'),('2'),(null)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t
+POSTHOOK: query: insert into t values ('a'),('1'),('2'),(null)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t
+POSTHOOK: Lineage: t.a SCRIPT []
+PREHOOK: query: insert into n values ('a'),('b'),('1'),('3'),(null)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@n
+POSTHOOK: query: insert into n values ('a'),('b'),('1'),('3'),(null)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@n
+POSTHOOK: Lineage: n.a SCRIPT []
+PREHOOK: query: explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is
null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is
null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: n
+ Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: a (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 5 Data size: 425 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 5 Data size: 425 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: t
+ filterExpr: a is not null (type: boolean)
+ Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: a is not null (type: boolean)
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: a (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Left Outer Join 0 to 1
+ keys:
+ 0 _col0 (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 5 Data size: 680 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: UDFToFloat(_col1) is null (type: boolean)
+ Statistics: Num rows: 2 Data size: 340 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 170 Basic stats:
COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 170 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select n.* from n left outer join t on (n.a=t.a) where
cast(t.a as float) is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: select n.* from n left outer join t on (n.a=t.a) where
cast(t.a as float) is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+3
+a
+b
+NULL
+PREHOOK: query: select assert_true(count(1)=4) from n left outer join t on
(n.a=t.a) where cast(t.a as float) is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: select assert_true(count(1)=4) from n left outer join t on
(n.a=t.a) where cast(t.a as float) is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+NULL
+PREHOOK: query: explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is
null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select n.* from n left outer join t on (n.a=t.a) where cast(t.a as float) is
null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: n
+ Statistics: Num rows: 5 Data size: 425 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: a (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 5 Data size: 425 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 5 Data size: 425 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: t
+ filterExpr: a is not null (type: boolean)
+ Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: a is not null (type: boolean)
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: a (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 3 Data size: 255 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Left Outer Join 0 to 1
+ keys:
+ 0 _col0 (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 5 Data size: 680 Basic stats: COMPLETE
Column stats: COMPLETE
+ Filter Operator
+ predicate: UDFToFloat(_col1) is null (type: boolean)
+ Statistics: Num rows: 2 Data size: 340 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 170 Basic stats:
COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 170 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select n.* from n left outer join t on (n.a=t.a) where
cast(t.a as float) is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: select n.* from n left outer join t on (n.a=t.a) where
cast(t.a as float) is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+3
+a
+b
+NULL
+PREHOOK: query: select assert_true(count(1)=4) from n left outer join t on
(n.a=t.a) where cast(t.a as float) is null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@n
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: select assert_true(count(1)=4) from n left outer join t on
(n.a=t.a) where cast(t.a as float) is null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@n
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+NULL