amansinha100 commented on code in PR #6062:
URL: https://github.com/apache/hive/pull/6062#discussion_r2326291191
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java:
##########
@@ -135,43 +142,76 @@ private List<RexNode> getResidualFilterNodes(Filter
filter, Join join) {
List<RexNode> aboveFilters =
RelOptUtil.conjunctions(filter.getCondition());
boolean hasNullFilterOnRightSide = false;
List<RexNode> filterList = new ArrayList<>();
+ final ImmutableBitSet notNullColumnsFromRightSide =
getNotNullColumnsFromRightSide(join);
+
for (RexNode filterNode : aboveFilters) {
- if (filterNode.getKind() == SqlKind.IS_NULL) {
- // Null filter from right side table can be removed and its a
pre-condition for anti join conversion.
- if (HiveCalciteUtil.hasAllExpressionsFromRightSide(join,
Collections.singletonList(filterNode))
- && isStrong(((RexCall) filterNode).getOperands().get(0))) {
- hasNullFilterOnRightSide = true;
- } else {
- filterList.add(filterNode);
- }
- } else {
- if (HiveCalciteUtil.hasAnyExpressionFromRightSide(join,
Collections.singletonList(filterNode))) {
- // If some non null condition is present from right side, we can not
convert the join to anti join as
- // anti join does not project the fields from right side.
- return null;
- } else {
- filterList.add(filterNode);
- }
+ final ImmutableBitSet usedFields =
RelOptUtil.InputFinder.bits(filterNode);
+ boolean usesFieldFromRHS = usedFields.intersects(rhsFields);
+
+ if(!usesFieldFromRHS) {
+ // Only LHS fields or constants, so the filterNode is part of the
residual filter
+ filterList.add(filterNode);
+ continue;
+ }
+
+ // In the following we check for filter nodes that let us deduce that
+ // "an (originally) not-null column of RHS IS NULL because the LHS row
will not be matched"
+
+ boolean usesRHSFieldsOnly = rhsFields.contains(usedFields);
+ if (!usesRHSFieldsOnly) {
+ // If there is a mix between LHS and RHS fields, don't convert to
anti-join
+ return Optional.empty();
+ }
+
+ if(filterNode.getKind() != SqlKind.IS_NULL) {
Review Comment:
Since this is a quicker conditional check, it would be better to move this
up a bit before line 160 so we can avoid executing the contains() if possible.
##########
ql/src/test/queries/clientpositive/antijoin3.q:
##########
@@ -0,0 +1,39 @@
+SET hive.vectorized.execution.enabled=false;
+set hive.mapred.mode=nonstrict;
+SET hive.auto.convert.join=false;
+SET hive.auto.convert.anti.join=true;
+-- SORT_QUERY_RESULTS
+
+create table antijoin3_t1 (t1id int not null, t1notnull string not null,
t1nullable string);
+create table antijoin3_t2 (t2id int not null, t2notnull string not null,
t2nullable string);
+
+insert into antijoin3_t1 values
+(0, "val_0", null),
+(1, "val_1", null),
+(2, "val_2", "val_2"),
+(3, "val_3", "val_3"),
+(4, "val_4", "val_4");
+
+insert into antijoin3_t2 values
+(0, "val_0", null),
+(1, "val_1", null),
+(4, "val_4", "val_4"),
+(5, "val_5", "val_5");
+
+-- do not introduce anti-join if filtering a nullable column with IS NULL
Review Comment:
One more suggestion for a test case would where the right column is not
coming from a base table but from another Left Outer Join so it becomes
nullable even if it was nonnullable in the base table. This is just to check
if the nullable property is being properly propagated up the operators and if
so, this PR's changes would work fine. If for some reason it is not getting
propagated, it could uncover a hidden bug.
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAntiSemiJoinRule.java:
##########
@@ -135,43 +142,76 @@ private List<RexNode> getResidualFilterNodes(Filter
filter, Join join) {
List<RexNode> aboveFilters =
RelOptUtil.conjunctions(filter.getCondition());
boolean hasNullFilterOnRightSide = false;
List<RexNode> filterList = new ArrayList<>();
+ final ImmutableBitSet notNullColumnsFromRightSide =
getNotNullColumnsFromRightSide(join);
+
for (RexNode filterNode : aboveFilters) {
- if (filterNode.getKind() == SqlKind.IS_NULL) {
- // Null filter from right side table can be removed and its a
pre-condition for anti join conversion.
- if (HiveCalciteUtil.hasAllExpressionsFromRightSide(join,
Collections.singletonList(filterNode))
- && isStrong(((RexCall) filterNode).getOperands().get(0))) {
- hasNullFilterOnRightSide = true;
- } else {
- filterList.add(filterNode);
- }
- } else {
- if (HiveCalciteUtil.hasAnyExpressionFromRightSide(join,
Collections.singletonList(filterNode))) {
- // If some non null condition is present from right side, we can not
convert the join to anti join as
- // anti join does not project the fields from right side.
- return null;
- } else {
- filterList.add(filterNode);
- }
+ final ImmutableBitSet usedFields =
RelOptUtil.InputFinder.bits(filterNode);
+ boolean usesFieldFromRHS = usedFields.intersects(rhsFields);
+
+ if(!usesFieldFromRHS) {
+ // Only LHS fields or constants, so the filterNode is part of the
residual filter
+ filterList.add(filterNode);
+ continue;
+ }
+
+ // In the following we check for filter nodes that let us deduce that
+ // "an (originally) not-null column of RHS IS NULL because the LHS row
will not be matched"
+
+ boolean usesRHSFieldsOnly = rhsFields.contains(usedFields);
+ if (!usesRHSFieldsOnly) {
+ // If there is a mix between LHS and RHS fields, don't convert to
anti-join
+ return Optional.empty();
+ }
+
+ if(filterNode.getKind() != SqlKind.IS_NULL) {
+ return Optional.empty();
+ }
+
+ // Null filter from right side table can be removed and it is a
pre-condition for anti join conversion.
+ RexNode arg = ((RexCall) filterNode).getOperands().get(0);
+ if (isStrong(arg, notNullColumnsFromRightSide)) {
+ hasNullFilterOnRightSide = true;
+ } else if(!isStrong(arg, rhsFields)) {
+ // if all RHS fields are null and the IS NULL is still not fulfilled,
bail out
+ return Optional.empty();
}
}
if (!hasNullFilterOnRightSide) {
- return null;
+ return Optional.empty();
+ }
+ return Optional.of(filterList);
+ }
+
+ private ImmutableBitSet getNotNullColumnsFromRightSide(RelNode joinRel) {
+ // we need to shift the indices of the second child to the right
+ int shift = (joinRel.getInput(0)).getRowType().getFieldCount();
+
+ ImmutableBitSet.Builder builder = ImmutableBitSet.builder();
+ List<RelDataTypeField> rhsFields =
joinRel.getInput(1).getRowType().getFieldList();
+ for(RelDataTypeField field : rhsFields) {
+ if(!field.getType().isNullable()) {
+ builder.set(shift+field.getIndex());
+ }
}
- return filterList;
+ return builder.build();
}
- private boolean isStrong(RexNode rexNode) {
- AtomicBoolean hasCast = new AtomicBoolean(false);
Review Comment:
Previously, since AtomicBoolean was used, I am wondering there might have
been a good reason. Was there a concurrency issue that motivated the original
implementation. Changing it to a MutableBoolean means that this variable is
not thread safe.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]