NGA-TRAN commented on code in PR #17319:
URL: https://github.com/apache/datafusion/pull/17319#discussion_r2354128075
##########
datafusion/optimizer/src/extract_equijoin_predicate.rs:
##########
@@ -82,6 +82,45 @@ impl OptimizerRule for ExtractEquijoinPredicate {
let (equijoin_predicates, non_equijoin_expr) =
split_eq_and_noneq_join_predicate(expr, left_schema,
right_schema)?;
+ // Equi-join operators like HashJoin support a special behavior
+ // that evaluates `NULL = NULL` as true instead of NULL.
Therefore,
+ // we transform `t1.c1 IS NOT DISTINCT FROM t2.c1` into an
equi-join
+ // and set the `NullEquality` configuration in the join
operator.
+ // This allows certain queries to use Hash Join instead of
+ // Nested Loop Join, resulting in better performance.
Review Comment:
Nice comment
##########
datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt:
##########
@@ -0,0 +1,249 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test IS NOT DISTINCT FROM join functionality
+# This tests the optimizer's ability to convert IS NOT DISTINCT FROM joins
+# to equijoins with proper null equality handling
+
+statement ok
+CREATE TABLE t0 (
+ id INT,
+ val INT
+)
+
+statement ok
+CREATE TABLE t1 (
+ id INT,
+ val INT
+)
+
+statement ok
+CREATE TABLE t2 (
+ id INT,
+ val INT
+)
+
+statement ok
+INSERT INTO t0 VALUES
+(1, 10),
+(2, NULL),
+(5, 50)
+
+statement ok
+INSERT INTO t1 VALUES
+(1, 10),
+(2, NULL),
+(3, 30)
+
+statement ok
+INSERT INTO t2 VALUES
+(1, 10),
+(2, NULL),
+(4, 40)
+
+# Test basic IS NOT DISTINCT FROM join functionality
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+1 1 10 10
+2 2 NULL NULL
+
+# Test that IS NOT DISTINCT FROM join produces HashJoin when used alone
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+logical_plan
+01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+02)--Inner Join: t1.val = t2.val
+03)----TableScan: t1 projection=[id, val]
+04)----TableScan: t2 projection=[id, val]
+physical_plan
+01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as
val]
+02)--CoalesceBatchesExec: target_batch_size=8192
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+# For nested expression comparision, it should still able to be converted to
Hash Join
+query IIII rowsort
+SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON ((t1.val+1) IS NOT DISTINCT FROM (t2.val+1)) AND ((t1.val + 1) IS
NOT DISTINCT FROM 11);
+----
+1 1 10 10
+
+# The plan should includ HashJoin
Review Comment:
```suggestion
# The plan should include HashJoin
```
##########
datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt:
##########
@@ -0,0 +1,249 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test IS NOT DISTINCT FROM join functionality
+# This tests the optimizer's ability to convert IS NOT DISTINCT FROM joins
+# to equijoins with proper null equality handling
+
+statement ok
+CREATE TABLE t0 (
+ id INT,
+ val INT
+)
+
+statement ok
+CREATE TABLE t1 (
+ id INT,
+ val INT
+)
+
+statement ok
+CREATE TABLE t2 (
+ id INT,
+ val INT
+)
+
+statement ok
+INSERT INTO t0 VALUES
+(1, 10),
+(2, NULL),
+(5, 50)
+
+statement ok
+INSERT INTO t1 VALUES
+(1, 10),
+(2, NULL),
+(3, 30)
+
+statement ok
+INSERT INTO t2 VALUES
+(1, 10),
+(2, NULL),
+(4, 40)
Review Comment:
Can you add these tests:
1. One is NULL but the other one is NOT NULL. If you add `(6, NULL)` to `t1`
and `(6, 6)` to `t2`, you will be able to verify that with your current SQL
below
2. Mixed condition `IS NOT DISTINCT` and `IS DISTINCT`
##########
datafusion/optimizer/src/extract_equijoin_predicate.rs:
##########
@@ -82,6 +82,45 @@ impl OptimizerRule for ExtractEquijoinPredicate {
let (equijoin_predicates, non_equijoin_expr) =
split_eq_and_noneq_join_predicate(expr, left_schema,
right_schema)?;
+ // Equi-join operators like HashJoin support a special behavior
+ // that evaluates `NULL = NULL` as true instead of NULL.
Therefore,
+ // we transform `t1.c1 IS NOT DISTINCT FROM t2.c1` into an
equi-join
+ // and set the `NullEquality` configuration in the join
operator.
+ // This allows certain queries to use Hash Join instead of
+ // Nested Loop Join, resulting in better performance.
+ //
+ // Only convert when there are NO equijoin predicates, to be
conservative.
+ if on.is_empty()
+ && equijoin_predicates.is_empty()
+ && non_equijoin_expr.is_some()
+ {
+ // SAFETY: checked in the outer `if`
+ let expr = non_equijoin_expr.clone().unwrap();
+ let (equijoin_predicates, non_equijoin_expr) =
+ split_is_not_distinct_from_and_other_join_predicate(
+ expr,
+ left_schema,
+ right_schema,
+ )?;
+
+ if !equijoin_predicates.is_empty() {
+ on.extend(equijoin_predicates);
+
+ return Ok(Transformed::yes(LogicalPlan::Join(Join {
+ left,
+ right,
+ on,
+ filter: non_equijoin_expr,
+ join_type,
+ join_constraint,
+ schema,
+ // According to `is not distinct from`'s
semantics, it's
+ // safe to override it
+ null_equality: NullEquality::NullEqualsNull,
Review Comment:
👍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]