This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 14656f5756 feat(small): Display `NullEquality` in join executor's
`EXPLAIN` output (#17664)
14656f5756 is described below
commit 14656f575632c39efca55c7339edaf9998ccd7d9
Author: Yongting You <[email protected]>
AuthorDate: Sat Sep 20 12:05:24 2025 +0800
feat(small): Display `NullEquality` in join executor's `EXPLAIN` output
(#17664)
* Clarify null-equal explain expectations
* Format null equality display strings
* fix test
* review: more concise message
* review: more concise message
---
.../physical_optimizer/projection_pushdown.rs | 2 +-
.../core/tests/physical_optimizer/test_utils.rs | 2 +-
.../physical-plan/src/joins/hash_join/exec.rs | 19 +++++++-
.../src/joins/sort_merge_join/exec.rs | 19 ++++++--
.../test_files/create_external_table.slt | 2 +-
.../test_files/join_is_not_distinct_from.slt | 53 +++++++++++++++++++---
datafusion/sqllogictest/test_files/union.slt | 8 ++--
7 files changed, 87 insertions(+), 18 deletions(-)
diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
index 0a75d9f52e..c51a5e02c9 100644
--- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
+++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs
@@ -1281,7 +1281,7 @@ fn test_hash_join_after_projection() -> Result<()> {
&JoinType::Inner,
None,
PartitionMode::Auto,
- NullEquality::NullEqualsNull,
+ NullEquality::NullEqualsNothing,
)?);
let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
vec![
diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs
b/datafusion/core/tests/physical_optimizer/test_utils.rs
index 7c9fb9de53..f1154c98d3 100644
--- a/datafusion/core/tests/physical_optimizer/test_utils.rs
+++ b/datafusion/core/tests/physical_optimizer/test_utils.rs
@@ -236,7 +236,7 @@ pub fn hash_join_exec(
join_type,
None,
PartitionMode::Partitioned,
- NullEquality::NullEqualsNull,
+ NullEquality::NullEqualsNothing,
)?))
}
diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs
b/datafusion/physical-plan/src/joins/hash_join/exec.rs
index c8ed196039..728497444c 100644
--- a/datafusion/physical-plan/src/joins/hash_join/exec.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs
@@ -725,6 +725,12 @@ impl DisplayAs for HashJoinExec {
} else {
"".to_string()
};
+ let display_null_equality =
+ if matches!(self.null_equality(),
NullEquality::NullEqualsNull) {
+ ", NullsEqual: true"
+ } else {
+ ""
+ };
let on = self
.on
.iter()
@@ -733,8 +739,13 @@ impl DisplayAs for HashJoinExec {
.join(", ");
write!(
f,
- "HashJoinExec: mode={:?}, join_type={:?}, on=[{}]{}{}",
- self.mode, self.join_type, on, display_filter,
display_projections,
+ "HashJoinExec: mode={:?}, join_type={:?}, on=[{}]{}{}{}",
+ self.mode,
+ self.join_type,
+ on,
+ display_filter,
+ display_projections,
+ display_null_equality,
)
}
DisplayFormatType::TreeRender => {
@@ -753,6 +764,10 @@ impl DisplayAs for HashJoinExec {
writeln!(f, "on={on}")?;
+ if matches!(self.null_equality(),
NullEquality::NullEqualsNull) {
+ writeln!(f, "NullsEqual: true")?;
+ }
+
if let Some(filter) = self.filter.as_ref() {
writeln!(f, "filter={filter}")?;
}
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
index 3ee8bf5260..8330ec0900 100644
--- a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
@@ -351,15 +351,22 @@ impl DisplayAs for SortMergeJoinExec {
.map(|(c1, c2)| format!("({c1}, {c2})"))
.collect::<Vec<String>>()
.join(", ");
+ let display_null_equality =
+ if matches!(self.null_equality(),
NullEquality::NullEqualsNull) {
+ ", NullsEqual: true"
+ } else {
+ ""
+ };
write!(
f,
- "SortMergeJoin: join_type={:?}, on=[{}]{}",
+ "SortMergeJoin: join_type={:?}, on=[{}]{}{}",
self.join_type,
on,
self.filter.as_ref().map_or("".to_string(), |f| format!(
", filter={}",
f.expression()
- ))
+ )),
+ display_null_equality,
)
}
DisplayFormatType::TreeRender => {
@@ -375,7 +382,13 @@ impl DisplayAs for SortMergeJoinExec {
if self.join_type() != JoinType::Inner {
writeln!(f, "join_type={:?}", self.join_type)?;
}
- writeln!(f, "on={on}")
+ writeln!(f, "on={on}")?;
+
+ if matches!(self.null_equality(),
NullEquality::NullEqualsNull) {
+ writeln!(f, "NullsEqual: true")?;
+ }
+
+ Ok(())
}
}
}
diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt
b/datafusion/sqllogictest/test_files/create_external_table.slt
index 4a803c981a..1e6183f48b 100644
--- a/datafusion/sqllogictest/test_files/create_external_table.slt
+++ b/datafusion/sqllogictest/test_files/create_external_table.slt
@@ -302,4 +302,4 @@ CREATE EXTERNAL TABLE release.bar STORED AS parquet
LOCATION '../../parquet-test
statement error DataFusion error: SQL error: ParserError\("'IF NOT EXISTS'
cannot coexist with 'REPLACE'"\)
CREATE OR REPLACE EXTERNAL TABLE IF NOT EXISTS t_conflict(c1 int)
STORED AS CSV
-LOCATION 'foo.csv';
\ No newline at end of file
+LOCATION 'foo.csv';
diff --git a/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
index fa8b9950ad..0336cfc2d3 100644
--- a/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
+++ b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt
@@ -81,10 +81,51 @@ logical_plan
physical_plan
01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as
val]
02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)]
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1, val@1)],
NullsEqual: true
04)------DataSourceExec: partitions=1, partition_sizes=[1]
05)------DataSourceExec: partitions=1, partition_sizes=[1]
+statement ok
+set datafusion.explain.format = "tree";
+
+# Tree explain should highlight null equality semantics
+query TT
+EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
+FROM t1
+JOIN t2 ON t1.val IS NOT DISTINCT FROM t2.val
+----
+physical_plan
+01)┌───────────────────────────┐
+02)│ ProjectionExec │
+03)│ -------------------- │
+04)│ t1_id: id │
+05)│ t2_id: id │
+06)│ val: val │
+07)└─────────────┬─────────────┘
+08)┌─────────────┴─────────────┐
+09)│ CoalesceBatchesExec │
+10)│ -------------------- │
+11)│ target_batch_size: │
+12)│ 8192 │
+13)└─────────────┬─────────────┘
+14)┌─────────────┴─────────────┐
+15)│ HashJoinExec │
+16)│ -------------------- │
+17)│ NullsEqual: true ├──────────────┐
+18)│ │ │
+19)│ on: (val = val) │ │
+20)└─────────────┬─────────────┘ │
+21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+22)│ DataSourceExec ││ DataSourceExec │
+23)│ -------------------- ││ -------------------- │
+24)│ bytes: 288 ││ bytes: 288 │
+25)│ format: memory ││ format: memory │
+26)│ rows: 1 ││ rows: 1 │
+27)└───────────────────────────┘└───────────────────────────┘
+
+statement ok
+set datafusion.explain.format = "indent";
+
# For nested expression comparision, it should still able to be converted to
Hash Join
query IIII rowsort
SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val
@@ -108,7 +149,7 @@ logical_plan
physical_plan
01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as
val]
02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val +
Int64(1)@2, t2.val + Int64(1)@2)], projection=[id@0, val@1, id@3, val@4]
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val +
Int64(1)@2, t2.val + Int64(1)@2)], projection=[id@0, val@1, id@3, val@4],
NullsEqual: true
04)------CoalescePartitionsExec
05)--------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS
Int64) + 1 as t1.val + Int64(1)]
06)----------RepartitionExec: partitioning=RoundRobinBatch(4),
input_partitions=1
@@ -139,7 +180,7 @@ logical_plan
physical_plan
01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as
val]
02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val +
Int64(1)@2, t2.val + Int64(1)@2)], filter=CAST(val@0 AS Int64) % 3 IS DISTINCT
FROM CAST(val@1 AS Int64) % 3, projection=[id@0, val@1, id@3, val@4]
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(t1.val +
Int64(1)@2, t2.val + Int64(1)@2)], filter=CAST(val@0 AS Int64) % 3 IS DISTINCT
FROM CAST(val@1 AS Int64) % 3, projection=[id@0, val@1, id@3, val@4],
NullsEqual: true
04)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64)
+ 1 as t1.val + Int64(1)]
05)--------DataSourceExec: partitions=1, partition_sizes=[1]
06)------ProjectionExec: expr=[id@0 as id, val@1 as val, CAST(val@1 AS Int64)
+ 1 as t2.val + Int64(1)]
@@ -201,11 +242,11 @@ logical_plan
physical_plan
01)ProjectionExec: expr=[id@0 as t1_id, id@2 as t2_id, val@1 as val, val@3 as
val]
02)--CoalesceBatchesExec: target_batch_size=8192
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@0, val@1)],
projection=[id@1, val@2, id@3, val@4]
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@0, val@1)],
projection=[id@1, val@2, id@3, val@4], NullsEqual: true
04)------DataSourceExec: partitions=1, partition_sizes=[1]
05)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
06)--------CoalesceBatchesExec: target_batch_size=8192
-07)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1,
val@1)]
+07)----------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val@1,
val@1)], NullsEqual: true
08)------------DataSourceExec: partitions=1, partition_sizes=[1]
09)------------DataSourceExec: partitions=1, partition_sizes=[1]
@@ -246,7 +287,7 @@ JOIN t4 ON (t3.val1 IS NOT DISTINCT FROM t4.val1) AND
(t3.val2 IS NOT DISTINCT F
01)ProjectionExec: expr=[id@0 as t3_id, id@3 as t4_id, val1@1 as val1, val1@4
as val1, val2@2 as val2, val2@5 as val2]
02)--CoalesceBatchesExec: target_batch_size=8192
02)--Inner Join: t3.val1 = t4.val1, t3.val2 = t4.val2
-03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val1@1, val1@1),
(val2@2, val2@2)]
+03)----HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(val1@1, val1@1),
(val2@2, val2@2)], NullsEqual: true
03)----TableScan: t3 projection=[id, val1, val2]
04)------DataSourceExec: partitions=1, partition_sizes=[1]
04)----TableScan: t4 projection=[id, val1, val2]
diff --git a/datafusion/sqllogictest/test_files/union.slt
b/datafusion/sqllogictest/test_files/union.slt
index 996ba0d70a..1f7605d220 100644
--- a/datafusion/sqllogictest/test_files/union.slt
+++ b/datafusion/sqllogictest/test_files/union.slt
@@ -308,7 +308,7 @@ logical_plan
physical_plan
01)UnionExec
02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0,
CAST(t2.id AS Int32)@2), (name@1, name@1)]
+03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(id@0,
CAST(t2.id AS Int32)@2), (name@1, name@1)], NullsEqual: true
04)------CoalescePartitionsExec
05)--------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1 as
name], aggr=[]
06)----------CoalesceBatchesExec: target_batch_size=2
@@ -321,7 +321,7 @@ physical_plan
13)----------DataSourceExec: partitions=1, partition_sizes=[1]
14)--ProjectionExec: expr=[CAST(id@0 AS Int32) as id, name@1 as name]
15)----CoalesceBatchesExec: target_batch_size=2
-16)------HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id
AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1]
+16)------HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(CAST(t2.id
AS Int32)@2, id@0), (name@1, name@1)], projection=[id@0, name@1], NullsEqual:
true
17)--------CoalescePartitionsExec
18)----------ProjectionExec: expr=[id@0 as id, name@1 as name, CAST(id@0 AS
Int32) as CAST(t2.id AS Int32)]
19)------------AggregateExec: mode=FinalPartitioned, gby=[id@0 as id, name@1
as name], aggr=[]
@@ -378,7 +378,7 @@ logical_plan
physical_plan
01)UnionExec
02)--CoalesceBatchesExec: target_batch_size=2
-03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0,
name@0)]
+03)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0,
name@0)], NullsEqual: true
04)------CoalescePartitionsExec
05)--------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
06)----------CoalesceBatchesExec: target_batch_size=2
@@ -389,7 +389,7 @@ physical_plan
11)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
12)--------DataSourceExec: partitions=1, partition_sizes=[1]
13)--CoalesceBatchesExec: target_batch_size=2
-14)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0,
name@0)]
+14)----HashJoinExec: mode=CollectLeft, join_type=LeftAnti, on=[(name@0,
name@0)], NullsEqual: true
15)------CoalescePartitionsExec
16)--------AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[]
17)----------CoalesceBatchesExec: target_batch_size=2
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]