nathanb9 commented on code in PR #22810:
URL: https://github.com/apache/datafusion/pull/22810#discussion_r3478519759


##########
datafusion/sqllogictest/test_files/subquery.slt:
##########
@@ -2437,3 +2437,45 @@ DROP TABLE sq_count_customer;
 
 statement ok
 DROP TABLE sq_count_orders;
+
+# Regression test: `NOT IN` is a null-aware anti join. When the subquery 
yields a
+# NULL the predicate is never TRUE, so the query must return zero rows. This 
must
+# hold regardless of the chosen physical join operator. Previously, with
+# prefer_hash_join = false and multiple partitions, the planner routed the
+# null-aware anti join to SortMergeJoin (which is not null-aware) and returned
+# wrong results; null-aware anti joins must use the CollectLeft HashJoin.
+statement ok
+set datafusion.optimizer.prefer_hash_join = false;
+
+statement ok
+CREATE TABLE nia_left(x INT) AS VALUES (1), (2), (3), (4);
+
+statement ok
+CREATE TABLE nia_right_with_null(y INT) AS VALUES (2), (NULL);
+
+statement ok
+CREATE TABLE nia_right_no_null(y INT) AS VALUES (2), (4);
+
+# Subquery contains a NULL -> NOT IN must return no rows.
+query I
+SELECT x FROM nia_left WHERE x NOT IN (SELECT y FROM nia_right_with_null) 
ORDER BY x;
+----
+
+# Subquery has no NULL -> NOT IN behaves like a normal anti join.

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to