Github user gatorsmile commented on a diff in the pull request:
https://github.com/apache/spark/pull/21425#discussion_r190694156
--- Diff: sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
---
@@ -275,6 +277,135 @@ class SubquerySuite extends QueryTest with
SharedSQLContext {
}
+ // ``col NOT IN expr'' is quite difficult to reason about. There are
many edge cases, some of the
+ // rules are not very intuitive, and precedence and treatment of null
values is somewhat
+ // unintuitive. To make this simpler to understand, I've come up with a
plain English way of
+ // describing the expected behavior of this query.
+ //
+ // - If the subquery is empty (i.e. returns no rows), the row should be
returned, regardless of
+ // whether the filtered columns include nulls.
+ // - If the subquery contains a result with all nulls, then the row
should not be returned.
+ // - If for all non-null filter columns there exists a row in the
subquery in which each column
+ // either
+ // 1. is equal to the corresponding filter column or
+ // 2. is null
+ // then the row should not be returned. (This includes the case where
all filter columns are
+ // null.)
+ // - Otherwise, the row should be returned.
+ //
+ // Using these rules, we can come up with a set of test cases for
single-column and multi-column
+ // NOT IN test cases.
+ test("NOT IN single column with nulls predicate subquery") {
+ // Test cases for single-column ``WHERE a NOT IN (SELECT c FROM r
...)'':
+ // | # | does subquery include null? | is a null? | a = c? | row with
a included in result? |
+ // | 1 | empty | | | yes
|
+ // | 2 | yes | | | no
|
+ // | 3 | no | yes | | no
|
+ // | 4 | no | no | yes | no
|
+ // | 5 | no | no | no | yes
|
+ Seq(row((null, 5.0)), row((3, 3.0))).toDF("a",
"b").createOrReplaceTempView("m")
+ Seq(row((2, 3.0)), row((2, 3.0)), row((null, 5.0))).toDF("c",
"d").createOrReplaceTempView("s")
+
+ // Single-column test cases
+ val subqueryIsEmpty = "d > 6.0"
+ val cIncludesNull = "d = 5.0"
+ val cDoesNotMatchA = "d = 3.0"
+ val cMatchesA = "d = 5.0"
+ val aIsNull = "b = 5.0"
+ val aIsNotNull = "b = 3.0"
+
+ val includesNullRow = Row(null, 5.0) :: Nil
+ val includesNotNullRow = Row(3, 3.0) :: Nil
+ val doesNotIncludeRow = Nil
--- End diff --
-> `returnNullRow`, `returnNotNullRow`, `returnEmpty`
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]