[spark] branch master updated: [SPARK-25154][SQL] Support NOT IN sub-queries inside nested OR conditions

yamamuro Fri, 10 Apr 2020 16:31:26 -0700

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f0e2fc3  [SPARK-25154][SQL] Support NOT IN sub-queries inside nested 
OR conditions
f0e2fc3 is described below

commit f0e2fc37d1dc2a85fd08c87add5106bb51305182
Author: Dilip Biswal <dkbis...@gmail.com>
AuthorDate: Sat Apr 11 08:28:11 2020 +0900

    [SPARK-25154][SQL] Support NOT IN sub-queries inside nested OR conditions
    
    ### What changes were proposed in this pull request?
    
    Currently NOT IN subqueries (predicated null aware subquery) are not 
allowed inside OR expressions. We currently catch this condition in 
checkAnalysis and throw an error.
    
    This PR enhances the subquery rewrite to support this type of queries.
    
    Query
    ```SQL
    SELECT * FROM s1 WHERE a > 5 or b NOT IN (SELECT c FROM s2);
    ```
    Optimized Plan
    ```SQL
    == Optimized Logical Plan ==
    Project [a#3, b#4]
    +- Filter ((a#3 > 5) || NOT exists#7)
       +- Join ExistenceJoin(exists#7), ((b#4 = c#5) || isnull((b#4 = c#5)))
          :- HiveTableRelation `default`.`s1`, 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#3, b#4]
          +- Project [c#5]
             +- HiveTableRelation `default`.`s2`, 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [c#5, d#6]
    ```
    This is rework from #22141.
    The original author of this PR is dilipbiswal.
    
    Closes #22141
    
    ### Why are the changes needed?
    
    For better usability.
    
    ### Does this PR introduce any user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Added new tests in SQLQueryTestSuite, RewriteSubquerySuite and 
SubquerySuite.
    Output from DB2 as a reference:
    
[nested-not-db2.txt](https://github.com/apache/spark/files/2299945/nested-not-db2.txt)
    
    Closes #28158 from maropu/pr22141.
    
    Lead-authored-by: Dilip Biswal <dkbis...@gmail.com>
    Co-authored-by: Takeshi Yamamuro <yamam...@apache.org>
    Co-authored-by: Dilip Biswal <dbis...@us.ibm.com>
    Signed-off-by: Takeshi Yamamuro <yamam...@apache.org>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala      |   4 -
 .../spark/sql/catalyst/expressions/subquery.scala  |  18 --
 .../spark/sql/catalyst/optimizer/subquery.scala    |  23 +-
 .../sql/catalyst/analysis/AnalysisErrorSuite.scala |  15 -
 .../catalyst/optimizer/RewriteSubquerySuite.scala  |  19 +-
 .../apache/spark/sql/catalyst/plans/PlanTest.scala |   9 +-
 .../inputs/subquery/in-subquery/nested-not-in.sql  | 198 ++++++++++++
 .../subquery/in-subquery/nested-not-in.sql.out     | 332 +++++++++++++++++++++
 .../scala/org/apache/spark/sql/SubquerySuite.scala |   6 +-
 9 files changed, 580 insertions(+), 44 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 066dc6d..9e325d0 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -200,10 +200,6 @@ trait CheckAnalysis extends PredicateHelper {
               s"filter expression '${f.condition.sql}' " +
                 s"of type ${f.condition.dataType.catalogString} is not a 
boolean.")
 
-          case Filter(condition, _) if 
hasNullAwarePredicateWithinNot(condition) =>
-            failAnalysis("Null-aware predicate sub-queries cannot be used in 
nested " +
-              s"conditions: $condition")
-
           case j @ Join(_, _, _, Some(condition), _) if condition.dataType != 
BooleanType =>
             failAnalysis(
               s"join condition '${condition.sql}' " +
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
index e33cff2..f46a1c6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
@@ -106,24 +106,6 @@ object SubExprUtils extends PredicateHelper {
   }
 
   /**
-   * Returns whether there are any null-aware predicate subqueries inside Not. 
If not, we could
-   * turn the null-aware predicate into not-null-aware predicate.
-   */
-  def hasNullAwarePredicateWithinNot(condition: Expression): Boolean = {
-    splitConjunctivePredicates(condition).exists {
-      case _: Exists | Not(_: Exists) => false
-      case _: InSubquery | Not(_: InSubquery) => false
-      case e => e.find { x =>
-        x.isInstanceOf[Not] && e.find {
-          case _: InSubquery => true
-          case _ => false
-        }.isDefined
-      }.isDefined
-    }
-
-  }
-
-  /**
    * Returns an expression after removing the OuterReference shell.
    */
   def stripOuterReference(e: Expression): Expression = e.transform { case 
OuterReference(r) => r }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index b697462..6fdd211 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -162,12 +162,33 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] 
with PredicateHelper {
       plan: LogicalPlan): (Option[Expression], LogicalPlan) = {
     var newPlan = plan
     val newExprs = exprs.map { e =>
-      e transformUp {
+      e transformDown {
         case Exists(sub, conditions, _) =>
           val exists = AttributeReference("exists", BooleanType, nullable = 
false)()
           newPlan =
             buildJoin(newPlan, sub, ExistenceJoin(exists), 
conditions.reduceLeftOption(And))
           exists
+        case Not(InSubquery(values, ListQuery(sub, conditions, _, _))) =>
+          val exists = AttributeReference("exists", BooleanType, nullable = 
false)()
+          // Deduplicate conflicting attributes if any.
+          val newSub = dedupSubqueryOnSelfJoin(newPlan, sub, Some(values))
+          val inConditions = values.zip(sub.output).map(EqualTo.tupled)
+          // To handle a null-aware predicate not-in-subquery in nested 
conditions
+          // (e.g., `v > 0 OR t1.id NOT IN (SELECT id FROM t2)`), we transform
+          // `inConditon` (t1.id=t2.id) into `(inCondition) OR 
ISNULL(inCondition)`.
+          //
+          // For example, `SELECT * FROM t1 WHERE v > 0 OR t1.id NOT IN 
(SELECT id FROM t2)`
+          // is transformed into a plan below;
+          // == Optimized Logical Plan ==
+          // Project [id#78, v#79]
+          // +- Filter ((v#79 > 0) OR NOT exists#83)
+          //   +- Join ExistenceJoin(exists#83), ((id#78 = id#80) OR 
isnull((id#78 = id#80)))
+          //     :- Relation[id#78,v#79] parquet
+          //     +- Relation[id#80] parquet
+          val nullAwareJoinConds = inConditions.map(c => Or(c, IsNull(c)))
+          val finalJoinCond = (nullAwareJoinConds ++ 
conditions).reduceLeft(And)
+          newPlan = Join(newPlan, newSub, ExistenceJoin(exists), 
Some(finalJoinCond), JoinHint.NONE)
+          Not(exists)
         case InSubquery(values, ListQuery(sub, conditions, _, _)) =>
           val exists = AttributeReference("exists", BooleanType, nullable = 
false)()
           // Deduplicate conflicting attributes if any.
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 09e0d9c..166ffec 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -614,21 +614,6 @@ class AnalysisErrorSuite extends AnalysisTest {
         " in Filter" :: Nil)
   }
 
-  test("PredicateSubQuery is used is a nested condition") {
-    val a = AttributeReference("a", IntegerType)()
-    val b = AttributeReference("b", IntegerType)()
-    val c = AttributeReference("c", BooleanType)()
-    val plan1 = Filter(Cast(Not(InSubquery(Seq(a), 
ListQuery(LocalRelation(b)))), BooleanType),
-      LocalRelation(a))
-    assertAnalysisError(plan1,
-      "Null-aware predicate sub-queries cannot be used in nested conditions" 
:: Nil)
-
-    val plan2 = Filter(
-      Or(Not(InSubquery(Seq(a), ListQuery(LocalRelation(b)))), c), 
LocalRelation(a, c))
-    assertAnalysisError(plan2,
-      "Null-aware predicate sub-queries cannot be used in nested conditions" 
:: Nil)
-  }
-
   test("PredicateSubQuery correlated predicate is nested in an illegal plan") {
     val a = AttributeReference("a", IntegerType)()
     val b = AttributeReference("b", IntegerType)()
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
index f00d22e..0608ded 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.ListQuery
-import org.apache.spark.sql.catalyst.plans.{LeftSemi, PlanTest}
+import org.apache.spark.sql.catalyst.expressions.{IsNull, ListQuery, Not}
+import org.apache.spark.sql.catalyst.plans.{ExistenceJoin, LeftSemi, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 
@@ -52,4 +52,19 @@ class RewriteSubquerySuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("NOT-IN subquery nested inside OR") {
+    val relation1 = LocalRelation('a.int, 'b.int)
+    val relation2 = LocalRelation('c.int, 'd.int)
+    val exists = 'exists.boolean.notNull
+
+    val query = relation1.where('b === 1 || 
Not('a.in(ListQuery(relation2.select('c))))).select('a)
+    val correctAnswer = relation1
+      .join(relation2.select('c), ExistenceJoin(exists), Some('a === 'c || 
IsNull('a === 'c)))
+      .where('b === 1 || Not(exists))
+      .select('a)
+      .analyze
+    val optimized = Optimize.execute(query.analyze)
+
+    comparePlans(optimized, correctAnswer)
+  }
 }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index b28e6de..9600357 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -109,10 +109,17 @@ trait PlanTestBase extends PredicateHelper with SQLHelper 
{ self: Suite =>
       case sample: Sample =>
         sample.copy(seed = 0L)
       case Join(left, right, joinType, condition, hint) if condition.isDefined 
=>
+        val newJoinType = joinType match {
+          case ExistenceJoin(a: Attribute) =>
+            val newAttr = AttributeReference(a.name, a.dataType, 
a.nullable)(exprId = ExprId(0))
+            ExistenceJoin(newAttr)
+          case other => other
+        }
+
         val newCondition =
           
splitConjunctivePredicates(condition.get).map(rewriteEqual).sortBy(_.hashCode())
             .reduce(And)
-        Join(left, right, joinType, Some(newCondition), hint)
+        Join(left, right, newJoinType, Some(newCondition), hint)
     }
   }
 
diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/nested-not-in.sql
 
b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/nested-not-in.sql
new file mode 100644
index 0000000..2f6835b
--- /dev/null
+++ 
b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/nested-not-in.sql
@@ -0,0 +1,198 @@
+-- Tests NOT-IN subqueries nested inside OR expression(s).
+
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", 10),
+  (200, "emp 2", NULL),
+  (300, "emp 3", 20),
+  (400, "emp 4", 30),
+  (500, "emp 5", NULL),
+  (600, "emp 6", 100),
+  (800, "emp 8", 70)
+AS EMP(id, emp_name, dept_id);
+
+CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES
+  ("emp 1", 10.00D),
+  ("emp 1", 20.00D),
+  ("emp 2", 300.00D),
+  ("emp 2", 100.00D),
+  ("emp 3", 300.00D),
+  ("emp 4", 100.00D),
+  ("emp 5", 1000.00D),
+  ("emp 6 - no dept", 500.00D)
+AS BONUS(emp_name, bonus_amt);
+
+CREATE TEMPORARY VIEW ADDRESS AS SELECT * FROM VALUES
+  (100, "emp 1", "addr1"),
+  (200, null, "addr2"),
+  (null, "emp 3", "addr3"),
+  (null, null, "addr4"),
+  (600, "emp 6", "addr6"),
+  (800, "emp 8", "addr8")
+AS ADDRESS(id, emp_name, address);
+
+CREATE TEMPORARY VIEW S1 AS SELECT * FROM VALUES
+  (null, null), (5, 5), (8, 8), (11, 11) AS s1(a, b);
+CREATE TEMPORARY VIEW S2 AS SELECT * FROM VALUES
+  (7, 7), (8, 8), (11, 11), (null, null) AS s2(c, d);
+
+-- null produced from both sides.
+-- TC.01.01
+SELECT id, 
+       dept_id 
+FROM   emp 
+WHERE  id = 600
+       OR id = 500 
+       OR dept_id NOT IN (SELECT dept_id 
+                          FROM   emp);
+
+-- null produced from right side
+-- TC.01.02
+SELECT id, 
+       dept_id 
+FROM   emp 
+WHERE  id = 800 
+       OR (dept_id IS NOT NULL 
+           AND dept_id NOT IN (SELECT dept_id 
+                                FROM   emp));
+
+-- null produced on left side
+-- TC.01.03
+SELECT id,
+       dept_id
+FROM   emp
+WHERE  id = 100
+       OR dept_id NOT IN (SELECT dept_id
+                           FROM   emp
+                           WHERE dept_id IS NOT NULL);
+
+-- no null in both left and right
+-- TC.01.04
+SELECT id, 
+       dept_id 
+FROM   emp 
+WHERE  id = 200 
+       OR (dept_id IS NOT NULL        
+       AND dept_id + 100 NOT IN (SELECT dept_id 
+                           FROM   emp
+                           WHERE dept_id IS NOT NULL));
+
+-- complex nesting
+-- TC.01.05
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  emp_name IN (SELECT emp_name 
+                    FROM   bonus) 
+        OR (dept_id IS NOT NULL 
+            AND dept_id NOT IN (SELECT dept_id 
+                                FROM   dept));
+
+-- complex nesting, exists in disjunction with not-in
+-- TC.01.06
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  EXISTS (SELECT emp_name 
+               FROM   bonus 
+               WHERE  emp.emp_name = bonus.emp_name) 
+       OR (dept_id IS NOT NULL 
+           AND dept_id NOT IN (SELECT dept_id 
+                               FROM   dept));
+
+-- multiple columns in not-in
+-- TC.01.07
+SELECT id,
+       dept_id,
+       emp_name
+FROM   emp
+WHERE  dept_id = 10
+OR (id, emp_name) NOT IN (SELECT id, emp_name FROM address);
+
+-- multiple columns in not-in
+-- TC.01.08
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  dept_id = 10 
+        OR (( id, emp_name ) NOT IN (SELECT id, 
+                                             emp_name 
+                                      FROM   address 
+                                      WHERE  id IS NOT NULL 
+                                             AND emp_name IS NOT NULL) 
+             AND id > 400 );
+-- correlated not-in along with disjunction
+-- TC.01.09
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  dept_id = 10 
+       OR emp_name NOT IN (SELECT emp_name 
+                                  FROM   address 
+                                  WHERE  id IS NOT NULL 
+                                  AND emp_name IS NOT NULL
+                                  AND emp.id = address.id);
+
+-- multiple not-in(s) in side disjunction`
+-- TC.01.10
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  id NOT IN (SELECT id 
+                         FROM   address 
+                         WHERE  id IS NOT NULL 
+                         AND emp_name IS NOT NULL
+                         AND id >= 400)
+       OR emp_name NOT IN (SELECT emp_name 
+                                  FROM   address 
+                                  WHERE  id IS NOT NULL 
+                                  AND emp_name IS NOT NULL
+                                  AND emp.id = address.id
+                                  AND id < 400);
+
+-- NOT (NOT IN (SUBQ))
+SELECT * 
+FROM   s1 
+WHERE  NOT (a NOT IN (SELECT c 
+                      FROM   s2));
+
+-- NOT (OR (expression, IN-SUBQ)) 
+SELECT * 
+FROM   s1 
+WHERE  NOT (a > 5 
+            OR a IN (SELECT c 
+                     FROM   s2));
+
+-- NOT (OR (expression, NOT-IN-SUB)
+SELECT * 
+FROM   s1 
+WHERE  NOT (a > 5 
+            OR a NOT IN (SELECT c 
+                         FROM   s2));
+
+-- NOT (AND (expression, IN-SUB))
+SELECT * 
+FROM   s1 
+WHERE  NOT (a > 5 
+            AND a IN (SELECT c 
+                      FROM   s2));
+ 
+-- NOT (AND (expression, NOT-IN-SUBQ))
+SELECT * 
+FROM   s1 
+WHERE  NOT (a > 5 
+            AND a NOT IN (SELECT c 
+                          FROM   s2));
diff --git 
a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/nested-not-in.sql.out
 
b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/nested-not-in.sql.out
new file mode 100644
index 0000000..86f967e
--- /dev/null
+++ 
b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/nested-not-in.sql.out
@@ -0,0 +1,332 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 21
+
+
+-- !query
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", 10),
+  (200, "emp 2", NULL),
+  (300, "emp 3", 20),
+  (400, "emp 4", 30),
+  (500, "emp 5", NULL),
+  (600, "emp 6", 100),
+  (800, "emp 8", 70)
+AS EMP(id, emp_name, dept_id)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES
+  ("emp 1", 10.00D),
+  ("emp 1", 20.00D),
+  ("emp 2", 300.00D),
+  ("emp 2", 100.00D),
+  ("emp 3", 300.00D),
+  ("emp 4", 100.00D),
+  ("emp 5", 1000.00D),
+  ("emp 6 - no dept", 500.00D)
+AS BONUS(emp_name, bonus_amt)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TEMPORARY VIEW ADDRESS AS SELECT * FROM VALUES
+  (100, "emp 1", "addr1"),
+  (200, null, "addr2"),
+  (null, "emp 3", "addr3"),
+  (null, null, "addr4"),
+  (600, "emp 6", "addr6"),
+  (800, "emp 8", "addr8")
+AS ADDRESS(id, emp_name, address)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TEMPORARY VIEW S1 AS SELECT * FROM VALUES
+  (null, null), (5, 5), (8, 8), (11, 11) AS s1(a, b)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TEMPORARY VIEW S2 AS SELECT * FROM VALUES
+  (7, 7), (8, 8), (11, 11), (null, null) AS s2(c, d)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+SELECT id, 
+       dept_id 
+FROM   emp 
+WHERE  id = 600
+       OR id = 500 
+       OR dept_id NOT IN (SELECT dept_id 
+                          FROM   emp)
+-- !query schema
+struct<id:int,dept_id:int>
+-- !query output
+500    NULL
+600    100
+
+
+-- !query
+SELECT id, 
+       dept_id 
+FROM   emp 
+WHERE  id = 800 
+       OR (dept_id IS NOT NULL 
+           AND dept_id NOT IN (SELECT dept_id 
+                                FROM   emp))
+-- !query schema
+struct<id:int,dept_id:int>
+-- !query output
+800    70
+
+
+-- !query
+SELECT id,
+       dept_id
+FROM   emp
+WHERE  id = 100
+       OR dept_id NOT IN (SELECT dept_id
+                           FROM   emp
+                           WHERE dept_id IS NOT NULL)
+-- !query schema
+struct<id:int,dept_id:int>
+-- !query output
+100    10
+
+
+-- !query
+SELECT id, 
+       dept_id 
+FROM   emp 
+WHERE  id = 200 
+       OR (dept_id IS NOT NULL        
+       AND dept_id + 100 NOT IN (SELECT dept_id 
+                           FROM   emp
+                           WHERE dept_id IS NOT NULL))
+-- !query schema
+struct<id:int,dept_id:int>
+-- !query output
+100    10
+200    NULL
+300    20
+400    30
+600    100
+800    70
+
+
+-- !query
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  emp_name IN (SELECT emp_name 
+                    FROM   bonus) 
+        OR (dept_id IS NOT NULL 
+            AND dept_id NOT IN (SELECT dept_id 
+                                FROM   dept))
+-- !query schema
+struct<id:int,dept_id:int,emp_name:string>
+-- !query output
+100    10      emp 1
+200    NULL    emp 2
+300    20      emp 3
+400    30      emp 4
+500    NULL    emp 5
+600    100     emp 6
+
+
+-- !query
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  EXISTS (SELECT emp_name 
+               FROM   bonus 
+               WHERE  emp.emp_name = bonus.emp_name) 
+       OR (dept_id IS NOT NULL 
+           AND dept_id NOT IN (SELECT dept_id 
+                               FROM   dept))
+-- !query schema
+struct<id:int,dept_id:int,emp_name:string>
+-- !query output
+100    10      emp 1
+200    NULL    emp 2
+300    20      emp 3
+400    30      emp 4
+500    NULL    emp 5
+600    100     emp 6
+
+
+-- !query
+SELECT id,
+       dept_id,
+       emp_name
+FROM   emp
+WHERE  dept_id = 10
+OR (id, emp_name) NOT IN (SELECT id, emp_name FROM address)
+-- !query schema
+struct<id:int,dept_id:int,emp_name:string>
+-- !query output
+100    10      emp 1
+
+
+-- !query
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  dept_id = 10 
+        OR (( id, emp_name ) NOT IN (SELECT id, 
+                                             emp_name 
+                                      FROM   address 
+                                      WHERE  id IS NOT NULL 
+                                             AND emp_name IS NOT NULL) 
+             AND id > 400 )
+-- !query schema
+struct<id:int,dept_id:int,emp_name:string>
+-- !query output
+100    10      emp 1
+500    NULL    emp 5
+
+
+-- !query
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  dept_id = 10 
+       OR emp_name NOT IN (SELECT emp_name 
+                                  FROM   address 
+                                  WHERE  id IS NOT NULL 
+                                  AND emp_name IS NOT NULL
+                                  AND emp.id = address.id)
+-- !query schema
+struct<id:int,dept_id:int,emp_name:string>
+-- !query output
+100    10      emp 1
+200    NULL    emp 2
+300    20      emp 3
+400    30      emp 4
+500    NULL    emp 5
+
+
+-- !query
+SELECT id, 
+       dept_id, 
+       emp_name 
+FROM   emp 
+WHERE  id NOT IN (SELECT id 
+                         FROM   address 
+                         WHERE  id IS NOT NULL 
+                         AND emp_name IS NOT NULL
+                         AND id >= 400)
+       OR emp_name NOT IN (SELECT emp_name 
+                                  FROM   address 
+                                  WHERE  id IS NOT NULL 
+                                  AND emp_name IS NOT NULL
+                                  AND emp.id = address.id
+                                  AND id < 400)
+-- !query schema
+struct<id:int,dept_id:int,emp_name:string>
+-- !query output
+100    10      emp 1
+200    NULL    emp 2
+300    20      emp 3
+400    30      emp 4
+500    NULL    emp 5
+600    100     emp 6
+800    70      emp 8
+
+
+-- !query
+SELECT * 
+FROM   s1 
+WHERE  NOT (a NOT IN (SELECT c 
+                      FROM   s2))
+-- !query schema
+struct<a:int,b:int>
+-- !query output
+11     11
+8      8
+
+
+-- !query
+SELECT * 
+FROM   s1 
+WHERE  NOT (a > 5 
+            OR a IN (SELECT c 
+                     FROM   s2))
+-- !query schema
+struct<a:int,b:int>
+-- !query output
+
+
+
+-- !query
+SELECT * 
+FROM   s1 
+WHERE  NOT (a > 5 
+            OR a NOT IN (SELECT c 
+                         FROM   s2))
+-- !query schema
+struct<a:int,b:int>
+-- !query output
+
+
+
+-- !query
+SELECT * 
+FROM   s1 
+WHERE  NOT (a > 5 
+            AND a IN (SELECT c 
+                      FROM   s2))
+-- !query schema
+struct<a:int,b:int>
+-- !query output
+5      5
+
+
+-- !query
+SELECT * 
+FROM   s1 
+WHERE  NOT (a > 5 
+            AND a NOT IN (SELECT c 
+                          FROM   s2))
+-- !query schema
+struct<a:int,b:int>
+-- !query output
+11     11
+5      5
+8      8
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index ff8f94c..1b66ce53 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -288,10 +288,10 @@ class SubquerySuite extends QueryTest with 
SharedSparkSession with AdaptiveSpark
         " or l.a in (select c from r where l.b < r.d)"),
       Row(2, 1.0) :: Row(2, 1.0) :: Row(3, 3.0) :: Row(6, null) :: Nil)
 
-    intercept[AnalysisException] {
+    checkAnswer(
       sql("select * from l where a not in (select c from r)" +
-        " or a not in (select c from r where c is not null)")
-    }
+        " or a not in (select c from r where c is not null)"),
+      Row(1, 2.0) :: Row(1, 2.0) :: Nil)
   }
 
   test("complex IN predicate subquery") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-25154][SQL] Support NOT IN sub-queries inside nested OR conditions

Reply via email to