This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new f0e2fc3 [SPARK-25154][SQL] Support NOT IN sub-queries inside nested OR conditions f0e2fc3 is described below commit f0e2fc37d1dc2a85fd08c87add5106bb51305182 Author: Dilip Biswal <dkbis...@gmail.com> AuthorDate: Sat Apr 11 08:28:11 2020 +0900 [SPARK-25154][SQL] Support NOT IN sub-queries inside nested OR conditions ### What changes were proposed in this pull request? Currently NOT IN subqueries (predicated null aware subquery) are not allowed inside OR expressions. We currently catch this condition in checkAnalysis and throw an error. This PR enhances the subquery rewrite to support this type of queries. Query ```SQL SELECT * FROM s1 WHERE a > 5 or b NOT IN (SELECT c FROM s2); ``` Optimized Plan ```SQL == Optimized Logical Plan == Project [a#3, b#4] +- Filter ((a#3 > 5) || NOT exists#7) +- Join ExistenceJoin(exists#7), ((b#4 = c#5) || isnull((b#4 = c#5))) :- HiveTableRelation `default`.`s1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#3, b#4] +- Project [c#5] +- HiveTableRelation `default`.`s2`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [c#5, d#6] ``` This is rework from #22141. The original author of this PR is dilipbiswal. Closes #22141 ### Why are the changes needed? For better usability. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Added new tests in SQLQueryTestSuite, RewriteSubquerySuite and SubquerySuite. Output from DB2 as a reference: [nested-not-db2.txt](https://github.com/apache/spark/files/2299945/nested-not-db2.txt) Closes #28158 from maropu/pr22141. Lead-authored-by: Dilip Biswal <dkbis...@gmail.com> Co-authored-by: Takeshi Yamamuro <yamam...@apache.org> Co-authored-by: Dilip Biswal <dbis...@us.ibm.com> Signed-off-by: Takeshi Yamamuro <yamam...@apache.org> --- .../sql/catalyst/analysis/CheckAnalysis.scala | 4 - .../spark/sql/catalyst/expressions/subquery.scala | 18 -- .../spark/sql/catalyst/optimizer/subquery.scala | 23 +- .../sql/catalyst/analysis/AnalysisErrorSuite.scala | 15 - .../catalyst/optimizer/RewriteSubquerySuite.scala | 19 +- .../apache/spark/sql/catalyst/plans/PlanTest.scala | 9 +- .../inputs/subquery/in-subquery/nested-not-in.sql | 198 ++++++++++++ .../subquery/in-subquery/nested-not-in.sql.out | 332 +++++++++++++++++++++ .../scala/org/apache/spark/sql/SubquerySuite.scala | 6 +- 9 files changed, 580 insertions(+), 44 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 066dc6d..9e325d0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -200,10 +200,6 @@ trait CheckAnalysis extends PredicateHelper { s"filter expression '${f.condition.sql}' " + s"of type ${f.condition.dataType.catalogString} is not a boolean.") - case Filter(condition, _) if hasNullAwarePredicateWithinNot(condition) => - failAnalysis("Null-aware predicate sub-queries cannot be used in nested " + - s"conditions: $condition") - case j @ Join(_, _, _, Some(condition), _) if condition.dataType != BooleanType => failAnalysis( s"join condition '${condition.sql}' " + diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala index e33cff2..f46a1c6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala @@ -106,24 +106,6 @@ object SubExprUtils extends PredicateHelper { } /** - * Returns whether there are any null-aware predicate subqueries inside Not. If not, we could - * turn the null-aware predicate into not-null-aware predicate. - */ - def hasNullAwarePredicateWithinNot(condition: Expression): Boolean = { - splitConjunctivePredicates(condition).exists { - case _: Exists | Not(_: Exists) => false - case _: InSubquery | Not(_: InSubquery) => false - case e => e.find { x => - x.isInstanceOf[Not] && e.find { - case _: InSubquery => true - case _ => false - }.isDefined - }.isDefined - } - - } - - /** * Returns an expression after removing the OuterReference shell. */ def stripOuterReference(e: Expression): Expression = e.transform { case OuterReference(r) => r } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index b697462..6fdd211 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -162,12 +162,33 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { plan: LogicalPlan): (Option[Expression], LogicalPlan) = { var newPlan = plan val newExprs = exprs.map { e => - e transformUp { + e transformDown { case Exists(sub, conditions, _) => val exists = AttributeReference("exists", BooleanType, nullable = false)() newPlan = buildJoin(newPlan, sub, ExistenceJoin(exists), conditions.reduceLeftOption(And)) exists + case Not(InSubquery(values, ListQuery(sub, conditions, _, _))) => + val exists = AttributeReference("exists", BooleanType, nullable = false)() + // Deduplicate conflicting attributes if any. + val newSub = dedupSubqueryOnSelfJoin(newPlan, sub, Some(values)) + val inConditions = values.zip(sub.output).map(EqualTo.tupled) + // To handle a null-aware predicate not-in-subquery in nested conditions + // (e.g., `v > 0 OR t1.id NOT IN (SELECT id FROM t2)`), we transform + // `inConditon` (t1.id=t2.id) into `(inCondition) OR ISNULL(inCondition)`. + // + // For example, `SELECT * FROM t1 WHERE v > 0 OR t1.id NOT IN (SELECT id FROM t2)` + // is transformed into a plan below; + // == Optimized Logical Plan == + // Project [id#78, v#79] + // +- Filter ((v#79 > 0) OR NOT exists#83) + // +- Join ExistenceJoin(exists#83), ((id#78 = id#80) OR isnull((id#78 = id#80))) + // :- Relation[id#78,v#79] parquet + // +- Relation[id#80] parquet + val nullAwareJoinConds = inConditions.map(c => Or(c, IsNull(c))) + val finalJoinCond = (nullAwareJoinConds ++ conditions).reduceLeft(And) + newPlan = Join(newPlan, newSub, ExistenceJoin(exists), Some(finalJoinCond), JoinHint.NONE) + Not(exists) case InSubquery(values, ListQuery(sub, conditions, _, _)) => val exists = AttributeReference("exists", BooleanType, nullable = false)() // Deduplicate conflicting attributes if any. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index 09e0d9c..166ffec 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -614,21 +614,6 @@ class AnalysisErrorSuite extends AnalysisTest { " in Filter" :: Nil) } - test("PredicateSubQuery is used is a nested condition") { - val a = AttributeReference("a", IntegerType)() - val b = AttributeReference("b", IntegerType)() - val c = AttributeReference("c", BooleanType)() - val plan1 = Filter(Cast(Not(InSubquery(Seq(a), ListQuery(LocalRelation(b)))), BooleanType), - LocalRelation(a)) - assertAnalysisError(plan1, - "Null-aware predicate sub-queries cannot be used in nested conditions" :: Nil) - - val plan2 = Filter( - Or(Not(InSubquery(Seq(a), ListQuery(LocalRelation(b)))), c), LocalRelation(a, c)) - assertAnalysisError(plan2, - "Null-aware predicate sub-queries cannot be used in nested conditions" :: Nil) - } - test("PredicateSubQuery correlated predicate is nested in an illegal plan") { val a = AttributeReference("a", IntegerType)() val b = AttributeReference("b", IntegerType)() diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala index f00d22e..0608ded 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala @@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.ListQuery -import org.apache.spark.sql.catalyst.plans.{LeftSemi, PlanTest} +import org.apache.spark.sql.catalyst.expressions.{IsNull, ListQuery, Not} +import org.apache.spark.sql.catalyst.plans.{ExistenceJoin, LeftSemi, PlanTest} import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor @@ -52,4 +52,19 @@ class RewriteSubquerySuite extends PlanTest { comparePlans(optimized, correctAnswer) } + test("NOT-IN subquery nested inside OR") { + val relation1 = LocalRelation('a.int, 'b.int) + val relation2 = LocalRelation('c.int, 'd.int) + val exists = 'exists.boolean.notNull + + val query = relation1.where('b === 1 || Not('a.in(ListQuery(relation2.select('c))))).select('a) + val correctAnswer = relation1 + .join(relation2.select('c), ExistenceJoin(exists), Some('a === 'c || IsNull('a === 'c))) + .where('b === 1 || Not(exists)) + .select('a) + .analyze + val optimized = Optimize.execute(query.analyze) + + comparePlans(optimized, correctAnswer) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala index b28e6de..9600357 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala @@ -109,10 +109,17 @@ trait PlanTestBase extends PredicateHelper with SQLHelper { self: Suite => case sample: Sample => sample.copy(seed = 0L) case Join(left, right, joinType, condition, hint) if condition.isDefined => + val newJoinType = joinType match { + case ExistenceJoin(a: Attribute) => + val newAttr = AttributeReference(a.name, a.dataType, a.nullable)(exprId = ExprId(0)) + ExistenceJoin(newAttr) + case other => other + } + val newCondition = splitConjunctivePredicates(condition.get).map(rewriteEqual).sortBy(_.hashCode()) .reduce(And) - Join(left, right, joinType, Some(newCondition), hint) + Join(left, right, newJoinType, Some(newCondition), hint) } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/nested-not-in.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/nested-not-in.sql new file mode 100644 index 0000000..2f6835b --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/nested-not-in.sql @@ -0,0 +1,198 @@ +-- Tests NOT-IN subqueries nested inside OR expression(s). + +CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES + (100, "emp 1", 10), + (200, "emp 2", NULL), + (300, "emp 3", 20), + (400, "emp 4", 30), + (500, "emp 5", NULL), + (600, "emp 6", 100), + (800, "emp 8", 70) +AS EMP(id, emp_name, dept_id); + +CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES + (10, "dept 1", "CA"), + (20, "dept 2", "NY"), + (30, "dept 3", "TX"), + (40, "dept 4 - unassigned", "OR"), + (50, "dept 5 - unassigned", "NJ"), + (70, "dept 7", "FL") +AS DEPT(dept_id, dept_name, state); + +CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES + ("emp 1", 10.00D), + ("emp 1", 20.00D), + ("emp 2", 300.00D), + ("emp 2", 100.00D), + ("emp 3", 300.00D), + ("emp 4", 100.00D), + ("emp 5", 1000.00D), + ("emp 6 - no dept", 500.00D) +AS BONUS(emp_name, bonus_amt); + +CREATE TEMPORARY VIEW ADDRESS AS SELECT * FROM VALUES + (100, "emp 1", "addr1"), + (200, null, "addr2"), + (null, "emp 3", "addr3"), + (null, null, "addr4"), + (600, "emp 6", "addr6"), + (800, "emp 8", "addr8") +AS ADDRESS(id, emp_name, address); + +CREATE TEMPORARY VIEW S1 AS SELECT * FROM VALUES + (null, null), (5, 5), (8, 8), (11, 11) AS s1(a, b); +CREATE TEMPORARY VIEW S2 AS SELECT * FROM VALUES + (7, 7), (8, 8), (11, 11), (null, null) AS s2(c, d); + +-- null produced from both sides. +-- TC.01.01 +SELECT id, + dept_id +FROM emp +WHERE id = 600 + OR id = 500 + OR dept_id NOT IN (SELECT dept_id + FROM emp); + +-- null produced from right side +-- TC.01.02 +SELECT id, + dept_id +FROM emp +WHERE id = 800 + OR (dept_id IS NOT NULL + AND dept_id NOT IN (SELECT dept_id + FROM emp)); + +-- null produced on left side +-- TC.01.03 +SELECT id, + dept_id +FROM emp +WHERE id = 100 + OR dept_id NOT IN (SELECT dept_id + FROM emp + WHERE dept_id IS NOT NULL); + +-- no null in both left and right +-- TC.01.04 +SELECT id, + dept_id +FROM emp +WHERE id = 200 + OR (dept_id IS NOT NULL + AND dept_id + 100 NOT IN (SELECT dept_id + FROM emp + WHERE dept_id IS NOT NULL)); + +-- complex nesting +-- TC.01.05 +SELECT id, + dept_id, + emp_name +FROM emp +WHERE emp_name IN (SELECT emp_name + FROM bonus) + OR (dept_id IS NOT NULL + AND dept_id NOT IN (SELECT dept_id + FROM dept)); + +-- complex nesting, exists in disjunction with not-in +-- TC.01.06 +SELECT id, + dept_id, + emp_name +FROM emp +WHERE EXISTS (SELECT emp_name + FROM bonus + WHERE emp.emp_name = bonus.emp_name) + OR (dept_id IS NOT NULL + AND dept_id NOT IN (SELECT dept_id + FROM dept)); + +-- multiple columns in not-in +-- TC.01.07 +SELECT id, + dept_id, + emp_name +FROM emp +WHERE dept_id = 10 +OR (id, emp_name) NOT IN (SELECT id, emp_name FROM address); + +-- multiple columns in not-in +-- TC.01.08 +SELECT id, + dept_id, + emp_name +FROM emp +WHERE dept_id = 10 + OR (( id, emp_name ) NOT IN (SELECT id, + emp_name + FROM address + WHERE id IS NOT NULL + AND emp_name IS NOT NULL) + AND id > 400 ); +-- correlated not-in along with disjunction +-- TC.01.09 +SELECT id, + dept_id, + emp_name +FROM emp +WHERE dept_id = 10 + OR emp_name NOT IN (SELECT emp_name + FROM address + WHERE id IS NOT NULL + AND emp_name IS NOT NULL + AND emp.id = address.id); + +-- multiple not-in(s) in side disjunction` +-- TC.01.10 +SELECT id, + dept_id, + emp_name +FROM emp +WHERE id NOT IN (SELECT id + FROM address + WHERE id IS NOT NULL + AND emp_name IS NOT NULL + AND id >= 400) + OR emp_name NOT IN (SELECT emp_name + FROM address + WHERE id IS NOT NULL + AND emp_name IS NOT NULL + AND emp.id = address.id + AND id < 400); + +-- NOT (NOT IN (SUBQ)) +SELECT * +FROM s1 +WHERE NOT (a NOT IN (SELECT c + FROM s2)); + +-- NOT (OR (expression, IN-SUBQ)) +SELECT * +FROM s1 +WHERE NOT (a > 5 + OR a IN (SELECT c + FROM s2)); + +-- NOT (OR (expression, NOT-IN-SUB) +SELECT * +FROM s1 +WHERE NOT (a > 5 + OR a NOT IN (SELECT c + FROM s2)); + +-- NOT (AND (expression, IN-SUB)) +SELECT * +FROM s1 +WHERE NOT (a > 5 + AND a IN (SELECT c + FROM s2)); + +-- NOT (AND (expression, NOT-IN-SUBQ)) +SELECT * +FROM s1 +WHERE NOT (a > 5 + AND a NOT IN (SELECT c + FROM s2)); diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/nested-not-in.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/nested-not-in.sql.out new file mode 100644 index 0000000..86f967e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/nested-not-in.sql.out @@ -0,0 +1,332 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 21 + + +-- !query +CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES + (100, "emp 1", 10), + (200, "emp 2", NULL), + (300, "emp 3", 20), + (400, "emp 4", 30), + (500, "emp 5", NULL), + (600, "emp 6", 100), + (800, "emp 8", 70) +AS EMP(id, emp_name, dept_id) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES + (10, "dept 1", "CA"), + (20, "dept 2", "NY"), + (30, "dept 3", "TX"), + (40, "dept 4 - unassigned", "OR"), + (50, "dept 5 - unassigned", "NJ"), + (70, "dept 7", "FL") +AS DEPT(dept_id, dept_name, state) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES + ("emp 1", 10.00D), + ("emp 1", 20.00D), + ("emp 2", 300.00D), + ("emp 2", 100.00D), + ("emp 3", 300.00D), + ("emp 4", 100.00D), + ("emp 5", 1000.00D), + ("emp 6 - no dept", 500.00D) +AS BONUS(emp_name, bonus_amt) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMPORARY VIEW ADDRESS AS SELECT * FROM VALUES + (100, "emp 1", "addr1"), + (200, null, "addr2"), + (null, "emp 3", "addr3"), + (null, null, "addr4"), + (600, "emp 6", "addr6"), + (800, "emp 8", "addr8") +AS ADDRESS(id, emp_name, address) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMPORARY VIEW S1 AS SELECT * FROM VALUES + (null, null), (5, 5), (8, 8), (11, 11) AS s1(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMPORARY VIEW S2 AS SELECT * FROM VALUES + (7, 7), (8, 8), (11, 11), (null, null) AS s2(c, d) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT id, + dept_id +FROM emp +WHERE id = 600 + OR id = 500 + OR dept_id NOT IN (SELECT dept_id + FROM emp) +-- !query schema +struct<id:int,dept_id:int> +-- !query output +500 NULL +600 100 + + +-- !query +SELECT id, + dept_id +FROM emp +WHERE id = 800 + OR (dept_id IS NOT NULL + AND dept_id NOT IN (SELECT dept_id + FROM emp)) +-- !query schema +struct<id:int,dept_id:int> +-- !query output +800 70 + + +-- !query +SELECT id, + dept_id +FROM emp +WHERE id = 100 + OR dept_id NOT IN (SELECT dept_id + FROM emp + WHERE dept_id IS NOT NULL) +-- !query schema +struct<id:int,dept_id:int> +-- !query output +100 10 + + +-- !query +SELECT id, + dept_id +FROM emp +WHERE id = 200 + OR (dept_id IS NOT NULL + AND dept_id + 100 NOT IN (SELECT dept_id + FROM emp + WHERE dept_id IS NOT NULL)) +-- !query schema +struct<id:int,dept_id:int> +-- !query output +100 10 +200 NULL +300 20 +400 30 +600 100 +800 70 + + +-- !query +SELECT id, + dept_id, + emp_name +FROM emp +WHERE emp_name IN (SELECT emp_name + FROM bonus) + OR (dept_id IS NOT NULL + AND dept_id NOT IN (SELECT dept_id + FROM dept)) +-- !query schema +struct<id:int,dept_id:int,emp_name:string> +-- !query output +100 10 emp 1 +200 NULL emp 2 +300 20 emp 3 +400 30 emp 4 +500 NULL emp 5 +600 100 emp 6 + + +-- !query +SELECT id, + dept_id, + emp_name +FROM emp +WHERE EXISTS (SELECT emp_name + FROM bonus + WHERE emp.emp_name = bonus.emp_name) + OR (dept_id IS NOT NULL + AND dept_id NOT IN (SELECT dept_id + FROM dept)) +-- !query schema +struct<id:int,dept_id:int,emp_name:string> +-- !query output +100 10 emp 1 +200 NULL emp 2 +300 20 emp 3 +400 30 emp 4 +500 NULL emp 5 +600 100 emp 6 + + +-- !query +SELECT id, + dept_id, + emp_name +FROM emp +WHERE dept_id = 10 +OR (id, emp_name) NOT IN (SELECT id, emp_name FROM address) +-- !query schema +struct<id:int,dept_id:int,emp_name:string> +-- !query output +100 10 emp 1 + + +-- !query +SELECT id, + dept_id, + emp_name +FROM emp +WHERE dept_id = 10 + OR (( id, emp_name ) NOT IN (SELECT id, + emp_name + FROM address + WHERE id IS NOT NULL + AND emp_name IS NOT NULL) + AND id > 400 ) +-- !query schema +struct<id:int,dept_id:int,emp_name:string> +-- !query output +100 10 emp 1 +500 NULL emp 5 + + +-- !query +SELECT id, + dept_id, + emp_name +FROM emp +WHERE dept_id = 10 + OR emp_name NOT IN (SELECT emp_name + FROM address + WHERE id IS NOT NULL + AND emp_name IS NOT NULL + AND emp.id = address.id) +-- !query schema +struct<id:int,dept_id:int,emp_name:string> +-- !query output +100 10 emp 1 +200 NULL emp 2 +300 20 emp 3 +400 30 emp 4 +500 NULL emp 5 + + +-- !query +SELECT id, + dept_id, + emp_name +FROM emp +WHERE id NOT IN (SELECT id + FROM address + WHERE id IS NOT NULL + AND emp_name IS NOT NULL + AND id >= 400) + OR emp_name NOT IN (SELECT emp_name + FROM address + WHERE id IS NOT NULL + AND emp_name IS NOT NULL + AND emp.id = address.id + AND id < 400) +-- !query schema +struct<id:int,dept_id:int,emp_name:string> +-- !query output +100 10 emp 1 +200 NULL emp 2 +300 20 emp 3 +400 30 emp 4 +500 NULL emp 5 +600 100 emp 6 +800 70 emp 8 + + +-- !query +SELECT * +FROM s1 +WHERE NOT (a NOT IN (SELECT c + FROM s2)) +-- !query schema +struct<a:int,b:int> +-- !query output +11 11 +8 8 + + +-- !query +SELECT * +FROM s1 +WHERE NOT (a > 5 + OR a IN (SELECT c + FROM s2)) +-- !query schema +struct<a:int,b:int> +-- !query output + + + +-- !query +SELECT * +FROM s1 +WHERE NOT (a > 5 + OR a NOT IN (SELECT c + FROM s2)) +-- !query schema +struct<a:int,b:int> +-- !query output + + + +-- !query +SELECT * +FROM s1 +WHERE NOT (a > 5 + AND a IN (SELECT c + FROM s2)) +-- !query schema +struct<a:int,b:int> +-- !query output +5 5 + + +-- !query +SELECT * +FROM s1 +WHERE NOT (a > 5 + AND a NOT IN (SELECT c + FROM s2)) +-- !query schema +struct<a:int,b:int> +-- !query output +11 11 +5 5 +8 8 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index ff8f94c..1b66ce53 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -288,10 +288,10 @@ class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark " or l.a in (select c from r where l.b < r.d)"), Row(2, 1.0) :: Row(2, 1.0) :: Row(3, 3.0) :: Row(6, null) :: Nil) - intercept[AnalysisException] { + checkAnswer( sql("select * from l where a not in (select c from r)" + - " or a not in (select c from r where c is not null)") - } + " or a not in (select c from r where c is not null)"), + Row(1, 2.0) :: Row(1, 2.0) :: Nil) } test("complex IN predicate subquery") { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org