This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new 7520fc160aa [SPARK-44551][SQL] Fix behavior of null IN (empty list) in expression execution 7520fc160aa is described below commit 7520fc160aa75360e29d5527e2a59c5605f144d4 Author: Jack Chen <jack.c...@databricks.com> AuthorDate: Wed Aug 9 15:10:51 2023 +0800 [SPARK-44551][SQL] Fix behavior of null IN (empty list) in expression execution ### What changes were proposed in this pull request? `null IN (empty list)` incorrectly evaluates to null, when it should evaluate to false. (The reason it should be false is because a IN (b1, b2) is defined as a = b1 OR a = b2, and an empty IN list is treated as an empty OR which is false. This is specified by ANSI SQL.) Many places in Spark execution (In, InSet, InSubquery) and optimization (OptimizeIn, NullPropagation) implemented this wrong behavior. This is a longstanding correctness issue which has existed since null support for IN expressions was first added to Spark. This PR fixes Spark execution (In, InSet, InSubquery). See previous PR https://github.com/apache/spark/pull/42007 for optimization fixes. The behavior is under a flag, which will be available to revert to the legacy behavior if needed. This flag is set to disable the new behavior until all of the fix PRs are complete. See [this doc](https://docs.google.com/document/d/1k8AY8oyT-GI04SnP7eXttPDnDj-Ek-c3luF2zL6DPNU/edit) for more information. ### Why are the changes needed? Fix wrong SQL semantics ### Does this PR introduce _any_ user-facing change? Not yet, but will fix wrong SQL semantics when enabled ### How was this patch tested? Unit tests PredicateSuite and tests added in previous PR https://github.com/apache/spark/pull/42007 Closes #42163 from jchen5/null-in-empty-exec. Authored-by: Jack Chen <jack.c...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit f9058d69b2af774e78677ac4cad55c7c91eb42ae) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../sql/catalyst/expressions/predicates.scala | 196 ++++++++++++--------- .../spark/sql/catalyst/optimizer/expressions.scala | 4 +- .../sql/catalyst/expressions/PredicateSuite.scala | 35 ++-- .../subquery/in-subquery/in-null-semantics.sql | 3 +- .../subquery/in-subquery/in-null-semantics.sql.out | 4 +- .../scala/org/apache/spark/sql/EmptyInSuite.scala | 3 +- 6 files changed, 144 insertions(+), 101 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index ee2ba7c73d1..31b872e04ce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -468,6 +468,8 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { override def foldable: Boolean = children.forall(_.foldable) final override val nodePatterns: Seq[TreePattern] = Seq(IN) + private val legacyNullInEmptyBehavior = + SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR) override lazy val canonicalized: Expression = { val basic = withNewChildren(children.map(_.canonicalized)).asInstanceOf[In] @@ -481,88 +483,104 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}" override def eval(input: InternalRow): Any = { - val evaluatedValue = value.eval(input) - if (evaluatedValue == null) { - null + if (list.isEmpty && !legacyNullInEmptyBehavior) { + // IN (empty list) is always false under current behavior. + // Under legacy behavior it's null if the left side is null, otherwise false (SPARK-44550). + false } else { - var hasNull = false - list.foreach { e => - val v = e.eval(input) - if (v == null) { - hasNull = true - } else if (ordering.equiv(v, evaluatedValue)) { - return true - } - } - if (hasNull) { + val evaluatedValue = value.eval(input) + if (evaluatedValue == null) { null } else { - false + var hasNull = false + list.foreach { e => + val v = e.eval(input) + if (v == null) { + hasNull = true + } else if (ordering.equiv(v, evaluatedValue)) { + return true + } + } + if (hasNull) { + null + } else { + false + } } } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val javaDataType = CodeGenerator.javaType(value.dataType) - val valueGen = value.genCode(ctx) - val listGen = list.map(_.genCode(ctx)) - // inTmpResult has 3 possible values: - // -1 means no matches found and there is at least one value in the list evaluated to null - val HAS_NULL = -1 - // 0 means no matches found and all values in the list are not null - val NOT_MATCHED = 0 - // 1 means one value in the list is matched - val MATCHED = 1 - val tmpResult = ctx.freshName("inTmpResult") - val valueArg = ctx.freshName("valueArg") - // All the blocks are meant to be inside a do { ... } while (false); loop. - // The evaluation of variables can be stopped when we find a matching value. - val listCode = listGen.map(x => - s""" - |${x.code} - |if (${x.isNull}) { - | $tmpResult = $HAS_NULL; // ${ev.isNull} = true; - |} else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) { - | $tmpResult = $MATCHED; // ${ev.isNull} = false; ${ev.value} = true; - | continue; - |} + if (list.isEmpty && !legacyNullInEmptyBehavior) { + // IN (empty list) is always false under current behavior. + // Under legacy behavior it's null if the left side is null, otherwise false (SPARK-44550). + ev.copy(code = + code""" + |final boolean ${ev.isNull} = false; + |final boolean ${ev.value} = false; """.stripMargin) - - val codes = ctx.splitExpressionsWithCurrentInputs( - expressions = listCode, - funcName = "valueIn", - extraArguments = (javaDataType, valueArg) :: (CodeGenerator.JAVA_BYTE, tmpResult) :: Nil, - returnType = CodeGenerator.JAVA_BYTE, - makeSplitFunction = body => - s""" - |do { - | $body - |} while (false); - |return $tmpResult; - """.stripMargin, - foldFunctions = _.map { funcCall => + } else { + val javaDataType = CodeGenerator.javaType(value.dataType) + val valueGen = value.genCode(ctx) + val listGen = list.map(_.genCode(ctx)) + // inTmpResult has 3 possible values: + // -1 means no matches found and there is at least one value in the list evaluated to null + val HAS_NULL = -1 + // 0 means no matches found and all values in the list are not null + val NOT_MATCHED = 0 + // 1 means one value in the list is matched + val MATCHED = 1 + val tmpResult = ctx.freshName("inTmpResult") + val valueArg = ctx.freshName("valueArg") + // All the blocks are meant to be inside a do { ... } while (false); loop. + // The evaluation of variables can be stopped when we find a matching value. + val listCode = listGen.map(x => s""" - |$tmpResult = $funcCall; - |if ($tmpResult == $MATCHED) { + |${x.code} + |if (${x.isNull}) { + | $tmpResult = $HAS_NULL; // ${ev.isNull} = true; + |} else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) { + | $tmpResult = $MATCHED; // ${ev.isNull} = false; ${ev.value} = true; | continue; |} - """.stripMargin - }.mkString("\n")) - - ev.copy(code = - code""" - |${valueGen.code} - |byte $tmpResult = $HAS_NULL; - |if (!${valueGen.isNull}) { - | $tmpResult = $NOT_MATCHED; - | $javaDataType $valueArg = ${valueGen.value}; - | do { - | $codes - | } while (false); - |} - |final boolean ${ev.isNull} = ($tmpResult == $HAS_NULL); - |final boolean ${ev.value} = ($tmpResult == $MATCHED); - """.stripMargin) + """.stripMargin) + + val codes = ctx.splitExpressionsWithCurrentInputs( + expressions = listCode, + funcName = "valueIn", + extraArguments = (javaDataType, valueArg) :: (CodeGenerator.JAVA_BYTE, tmpResult) :: Nil, + returnType = CodeGenerator.JAVA_BYTE, + makeSplitFunction = body => + s""" + |do { + | $body + |} while (false); + |return $tmpResult; + """.stripMargin, + foldFunctions = _.map { funcCall => + s""" + |$tmpResult = $funcCall; + |if ($tmpResult == $MATCHED) { + | continue; + |} + """.stripMargin + }.mkString("\n")) + + ev.copy(code = + code""" + |${valueGen.code} + |byte $tmpResult = $HAS_NULL; + |if (!${valueGen.isNull}) { + | $tmpResult = $NOT_MATCHED; + | $javaDataType $valueArg = ${valueGen.value}; + | do { + | $codes + | } while (false); + |} + |final boolean ${ev.isNull} = ($tmpResult == $HAS_NULL); + |final boolean ${ev.value} = ($tmpResult == $MATCHED); + """.stripMargin) + } } override def sql: String = { @@ -607,16 +625,27 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with override def nullable: Boolean = child.nullable || hasNull final override val nodePatterns: Seq[TreePattern] = Seq(INSET) + private val legacyNullInEmptyBehavior = + SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR) - protected override def nullSafeEval(value: Any): Any = { - if (set.contains(value)) { - true - } else if (isNaN(value)) { - hasNaN - } else if (hasNull) { - null - } else { + override def eval(input: InternalRow): Any = { + if (hset.isEmpty && !legacyNullInEmptyBehavior) { + // IN (empty list) is always false under current behavior. + // Under legacy behavior it's null if the left side is null, otherwise false (SPARK-44550). false + } else { + val value = child.eval(input) + if (value == null) { + null + } else if (set.contains(value)) { + true + } else if (isNaN(value)) { + hasNaN + } else if (hasNull) { + null + } else { + false + } } } @@ -629,7 +658,16 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - if (canBeComputedUsingSwitch && hset.size <= SQLConf.get.optimizerInSetSwitchThreshold) { + if (hset.isEmpty && !legacyNullInEmptyBehavior) { + // IN (empty list) is always false under current behavior. + // Under legacy behavior it's null if the left side is null, otherwise false (SPARK-44550). + ev.copy(code = + code""" + ${CodeGenerator.JAVA_BOOLEAN} ${ev.value} = false; + ${CodeGenerator.JAVA_BOOLEAN} ${ev.isNull} = false; + """ + ) + } else if (canBeComputedUsingSwitch && hset.size <= SQLConf.get.optimizerInSetSwitchThreshold) { genCodeWithSwitch(ctx, ev) } else { genCodeWithSet(ctx, ev) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 8cb560199c0..d4f0f72c935 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -284,11 +284,11 @@ object OptimizeIn extends Rule[LogicalPlan] { _.containsPattern(IN), ruleId) { case q: LogicalPlan => q.transformExpressionsDownWithPruning(_.containsPattern(IN), ruleId) { case In(v, list) if list.isEmpty => + // IN (empty list) is always false under current behavior. + // Under legacy behavior it's null if the left side is null, otherwise false (SPARK-44550). if (!SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)) { FalseLiteral } else { - // Incorrect legacy behavior optimizes to null if the left side is null, and otherwise - // to false. If(IsNotNull(v), FalseLiteral, Literal(null, BooleanType)) } case expr @ In(v, list) if expr.inSetConvertible => diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala index 73cc9aca568..55e0dd21794 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala @@ -134,20 +134,27 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { } test("basic IN/INSET predicate test") { - checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), Seq(Literal(1), - Literal(2))), null) - checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), - Seq(NonFoldableLiteral.create(null, IntegerType))), null) - checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), Seq.empty), null) - checkInAndInSet(In(Literal(1), Seq.empty), false) - checkInAndInSet(In(Literal(1), Seq(NonFoldableLiteral.create(null, IntegerType))), null) - checkInAndInSet(In(Literal(1), Seq(Literal(1), NonFoldableLiteral.create(null, IntegerType))), - true) - checkInAndInSet(In(Literal(2), Seq(Literal(1), NonFoldableLiteral.create(null, IntegerType))), - null) - checkInAndInSet(In(Literal(1), Seq(Literal(1), Literal(2))), true) - checkInAndInSet(In(Literal(2), Seq(Literal(1), Literal(2))), true) - checkInAndInSet(In(Literal(3), Seq(Literal(1), Literal(2))), false) + Seq(true, false).foreach { legacyNullInBehavior => + withSQLConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR.key -> legacyNullInBehavior.toString) { + checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), Seq(Literal(1), + Literal(2))), null) + checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), + Seq(NonFoldableLiteral.create(null, IntegerType))), null) + checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), Seq.empty), + expected = if (legacyNullInBehavior) null else false) + checkInAndInSet(In(Literal(1), Seq.empty), false) + checkInAndInSet(In(Literal(1), Seq(NonFoldableLiteral.create(null, IntegerType))), null) + checkInAndInSet(In(Literal(1), + Seq(Literal(1), NonFoldableLiteral.create(null, IntegerType))), + true) + checkInAndInSet(In(Literal(2), + Seq(Literal(1), NonFoldableLiteral.create(null, IntegerType))), + null) + checkInAndInSet(In(Literal(1), Seq(Literal(1), Literal(2))), true) + checkInAndInSet(In(Literal(2), Seq(Literal(1), Literal(2))), true) + checkInAndInSet(In(Literal(3), Seq(Literal(1), Literal(2))), false) + } + } checkEvaluation( And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1), diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-null-semantics.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-null-semantics.sql index b893d8970b4..cc01887a4e2 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-null-semantics.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-null-semantics.sql @@ -1,7 +1,7 @@ create temp view v (c) as values (1), (null); create temp view v_empty (e) as select 1 where false; --- Note: tables and temp views hit different optimization/execution codepaths +-- Note: tables and temp views hit different optimization/execution codepaths: expressions over temp views are evaled at query compilation time by ConvertToLocalRelation create table t(c int) using json; insert into t values (1), (null); create table t2(d int) using json; @@ -29,7 +29,6 @@ select null not in (select e from v_empty); -- IN subquery which is not rewritten to join - here we use IN in the ON condition because that is a case that doesn't get rewritten to join in RewritePredicateSubquery, so we can observe the execution behavior of InSubquery directly -- Correct results: column t2.d should be NULL because the ON condition is always false --- This will be fixed by the execution fixes. select * from t left join t2 on (t.c in (select e from t_empty)) is null; select * from t left join t2 on (t.c not in (select e from t_empty)) is null; diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-null-semantics.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-null-semantics.sql.out index 39b03576baa..169b49fda84 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-null-semantics.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-null-semantics.sql.out @@ -137,7 +137,7 @@ select * from t left join t2 on (t.c in (select e from t_empty)) is null struct<c:int,d:int> -- !query output 1 NULL -NULL 2 +NULL NULL -- !query @@ -146,7 +146,7 @@ select * from t left join t2 on (t.c not in (select e from t_empty)) is null struct<c:int,d:int> -- !query output 1 NULL -NULL 2 +NULL NULL -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala index c9e016c891e..1534aba28c4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala @@ -49,9 +49,8 @@ with SharedSparkSession { withSQLConf( SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> excludedRules, SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR.key -> legacyNullInBehavior.toString) { - // We still get legacy behavior with disableOptimizeIn until execution is also fixed val expectedResultForNullInEmpty = - if (legacyNullInBehavior || disableOptimizeIn) null else false + if (legacyNullInBehavior) null else false val df = t.select(col("a"), col("a").isin(emptylist: _*)) checkAnswer( df, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org