This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new 7520fc160aa [SPARK-44551][SQL] Fix behavior of null IN (empty list) in 
expression execution
7520fc160aa is described below

commit 7520fc160aa75360e29d5527e2a59c5605f144d4
Author: Jack Chen <jack.c...@databricks.com>
AuthorDate: Wed Aug 9 15:10:51 2023 +0800

    [SPARK-44551][SQL] Fix behavior of null IN (empty list) in expression 
execution
    
    ### What changes were proposed in this pull request?
    `null IN (empty list)` incorrectly evaluates to null, when it should 
evaluate to false. (The reason it should be false is because a IN (b1, b2) is 
defined as a = b1 OR a = b2, and an empty IN list is treated as an empty OR 
which is false. This is specified by ANSI SQL.)
    
    Many places in Spark execution (In, InSet, InSubquery) and optimization 
(OptimizeIn, NullPropagation) implemented this wrong behavior. This is a 
longstanding correctness issue which has existed since null support for IN 
expressions was first added to Spark.
    
    This PR fixes Spark execution (In, InSet, InSubquery). See previous PR 
https://github.com/apache/spark/pull/42007 for optimization fixes.
    
    The behavior is under a flag, which will be available to revert to the 
legacy behavior if needed. This flag is set to disable the new behavior until 
all of the fix PRs are complete.
    
    See [this 
doc](https://docs.google.com/document/d/1k8AY8oyT-GI04SnP7eXttPDnDj-Ek-c3luF2zL6DPNU/edit)
 for more information.
    
    ### Why are the changes needed?
    Fix wrong SQL semantics
    
    ### Does this PR introduce _any_ user-facing change?
    Not yet, but will fix wrong SQL semantics when enabled
    
    ### How was this patch tested?
    Unit tests PredicateSuite and tests added in previous PR 
https://github.com/apache/spark/pull/42007
    
    Closes #42163 from jchen5/null-in-empty-exec.
    
    Authored-by: Jack Chen <jack.c...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit f9058d69b2af774e78677ac4cad55c7c91eb42ae)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../sql/catalyst/expressions/predicates.scala      | 196 ++++++++++++---------
 .../spark/sql/catalyst/optimizer/expressions.scala |   4 +-
 .../sql/catalyst/expressions/PredicateSuite.scala  |  35 ++--
 .../subquery/in-subquery/in-null-semantics.sql     |   3 +-
 .../subquery/in-subquery/in-null-semantics.sql.out |   4 +-
 .../scala/org/apache/spark/sql/EmptyInSuite.scala  |   3 +-
 6 files changed, 144 insertions(+), 101 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index ee2ba7c73d1..31b872e04ce 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -468,6 +468,8 @@ case class In(value: Expression, list: Seq[Expression]) 
extends Predicate {
   override def foldable: Boolean = children.forall(_.foldable)
 
   final override val nodePatterns: Seq[TreePattern] = Seq(IN)
+  private val legacyNullInEmptyBehavior =
+    SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
 
   override lazy val canonicalized: Expression = {
     val basic = withNewChildren(children.map(_.canonicalized)).asInstanceOf[In]
@@ -481,88 +483,104 @@ case class In(value: Expression, list: Seq[Expression]) 
extends Predicate {
   override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}"
 
   override def eval(input: InternalRow): Any = {
-    val evaluatedValue = value.eval(input)
-    if (evaluatedValue == null) {
-      null
+    if (list.isEmpty && !legacyNullInEmptyBehavior) {
+      // IN (empty list) is always false under current behavior.
+      // Under legacy behavior it's null if the left side is null, otherwise 
false (SPARK-44550).
+      false
     } else {
-      var hasNull = false
-      list.foreach { e =>
-        val v = e.eval(input)
-        if (v == null) {
-          hasNull = true
-        } else if (ordering.equiv(v, evaluatedValue)) {
-          return true
-        }
-      }
-      if (hasNull) {
+      val evaluatedValue = value.eval(input)
+      if (evaluatedValue == null) {
         null
       } else {
-        false
+        var hasNull = false
+        list.foreach { e =>
+          val v = e.eval(input)
+          if (v == null) {
+            hasNull = true
+          } else if (ordering.equiv(v, evaluatedValue)) {
+            return true
+          }
+        }
+        if (hasNull) {
+          null
+        } else {
+          false
+        }
       }
     }
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val javaDataType = CodeGenerator.javaType(value.dataType)
-    val valueGen = value.genCode(ctx)
-    val listGen = list.map(_.genCode(ctx))
-    // inTmpResult has 3 possible values:
-    // -1 means no matches found and there is at least one value in the list 
evaluated to null
-    val HAS_NULL = -1
-    // 0 means no matches found and all values in the list are not null
-    val NOT_MATCHED = 0
-    // 1 means one value in the list is matched
-    val MATCHED = 1
-    val tmpResult = ctx.freshName("inTmpResult")
-    val valueArg = ctx.freshName("valueArg")
-    // All the blocks are meant to be inside a do { ... } while (false); loop.
-    // The evaluation of variables can be stopped when we find a matching 
value.
-    val listCode = listGen.map(x =>
-      s"""
-         |${x.code}
-         |if (${x.isNull}) {
-         |  $tmpResult = $HAS_NULL; // ${ev.isNull} = true;
-         |} else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) {
-         |  $tmpResult = $MATCHED; // ${ev.isNull} = false; ${ev.value} = true;
-         |  continue;
-         |}
+    if (list.isEmpty && !legacyNullInEmptyBehavior) {
+      // IN (empty list) is always false under current behavior.
+      // Under legacy behavior it's null if the left side is null, otherwise 
false (SPARK-44550).
+      ev.copy(code =
+        code"""
+              |final boolean ${ev.isNull} = false;
+              |final boolean ${ev.value} = false;
        """.stripMargin)
-
-    val codes = ctx.splitExpressionsWithCurrentInputs(
-      expressions = listCode,
-      funcName = "valueIn",
-      extraArguments = (javaDataType, valueArg) :: (CodeGenerator.JAVA_BYTE, 
tmpResult) :: Nil,
-      returnType = CodeGenerator.JAVA_BYTE,
-      makeSplitFunction = body =>
-        s"""
-           |do {
-           |  $body
-           |} while (false);
-           |return $tmpResult;
-         """.stripMargin,
-      foldFunctions = _.map { funcCall =>
+    } else {
+      val javaDataType = CodeGenerator.javaType(value.dataType)
+      val valueGen = value.genCode(ctx)
+      val listGen = list.map(_.genCode(ctx))
+      // inTmpResult has 3 possible values:
+      // -1 means no matches found and there is at least one value in the list 
evaluated to null
+      val HAS_NULL = -1
+      // 0 means no matches found and all values in the list are not null
+      val NOT_MATCHED = 0
+      // 1 means one value in the list is matched
+      val MATCHED = 1
+      val tmpResult = ctx.freshName("inTmpResult")
+      val valueArg = ctx.freshName("valueArg")
+      // All the blocks are meant to be inside a do { ... } while (false); 
loop.
+      // The evaluation of variables can be stopped when we find a matching 
value.
+      val listCode = listGen.map(x =>
         s"""
-           |$tmpResult = $funcCall;
-           |if ($tmpResult == $MATCHED) {
+           |${x.code}
+           |if (${x.isNull}) {
+           |  $tmpResult = $HAS_NULL; // ${ev.isNull} = true;
+           |} else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) {
+           |  $tmpResult = $MATCHED; // ${ev.isNull} = false; ${ev.value} = 
true;
            |  continue;
            |}
-         """.stripMargin
-      }.mkString("\n"))
-
-    ev.copy(code =
-      code"""
-         |${valueGen.code}
-         |byte $tmpResult = $HAS_NULL;
-         |if (!${valueGen.isNull}) {
-         |  $tmpResult = $NOT_MATCHED;
-         |  $javaDataType $valueArg = ${valueGen.value};
-         |  do {
-         |    $codes
-         |  } while (false);
-         |}
-         |final boolean ${ev.isNull} = ($tmpResult == $HAS_NULL);
-         |final boolean ${ev.value} = ($tmpResult == $MATCHED);
-       """.stripMargin)
+         """.stripMargin)
+
+      val codes = ctx.splitExpressionsWithCurrentInputs(
+        expressions = listCode,
+        funcName = "valueIn",
+        extraArguments = (javaDataType, valueArg) :: (CodeGenerator.JAVA_BYTE, 
tmpResult) :: Nil,
+        returnType = CodeGenerator.JAVA_BYTE,
+        makeSplitFunction = body =>
+          s"""
+             |do {
+             |  $body
+             |} while (false);
+             |return $tmpResult;
+           """.stripMargin,
+        foldFunctions = _.map { funcCall =>
+          s"""
+             |$tmpResult = $funcCall;
+             |if ($tmpResult == $MATCHED) {
+             |  continue;
+             |}
+           """.stripMargin
+        }.mkString("\n"))
+
+      ev.copy(code =
+        code"""
+           |${valueGen.code}
+           |byte $tmpResult = $HAS_NULL;
+           |if (!${valueGen.isNull}) {
+           |  $tmpResult = $NOT_MATCHED;
+           |  $javaDataType $valueArg = ${valueGen.value};
+           |  do {
+           |    $codes
+           |  } while (false);
+           |}
+           |final boolean ${ev.isNull} = ($tmpResult == $HAS_NULL);
+           |final boolean ${ev.value} = ($tmpResult == $MATCHED);
+         """.stripMargin)
+    }
   }
 
   override def sql: String = {
@@ -607,16 +625,27 @@ case class InSet(child: Expression, hset: Set[Any]) 
extends UnaryExpression with
   override def nullable: Boolean = child.nullable || hasNull
 
   final override val nodePatterns: Seq[TreePattern] = Seq(INSET)
+  private val legacyNullInEmptyBehavior =
+    SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
 
-  protected override def nullSafeEval(value: Any): Any = {
-    if (set.contains(value)) {
-      true
-    } else if (isNaN(value)) {
-      hasNaN
-    } else if (hasNull) {
-      null
-    } else {
+  override def eval(input: InternalRow): Any = {
+    if (hset.isEmpty && !legacyNullInEmptyBehavior) {
+      // IN (empty list) is always false under current behavior.
+      // Under legacy behavior it's null if the left side is null, otherwise 
false (SPARK-44550).
       false
+    } else {
+      val value = child.eval(input)
+      if (value == null) {
+        null
+      } else if (set.contains(value)) {
+        true
+      } else if (isNaN(value)) {
+        hasNaN
+      } else if (hasNull) {
+        null
+      } else {
+        false
+      }
     }
   }
 
@@ -629,7 +658,16 @@ case class InSet(child: Expression, hset: Set[Any]) 
extends UnaryExpression with
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    if (canBeComputedUsingSwitch && hset.size <= 
SQLConf.get.optimizerInSetSwitchThreshold) {
+    if (hset.isEmpty && !legacyNullInEmptyBehavior) {
+      // IN (empty list) is always false under current behavior.
+      // Under legacy behavior it's null if the left side is null, otherwise 
false (SPARK-44550).
+      ev.copy(code =
+        code"""
+        ${CodeGenerator.JAVA_BOOLEAN} ${ev.value} = false;
+        ${CodeGenerator.JAVA_BOOLEAN} ${ev.isNull} = false;
+        """
+      )
+    } else if (canBeComputedUsingSwitch && hset.size <= 
SQLConf.get.optimizerInSetSwitchThreshold) {
       genCodeWithSwitch(ctx, ev)
     } else {
       genCodeWithSet(ctx, ev)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 8cb560199c0..d4f0f72c935 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -284,11 +284,11 @@ object OptimizeIn extends Rule[LogicalPlan] {
     _.containsPattern(IN), ruleId) {
     case q: LogicalPlan => 
q.transformExpressionsDownWithPruning(_.containsPattern(IN), ruleId) {
       case In(v, list) if list.isEmpty =>
+        // IN (empty list) is always false under current behavior.
+        // Under legacy behavior it's null if the left side is null, otherwise 
false (SPARK-44550).
         if (!SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)) {
           FalseLiteral
         } else {
-          // Incorrect legacy behavior optimizes to null if the left side is 
null, and otherwise
-          // to false.
           If(IsNotNull(v), FalseLiteral, Literal(null, BooleanType))
         }
       case expr @ In(v, list) if expr.inSetConvertible =>
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 73cc9aca568..55e0dd21794 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -134,20 +134,27 @@ class PredicateSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   }
 
   test("basic IN/INSET predicate test") {
-    checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), 
Seq(Literal(1),
-      Literal(2))), null)
-    checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType),
-      Seq(NonFoldableLiteral.create(null, IntegerType))), null)
-    checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), 
Seq.empty), null)
-    checkInAndInSet(In(Literal(1), Seq.empty), false)
-    checkInAndInSet(In(Literal(1), Seq(NonFoldableLiteral.create(null, 
IntegerType))), null)
-    checkInAndInSet(In(Literal(1), Seq(Literal(1), 
NonFoldableLiteral.create(null, IntegerType))),
-      true)
-    checkInAndInSet(In(Literal(2), Seq(Literal(1), 
NonFoldableLiteral.create(null, IntegerType))),
-      null)
-    checkInAndInSet(In(Literal(1), Seq(Literal(1), Literal(2))), true)
-    checkInAndInSet(In(Literal(2), Seq(Literal(1), Literal(2))), true)
-    checkInAndInSet(In(Literal(3), Seq(Literal(1), Literal(2))), false)
+    Seq(true, false).foreach { legacyNullInBehavior =>
+      withSQLConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR.key -> 
legacyNullInBehavior.toString) {
+        checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), 
Seq(Literal(1),
+          Literal(2))), null)
+        checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType),
+          Seq(NonFoldableLiteral.create(null, IntegerType))), null)
+        checkInAndInSet(In(NonFoldableLiteral.create(null, IntegerType), 
Seq.empty),
+          expected = if (legacyNullInBehavior) null else false)
+        checkInAndInSet(In(Literal(1), Seq.empty), false)
+        checkInAndInSet(In(Literal(1), Seq(NonFoldableLiteral.create(null, 
IntegerType))), null)
+        checkInAndInSet(In(Literal(1),
+          Seq(Literal(1), NonFoldableLiteral.create(null, IntegerType))),
+          true)
+        checkInAndInSet(In(Literal(2),
+          Seq(Literal(1), NonFoldableLiteral.create(null, IntegerType))),
+          null)
+        checkInAndInSet(In(Literal(1), Seq(Literal(1), Literal(2))), true)
+        checkInAndInSet(In(Literal(2), Seq(Literal(1), Literal(2))), true)
+        checkInAndInSet(In(Literal(3), Seq(Literal(1), Literal(2))), false)
+      }
+    }
 
     checkEvaluation(
       And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), 
Seq(Literal(1),
diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-null-semantics.sql
 
b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-null-semantics.sql
index b893d8970b4..cc01887a4e2 100644
--- 
a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-null-semantics.sql
+++ 
b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-null-semantics.sql
@@ -1,7 +1,7 @@
 create temp view v (c) as values (1), (null);
 create temp view v_empty (e) as select 1 where false;
 
--- Note: tables and temp views hit different optimization/execution codepaths
+-- Note: tables and temp views hit different optimization/execution codepaths: 
expressions over temp views are evaled at query compilation time by 
ConvertToLocalRelation
 create table t(c int) using json;
 insert into t values (1), (null);
 create table t2(d int) using json;
@@ -29,7 +29,6 @@ select null not in (select e from v_empty);
 
 -- IN subquery which is not rewritten to join - here we use IN in the ON 
condition because that is a case that doesn't get rewritten to join in 
RewritePredicateSubquery, so we can observe the execution behavior of 
InSubquery directly
 -- Correct results: column t2.d should be NULL because the ON condition is 
always false
--- This will be fixed by the execution fixes.
 select * from t left join t2 on (t.c in (select e from t_empty)) is null;
 select * from t left join t2 on (t.c not in (select e from t_empty)) is null;
 
diff --git 
a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-null-semantics.sql.out
 
b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-null-semantics.sql.out
index 39b03576baa..169b49fda84 100644
--- 
a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-null-semantics.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-null-semantics.sql.out
@@ -137,7 +137,7 @@ select * from t left join t2 on (t.c in (select e from 
t_empty)) is null
 struct<c:int,d:int>
 -- !query output
 1      NULL
-NULL   2
+NULL   NULL
 
 
 -- !query
@@ -146,7 +146,7 @@ select * from t left join t2 on (t.c not in (select e from 
t_empty)) is null
 struct<c:int,d:int>
 -- !query output
 1      NULL
-NULL   2
+NULL   NULL
 
 
 -- !query
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala
index c9e016c891e..1534aba28c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala
@@ -49,9 +49,8 @@ with SharedSparkSession {
         withSQLConf(
           SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> excludedRules,
           SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR.key -> 
legacyNullInBehavior.toString) {
-          // We still get legacy behavior with disableOptimizeIn until 
execution is also fixed
           val expectedResultForNullInEmpty =
-            if (legacyNullInBehavior || disableOptimizeIn) null else false
+            if (legacyNullInBehavior) null else false
           val df = t.select(col("a"), col("a").isin(emptylist: _*))
           checkAnswer(
             df,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to