(spark) branch branch-4.1 updated: [SPARK-56840][SQL] Avoid unresolved NullIf type lookup

ptoth Thu, 14 May 2026 02:32:14 -0700

This is an automated email from the ASF dual-hosted git repository.

peter-toth pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 1d8d3d3c275b [SPARK-56840][SQL] Avoid unresolved NullIf type lookup
1d8d3d3c275b is described below

commit 1d8d3d3c275bade3649f29b154d990238ed34a45
Author: Chao Sun <[email protected]>
AuthorDate: Thu May 14 11:18:47 2026 +0200

    [SPARK-56840][SQL] Avoid unresolved NullIf type lookup
    
    ### Why are the changes needed?
    
    `NULLIF` builds its replacement expression before analysis has resolved all 
child expressions.
    For nested field references, the existing implementation can read the left 
operand's data type
    too early while constructing the null branch, which can fail analysis even 
though the SQL shape
    is valid.
    
    SPARK-56840 tracks this analyzer failure.
    
    ### What changes were proposed in this PR?
    
    - Build the `NULLIF` null branch with a lazy typed-null placeholder so 
construction does not eagerly
      read the unresolved left operand type, while 
`NullIf.replacement.dataType` remains valid once the
      operand type is available.
    - Make that placeholder `RuntimeReplaceable`, so `ReplaceExpressions` 
restores an ordinary typed
      `Literal(null, ...)` before later optimizer rules run and existing 
null-literal simplifications
      continue to apply.
    - Add focused regressions for:
      - nested struct-field `nullif(c.provider, lower(...))` analysis in both
        `ALWAYS_INLINE_COMMON_EXPR` modes;
      - `NullIf` replacement type reporting before type coercion;
      - optimizer replacement back to a normal null literal;
      - explain output avoiding exposure of the internal helper name.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes. Valid `NULLIF` expressions over unresolved nested field references 
that could fail during
    analysis now resolve and execute successfully.
    
    ### How was this patch tested?
    
    - `build/sbt 'catalyst/testOnly 
org.apache.spark.sql.catalyst.expressions.NullExpressionsSuite -- -z "NullIf 
replacement preserves its data type before type coercion"'`
    - `build/sbt 'catalyst/testOnly 
org.apache.spark.sql.catalyst.optimizer.OptimizerSuite -- -z "NullIf typed null 
branch is replaced with a null literal"'`
    - `build/sbt 'sql/testOnly org.apache.spark.sql.DataFrameFunctionsSuite -- 
-z "nullif function"'`
    - `build/sbt 'sql/testOnly org.apache.spark.sql.ExplainSuite -- -z "explain 
for these functions; use range to avoid constant folding"'`
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Codex (GPT-5.5)
    
    Closes #55838 from sunchao/dev/chao/codex/oss-nullif-unresolved.
    
    Authored-by: Chao Sun <[email protected]>
    Signed-off-by: Peter Toth <[email protected]>
    (cherry picked from commit 5949ab30b41860574ab57b94a8848464b5e127a7)
    Signed-off-by: Peter Toth <[email protected]>
---
 .../sql/catalyst/expressions/nullExpressions.scala | 19 +++++++++++++++--
 .../expressions/NullExpressionsSuite.scala         | 10 +++++++++
 .../sql/catalyst/optimizer/OptimizerSuite.scala    | 24 ++++++++++++++++++++--
 .../apache/spark/sql/DataFrameFunctionsSuite.scala |  7 +++++++
 .../scala/org/apache/spark/sql/ExplainSuite.scala  |  1 +
 5 files changed, 57 insertions(+), 4 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 1aa1d0b25e44..e7be588c4b46 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -145,6 +145,21 @@ case class Coalesce(children: Seq[Expression])
     copy(children = newChildren)
 }
 
+private case class TypedNullLiteral(child: Expression)
+    extends UnaryExpression with RuntimeReplaceable {
+  override def nullable: Boolean = true
+
+  override def dataType: DataType = child.dataType
+
+  override def toString: String = "null"
+
+  override def sql: String = "NULL"
+
+  override lazy val replacement: Expression = Literal.create(null, 
child.dataType)
+
+  override protected def withNewChildInternal(newChild: Expression): 
TypedNullLiteral =
+    copy(child = newChild)
+}
 
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns null if `expr1` equals to `expr2`, 
or `expr1` otherwise.",
@@ -162,10 +177,10 @@ case class NullIf(left: Expression, right: Expression, 
replacement: Expression)
     this(left, right,
       if (!SQLConf.get.getConf(SQLConf.ALWAYS_INLINE_COMMON_EXPR)) {
         With(left) { case Seq(ref) =>
-          If(EqualTo(ref, right), Literal.create(null, left.dataType), ref)
+          If(EqualTo(ref, right), TypedNullLiteral(ref), ref)
         }
       } else {
-        If(EqualTo(left, right), Literal.create(null, left.dataType), left)
+        If(EqualTo(left, right), TypedNullLiteral(left), left)
       }
     )
   }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
index c74a9e35833d..bb4aed9b4002 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
@@ -143,6 +143,16 @@ class NullExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     assert(analyze(new Nvl(floatLit, doubleLit)).dataType == DoubleType)
   }
 
+  test("NullIf replacement preserves its data type before type coercion") {
+    Seq(true, false).foreach { alwaysInlineCommonExpr =>
+      withSQLConf(SQLConf.ALWAYS_INLINE_COMMON_EXPR.key -> 
alwaysInlineCommonExpr.toString) {
+        val nullIf = new NullIf(Literal(1), Literal(1))
+        assert(nullIf.dataType == IntegerType)
+        assert(nullIf.replacement.dataType == IntegerType)
+      }
+    }
+  }
+
   test("AtLeastNNonNulls") {
     val mix = Seq(Literal("x"),
       Literal.create(null, StringType),
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
index 70a2ae94109f..057e4ceaf0a0 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
@@ -21,13 +21,13 @@ import org.apache.spark.SparkException
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Add, Alias, ArrayCompact, 
AttributeReference, CreateArray, CreateStruct, IntegerLiteral, Literal, 
MapFromEntries, Multiply, NamedExpression, Remainder}
+import org.apache.spark.sql.catalyst.expressions.{Add, Alias, ArrayCompact, 
AttributeReference, CreateArray, CreateStruct, IntegerLiteral, Literal, 
MapFromEntries, Multiply, NamedExpression, NullIf, Remainder, 
RuntimeReplaceable}
 import org.apache.spark.sql.catalyst.expressions.aggregate.Sum
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LocalRelation, 
LogicalPlan, OneRowRelation, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{ArrayType, IntegerType, MapType, 
StructField, StructType}
+import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, 
MapType, StructField, StructType}
 
 /**
  * A dummy optimizer rule for testing that decrements integer literals until 0.
@@ -334,4 +334,24 @@ class OptimizerSuite extends PlanTest {
     assert(optimized2.schema ===
       StructType(StructField("map", MapType(IntegerType, IntegerType, false), 
false) :: Nil))
   }
+
+  test("NullIf typed null branch is replaced with a null literal") {
+    val optimizer = new SimpleTestOptimizer() {
+      override def defaultBatches: Seq[Batch] =
+        Batch("test", fixedPoint,
+          ReplaceExpressions) :: Nil
+    }
+
+    withSQLConf(SQLConf.ALWAYS_INLINE_COMMON_EXPR.key -> "true") {
+      val nullIf = new NullIf(Literal(true), Literal(true))
+      val plan = Project(Alias(nullIf, "out")() :: Nil, 
OneRowRelation()).analyze
+      val optimized = optimizer.execute(plan)
+
+      assert(optimized.expressions.exists(_.exists {
+        case Literal(null, BooleanType) => true
+        case _ => false
+      }))
+      
assert(optimized.expressions.forall(!_.exists(_.isInstanceOf[RuntimeReplaceable])))
+    }
+  }
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index fc6d3023ed07..d4a7801b395f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -350,6 +350,13 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
            "expression" -> "\"id\"",
            "expressionAnyValue" -> "\"any_value(id)\"")
         )
+
+        val nestedDf = Seq("error_multiple_providers", "openai")
+          .toDF("provider")
+          .select(struct(col("provider")).as("c"))
+        checkAnswer(
+          nestedDf.select(nullif(col("c.provider"), 
lower(lit("ERROR_MULTIPLE_PROVIDERS")))),
+          Seq(Row(null), Row("openai")))
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index b27122a8de2b..4c62c47971ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -251,6 +251,7 @@ class ExplainSuite extends ExplainSuiteHelper with 
DisableAdaptiveExecutionSuite
     checkKeywordsExistsInExplain(df,
       "Project [id#xL AS ifnull(id, 1)#xL, if ((id#xL = 1)) null " +
         "else id#xL AS nullif(id, 1)#xL, id#xL AS nvl(id, 1)#xL, 1 AS nvl2(id, 
1, 2)#x]")
+    checkKeywordsNotExistsInExplain(df, ExtendedMode, "typednullliteral")
   }
 
   test("SPARK-26659: explain of DataWritingCommandExec should not contain 
duplicate cmd.nodeName") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch branch-4.1 updated: [SPARK-56840][SQL] Avoid unresolved NullIf type lookup

Reply via email to