This is an automated email from the ASF dual-hosted git repository.
peter-toth pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.x by this push:
new 3ac019a6abe2 [SPARK-56840][SQL] Avoid unresolved NullIf type lookup
3ac019a6abe2 is described below
commit 3ac019a6abe22a1ff9127b0fd519034d28e50ab5
Author: Chao Sun <[email protected]>
AuthorDate: Thu May 14 11:18:47 2026 +0200
[SPARK-56840][SQL] Avoid unresolved NullIf type lookup
### Why are the changes needed?
`NULLIF` builds its replacement expression before analysis has resolved all
child expressions.
For nested field references, the existing implementation can read the left
operand's data type
too early while constructing the null branch, which can fail analysis even
though the SQL shape
is valid.
SPARK-56840 tracks this analyzer failure.
### What changes were proposed in this PR?
- Build the `NULLIF` null branch with a lazy typed-null placeholder so
construction does not eagerly
read the unresolved left operand type, while
`NullIf.replacement.dataType` remains valid once the
operand type is available.
- Make that placeholder `RuntimeReplaceable`, so `ReplaceExpressions`
restores an ordinary typed
`Literal(null, ...)` before later optimizer rules run and existing
null-literal simplifications
continue to apply.
- Add focused regressions for:
- nested struct-field `nullif(c.provider, lower(...))` analysis in both
`ALWAYS_INLINE_COMMON_EXPR` modes;
- `NullIf` replacement type reporting before type coercion;
- optimizer replacement back to a normal null literal;
- explain output avoiding exposure of the internal helper name.
### Does this PR introduce _any_ user-facing change?
Yes. Valid `NULLIF` expressions over unresolved nested field references
that could fail during
analysis now resolve and execute successfully.
### How was this patch tested?
- `build/sbt 'catalyst/testOnly
org.apache.spark.sql.catalyst.expressions.NullExpressionsSuite -- -z "NullIf
replacement preserves its data type before type coercion"'`
- `build/sbt 'catalyst/testOnly
org.apache.spark.sql.catalyst.optimizer.OptimizerSuite -- -z "NullIf typed null
branch is replaced with a null literal"'`
- `build/sbt 'sql/testOnly org.apache.spark.sql.DataFrameFunctionsSuite --
-z "nullif function"'`
- `build/sbt 'sql/testOnly org.apache.spark.sql.ExplainSuite -- -z "explain
for these functions; use range to avoid constant folding"'`
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Codex (GPT-5.5)
Closes #55838 from sunchao/dev/chao/codex/oss-nullif-unresolved.
Authored-by: Chao Sun <[email protected]>
Signed-off-by: Peter Toth <[email protected]>
(cherry picked from commit 5949ab30b41860574ab57b94a8848464b5e127a7)
Signed-off-by: Peter Toth <[email protected]>
---
.../sql/catalyst/expressions/nullExpressions.scala | 19 +++++++++++++++--
.../expressions/NullExpressionsSuite.scala | 10 +++++++++
.../sql/catalyst/optimizer/OptimizerSuite.scala | 24 ++++++++++++++++++++--
.../apache/spark/sql/DataFrameFunctionsSuite.scala | 7 +++++++
.../scala/org/apache/spark/sql/ExplainSuite.scala | 1 +
5 files changed, 57 insertions(+), 4 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 1aa1d0b25e44..e7be588c4b46 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -145,6 +145,21 @@ case class Coalesce(children: Seq[Expression])
copy(children = newChildren)
}
+private case class TypedNullLiteral(child: Expression)
+ extends UnaryExpression with RuntimeReplaceable {
+ override def nullable: Boolean = true
+
+ override def dataType: DataType = child.dataType
+
+ override def toString: String = "null"
+
+ override def sql: String = "NULL"
+
+ override lazy val replacement: Expression = Literal.create(null,
child.dataType)
+
+ override protected def withNewChildInternal(newChild: Expression):
TypedNullLiteral =
+ copy(child = newChild)
+}
@ExpressionDescription(
usage = "_FUNC_(expr1, expr2) - Returns null if `expr1` equals to `expr2`,
or `expr1` otherwise.",
@@ -162,10 +177,10 @@ case class NullIf(left: Expression, right: Expression,
replacement: Expression)
this(left, right,
if (!SQLConf.get.getConf(SQLConf.ALWAYS_INLINE_COMMON_EXPR)) {
With(left) { case Seq(ref) =>
- If(EqualTo(ref, right), Literal.create(null, left.dataType), ref)
+ If(EqualTo(ref, right), TypedNullLiteral(ref), ref)
}
} else {
- If(EqualTo(left, right), Literal.create(null, left.dataType), left)
+ If(EqualTo(left, right), TypedNullLiteral(left), left)
}
)
}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
index c74a9e35833d..bb4aed9b4002 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
@@ -143,6 +143,16 @@ class NullExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper {
assert(analyze(new Nvl(floatLit, doubleLit)).dataType == DoubleType)
}
+ test("NullIf replacement preserves its data type before type coercion") {
+ Seq(true, false).foreach { alwaysInlineCommonExpr =>
+ withSQLConf(SQLConf.ALWAYS_INLINE_COMMON_EXPR.key ->
alwaysInlineCommonExpr.toString) {
+ val nullIf = new NullIf(Literal(1), Literal(1))
+ assert(nullIf.dataType == IntegerType)
+ assert(nullIf.replacement.dataType == IntegerType)
+ }
+ }
+ }
+
test("AtLeastNNonNulls") {
val mix = Seq(Literal("x"),
Literal.create(null, StringType),
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
index 70a2ae94109f..057e4ceaf0a0 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
@@ -21,13 +21,13 @@ import org.apache.spark.SparkException
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Add, Alias, ArrayCompact,
AttributeReference, CreateArray, CreateStruct, IntegerLiteral, Literal,
MapFromEntries, Multiply, NamedExpression, Remainder}
+import org.apache.spark.sql.catalyst.expressions.{Add, Alias, ArrayCompact,
AttributeReference, CreateArray, CreateStruct, IntegerLiteral, Literal,
MapFromEntries, Multiply, NamedExpression, NullIf, Remainder,
RuntimeReplaceable}
import org.apache.spark.sql.catalyst.expressions.aggregate.Sum
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LocalRelation,
LogicalPlan, OneRowRelation, Project}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{ArrayType, IntegerType, MapType,
StructField, StructType}
+import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType,
MapType, StructField, StructType}
/**
* A dummy optimizer rule for testing that decrements integer literals until 0.
@@ -334,4 +334,24 @@ class OptimizerSuite extends PlanTest {
assert(optimized2.schema ===
StructType(StructField("map", MapType(IntegerType, IntegerType, false),
false) :: Nil))
}
+
+ test("NullIf typed null branch is replaced with a null literal") {
+ val optimizer = new SimpleTestOptimizer() {
+ override def defaultBatches: Seq[Batch] =
+ Batch("test", fixedPoint,
+ ReplaceExpressions) :: Nil
+ }
+
+ withSQLConf(SQLConf.ALWAYS_INLINE_COMMON_EXPR.key -> "true") {
+ val nullIf = new NullIf(Literal(true), Literal(true))
+ val plan = Project(Alias(nullIf, "out")() :: Nil,
OneRowRelation()).analyze
+ val optimized = optimizer.execute(plan)
+
+ assert(optimized.expressions.exists(_.exists {
+ case Literal(null, BooleanType) => true
+ case _ => false
+ }))
+
assert(optimized.expressions.forall(!_.exists(_.isInstanceOf[RuntimeReplaceable])))
+ }
+ }
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index a0d9d2e9f40d..7faccbde997d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -350,6 +350,13 @@ class DataFrameFunctionsSuite extends SharedSparkSession {
"expression" -> "\"id\"",
"expressionAnyValue" -> "\"any_value(id)\"")
)
+
+ val nestedDf = Seq("error_multiple_providers", "openai")
+ .toDF("provider")
+ .select(struct(col("provider")).as("c"))
+ checkAnswer(
+ nestedDf.select(nullif(col("c.provider"),
lower(lit("ERROR_MULTIPLE_PROVIDERS")))),
+ Seq(Row(null), Row("openai")))
}
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index 04be1e8fcfba..af52204dbb7f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -251,6 +251,7 @@ class ExplainSuite extends ExplainSuiteHelper with
DisableAdaptiveExecutionSuite
checkKeywordsExistsInExplain(df,
"Project [id#xL AS ifnull(id, 1)#xL, if ((id#xL = 1)) null " +
"else id#xL AS nullif(id, 1)#xL, id#xL AS nvl(id, 1)#xL, 1 AS nvl2(id,
1, 2)#x]")
+ checkKeywordsNotExistsInExplain(df, ExtendedMode, "typednullliteral")
}
test("SPARK-26659: explain of DataWritingCommandExec should not contain
duplicate cmd.nodeName") {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]