This is an automated email from the ASF dual-hosted git repository.
cloud-fan pushed a commit to branch branch-4.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.2 by this push:
new c259da2ed506 [SPARK-48091][SQL] Preserve aliases inside lambda when
ExtractGenerator restructures plan
c259da2ed506 is described below
commit c259da2ed506f24c9bfe4d0f7c2d50a882b90ee6
Author: Shrirang Mhalgi <[email protected]>
AuthorDate: Fri May 22 18:37:37 2026 +0800
[SPARK-48091][SQL] Preserve aliases inside lambda when ExtractGenerator
restructures plan
### What changes were proposed in this pull request?
Fix `ExtractGenerator` to preserve aliases inside lambda functions when
restructuring the plan.
Previously, `ExtractGenerator` called `trimNonTopLevelAliases` on all
expressions in the project list before extracting the generator. This stripped
aliases inside lambda functions (e.g., struct(x.as("data"))) before
`CreateStruct` could resolve them into struct field names.
The fix uses `trimNonTopLevelAliases` only for pattern matching (to detect
generators via `AliasedGenerator`), but preserves the original untrimmed
expression for non-generator project items.
### Why are the changes needed?
When using explode together with transform in the same `select statement`,
aliases used inside the transformed column's `struct()` are ignored. Field
names become auto-generated (x_1, x_2) instead of the user-specified alias.
This only happens with the DataFrame/Dataset API, not with SQL.
### Does this PR introduce _any_ user-facing change?
Yes. Struct field aliases inside transform lambdas are now correctly
preserved when explode (or any generator) is in the same `select`.
### How was this patch tested?
Added a test in `GeneratorFunctionSuite` verifying that struct field
aliases are preserved when explode and transform are used together, including
single and multiple aliases.
### Was this patch authored or co-authored using generative AI tooling?
Yes.
Closes #55892 from shrirangmhalgi/SPARK-48091-explode-transform-alias.
Authored-by: Shrirang Mhalgi <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit ccdb31acfae01000601a2ba70bf5168704c04e78)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../sql/catalyst/expressions/AliasHelper.scala | 6 +++-
.../apache/spark/sql/GeneratorFunctionSuite.scala | 33 +++++++++++++++++++++-
2 files changed, 37 insertions(+), 2 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala
index 2340385dcdd6..f1cb20ca4061 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
import scala.annotation.tailrec
-import org.apache.spark.sql.catalyst.analysis.MultiAlias
+import org.apache.spark.sql.catalyst.analysis.{MultiAlias, UnresolvedFunction}
import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project}
import org.apache.spark.sql.catalyst.trees.CurrentOrigin
@@ -112,6 +112,10 @@ trait AliasHelper {
}
protected def trimAliases(e: Expression): Expression = e match {
+ // SPARK-48091: Do not descend into unresolved function calls. Aliases
inside them
+ // (e.g., UnresolvedFunction("struct", Seq(Alias(x, "data")))) carry
semantic information
+ // that ResolveFunctions -> CreateStruct.apply consumes to produce field
names.
+ case u: UnresolvedFunction => u
// The children of `CreateNamedStruct` may use `Alias` to carry metadata
and we should not
// trim them.
case c: CreateNamedStruct => c.mapChildren {
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
index 015ea9defae9..58f399bf797f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.trees.LeafLike
import org.apache.spark.sql.functions._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSparkSession
-import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
class GeneratorFunctionSuite extends SharedSparkSession {
import testImplicits._
@@ -765,6 +765,37 @@ class GeneratorFunctionSuite extends SharedSparkSession {
Seq(Row(0, 10, 0, 10), Row(1, 20, 1, 20))
)
}
+
+ test("SPARK-48091: explode with transform should preserve struct field
aliases") {
+ val df = spark.createDataFrame(Seq((1, Array(1, 2, 3), Array(4, 5, 6))))
+ .toDF("id", "my_array", "my_array2")
+
+ // Without explode - aliases should work (baseline)
+ val good = df.select(
+ transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct")
+ )
+ assert(good.schema("my_struct").dataType.asInstanceOf[ArrayType]
+ .elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data"))
+
+ // With explode in same select - aliases should still be preserved
+ val result = df.select(
+ explode(col("my_array")).as("exploded"),
+ transform(col("my_array2"), x => struct(x.as("data"))).as("my_struct")
+ )
+ assert(result.schema("my_struct").dataType.asInstanceOf[ArrayType]
+ .elementType.asInstanceOf[StructType].fieldNames.toSeq === Seq("data"))
+
+ // Multiple aliases inside struct
+ val result2 = df.select(
+ explode(col("my_array")).as("exploded"),
+ transform(col("my_array2"),
+ x => struct(x.as("value"), col("id").as("key"))
+ ).as("my_struct")
+ )
+ val fields2 = result2.schema("my_struct").dataType.asInstanceOf[ArrayType]
+ .elementType.asInstanceOf[StructType].fieldNames.toSeq
+ assert(fields2 === Seq("value", "key"))
+ }
}
case class EmptyGenerator() extends Generator with LeafLike[Expression] {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]