[
https://issues.apache.org/jira/browse/SPARK-30855?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Benoit Roy updated SPARK-30855:
-------------------------------
Description:
An exception occurs when trying to use a _* expand_ selection after performing
an explode on a array of struct.
I am testing this on preview2 release of spark.
Here's a public repo containing a very simple scala test case that reproduces
the issue
{code:java}
git clone [email protected]:benoitroy/spark-30855.git{code}
Simply execute the *Spark30855Tests* class.
On a simple schema such as:
{code:java}
root
|-- k1: string (nullable = true)
|-- k2: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- k2.k1: struct (nullable = true)
| | | |-- k2.k1.k1: string (nullable = true)
| | | |-- k2.k1.k2: string (nullable = true)
| | |-- k2.k2: string (nullable = true) {code}
The following test case will fail on the 'col.*' selection.
{code:java}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.scalatest.funsuite.AnyFunSuite
class Spark38055Tests extends AnyFunSuite {
test("") {
//
val path = "src/test/data/json/data.json"
//
val spark = SparkSession
.builder()
.appName("Testing.")
.config("spark.master", "local")
.getOrCreate();
//
val df = spark.read.json(path)
// SUCCESS!
df.printSchema()
// SUCCESS!
df.select(explode(col("k2"))).show()
// SUCCESS!
df.select(explode(col("k2"))).select("col.*").printSchema()
// FAIL!
df.select(explode(col("k2"))).select("col.*").show()
}
} {code}
The test class demonstrates two cases, one where it fails (as shown above) and
another where it succeeds. There is only a slight variation on the schema of
both cases. The succeeding case works on the following schema:
{code:java}
root
|-- k1: string (nullable = true)
|-- k2: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- k2.k1: struct (nullable = true)
| | | |-- k2.k1.k1: string (nullable = true)
| | |-- k2.k2: string (nullable = true) {code}
You will notice that this schema simply removes a field from the nested struct
'k2.k1'.
The stacktrace produced by the failing case is show below:
{code:java}
Binding attribute, tree: _gen_alias_23#23Binding attribute, tree:
_gen_alias_23#23org.apache.spark.sql.catalyst.errors.package$TreeNodeException:
Binding attribute, tree: _gen_alias_23#23 at
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56) at
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:75)
at
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:74)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$1(TreeNode.scala:286)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:286)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:291)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:376)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:214)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:374) at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:327) at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:291)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:376)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:214)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:374) at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:327) at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:275)
at
org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:74)
at
org.apache.spark.sql.catalyst.expressions.BindReferences$.$anonfun$bindReferences$1(BoundAttribute.scala:96)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:237)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at
scala.collection.TraversableLike.map(TraversableLike.scala:237) at
scala.collection.TraversableLike.map$(TraversableLike.scala:230) at
scala.collection.AbstractTraversable.map(Traversable.scala:108) at
org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReferences(BoundAttribute.scala:96)
at
org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:63)
at
org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:193)
at
org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:148)
at
org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:495)
at
org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:482)
at
org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:455)
at
org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:495)
at
org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:94)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
at
org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:89)
at
org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:89)
at
org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:495)
at
org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:49)
at
org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:94)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
at
org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:89)
at
org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:89)
at
org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:39)
at
org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:629)
at
org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:689)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:173)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:169) at
org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:313)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:405) at
org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3482) at
org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2581) at
org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3472) at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$4(SQLExecution.scala:100)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:87)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3468) at
org.apache.spark.sql.Dataset.head(Dataset.scala:2581) at
org.apache.spark.sql.Dataset.take(Dataset.scala:2788) at
org.apache.spark.sql.Dataset.getRows(Dataset.scala:297) at
org.apache.spark.sql.Dataset.showString(Dataset.scala:334) at
org.apache.spark.sql.Dataset.show(Dataset.scala:816) at
org.apache.spark.sql.Dataset.show(Dataset.scala:775) at
org.apache.spark.sql.Dataset.show(Dataset.scala:784) at
Spark38055Tests.$anonfun$new$1(Spark38055Tests.scala:21) at
scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at
org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85) at
org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83) at
org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104) at
org.scalatest.Transformer.apply(Transformer.scala:22) at
org.scalatest.Transformer.apply(Transformer.scala:20) at
org.scalatest.funsuite.AnyFunSuiteLike$$anon$1.apply(AnyFunSuiteLike.scala:189)
at org.scalatest.TestSuite.withFixture(TestSuite.scala:196) at
org.scalatest.TestSuite.withFixture$(TestSuite.scala:195) at
org.scalatest.funsuite.AnyFunSuite.withFixture(AnyFunSuite.scala:1562) at
org.scalatest.funsuite.AnyFunSuiteLike.invokeWithFixture$1(AnyFunSuiteLike.scala:187)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTest$1(AnyFunSuiteLike.scala:199)
at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306) at
org.scalatest.funsuite.AnyFunSuiteLike.runTest(AnyFunSuiteLike.scala:199) at
org.scalatest.funsuite.AnyFunSuiteLike.runTest$(AnyFunSuiteLike.scala:181) at
org.scalatest.funsuite.AnyFunSuite.runTest(AnyFunSuite.scala:1562) at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTests$1(AnyFunSuiteLike.scala:232)
at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:413) at
scala.collection.immutable.List.foreach(List.scala:392) at
org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401) at
org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:396) at
org.scalatest.SuperEngine.runTestsImpl(Engine.scala:475) at
org.scalatest.funsuite.AnyFunSuiteLike.runTests(AnyFunSuiteLike.scala:232) at
org.scalatest.funsuite.AnyFunSuiteLike.runTests$(AnyFunSuiteLike.scala:231) at
org.scalatest.funsuite.AnyFunSuite.runTests(AnyFunSuite.scala:1562) at
org.scalatest.Suite.run(Suite.scala:1112) at
org.scalatest.Suite.run$(Suite.scala:1094) at
org.scalatest.funsuite.AnyFunSuite.org$scalatest$funsuite$AnyFunSuiteLike$$super$run(AnyFunSuite.scala:1562)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$run$1(AnyFunSuiteLike.scala:236)
at org.scalatest.SuperEngine.runImpl(Engine.scala:535) at
org.scalatest.funsuite.AnyFunSuiteLike.run(AnyFunSuiteLike.scala:236) at
org.scalatest.funsuite.AnyFunSuiteLike.run$(AnyFunSuiteLike.scala:235) at
org.scalatest.funsuite.AnyFunSuite.run(AnyFunSuite.scala:1562) at
org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:45) at
org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13(Runner.scala:1314)
at
org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13$adapted(Runner.scala:1308)
at scala.collection.immutable.List.foreach(List.scala:392) at
org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:1308) at
org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24(Runner.scala:993)
at
org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24$adapted(Runner.scala:971)
at
org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:1474)
at
org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:971)
at org.scalatest.tools.Runner$.run(Runner.scala:798) at
org.scalatest.tools.Runner.run(Runner.scala) at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2(ScalaTestRunner.java:133)
at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:27)Caused
by: java.lang.RuntimeException: Couldn't find _gen_alias_23#23 in col#11 at
scala.sys.package$.error(package.scala:30) at
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.$anonfun$applyOrElse$1(BoundAttribute.scala:81)
at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
... 121 more{code}
was:
An exception occurs when trying to use a _* expand_ selection after performing
an explode on a array of struct.
I am testing this on preview2 release of spark.
Here's a public repo containing a very simple scala test case that reproduces
the issue
{code:java}
git clone [email protected]:benoitroy/spark-30855.git{code}
Simply execute the *Spark30855Tests* class.
On a simple schema such as:
{code:java}
root
|-- k1: string (nullable = true)
|-- k2: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- k2.k1: struct (nullable = true)
| | | |-- k2.k1.k1: string (nullable = true)
| | | |-- k2.k1.k2: string (nullable = true)
| | |-- k2.k2: string (nullable = true) {code}
The following test case will fail on the 'col.*' selection.
{code:java}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.scalatest.funsuite.AnyFunSuite
class Spark38055Tests extends AnyFunSuite {
test("") {
//
val path = "src/test/data/json/data.json"
//
val spark = SparkSession
.builder()
.appName("Testing.")
.config("spark.master", "local")
.getOrCreate();
//
val df = spark.read.json(path)
// SUCCESS!
df.printSchema()
// SUCCESS!
df.select(explode(col("k2"))).show()
// SUCCESS!
df.select(explode(col("k2"))).select("col.*").printSchema()
// FAIL!
df.select(explode(col("k2"))).select("col.*").show()
}
} {code}
The test class demonstrates two cases, one where it fails (as shown above) and
another where it succeeds. There is only a slight variation on the schema of
both cases. The succeeding case works on the following schema:
{code:java}
root
|-- k1: string (nullable = true)
|-- k2: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- k2.k1: struct (nullable = true)
| | | |-- k2.k1.k1: string (nullable = true)
| | |-- k2.k2: string (nullable = true) {code}
You will notice that this schema simply removes a field from the nested struct
'k2.k1'.
The stacktrace produced by the failing case is show below:
{code:java}
Binding attribute, tree: _gen_alias_23#23Binding attribute, tree:
_gen_alias_23#23org.apache.spark.sql.catalyst.errors.package$TreeNodeException:
Binding attribute, tree: _gen_alias_23#23 at
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56) at
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:75)
at
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:74)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$1(TreeNode.scala:286)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:286)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:291)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:376)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:214)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:374) at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:327) at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:291)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:376)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:214)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:374) at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:327) at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:275)
at
org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:74)
at
org.apache.spark.sql.catalyst.expressions.BindReferences$.$anonfun$bindReferences$1(BoundAttribute.scala:96)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:237)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at
scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at
scala.collection.TraversableLike.map(TraversableLike.scala:237) at
scala.collection.TraversableLike.map$(TraversableLike.scala:230) at
scala.collection.AbstractTraversable.map(Traversable.scala:108) at
org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReferences(BoundAttribute.scala:96)
at
org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:63)
at
org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:193)
at
org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:148)
at
org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:495)
at
org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:482)
at
org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:455)
at
org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:495)
at
org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:94)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
at
org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:89)
at
org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:89)
at
org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:495)
at
org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:49)
at
org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:94)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
at
org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:89)
at
org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:89)
at
org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:39)
at
org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:629)
at
org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:689)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:173)
at
org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:169) at
org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:313)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:405) at
org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3482) at
org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2581) at
org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3472) at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$4(SQLExecution.scala:100)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:87)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3468) at
org.apache.spark.sql.Dataset.head(Dataset.scala:2581) at
org.apache.spark.sql.Dataset.take(Dataset.scala:2788) at
org.apache.spark.sql.Dataset.getRows(Dataset.scala:297) at
org.apache.spark.sql.Dataset.showString(Dataset.scala:334) at
org.apache.spark.sql.Dataset.show(Dataset.scala:816) at
org.apache.spark.sql.Dataset.show(Dataset.scala:775) at
org.apache.spark.sql.Dataset.show(Dataset.scala:784) at
Spark38055Tests.$anonfun$new$1(Spark38055Tests.scala:21) at
scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at
org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85) at
org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83) at
org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104) at
org.scalatest.Transformer.apply(Transformer.scala:22) at
org.scalatest.Transformer.apply(Transformer.scala:20) at
org.scalatest.funsuite.AnyFunSuiteLike$$anon$1.apply(AnyFunSuiteLike.scala:189)
at org.scalatest.TestSuite.withFixture(TestSuite.scala:196) at
org.scalatest.TestSuite.withFixture$(TestSuite.scala:195) at
org.scalatest.funsuite.AnyFunSuite.withFixture(AnyFunSuite.scala:1562) at
org.scalatest.funsuite.AnyFunSuiteLike.invokeWithFixture$1(AnyFunSuiteLike.scala:187)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTest$1(AnyFunSuiteLike.scala:199)
at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306) at
org.scalatest.funsuite.AnyFunSuiteLike.runTest(AnyFunSuiteLike.scala:199) at
org.scalatest.funsuite.AnyFunSuiteLike.runTest$(AnyFunSuiteLike.scala:181) at
org.scalatest.funsuite.AnyFunSuite.runTest(AnyFunSuite.scala:1562) at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTests$1(AnyFunSuiteLike.scala:232)
at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:413) at
scala.collection.immutable.List.foreach(List.scala:392) at
org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401) at
org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:396) at
org.scalatest.SuperEngine.runTestsImpl(Engine.scala:475) at
org.scalatest.funsuite.AnyFunSuiteLike.runTests(AnyFunSuiteLike.scala:232) at
org.scalatest.funsuite.AnyFunSuiteLike.runTests$(AnyFunSuiteLike.scala:231) at
org.scalatest.funsuite.AnyFunSuite.runTests(AnyFunSuite.scala:1562) at
org.scalatest.Suite.run(Suite.scala:1112) at
org.scalatest.Suite.run$(Suite.scala:1094) at
org.scalatest.funsuite.AnyFunSuite.org$scalatest$funsuite$AnyFunSuiteLike$$super$run(AnyFunSuite.scala:1562)
at
org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$run$1(AnyFunSuiteLike.scala:236)
at org.scalatest.SuperEngine.runImpl(Engine.scala:535) at
org.scalatest.funsuite.AnyFunSuiteLike.run(AnyFunSuiteLike.scala:236) at
org.scalatest.funsuite.AnyFunSuiteLike.run$(AnyFunSuiteLike.scala:235) at
org.scalatest.funsuite.AnyFunSuite.run(AnyFunSuite.scala:1562) at
org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:45) at
org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13(Runner.scala:1314)
at
org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13$adapted(Runner.scala:1308)
at scala.collection.immutable.List.foreach(List.scala:392) at
org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:1308) at
org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24(Runner.scala:993)
at
org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24$adapted(Runner.scala:971)
at
org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:1474)
at
org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:971)
at org.scalatest.tools.Runner$.run(Runner.scala:798) at
org.scalatest.tools.Runner.run(Runner.scala) at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2(ScalaTestRunner.java:133)
at
org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:27)Caused
by: java.lang.RuntimeException: Couldn't find _gen_alias_23#23 in col#11 at
scala.sys.package$.error(package.scala:30) at
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.$anonfun$applyOrElse$1(BoundAttribute.scala:81)
at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
... 121 more{code}
> Issue using 'explode' function followed by a (*)star expand selection of
> resulting struct
> -----------------------------------------------------------------------------------------
>
> Key: SPARK-30855
> URL: https://issues.apache.org/jira/browse/SPARK-30855
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 3.0.0
> Reporter: Benoit Roy
> Priority: Major
>
> An exception occurs when trying to use a _* expand_ selection after
> performing an explode on a array of struct.
> I am testing this on preview2 release of spark.
> Here's a public repo containing a very simple scala test case that reproduces
> the issue
> {code:java}
> git clone [email protected]:benoitroy/spark-30855.git{code}
> Simply execute the *Spark30855Tests* class.
> On a simple schema such as:
> {code:java}
> root
> |-- k1: string (nullable = true)
> |-- k2: array (nullable = true)
> | |-- element: struct (containsNull = true)
> | | |-- k2.k1: struct (nullable = true)
> | | | |-- k2.k1.k1: string (nullable = true)
> | | | |-- k2.k1.k2: string (nullable = true)
> | | |-- k2.k2: string (nullable = true) {code}
> The following test case will fail on the 'col.*' selection.
> {code:java}
> import org.apache.spark.sql.SparkSession
> import org.apache.spark.sql.functions._
> import org.scalatest.funsuite.AnyFunSuite
> class Spark38055Tests extends AnyFunSuite {
> test("") {
> //
> val path = "src/test/data/json/data.json"
> //
> val spark = SparkSession
> .builder()
> .appName("Testing.")
> .config("spark.master", "local")
> .getOrCreate();
> //
> val df = spark.read.json(path)
> // SUCCESS!
> df.printSchema()
> // SUCCESS!
> df.select(explode(col("k2"))).show()
> // SUCCESS!
> df.select(explode(col("k2"))).select("col.*").printSchema()
> // FAIL!
> df.select(explode(col("k2"))).select("col.*").show()
> }
> } {code}
>
> The test class demonstrates two cases, one where it fails (as shown above)
> and another where it succeeds. There is only a slight variation on the
> schema of both cases. The succeeding case works on the following schema:
> {code:java}
> root
> |-- k1: string (nullable = true)
> |-- k2: array (nullable = true)
> | |-- element: struct (containsNull = true)
> | | |-- k2.k1: struct (nullable = true)
> | | | |-- k2.k1.k1: string (nullable = true)
> | | |-- k2.k2: string (nullable = true) {code}
> You will notice that this schema simply removes a field from the nested
> struct 'k2.k1'.
>
> The stacktrace produced by the failing case is show below:
> {code:java}
> Binding attribute, tree: _gen_alias_23#23Binding attribute, tree:
> _gen_alias_23#23org.apache.spark.sql.catalyst.errors.package$TreeNodeException:
> Binding attribute, tree: _gen_alias_23#23 at
> org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56) at
> org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:75)
> at
> org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:74)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$1(TreeNode.scala:286)
> at
> org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:286)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:291)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:376)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:214)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:374)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:327)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:291)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:376)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:214)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:374)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:327)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:291)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:275) at
> org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:74)
> at
> org.apache.spark.sql.catalyst.expressions.BindReferences$.$anonfun$bindReferences$1(BoundAttribute.scala:96)
> at
> scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:237) at
> scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at
> scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at
> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at
> scala.collection.TraversableLike.map(TraversableLike.scala:237) at
> scala.collection.TraversableLike.map$(TraversableLike.scala:230) at
> scala.collection.AbstractTraversable.map(Traversable.scala:108) at
> org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReferences(BoundAttribute.scala:96)
> at
> org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:63)
> at
> org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:193)
> at
> org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:148)
> at
> org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:495)
> at
> org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:482)
> at
> org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:455)
> at
> org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:495)
> at
> org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:94)
> at
> org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at
> org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208) at
> org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:89)
> at
> org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:89)
> at
> org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:495)
> at
> org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:49)
> at
> org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:94)
> at
> org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at
> org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208) at
> org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:89)
> at
> org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:89)
> at
> org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:39)
> at
> org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:629)
> at
> org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:689)
> at
> org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:173)
> at
> org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at
> org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208) at
> org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:169) at
> org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:313)
> at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:405)
> at
> org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
> at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3482) at
> org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2581) at
> org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3472) at
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$4(SQLExecution.scala:100)
> at
> org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
> at
> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:87)
> at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3468) at
> org.apache.spark.sql.Dataset.head(Dataset.scala:2581) at
> org.apache.spark.sql.Dataset.take(Dataset.scala:2788) at
> org.apache.spark.sql.Dataset.getRows(Dataset.scala:297) at
> org.apache.spark.sql.Dataset.showString(Dataset.scala:334) at
> org.apache.spark.sql.Dataset.show(Dataset.scala:816) at
> org.apache.spark.sql.Dataset.show(Dataset.scala:775) at
> org.apache.spark.sql.Dataset.show(Dataset.scala:784) at
> Spark38055Tests.$anonfun$new$1(Spark38055Tests.scala:21) at
> scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at
> org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85) at
> org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83) at
> org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104) at
> org.scalatest.Transformer.apply(Transformer.scala:22) at
> org.scalatest.Transformer.apply(Transformer.scala:20) at
> org.scalatest.funsuite.AnyFunSuiteLike$$anon$1.apply(AnyFunSuiteLike.scala:189)
> at org.scalatest.TestSuite.withFixture(TestSuite.scala:196) at
> org.scalatest.TestSuite.withFixture$(TestSuite.scala:195) at
> org.scalatest.funsuite.AnyFunSuite.withFixture(AnyFunSuite.scala:1562) at
> org.scalatest.funsuite.AnyFunSuiteLike.invokeWithFixture$1(AnyFunSuiteLike.scala:187)
> at
> org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTest$1(AnyFunSuiteLike.scala:199)
> at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306) at
> org.scalatest.funsuite.AnyFunSuiteLike.runTest(AnyFunSuiteLike.scala:199) at
> org.scalatest.funsuite.AnyFunSuiteLike.runTest$(AnyFunSuiteLike.scala:181) at
> org.scalatest.funsuite.AnyFunSuite.runTest(AnyFunSuite.scala:1562) at
> org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTests$1(AnyFunSuiteLike.scala:232)
> at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:413)
> at scala.collection.immutable.List.foreach(List.scala:392) at
> org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401) at
> org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:396) at
> org.scalatest.SuperEngine.runTestsImpl(Engine.scala:475) at
> org.scalatest.funsuite.AnyFunSuiteLike.runTests(AnyFunSuiteLike.scala:232) at
> org.scalatest.funsuite.AnyFunSuiteLike.runTests$(AnyFunSuiteLike.scala:231)
> at org.scalatest.funsuite.AnyFunSuite.runTests(AnyFunSuite.scala:1562) at
> org.scalatest.Suite.run(Suite.scala:1112) at
> org.scalatest.Suite.run$(Suite.scala:1094) at
> org.scalatest.funsuite.AnyFunSuite.org$scalatest$funsuite$AnyFunSuiteLike$$super$run(AnyFunSuite.scala:1562)
> at
> org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$run$1(AnyFunSuiteLike.scala:236)
> at org.scalatest.SuperEngine.runImpl(Engine.scala:535) at
> org.scalatest.funsuite.AnyFunSuiteLike.run(AnyFunSuiteLike.scala:236) at
> org.scalatest.funsuite.AnyFunSuiteLike.run$(AnyFunSuiteLike.scala:235) at
> org.scalatest.funsuite.AnyFunSuite.run(AnyFunSuite.scala:1562) at
> org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:45) at
> org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13(Runner.scala:1314)
> at
> org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13$adapted(Runner.scala:1308)
> at scala.collection.immutable.List.foreach(List.scala:392) at
> org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:1308) at
> org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24(Runner.scala:993)
> at
> org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24$adapted(Runner.scala:971)
> at
> org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:1474)
> at
> org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:971)
> at org.scalatest.tools.Runner$.run(Runner.scala:798) at
> org.scalatest.tools.Runner.run(Runner.scala) at
> org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.runScalaTest2(ScalaTestRunner.java:133)
> at
> org.jetbrains.plugins.scala.testingSupport.scalaTest.ScalaTestRunner.main(ScalaTestRunner.java:27)Caused
> by: java.lang.RuntimeException: Couldn't find _gen_alias_23#23 in col#11 at
> scala.sys.package$.error(package.scala:30) at
> org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.$anonfun$applyOrElse$1(BoundAttribute.scala:81)
> at
> org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
> ... 121 more{code}
>
>
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]