[
https://issues.apache.org/jira/browse/SPARK-43522?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Heedo Lee updated SPARK-43522:
------------------------------
Description:
When creating a struct column in Dataframe, the code that ran without problems
in version 3.3.1 does not work in version 3.4.0.
Example
{code:java}
val testDF = Seq("a=b,c=d,d=f").toDF.withColumn("key_value", split('value,
",")).withColumn("map_entry", transform(col("key_value"), x => struct(split(x,
"=").getItem(0), split(x, "=").getItem(1) ) )){code}
In 3.3.1
{code:java}
testDF.show()
+-----------+---------------+--------------------+
| value| key_value| map_entry|
+-----------+---------------+--------------------+
|a=b,c=d,d=f|[a=b, c=d, d=f]|[{a, b}, {c, d}, ...|
+-----------+---------------+--------------------+
root
|-- value: string (nullable = true)
|-- key_value: array (nullable = true)
| |-- element: string (containsNull = false)
|-- map_entry: array (nullable = true)
| |-- element: struct (containsNull = false)
| | |-- col1: string (nullable = true)
| | |-- col2: string (nullable = true)
|-- aaa: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true) {code}
In 3.4.0
{code:java}
org.apache.spark.sql.AnalysisException:
[DATATYPE_MISMATCH.CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING] Cannot resolve
"struct(split(namedlambdavariable(), =, -1)[0], split(namedlambdavariable(), =,
-1)[1])" due to data type mismatch: Only foldable `STRING` expressions are
allowed to appear at odd position, but they are ["0", "1"].;
'Project [value#41, key_value#45, transform(key_value#45,
lambdafunction(struct(0, split(lambda x_3#49, =, -1)[0], 1, split(lambda
x_3#49, =, -1)[1]), lambda x_3#49, false)) AS map_entry#48]
+- Project [value#41, split(value#41, ,, -1) AS key_value#45]
+- LocalRelation [value#41] at
org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5(CheckAnalysis.scala:269)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5$adapted(CheckAnalysis.scala:256)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:295)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:294)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:294)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:294)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:294)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:294)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
....
{code}
However, if you do an alias to struct elements, you can get the same result as
the previous version.
{code:java}
val testDF = Seq("a=b,c=d,d=f").toDF.withColumn("key_value", split('value,
",")).withColumn("map_entry", transform(col("key_value"), x => struct(split(x,
"=").getItem(0).as("col1") , split(x, "=").getItem(1).as("col2") ) )){code}
was:
When creating a struct column in Dataframe, the code that ran without problems
in version 3.3.1 does not work in version 3.4.0.
Example
{code:java}
val testDF = Seq("a=b,c=d,d=f").toDF.withColumn("key_value", split('value,
",")).withColumn("map_entry", transform(col("key_value"), x => struct(split(x,
"=").getItem(0), split(x, "=").getItem(1) ) )){code}
In 3.3.1
{code:java}
testDF.show()
+-----------+---------------+--------------------+ | value|
key_value| map_entry|
+-----------+---------------+--------------------+ |a=b,c=d,d=f|[a=b, c=d,
d=f]|[{a, b}, {c, d}, ...| +-----------+---------------+--------------------+
testDF.printSchema
root |-- value: string (nullable = true) |-- key_value: array (nullable =
true) | |-- element: string (containsNull = false) |-- map_entry: array
(nullable = true) | |-- element: struct (containsNull = false) | |
|-- col1: string (nullable = true) | | |-- col2: string (nullable = true)
{code}
In 3.4.0
{code:java}
org.apache.spark.sql.AnalysisException:
[DATATYPE_MISMATCH.CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING] Cannot resolve
"struct(split(namedlambdavariable(), =, -1)[0], split(namedlambdavariable(), =,
-1)[1])" due to data type mismatch: Only foldable `STRING` expressions are
allowed to appear at odd position, but they are ["0", "1"].;
'Project [value#41, key_value#45, transform(key_value#45,
lambdafunction(struct(0, split(lambda x_3#49, =, -1)[0], 1, split(lambda
x_3#49, =, -1)[1]), lambda x_3#49, false)) AS map_entry#48]
+- Project [value#41, split(value#41, ,, -1) AS key_value#45]
+- LocalRelation [value#41] at
org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5(CheckAnalysis.scala:269)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5$adapted(CheckAnalysis.scala:256)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:295)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:294)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:294)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:294)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:294)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:294)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
....
{code}
However, if you do an alias to struct elements, you can get the same result as
the previous version.
{code:java}
val testDF = Seq("a=b,c=d,d=f").toDF.withColumn("key_value", split('value,
",")).withColumn("map_entry", transform(col("key_value"), x => struct(split(x,
"=").getItem(0).as("col1") , split(x, "=").getItem(1).as("col2") ) )){code}
> Creating struct column occurs error 'org.apache.spark.sql.AnalysisException
> [DATATYPE_MISMATCH.CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING]'
> ---------------------------------------------------------------------------------------------------------------------------------------------
>
> Key: SPARK-43522
> URL: https://issues.apache.org/jira/browse/SPARK-43522
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 3.4.0
> Reporter: Heedo Lee
> Priority: Minor
>
> When creating a struct column in Dataframe, the code that ran without
> problems in version 3.3.1 does not work in version 3.4.0.
>
> Example
> {code:java}
> val testDF = Seq("a=b,c=d,d=f").toDF.withColumn("key_value", split('value,
> ",")).withColumn("map_entry", transform(col("key_value"), x =>
> struct(split(x, "=").getItem(0), split(x, "=").getItem(1) ) )){code}
>
> In 3.3.1
>
> {code:java}
>
> testDF.show()
> +-----------+---------------+--------------------+
> | value| key_value| map_entry|
> +-----------+---------------+--------------------+
> |a=b,c=d,d=f|[a=b, c=d, d=f]|[{a, b}, {c, d}, ...|
> +-----------+---------------+--------------------+
>
> root
> |-- value: string (nullable = true)
> |-- key_value: array (nullable = true)
> | |-- element: string (containsNull = false)
> |-- map_entry: array (nullable = true)
> | |-- element: struct (containsNull = false)
> | | |-- col1: string (nullable = true)
> | | |-- col2: string (nullable = true)
> |-- aaa: map (nullable = true)
> | |-- key: string
> | |-- value: string (valueContainsNull = true) {code}
>
>
> In 3.4.0
>
> {code:java}
> org.apache.spark.sql.AnalysisException:
> [DATATYPE_MISMATCH.CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING] Cannot
> resolve "struct(split(namedlambdavariable(), =, -1)[0],
> split(namedlambdavariable(), =, -1)[1])" due to data type mismatch: Only
> foldable `STRING` expressions are allowed to appear at odd position, but they
> are ["0", "1"].;
> 'Project [value#41, key_value#45, transform(key_value#45,
> lambdafunction(struct(0, split(lambda x_3#49, =, -1)[0], 1, split(lambda
> x_3#49, =, -1)[1]), lambda x_3#49, false)) AS map_entry#48]
> +- Project [value#41, split(value#41, ,, -1) AS key_value#45]
> +- LocalRelation [value#41] at
> org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73)
> at
> org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5(CheckAnalysis.scala:269)
> at
> org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5$adapted(CheckAnalysis.scala:256)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:295)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:294)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:294)
> at scala.collection.Iterator.foreach(Iterator.scala:943)
> at scala.collection.Iterator.foreach$(Iterator.scala:943)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> at scala.collection.IterableLike.foreach(IterableLike.scala:74)
> at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
> at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:294)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:294)
> at
> org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:294)
> at scala.collection.Iterator.foreach(Iterator.scala:943)
> at scala.collection.Iterator.foreach$(Iterator.scala:943)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
> ....
>
> {code}
>
> However, if you do an alias to struct elements, you can get the same result
> as the previous version.
>
> {code:java}
> val testDF = Seq("a=b,c=d,d=f").toDF.withColumn("key_value", split('value,
> ",")).withColumn("map_entry", transform(col("key_value"), x =>
> struct(split(x, "=").getItem(0).as("col1") , split(x,
> "=").getItem(1).as("col2") ) )){code}
>
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]