This is an automated email from the ASF dual-hosted git repository. viirya pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 87282f0 [SPARK-35972][SQL] When replace ExtractValue in NestedColumnAliasing we should use semanticEquals 87282f0 is described below commit 87282f04bf8810882187f6759d4a675190ca9b0b Author: Angerszhuuuu <angers....@gmail.com> AuthorDate: Tue Jul 6 00:09:34 2021 -0700 [SPARK-35972][SQL] When replace ExtractValue in NestedColumnAliasing we should use semanticEquals ### What changes were proposed in this pull request? Ideally, in SQL query, nested columns should result to GetStructField with non-None name. But there are places that can create GetStructField with None name, such as UnresolvedStar.expand, Dataset encoder stuff, etc. the current `nestedFieldToAlias` cannot catch it up and will cause job failed. ### Why are the changes needed? Fix bug ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added UT, Closes #33183 from AngersZhuuuu/SPARK-35972. Authored-by: Angerszhuuuu <angers....@gmail.com> Signed-off-by: Liang-Chi Hsieh <vii...@gmail.com> --- .../catalyst/optimizer/NestedColumnAliasing.scala | 15 +++++++------ .../optimizer/NestedColumnAliasingSuite.scala | 25 ++++++++++++++++++++++ 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index 9ea3c1d..9facae3b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -144,7 +144,8 @@ object NestedColumnAliasing { attr -> evAliasSeq } - val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten.toMap + val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten + .map { case (field, alias) => field.canonicalized -> alias }.toMap // A reference attribute can have multiple aliases for nested fields. val attrToAliases = @@ -167,10 +168,10 @@ object NestedColumnAliasing { */ def getNewProjectList( projectList: Seq[NamedExpression], - nestedFieldToAlias: Map[ExtractValue, Alias]): Seq[NamedExpression] = { + nestedFieldToAlias: Map[Expression, Alias]): Seq[NamedExpression] = { projectList.map(_.transform { - case f: ExtractValue if nestedFieldToAlias.contains(f) => - nestedFieldToAlias(f).toAttribute + case f: ExtractValue if nestedFieldToAlias.contains(f.canonicalized) => + nestedFieldToAlias(f.canonicalized).toAttribute }.asInstanceOf[NamedExpression]) } @@ -180,13 +181,13 @@ object NestedColumnAliasing { */ def replaceWithAliases( plan: LogicalPlan, - nestedFieldToAlias: Map[ExtractValue, Alias], + nestedFieldToAlias: Map[Expression, Alias], attrToAliases: AttributeMap[Seq[Alias]]): LogicalPlan = { plan.withNewChildren(plan.children.map { plan => Project(plan.output.flatMap(a => attrToAliases.getOrElse(a, Seq(a))), plan) }).transformExpressions { - case f: ExtractValue if nestedFieldToAlias.contains(f) => - nestedFieldToAlias(f).toAttribute + case f: ExtractValue if nestedFieldToAlias.contains(f.canonicalized) => + nestedFieldToAlias(f.canonicalized).toAttribute } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala index 19aec2a..e49e028 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala @@ -738,6 +738,31 @@ class NestedColumnAliasingSuite extends SchemaPruningTest { val optimized = Optimize.execute(query) comparePlans(optimized, query) } + + test("SPARK-35972: NestedColumnAliasing should consider semantic equality") { + val dataType = new StructType() + .add(StructField("itemid", StringType)) + .add(StructField("search_params", StructType(Seq( + StructField("col1", StringType), + StructField("col2", StringType) + )))) + val relation = LocalRelation('struct_data.struct(dataType)) + val plan = relation + .repartition(100) + .select( + GetStructField('struct_data, 1, None).as("value"), + $"struct_data.search_params.col1".as("col1"), + $"struct_data.search_params.col2".as("col2")).analyze + val query = Optimize.execute(plan) + val optimized = relation + .select(GetStructField('struct_data, 1, None).as("_extract_search_params")) + .repartition(100) + .select( + $"_extract_search_params".as("value"), + $"_extract_search_params.col1".as("col1"), + $"_extract_search_params.col2".as("col2")).analyze + comparePlans(optimized, query) + } } object NestedColumnAliasingSuite { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org