viirya commented on a change in pull request #31966:
URL: https://github.com/apache/spark/pull/31966#discussion_r613814881
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
##########
@@ -241,12 +262,68 @@ object GeneratorNestedColumnAliasing {
// On top on `Generate`, a `Project` that might have nested column
accessors.
// We try to get alias maps for both project list and generator's
children expressions.
val exprsToPrune = projectList ++ g.generator.children
- NestedColumnAliasing.getAliasSubMap(exprsToPrune,
g.qualifiedGeneratorOutput).map {
+ NestedColumnAliasing.getAliasSubMap(exprsToPrune).map {
case (nestedFieldToAlias, attrToAliases) =>
+ val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) =
+ nestedFieldOnGeneratorOutput(nestedFieldToAlias,
g.qualifiedGeneratorOutput)
+ val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) =
+ aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput)
+
+ // Push nested column accessors through `Generator`.
// Defer updating `Generate.unrequiredChildIndex` to next round of
`ColumnPruning`.
- val newChild =
- NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias,
attrToAliases)
- Project(NestedColumnAliasing.getNewProjectList(projectList,
nestedFieldToAlias), newChild)
+ val newChild = NestedColumnAliasing.replaceWithAliases(g,
+ nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator)
+ val pushedThrough = Project(NestedColumnAliasing
+ .getNewProjectList(projectList, nestedFieldsNotOnGenerator),
newChild)
+
+ // Pruning on `Generator`'s output. We only process single field
case.
+ // For multiple field case, we cannot directly move field extractor
into
+ // the generator expression. A workaround is to re-construct array
of struct
+ // from multiple fields. But it will be more complicated and may not
worth.
+ // TODO(SPARK-34956): support multiple fields.
+ if (nestedFieldsOnGenerator.size > 1 || nestedFieldsOnGenerator.size
== 0) {
Review comment:
done.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]