maropu commented on a change in pull request #31966:
URL: https://github.com/apache/spark/pull/31966#discussion_r612051281
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
##########
@@ -241,12 +262,68 @@ object GeneratorNestedColumnAliasing {
// On top on `Generate`, a `Project` that might have nested column
accessors.
// We try to get alias maps for both project list and generator's
children expressions.
val exprsToPrune = projectList ++ g.generator.children
- NestedColumnAliasing.getAliasSubMap(exprsToPrune,
g.qualifiedGeneratorOutput).map {
+ NestedColumnAliasing.getAliasSubMap(exprsToPrune).map {
case (nestedFieldToAlias, attrToAliases) =>
+ val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) =
+ nestedFieldOnGeneratorOutput(nestedFieldToAlias,
g.qualifiedGeneratorOutput)
+ val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) =
+ aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput)
+
+ // Push nested column accessors through `Generator`.
// Defer updating `Generate.unrequiredChildIndex` to next round of
`ColumnPruning`.
- val newChild =
- NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias,
attrToAliases)
- Project(NestedColumnAliasing.getNewProjectList(projectList,
nestedFieldToAlias), newChild)
+ val newChild = NestedColumnAliasing.replaceWithAliases(g,
+ nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator)
+ val pushedThrough = Project(NestedColumnAliasing
+ .getNewProjectList(projectList, nestedFieldsNotOnGenerator),
newChild)
+
+ // Pruning on `Generator`'s output. We only process single field
case.
+ // For multiple field case, we cannot directly move field extractor
into
+ // the generator expression. A workaround is to re-construct array
of struct
+ // from multiple fields. But it will be more complicated and may not
worth.
+ // TODO(SPARK-34956): support multiple fields.
+ if (nestedFieldsOnGenerator.size > 1 || nestedFieldsOnGenerator.size
== 0) {
+ pushedThrough
+ } else {
+ // Only one nested column accessor.
+ // E.g., df.select(explode($"items").as("item")).select($"item.a")
+ pushedThrough match {
+ case p @ Project(_, newG: Generate) =>
+ // Replace the child expression of `ExplodeBase` generator with
+ // nested column accessor.
+ // E.g.,
df.select(explode($"items").as("item")).select($"item.a") =>
+ // df.select(explode($"items.a").as("item.a"))
+ val rewrittenG = newG.transformExpressions {
+ case e: ExplodeBase =>
+ val extractor =
nestedFieldsOnGenerator.head._1.transformUp {
+ case _: Attribute =>
+ e.child
+ case g: GetStructField =>
+ ExtractValue(g.child, Literal(g.extractFieldName),
SQLConf.get.resolver)
+ }
+ e.withNewChildren(Seq(extractor))
+ }
+
+ // As we change the child of the generator, its output data
type must be updated.
+ val updatedGeneratorOutput = rewrittenG.generatorOutput
+ .zip(rewrittenG.generator.elementSchema.toAttributes)
+ .map { case (oldAttr, newAttr) =>
+ newAttr.withExprId(oldAttr.exprId).withName(oldAttr.name)
+ }
+ assert(updatedGeneratorOutput.length ==
rewrittenG.generatorOutput.length,
+ "Updated generator output must have same length as original
generator output.")
+ val updatedGenerate = rewrittenG.copy(generatorOutput =
updatedGeneratorOutput)
+
+ // Replace nested column accessor with generator output.
+ p.withNewChildren(Seq(updatedGenerate)).transformExpressions {
+ case f: ExtractValue if nestedFieldsOnGenerator.contains(f)
=>
+ updatedGenerate.output
+ .find(a => attrToAliasesOnGenerator.contains(a.exprId))
+ .getOrElse(f)
+ }
+
+ case _ => pushedThrough
Review comment:
If this is a unreachable path, how about throwing an illegal state
exception?
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
##########
@@ -351,6 +351,41 @@ abstract class SchemaPruningSuite
}
}
+ testSchemaPruning("SPARK-34638: nested column prune on generator output") {
+ val query1 = spark.table("contacts")
+ .select(explode(col("friends")).as("friend"))
Review comment:
Can this work correctly if `explode` has another func in its child,
e.g., `explode(filter(friends, x -> x is not null))`?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
##########
@@ -241,12 +262,68 @@ object GeneratorNestedColumnAliasing {
// On top on `Generate`, a `Project` that might have nested column
accessors.
// We try to get alias maps for both project list and generator's
children expressions.
val exprsToPrune = projectList ++ g.generator.children
- NestedColumnAliasing.getAliasSubMap(exprsToPrune,
g.qualifiedGeneratorOutput).map {
+ NestedColumnAliasing.getAliasSubMap(exprsToPrune).map {
case (nestedFieldToAlias, attrToAliases) =>
+ val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) =
+ nestedFieldOnGeneratorOutput(nestedFieldToAlias,
g.qualifiedGeneratorOutput)
+ val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) =
+ aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput)
+
+ // Push nested column accessors through `Generator`.
// Defer updating `Generate.unrequiredChildIndex` to next round of
`ColumnPruning`.
- val newChild =
- NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias,
attrToAliases)
- Project(NestedColumnAliasing.getNewProjectList(projectList,
nestedFieldToAlias), newChild)
+ val newChild = NestedColumnAliasing.replaceWithAliases(g,
+ nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator)
+ val pushedThrough = Project(NestedColumnAliasing
+ .getNewProjectList(projectList, nestedFieldsNotOnGenerator),
newChild)
+
+ // Pruning on `Generator`'s output. We only process single field
case.
+ // For multiple field case, we cannot directly move field extractor
into
+ // the generator expression. A workaround is to re-construct array
of struct
+ // from multiple fields. But it will be more complicated and may not
worth.
+ // TODO(SPARK-34956): support multiple fields.
+ if (nestedFieldsOnGenerator.size > 1 || nestedFieldsOnGenerator.size
== 0) {
+ pushedThrough
+ } else {
+ // Only one nested column accessor.
+ // E.g., df.select(explode($"items").as("item")).select($"item.a")
+ pushedThrough match {
+ case p @ Project(_, newG: Generate) =>
+ // Replace the child expression of `ExplodeBase` generator with
+ // nested column accessor.
+ // E.g.,
df.select(explode($"items").as("item")).select($"item.a") =>
+ // df.select(explode($"items.a").as("item.a"))
+ val rewrittenG = newG.transformExpressions {
+ case e: ExplodeBase =>
Review comment:
This PR targets at explode-like generators only? How about the others,
e.g., `inline`?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
##########
@@ -241,12 +262,68 @@ object GeneratorNestedColumnAliasing {
// On top on `Generate`, a `Project` that might have nested column
accessors.
// We try to get alias maps for both project list and generator's
children expressions.
val exprsToPrune = projectList ++ g.generator.children
- NestedColumnAliasing.getAliasSubMap(exprsToPrune,
g.qualifiedGeneratorOutput).map {
+ NestedColumnAliasing.getAliasSubMap(exprsToPrune).map {
case (nestedFieldToAlias, attrToAliases) =>
+ val (nestedFieldsOnGenerator, nestedFieldsNotOnGenerator) =
+ nestedFieldOnGeneratorOutput(nestedFieldToAlias,
g.qualifiedGeneratorOutput)
+ val (attrToAliasesOnGenerator, attrToAliasesNotOnGenerator) =
+ aliasesOnGeneratorOutput(attrToAliases, g.qualifiedGeneratorOutput)
+
+ // Push nested column accessors through `Generator`.
// Defer updating `Generate.unrequiredChildIndex` to next round of
`ColumnPruning`.
- val newChild =
- NestedColumnAliasing.replaceWithAliases(g, nestedFieldToAlias,
attrToAliases)
- Project(NestedColumnAliasing.getNewProjectList(projectList,
nestedFieldToAlias), newChild)
+ val newChild = NestedColumnAliasing.replaceWithAliases(g,
+ nestedFieldsNotOnGenerator, attrToAliasesNotOnGenerator)
+ val pushedThrough = Project(NestedColumnAliasing
+ .getNewProjectList(projectList, nestedFieldsNotOnGenerator),
newChild)
+
+ // Pruning on `Generator`'s output. We only process single field
case.
+ // For multiple field case, we cannot directly move field extractor
into
+ // the generator expression. A workaround is to re-construct array
of struct
+ // from multiple fields. But it will be more complicated and may not
worth.
+ // TODO(SPARK-34956): support multiple fields.
+ if (nestedFieldsOnGenerator.size > 1 || nestedFieldsOnGenerator.size
== 0) {
Review comment:
`nestedFieldsOnGenerator.size == 0` -> `nestedFieldsOnGenerator.isEmpty`
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
##########
@@ -351,6 +351,41 @@ abstract class SchemaPruningSuite
}
}
+ testSchemaPruning("SPARK-34638: nested column prune on generator output") {
+ val query1 = spark.table("contacts")
+ .select(explode(col("friends")).as("friend"))
+ .select("friend.first")
+ checkScan(query1, "struct<friends:array<struct<first:string>>>")
+ checkAnswer(query1, Row("Susan") :: Nil)
+
+ // Currently we don't prune multiple field case.
+ val query2 = spark.table("contacts")
+ .select(explode(col("friends")).as("friend"))
+ .select("friend.first", "friend.middle")
+ checkScan(query2,
"struct<friends:array<struct<first:string,middle:string,last:string>>>")
+ checkAnswer(query2, Row("Susan", "Z.") :: Nil)
+
+ val query3 = spark.table("contacts")
+ .select(explode(col("friends")).as("friend"))
+ .select("friend.first", "friend.middle", "friend")
+ checkScan(query3,
"struct<friends:array<struct<first:string,middle:string,last:string>>>")
+ checkAnswer(query3, Row("Susan", "Z.", Row("Susan", "Z.", "Smith")) :: Nil)
+
+ withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+ val query4 = spark.table("contacts")
+ .select(explode(col("friends")).as("friend"))
+ .select("friend.First")
+ checkScan(query4, "struct<friends:array<struct<first:string>>>")
+ checkAnswer(query4, Row("Susan") :: Nil)
+
+ val query5 = spark.table("contacts")
+ .select(explode(col("friends")).as("friend"))
+ .select("friend.MIDDLE")
+ checkScan(query5, "struct<friends:array<struct<middle:string>>>")
+ checkAnswer(query5, Row("Z.") :: Nil)
+ }
+ }
Review comment:
Could you move this case-sensitivity test into an independent test case
just like `testSchemaPruning("SPARK-34638: nested column prune on generator
output - case-sensitivity")`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]