Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/19683#discussion_r158751339 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala --- @@ -444,12 +444,22 @@ object ColumnPruning extends Rule[LogicalPlan] { f.copy(child = prunedChild(child, f.references)) case e @ Expand(_, _, child) if (child.outputSet -- e.references).nonEmpty => e.copy(child = prunedChild(child, e.references)) - case g: Generate if !g.join && (g.child.outputSet -- g.references).nonEmpty => - g.copy(child = prunedChild(g.child, g.references)) - // Turn off `join` for Generate if no column from it's child is used - case p @ Project(_, g: Generate) if g.join && p.references.subsetOf(g.generatedSet) => - p.copy(child = g.copy(join = false)) + // Sync Generate's unrequiredChildOutput with the actual needed outputs + case p @ Project(_, g: Generate) => + val actualUnrequired = g.child.outputSet -- p.references + if (actualUnrequired == AttributeSet(g.unrequiredChildOutput)) { + p + } else { + p.copy(child = g.copy(unrequiredChildOutput = actualUnrequired.toSeq)) + } + + // prune unrequired references + case g : Generate --- End diff -- We may never reach here if there is a `Project(_, g: Generate)`. We should keep the previous order, move `case g : Generate` before `case p @ Project(_, g: Generate)`, and do ``` case g: Generate if g.requiredChildOutput.isEmpty && (g.child.outputSet -- g.references).nonEmpty => ... ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org