maropu commented on a change in pull request #29950:
URL: https://github.com/apache/spark/pull/29950#discussion_r502990724
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
##########
@@ -216,7 +216,7 @@ abstract class Optimizer(catalogManager: CatalogManager)
// The following batch should be executed after batch "Join Reorder" and
"LocalRelation".
Batch("Check Cartesian Products", Once,
CheckCartesianProducts) :+
- Batch("RewriteSubquery", Once,
+ Batch("RewriteSubquery", fixedPoint,
Review comment:
`FixedPoint(1)` instead? Also, could you leave comments about why we use
it instead of `Once`?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
##########
@@ -760,12 +756,43 @@ object CollapseProject extends Rule[LogicalPlan] {
s.copy(child = p2.copy(projectList = buildCleanedProjectList(l1,
p2.projectList)))
}
+ private def collapseProjects(plan: LogicalPlan): LogicalPlan = plan match {
+ case p1 @ Project(_, p2: Project) =>
+ val maxCommonExprs = SQLConf.get.maxCommonExprsInCollapseProject
+
+ if (haveCommonNonDeterministicOutput(p1.projectList, p2.projectList) ||
+ getLargestNumOfCommonOutput(p1.projectList, p2.projectList) >
maxCommonExprs) {
+ p1
+ } else {
+ collapseProjects(
+ p2.copy(projectList = buildCleanedProjectList(p1.projectList,
p2.projectList)))
+ }
+ case _ => plan
+ }
+
private def collectAliases(projectList: Seq[NamedExpression]):
AttributeMap[Alias] = {
AttributeMap(projectList.collect {
case a: Alias => a.toAttribute -> a
})
}
+ // Counts for the largest times common outputs from lower operator are used
in upper operators.
+ private def getLargestNumOfCommonOutput(
Review comment:
we cannot share the code between this and
`moreThanMaxAllowedCommonOutput`?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
##########
@@ -1926,6 +1926,19 @@ object SQLConf {
.booleanConf
.createWithDefault(true)
+ val MAX_COMMON_EXPRS_IN_COLLAPSE_PROJECT =
+ buildConf("spark.sql.optimizer.maxCommonExprsInCollapseProject")
+ .doc("An integer number indicates the maximum allowed number of a common
expression " +
Review comment:
`a common expression` -> `common input attributes`?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
##########
@@ -1926,6 +1926,19 @@ object SQLConf {
.booleanConf
.createWithDefault(true)
+ val MAX_COMMON_EXPRS_IN_COLLAPSE_PROJECT =
+ buildConf("spark.sql.optimizer.maxCommonExprsInCollapseProject")
+ .doc("An integer number indicates the maximum allowed number of a common
expression " +
+ "can be collapsed into upper Project from lower Project by optimizer
rule " +
+ "`CollapseProject`. Normally `CollapseProject` will collapse adjacent
Project " +
+ "and merge expressions. But in some edge cases, expensive expressions
might be " +
+ "duplicated many times in merged Project by this optimization. This
config sets " +
+ "a maximum number. Once an expression is duplicated more than this
number " +
+ "if merging two Project, Spark SQL will skip the merging.")
+ .version("3.1.0")
+ .intConf
Review comment:
Could you add `checkValue`?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
##########
@@ -1926,6 +1926,19 @@ object SQLConf {
.booleanConf
.createWithDefault(true)
+ val MAX_COMMON_EXPRS_IN_COLLAPSE_PROJECT =
+ buildConf("spark.sql.optimizer.maxCommonExprsInCollapseProject")
+ .doc("An integer number indicates the maximum allowed number of a common
expression " +
Review comment:
nit: `the maximum allowed number of a common expression can be collapsed
into upper Project from lower Project ...` => `the maximum allowed number of
common input attributes when collapsing adjacent Projects ...`?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]