Ngone51 commented on a change in pull request #26629: [SPARK-29768][SQL] Column
pruning through nondeterministic expressions
URL: https://github.com/apache/spark/pull/26629#discussion_r350514186
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
##########
@@ -74,24 +95,72 @@ object PhysicalOperation extends PredicateHelper {
case other =>
(None, Nil, other, AttributeMap(Seq()))
}
+}
- private def collectAliases(fields: Seq[Expression]):
AttributeMap[Expression] =
- AttributeMap(fields.collect {
- case a: Alias => (a.toAttribute, a.child)
- })
-
- private def substitute(aliases: AttributeMap[Expression])(expr: Expression):
Expression = {
- expr.transform {
- case a @ Alias(ref: AttributeReference, name) =>
- aliases.get(ref)
- .map(Alias(_, name)(a.exprId, a.qualifier))
- .getOrElse(a)
+/**
+ * A variant of [[PhysicalOperation]]. It matches any number of project or
filter
+ * operations even if they are non-deterministic, as long as they satisfy the
+ * requirement of CollapseProject and CombineFilters.
+ */
+object ScanOperation extends OperationHelper with PredicateHelper {
+ type ScanReturnType = Option[(Option[Seq[NamedExpression]],
+ Seq[Expression], LogicalPlan, AttributeMap[Expression])]
- case a: AttributeReference =>
- aliases.get(a)
- .map(Alias(_, a.name)(a.exprId, a.qualifier)).getOrElse(a)
+ def unapply(plan: LogicalPlan): Option[ReturnType] = {
+ collectProjectsAndFilters(plan) match {
+ case Some((fields, filters, child, _)) =>
+ Some((fields.getOrElse(child.output), filters, child))
+ case None => None
}
}
+
+ private def hasCommonNonDeterministic(expr: Seq[Expression], aliases:
AttributeMap[Expression])
+ : Boolean = {
+ expr.exists(_.collect {
+ case a: AttributeReference if aliases.contains(a) => aliases(a)
+ }.exists(!_.deterministic))
+ }
+
+ private def collectProjectsAndFilters(plan: LogicalPlan): ScanReturnType =
+ plan match {
+ case Project(fields, child) =>
+ collectProjectsAndFilters(child) match {
+ case Some((_, filters, other, aliases)) =>
+ // Follow CollapseProject and only keep going if the collected
Projects
+ // do not have common non-deterministic expressions.
Review comment:
Yeah, generally, I think so. But there's also suggestion of reducing
coupling between `Optimizer` and `Planner`. See:
https://github.com/apache/spark/pull/8486#issuecomment-136518354
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]