viirya commented on a change in pull request #29812:
URL: https://github.com/apache/spark/pull/29812#discussion_r492414412
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
Review comment:
We don't run this rule just once, so the order should be fine.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
Review comment:
ok.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
+ newNames += name
+ newValues += value
+ }
+ }
+ WithFields(structExpr, names = newNames.reverse.toSeq, valExprs =
newValues.reverse.toSeq)
Review comment:
You are right. It is eventually the same. This is coming from improving
scalability of #29587. This is applied during I fixed the scalability issue. I
found this is useful to reduce the complex of `WithFields` expression tree.
I will run these rules in #29587 to simplify expression tree before
optimizer.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
+ newNames += name
+ newValues += value
+ }
+ }
+ WithFields(structExpr, names = newNames.reverse.toSeq, valExprs =
newValues.reverse.toSeq)
Review comment:
You are right. It is eventually the same. But for some cases, before we
extend `WithFields`, the expression tree might be very complex. This is coming
from improving scalability of #29587. This is applied during I fixed the
scalability issue. I found this is useful to reduce the complex of `WithFields`
expression tree.
I will run these rules in #29587 to simplify expression tree before
optimizer.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
+ newNames += name
+ newValues += value
+ }
+ }
+ WithFields(structExpr, names = newNames.reverse.toSeq, valExprs =
newValues.reverse.toSeq)
Review comment:
You are right. It is eventually the same. But for some cases, before we
extend `WithFields`, the expression tree might be very complex. This is coming
from improving scalability of #29587. This is applied during I fixed the
scalability issue. I found this is useful to reduce the complex of `WithFields`
expression tree.
I will run these rules in #29587 to simplify expression tree before entering
optimizer.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,32 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.internal.SQLConf
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
+ lazy val resolver = SQLConf.get.resolver
+
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (newNames.find(resolver(_, name)).isEmpty) {
Review comment:
Added a set for case-sensitive case.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
+ newNames += name
+ newValues += value
+ }
+ }
+ WithFields(structExpr, names = newNames.reverse.toSeq, valExprs =
newValues.reverse.toSeq)
Review comment:
Actually I'd like to run these rules to simplify `WithFields` tree early
in analysis stage. After #29587, I thought that it is very likely to write bad
`WithFields` tree. Once hitting that, it is very hard to debug and the
analyzer/optimizer spend a lot of time traversing expression tree. So I think
it is very useful keep this rule to simplify the expression tree, but I don't
think we want to do `ReplaceWithFieldsExpression` in analysis stage.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
+ newNames += name
+ newValues += value
+ }
+ }
+ WithFields(structExpr, names = newNames.reverse.toSeq, valExprs =
newValues.reverse.toSeq)
Review comment:
Actually I'd like to run these rules to simplify `WithFields` tree early
in analysis stage. During fixing scale issue of #29587, I thought that it is
very likely to write bad `WithFields` tree. Once hitting that, it is very hard
to debug and the analyzer/optimizer spend a lot of time traversing expression
tree. So I think it is very useful keep this rule to simplify the expression
tree, but I don't think we want to do `ReplaceWithFieldsExpression` in analysis
stage.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
+ newNames += name
+ newValues += value
+ }
+ }
+ WithFields(structExpr, names = newNames.reverse.toSeq, valExprs =
newValues.reverse.toSeq)
Review comment:
I'm fine to wait until #29795.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]