viirya commented on a change in pull request #29812:
URL: https://github.com/apache/spark/pull/29812#discussion_r492414412
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
Review comment:
We don't run this rule just once, so the order should be fine.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
Review comment:
ok.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
+ newNames += name
+ newValues += value
+ }
+ }
+ WithFields(structExpr, names = newNames.reverse.toSeq, valExprs =
newValues.reverse.toSeq)
Review comment:
You are right. It is eventually the same. This is coming from improving
scalability of #29587. This is applied during I fixed the scalability issue. I
found this is useful to reduce the complex of `WithFields` expression tree.
I will run these rules in #29587 to simplify expression tree before
optimizer.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
package org.apache.spark.sql.catalyst.optimizer
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField,
WithFields}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
/**
- * Combines all adjacent [[WithFields]] expression into a single
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
*/
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case WithFields(structExpr, names, values) if names.distinct.length !=
names.length =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ val newValues = mutable.ArrayBuffer.empty[Expression]
+ names.zip(values).reverse.foreach { case (name, value) =>
+ if (!newNames.contains(name)) {
+ newNames += name
+ newValues += value
+ }
+ }
+ WithFields(structExpr, names = newNames.reverse.toSeq, valExprs =
newValues.reverse.toSeq)
Review comment:
You are right. It is eventually the same. But for some cases, before we
extend `WithFields`, the expression tree might be very complex. This is coming
from improving scalability of #29587. This is applied during I fixed the
scalability issue. I found this is useful to reduce the complex of `WithFields`
expression tree.
I will run these rules in #29587 to simplify expression tree before
optimizer.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]