viirya commented on a change in pull request #29812:
URL: https://github.com/apache/spark/pull/29812#discussion_r492414412



##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>

Review comment:
       We don't run this rule just once, so the order should be fine.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>
+      val newNames = mutable.ArrayBuffer.empty[String]
+      val newValues = mutable.ArrayBuffer.empty[Expression]
+      names.zip(values).reverse.foreach { case (name, value) =>
+        if (!newNames.contains(name)) {

Review comment:
       ok.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>
+      val newNames = mutable.ArrayBuffer.empty[String]
+      val newValues = mutable.ArrayBuffer.empty[Expression]
+      names.zip(values).reverse.foreach { case (name, value) =>
+        if (!newNames.contains(name)) {
+          newNames += name
+          newValues += value
+        }
+      }
+      WithFields(structExpr, names = newNames.reverse.toSeq, valExprs = 
newValues.reverse.toSeq)

Review comment:
       You are right. It is eventually the same. This is coming from improving 
scalability of #29587. This is applied during I fixed the scalability issue. I 
found this is useful to reduce the complex of `WithFields` expression tree.
   
   I will run these rules in #29587 to simplify expression tree before 
optimizer.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>
+      val newNames = mutable.ArrayBuffer.empty[String]
+      val newValues = mutable.ArrayBuffer.empty[Expression]
+      names.zip(values).reverse.foreach { case (name, value) =>
+        if (!newNames.contains(name)) {
+          newNames += name
+          newValues += value
+        }
+      }
+      WithFields(structExpr, names = newNames.reverse.toSeq, valExprs = 
newValues.reverse.toSeq)

Review comment:
       You are right. It is eventually the same. But for some cases, before we 
extend `WithFields`, the expression tree might be very complex. This is coming 
from improving scalability of #29587. This is applied during I fixed the 
scalability issue. I found this is useful to reduce the complex of `WithFields` 
expression tree.
   
   I will run these rules in #29587 to simplify expression tree before 
optimizer.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>
+      val newNames = mutable.ArrayBuffer.empty[String]
+      val newValues = mutable.ArrayBuffer.empty[Expression]
+      names.zip(values).reverse.foreach { case (name, value) =>
+        if (!newNames.contains(name)) {
+          newNames += name
+          newValues += value
+        }
+      }
+      WithFields(structExpr, names = newNames.reverse.toSeq, valExprs = 
newValues.reverse.toSeq)

Review comment:
       You are right. It is eventually the same. But for some cases, before we 
extend `WithFields`, the expression tree might be very complex. This is coming 
from improving scalability of #29587. This is applied during I fixed the 
scalability issue. I found this is useful to reduce the complex of `WithFields` 
expression tree.
   
   I will run these rules in #29587 to simplify expression tree before entering 
optimizer.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,32 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.internal.SQLConf
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
+  lazy val resolver = SQLConf.get.resolver
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>
+      val newNames = mutable.ArrayBuffer.empty[String]
+      val newValues = mutable.ArrayBuffer.empty[Expression]
+      names.zip(values).reverse.foreach { case (name, value) =>
+        if (newNames.find(resolver(_, name)).isEmpty) {

Review comment:
       Added a set for case-sensitive case.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>
+      val newNames = mutable.ArrayBuffer.empty[String]
+      val newValues = mutable.ArrayBuffer.empty[Expression]
+      names.zip(values).reverse.foreach { case (name, value) =>
+        if (!newNames.contains(name)) {
+          newNames += name
+          newValues += value
+        }
+      }
+      WithFields(structExpr, names = newNames.reverse.toSeq, valExprs = 
newValues.reverse.toSeq)

Review comment:
       Actually I'd like to run these rules to simplify `WithFields` tree early 
in analysis stage. After #29587, I thought that it is very likely to write bad 
`WithFields` tree. Once hitting that, it is very hard to debug and the 
analyzer/optimizer spend a lot of time traversing expression tree. So I think 
it is very useful keep this rule to simplify the expression tree, but I don't 
think we want to do `ReplaceWithFieldsExpression` in analysis stage.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>
+      val newNames = mutable.ArrayBuffer.empty[String]
+      val newValues = mutable.ArrayBuffer.empty[Expression]
+      names.zip(values).reverse.foreach { case (name, value) =>
+        if (!newNames.contains(name)) {
+          newNames += name
+          newValues += value
+        }
+      }
+      WithFields(structExpr, names = newNames.reverse.toSeq, valExprs = 
newValues.reverse.toSeq)

Review comment:
       Actually I'd like to run these rules to simplify `WithFields` tree early 
in analysis stage. During fixing scale issue of #29587, I thought that it is 
very likely to write bad `WithFields` tree. Once hitting that, it is very hard 
to debug and the analyzer/optimizer spend a lot of time traversing expression 
tree. So I think it is very useful keep this rule to simplify the expression 
tree, but I don't think we want to do `ReplaceWithFieldsExpression` in analysis 
stage.

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/WithFields.scala
##########
@@ -17,16 +17,29 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.expressions.WithFields
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField, 
WithFields}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 
 
 /**
- * Combines all adjacent [[WithFields]] expression into a single 
[[WithFields]] expression.
+ * Optimizes [[WithFields]] expression chains.
  */
-object CombineWithFields extends Rule[LogicalPlan] {
+object OptimizeWithFields extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case WithFields(structExpr, names, values) if names.distinct.length != 
names.length =>
+      val newNames = mutable.ArrayBuffer.empty[String]
+      val newValues = mutable.ArrayBuffer.empty[Expression]
+      names.zip(values).reverse.foreach { case (name, value) =>
+        if (!newNames.contains(name)) {
+          newNames += name
+          newValues += value
+        }
+      }
+      WithFields(structExpr, names = newNames.reverse.toSeq, valExprs = 
newValues.reverse.toSeq)

Review comment:
       I'm fine to wait until #29795.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to