fqaiser94 commented on a change in pull request #29795:
URL: https://github.com/apache/spark/pull/29795#discussion_r492435473



##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
##########
@@ -39,19 +40,14 @@ object SimplifyExtractValueOps extends Rule[LogicalPlan] {
       // Remove redundant field extraction.
       case GetStructField(createNamedStruct: CreateNamedStruct, ordinal, _) =>
         createNamedStruct.valExprs(ordinal)
-      case GetStructField(w @ WithFields(struct, names, valExprs), ordinal, 
maybeName) =>
-        val name = w.dataType(ordinal).name
-        val matches = names.zip(valExprs).filter(_._1 == name)
-        if (matches.nonEmpty) {
-          // return last matching element as that is the final value for the 
field being extracted.
-          // For example, if a user submits a query like this:
-          // `$"struct_col".withField("b", lit(1)).withField("b", 
lit(2)).getField("b")`
-          // we want to return `lit(2)` (and not `lit(1)`).
-          val expr = matches.last._2
-          If(IsNull(struct), Literal(null, expr.dataType), expr)
-        } else {
-          GetStructField(struct, ordinal, maybeName)
-        }
+    case GetStructField(updateFields: UpdateFields, ordinal, _) =>
+      val structExpr = updateFields.structExpr
+      updateFields.newExprs(ordinal) match {
+        // if the struct itself is null, then any value extracted from it 
(expr) will be null
+        // so we don't need to wrap expr in If(IsNull(struct), Literal(null, 
expr.dataType), expr)
+        case expr: GetStructField if expr.child.semanticEquals(structExpr) => 
expr
+        case expr => If(IsNull(ultimateStruct(structExpr)), Literal(null, 
expr.dataType), expr)

Review comment:
       IIUC you mean put `CombineUpdateFields` in a separate batch that runs 
before the `SimplifyExtractValueOps` batch and remove the `ultimateStruct` 
method?
   
   If so, we'll miss out on some optimizations because after 
`SimplifyExtractValueOps` runs we might again end up with code containing 
`UpdateFields(UpdateFields(_, _))`. The `simplify add multiple nested fields to 
struct` test in `complexTypesSuite` is a good example of a test that will fail 
in this scenario. 

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
##########
@@ -39,19 +40,14 @@ object SimplifyExtractValueOps extends Rule[LogicalPlan] {
       // Remove redundant field extraction.
       case GetStructField(createNamedStruct: CreateNamedStruct, ordinal, _) =>
         createNamedStruct.valExprs(ordinal)
-      case GetStructField(w @ WithFields(struct, names, valExprs), ordinal, 
maybeName) =>
-        val name = w.dataType(ordinal).name
-        val matches = names.zip(valExprs).filter(_._1 == name)
-        if (matches.nonEmpty) {
-          // return last matching element as that is the final value for the 
field being extracted.
-          // For example, if a user submits a query like this:
-          // `$"struct_col".withField("b", lit(1)).withField("b", 
lit(2)).getField("b")`
-          // we want to return `lit(2)` (and not `lit(1)`).
-          val expr = matches.last._2
-          If(IsNull(struct), Literal(null, expr.dataType), expr)
-        } else {
-          GetStructField(struct, ordinal, maybeName)
-        }
+    case GetStructField(updateFields: UpdateFields, ordinal, _) =>
+      val structExpr = updateFields.structExpr
+      updateFields.newExprs(ordinal) match {
+        // if the struct itself is null, then any value extracted from it 
(expr) will be null
+        // so we don't need to wrap expr in If(IsNull(struct), Literal(null, 
expr.dataType), expr)
+        case expr: GetStructField if expr.child.semanticEquals(structExpr) => 
expr

Review comment:
       good to know, cheers

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
##########
@@ -39,19 +40,14 @@ object SimplifyExtractValueOps extends Rule[LogicalPlan] {
       // Remove redundant field extraction.
       case GetStructField(createNamedStruct: CreateNamedStruct, ordinal, _) =>
         createNamedStruct.valExprs(ordinal)
-      case GetStructField(w @ WithFields(struct, names, valExprs), ordinal, 
maybeName) =>
-        val name = w.dataType(ordinal).name
-        val matches = names.zip(valExprs).filter(_._1 == name)
-        if (matches.nonEmpty) {
-          // return last matching element as that is the final value for the 
field being extracted.
-          // For example, if a user submits a query like this:
-          // `$"struct_col".withField("b", lit(1)).withField("b", 
lit(2)).getField("b")`
-          // we want to return `lit(2)` (and not `lit(1)`).
-          val expr = matches.last._2
-          If(IsNull(struct), Literal(null, expr.dataType), expr)
-        } else {
-          GetStructField(struct, ordinal, maybeName)
-        }
+    case GetStructField(updateFields: UpdateFields, ordinal, _) =>
+      val structExpr = updateFields.structExpr
+      updateFields.newExprs(ordinal) match {
+        // if the struct itself is null, then any value extracted from it 
(expr) will be null
+        // so we don't need to wrap expr in If(IsNull(struct), Literal(null, 
expr.dataType), expr)
+        case expr: GetStructField if expr.child.semanticEquals(structExpr) => 
expr
+        case expr => If(IsNull(ultimateStruct(structExpr)), Literal(null, 
expr.dataType), expr)

Review comment:
       oh I see, passes my tests so I have made the change.

##########
File path: sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
##########
@@ -159,6 +160,14 @@ abstract class QueryTest extends PlanTest {
     checkAnswer(df, expectedAnswer.collect())
   }
 
+  protected def checkAnswer(

Review comment:
       I created another suite called `UpdateFieldsPerformanceSuite` as part of 
this PR where I reuse this function. 
   If you want, I could put the contents of `UpdateFieldsPerformanceSuite` in 
`ColumnExpressionSuite` and then I would be able to keep this function 
(`checkAnswer`) in just `ColumnExpressionSuite`. Just let me know. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to