[GitHub] [spark] cloud-fan commented on a change in pull request #29795: [SPARK-32511][SQL] Add dropFields method to Column class

GitBox Mon, 21 Sep 2020 20:47:49 -0700


cloud-fan commented on a change in pull request #29795:
URL: https://github.com/apache/spark/pull/29795#discussion_r492186072




##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
##########
@@ -39,19 +40,14 @@ object SimplifyExtractValueOps extends Rule[LogicalPlan] {
       // Remove redundant field extraction.
       case GetStructField(createNamedStruct: CreateNamedStruct, ordinal, _) =>
         createNamedStruct.valExprs(ordinal)
-      case GetStructField(w @ WithFields(struct, names, valExprs), ordinal, 
maybeName) =>
-        val name = w.dataType(ordinal).name
-        val matches = names.zip(valExprs).filter(_._1 == name)
-        if (matches.nonEmpty) {
-          // return last matching element as that is the final value for the 
field being extracted.
-          // For example, if a user submits a query like this:
-          // `$"struct_col".withField("b", lit(1)).withField("b", 
lit(2)).getField("b")`
-          // we want to return `lit(2)` (and not `lit(1)`).
-          val expr = matches.last._2
-          If(IsNull(struct), Literal(null, expr.dataType), expr)
-        } else {
-          GetStructField(struct, ordinal, maybeName)
-        }
+    case GetStructField(updateFields: UpdateFields, ordinal, _) =>
+      val structExpr = updateFields.structExpr
+      updateFields.newExprs(ordinal) match {
+        // if the struct itself is null, then any value extracted from it 
(expr) will be null
+        // so we don't need to wrap expr in If(IsNull(struct), Literal(null, 
expr.dataType), expr)
+        case expr: GetStructField if expr.child.semanticEquals(structExpr) => 
expr

Review comment:
       using `semanticEquals` is safer

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
##########
@@ -39,19 +40,14 @@ object SimplifyExtractValueOps extends Rule[LogicalPlan] {
       // Remove redundant field extraction.
       case GetStructField(createNamedStruct: CreateNamedStruct, ordinal, _) =>
         createNamedStruct.valExprs(ordinal)
-      case GetStructField(w @ WithFields(struct, names, valExprs), ordinal, 
maybeName) =>
-        val name = w.dataType(ordinal).name
-        val matches = names.zip(valExprs).filter(_._1 == name)
-        if (matches.nonEmpty) {
-          // return last matching element as that is the final value for the 
field being extracted.
-          // For example, if a user submits a query like this:
-          // `$"struct_col".withField("b", lit(1)).withField("b", 
lit(2)).getField("b")`
-          // we want to return `lit(2)` (and not `lit(1)`).
-          val expr = matches.last._2
-          If(IsNull(struct), Literal(null, expr.dataType), expr)
-        } else {
-          GetStructField(struct, ordinal, maybeName)
-        }
+    case GetStructField(updateFields: UpdateFields, ordinal, _) =>
+      val structExpr = updateFields.structExpr
+      updateFields.newExprs(ordinal) match {
+        // if the struct itself is null, then any value extracted from it 
(expr) will be null
+        // so we don't need to wrap expr in If(IsNull(struct), Literal(null, 
expr.dataType), expr)
+        case expr: GetStructField if expr.child.semanticEquals(structExpr) => 
expr
+        case expr => If(IsNull(ultimateStruct(structExpr)), Literal(null, 
expr.dataType), expr)

Review comment:
       shall we apply this rule after `UpdateFields` are all merged? then we 
don't need to do `ultimateStruct`.

##########
File path: sql/core/src/main/scala/org/apache/spark/sql/Column.scala
##########
@@ -901,39 +901,125 @@ class Column(val expr: Expression) extends Logging {
    *   // result: org.apache.spark.sql.AnalysisException: Ambiguous reference 
to fields
    * }}}
    *
+   * This method supports adding/replacing nested fields directly e.g.
+   *
+   * {{{
+   *   val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2)) 
struct_col")
+   *   df.select($"struct_col".withField("a.c", lit(3)).withField("a.d", 
lit(4)))
+   *   // result: {"a":{"a":1,"b":2,"c":3,"d":4}}
+   * }}}
+   *
+   * However, if you are going to add/replace multiple nested fields, it is 
more optimal to extract
+   * out the nested struct before adding/replacing multiple fields e.g.
+   *
+   * {{{
+   *   val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2)) 
struct_col")
+   *   df.select($"struct_col".withField("a", $"struct_col.a".withField("c", 
lit(3)).withField("d", lit(4))))
+   *   // result: {"a":{"a":1,"b":2,"c":3,"d":4}}
+   * }}}
+   *

Review comment:
       I think the same issue happens in `withColumn` as well. I'm fine with 
method doc.

##########
File path: sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
##########
@@ -159,6 +160,14 @@ abstract class QueryTest extends PlanTest {
     checkAnswer(df, expectedAnswer.collect())
   }
 
+  protected def checkAnswer(

Review comment:
       If it's only used in one test suite, let's move it there. We can move it 
to `QueryTest` if we see more and more suites using it.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] cloud-fan commented on a change in pull request #29795: [SPARK-32511][SQL] Add dropFields method to Column class

Reply via email to