cloud-fan commented on a change in pull request #29795:
URL: https://github.com/apache/spark/pull/29795#discussion_r492186072
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
##########
@@ -39,19 +40,14 @@ object SimplifyExtractValueOps extends Rule[LogicalPlan] {
// Remove redundant field extraction.
case GetStructField(createNamedStruct: CreateNamedStruct, ordinal, _) =>
createNamedStruct.valExprs(ordinal)
- case GetStructField(w @ WithFields(struct, names, valExprs), ordinal,
maybeName) =>
- val name = w.dataType(ordinal).name
- val matches = names.zip(valExprs).filter(_._1 == name)
- if (matches.nonEmpty) {
- // return last matching element as that is the final value for the
field being extracted.
- // For example, if a user submits a query like this:
- // `$"struct_col".withField("b", lit(1)).withField("b",
lit(2)).getField("b")`
- // we want to return `lit(2)` (and not `lit(1)`).
- val expr = matches.last._2
- If(IsNull(struct), Literal(null, expr.dataType), expr)
- } else {
- GetStructField(struct, ordinal, maybeName)
- }
+ case GetStructField(updateFields: UpdateFields, ordinal, _) =>
+ val structExpr = updateFields.structExpr
+ updateFields.newExprs(ordinal) match {
+ // if the struct itself is null, then any value extracted from it
(expr) will be null
+ // so we don't need to wrap expr in If(IsNull(struct), Literal(null,
expr.dataType), expr)
+ case expr: GetStructField if expr.child.semanticEquals(structExpr) =>
expr
Review comment:
using `semanticEquals` is safer
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
##########
@@ -39,19 +40,14 @@ object SimplifyExtractValueOps extends Rule[LogicalPlan] {
// Remove redundant field extraction.
case GetStructField(createNamedStruct: CreateNamedStruct, ordinal, _) =>
createNamedStruct.valExprs(ordinal)
- case GetStructField(w @ WithFields(struct, names, valExprs), ordinal,
maybeName) =>
- val name = w.dataType(ordinal).name
- val matches = names.zip(valExprs).filter(_._1 == name)
- if (matches.nonEmpty) {
- // return last matching element as that is the final value for the
field being extracted.
- // For example, if a user submits a query like this:
- // `$"struct_col".withField("b", lit(1)).withField("b",
lit(2)).getField("b")`
- // we want to return `lit(2)` (and not `lit(1)`).
- val expr = matches.last._2
- If(IsNull(struct), Literal(null, expr.dataType), expr)
- } else {
- GetStructField(struct, ordinal, maybeName)
- }
+ case GetStructField(updateFields: UpdateFields, ordinal, _) =>
+ val structExpr = updateFields.structExpr
+ updateFields.newExprs(ordinal) match {
+ // if the struct itself is null, then any value extracted from it
(expr) will be null
+ // so we don't need to wrap expr in If(IsNull(struct), Literal(null,
expr.dataType), expr)
+ case expr: GetStructField if expr.child.semanticEquals(structExpr) =>
expr
+ case expr => If(IsNull(ultimateStruct(structExpr)), Literal(null,
expr.dataType), expr)
Review comment:
shall we apply this rule after `UpdateFields` are all merged? then we
don't need to do `ultimateStruct`.
##########
File path: sql/core/src/main/scala/org/apache/spark/sql/Column.scala
##########
@@ -901,39 +901,125 @@ class Column(val expr: Expression) extends Logging {
* // result: org.apache.spark.sql.AnalysisException: Ambiguous reference
to fields
* }}}
*
+ * This method supports adding/replacing nested fields directly e.g.
+ *
+ * {{{
+ * val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2))
struct_col")
+ * df.select($"struct_col".withField("a.c", lit(3)).withField("a.d",
lit(4)))
+ * // result: {"a":{"a":1,"b":2,"c":3,"d":4}}
+ * }}}
+ *
+ * However, if you are going to add/replace multiple nested fields, it is
more optimal to extract
+ * out the nested struct before adding/replacing multiple fields e.g.
+ *
+ * {{{
+ * val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2))
struct_col")
+ * df.select($"struct_col".withField("a", $"struct_col.a".withField("c",
lit(3)).withField("d", lit(4))))
+ * // result: {"a":{"a":1,"b":2,"c":3,"d":4}}
+ * }}}
+ *
Review comment:
I think the same issue happens in `withColumn` as well. I'm fine with
method doc.
##########
File path: sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
##########
@@ -159,6 +160,14 @@ abstract class QueryTest extends PlanTest {
checkAnswer(df, expectedAnswer.collect())
}
+ protected def checkAnswer(
Review comment:
If it's only used in one test suite, let's move it there. We can move it
to `QueryTest` if we see more and more suites using it.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]