[spark] branch master updated: [SPARK-39699][SQL] Make CollapseProject smarter about collection creation expressions

wenchen Tue, 12 Jul 2022 22:51:48 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 8749c17c200 [SPARK-39699][SQL] Make CollapseProject smarter about 
collection creation expressions
8749c17c200 is described below

commit 8749c17c20062a81a0de91a0671b7bb198063406
Author: Wenchen Fan <[email protected]>
AuthorDate: Wed Jul 13 13:51:22 2022 +0800

    [SPARK-39699][SQL] Make CollapseProject smarter about collection creation 
expressions
    
    ### What changes were proposed in this pull request?
    
    The rule `CollapseProject` has been improved multiple times, to make sure 
it only collapses projects when there is no regresson. However, it still has a 
problem today. For example, if the project below has `struct(c1, expensive_expr 
as c2) as s`, and the project above has `s.c2 + s.c2`, then we should not 
collapse projects because it will duplicate the expensive expression.
    
    This PR makes the rule smarter. If `CreateStruct` expression (or its 
friends) is referenced more than once, we can still collapse projects if the 
`CreateStruct` is only referenced by `GetStructField` and all the accesses to 
it are cheap. Cheap here means the result expression after we optimize 
`GetStructField` and `CreateStruct` is simple.
    ### Why are the changes needed?
    
    To avoid bad optimized plan produced by `CollapseProject`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    new tests
    
    Closes #37165 from cloud-fan/qo.
    
    Authored-by: Wenchen Fan <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../catalyst/expressions/complexTypeCreator.scala  |  30 +++---
 .../spark/sql/catalyst/optimizer/Optimizer.scala   | 107 +++++++++++++++-----
 .../catalyst/optimizer/CollapseProjectSuite.scala  |  79 ++++++++++++---
 .../sql/catalyst/optimizer/complexTypesSuite.scala | 108 ++++++++++++---------
 4 files changed, 224 insertions(+), 100 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 014e74b7641..cdeb27d0c28 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -25,8 +25,8 @@ import 
org.apache.spark.sql.catalyst.analysis.FunctionRegistry.{FUNC_ALIAS, Func
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.catalyst.trees.{LeafLike, UnaryLike}
 import org.apache.spark.sql.catalyst.trees.TreePattern._
-import org.apache.spark.sql.catalyst.trees.UnaryLike
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
@@ -605,10 +605,16 @@ case class StringToMap(text: Expression, pairDelim: 
Expression, keyValueDelim: E
 /**
  * Represents an operation to be applied to the fields of a struct.
  */
-trait StructFieldsOperation {
+trait StructFieldsOperation extends Expression with Unevaluable {
 
   val resolver: Resolver = SQLConf.get.resolver
 
+  override def dataType: DataType = throw new IllegalStateException(
+    "StructFieldsOperation.dataType should not be called.")
+
+  override def nullable: Boolean = throw new IllegalStateException(
+    "StructFieldsOperation.nullable should not be called.")
+
   /**
    * Returns an updated list of StructFields and Expressions that will 
ultimately be used
    * as the fields argument for [[StructType]] and as the children argument for
@@ -624,7 +630,7 @@ trait StructFieldsOperation {
  * children, and thereby enable the analyzer to resolve and transform valExpr 
as necessary.
  */
 case class WithField(name: String, valExpr: Expression)
-  extends Unevaluable with StructFieldsOperation with UnaryLike[Expression] {
+  extends StructFieldsOperation with UnaryLike[Expression] {
 
   override def apply(values: Seq[(StructField, Expression)]): 
Seq[(StructField, Expression)] = {
     val newFieldExpr = (StructField(name, valExpr.dataType, valExpr.nullable), 
valExpr)
@@ -644,12 +650,6 @@ case class WithField(name: String, valExpr: Expression)
 
   override def child: Expression = valExpr
 
-  override def dataType: DataType = throw new IllegalStateException(
-    "WithField.dataType should not be called.")
-
-  override def nullable: Boolean = throw new IllegalStateException(
-    "WithField.nullable should not be called.")
-
   override def prettyName: String = "WithField"
 
   override protected def withNewChildInternal(newChild: Expression): WithField 
=
@@ -659,7 +659,7 @@ case class WithField(name: String, valExpr: Expression)
 /**
  * Drop a field by name.
  */
-case class DropField(name: String) extends StructFieldsOperation {
+case class DropField(name: String) extends StructFieldsOperation with 
LeafLike[Expression] {
   override def apply(values: Seq[(StructField, Expression)]): 
Seq[(StructField, Expression)] =
     values.filterNot { case (field, _) => resolver(field.name, name) }
 }
@@ -698,11 +698,13 @@ case class UpdateFields(structExpr: Expression, fieldOps: 
Seq[StructFieldsOperat
   override def prettyName: String = "update_fields"
 
   private lazy val newFieldExprs: Seq[(StructField, Expression)] = {
+    def getFieldExpr(i: Int): Expression = structExpr match {
+      case c: CreateNamedStruct => c.valExprs(i)
+      case _ => GetStructField(structExpr, i)
+    }
+    val fieldsWithIndex = 
structExpr.dataType.asInstanceOf[StructType].fields.zipWithIndex
     val existingFieldExprs: Seq[(StructField, Expression)] =
-      structExpr.dataType.asInstanceOf[StructType].fields.zipWithIndex.map {
-        case (field, i) => (field, GetStructField(structExpr, i))
-      }
-
+      fieldsWithIndex.map { case (field, i) => (field, getFieldExpr(i)) }
     fieldOps.foldLeft(existingFieldExprs)((exprs, op) => op(exprs))
   }
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index fa012aac4fa..8ab08ba878e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1011,24 +1011,92 @@ object CollapseProject extends Rule[LogicalPlan] with 
AliasHelper {
       .forall {
         case (reference, count) =>
           val producer = producerMap.getOrElse(reference, reference)
-          producer.deterministic && (count == 1 || alwaysInline || {
-            val relatedConsumers = 
consumers.filter(_.references.contains(reference))
-            // It's still exactly-only if there is only one reference in 
non-extract expressions,
-            // as we won't duplicate the expensive CreateStruct-like 
expressions.
-            val extractOnly = relatedConsumers.map(refCountInNonExtract(_, 
reference)).sum <= 1
-            shouldInline(producer, extractOnly)
-          })
+          val relatedConsumers = 
consumers.filter(_.references.contains(reference))
+
+          def cheapToInlineProducer: Boolean = trimAliases(producer) match {
+            // These collection creation functions are not cheap as a 
producer, but we have
+            // optimizer rules that can optimize them out if they are only 
consumed by
+            // ExtractValue (See SimplifyExtractValueOps), so we need to allow 
to inline them to
+            // avoid perf regression. As an example:
+            //   Project(s.a, s.b, Project(create_struct(a, b, c) as s, child))
+            // We should collapse these two projects and eventually get 
Project(a, b, child)
+            case e @ (_: CreateNamedStruct | _: UpdateFields | _: CreateMap | 
_: CreateArray) =>
+              // We can inline the collection creation producer if at most one 
of its access
+              // is non-cheap. Cheap access here means the access can be 
optimized by
+              // `SimplifyExtractValueOps` and become a cheap expression. For 
example,
+              // `create_struct(a, b, c).a` is a cheap access as it can be 
optimized to `a`.
+              // For a query:
+              //   Project(s.a, s, Project(create_struct(a, b, c) as s, child))
+              // We should collapse these two projects and eventually get
+              //   Project(a, create_struct(a, b, c) as s, child)
+              var nonCheapAccessSeen = false
+              def nonCheapAccessVisitor(): Boolean = {
+                // Returns true for all calls after the first.
+                try {
+                  nonCheapAccessSeen
+                } finally {
+                  nonCheapAccessSeen = true
+                }
+              }
+
+              !relatedConsumers.exists(findNonCheapAccesses(_, reference, e, 
nonCheapAccessVisitor))
+
+            case other => isCheap(other)
+          }
+
+          producer.deterministic && (count == 1 || alwaysInline || 
cheapToInlineProducer)
       }
   }
 
-  private def refCountInNonExtract(expr: Expression, ref: Attribute): Int = {
-    def refCount(e: Expression): Int = e match {
-      case a: Attribute if a.semanticEquals(ref) => 1
-      // The first child of `ExtractValue` is the complex type to be extracted.
-      case e: ExtractValue if e.children.head.semanticEquals(ref) => 0
-      case _ => e.children.map(refCount).sum
+  private object ExtractOnlyRef {
+    def unapply(expr: Expression): Option[Attribute] = expr match {
+      case a: Alias => unapply(a.child)
+      case e: ExtractValue => unapply(e.children.head)
+      case a: Attribute => Some(a)
+      case _ => None
+    }
+  }
+
+  private def inlineReference(expr: Expression, ref: Attribute, refExpr: 
Expression): Expression = {
+    expr.transformUp {
+      case a: Attribute if a.semanticEquals(ref) => refExpr
     }
-    refCount(expr)
+  }
+
+  private object SimplifyExtractValueExecutor extends 
RuleExecutor[LogicalPlan] {
+    override val batches = Batch("SimplifyExtractValueOps", FixedPoint(10),
+      SimplifyExtractValueOps,
+      // `SimplifyExtractValueOps` turns map lookup to CaseWhen, and we need 
the following two rules
+      // to further optimize CaseWhen.
+      ConstantFolding,
+      SimplifyConditionals) :: Nil
+  }
+
+  private def simplifyExtractValues(expr: Expression): Expression = {
+    val fakePlan = Project(Seq(Alias(expr, "fake")()), LocalRelation(Nil))
+    SimplifyExtractValueExecutor.execute(fakePlan)
+      .asInstanceOf[Project].projectList.head.asInstanceOf[Alias].child
+  }
+
+  // This method visits the consumer expression tree and finds non-cheap 
accesses to the reference.
+  // It returns true as long as the `nonCheapAccessVisitor` returns true.
+  private def findNonCheapAccesses(
+      consumer: Expression,
+      ref: Attribute,
+      refExpr: Expression,
+      nonCheapAccessVisitor: () => Boolean): Boolean = consumer match {
+    // Direct access to the collection creation producer is non-cheap.
+    case attr: Attribute if attr.semanticEquals(ref) =>
+      nonCheapAccessVisitor()
+
+    // If the collection creation producer is accessed by a `ExtractValue` 
chain, inline it and
+    // apply `SimplifyExtractValueOps` to see if the result expression is 
cheap.
+    case e @ ExtractOnlyRef(attr) if attr.semanticEquals(ref) =>
+      val finalExpr = simplifyExtractValues(inlineReference(e, ref, refExpr))
+      !isCheap(finalExpr) && nonCheapAccessVisitor()
+
+    case _ =>
+      consumer.children.exists(findNonCheapAccesses(_, ref, refExpr, 
nonCheapAccessVisitor))
   }
 
   /**
@@ -1053,20 +1121,13 @@ object CollapseProject extends Rule[LogicalPlan] with 
AliasHelper {
   /**
    * Check if the given expression is cheap that we can inline it.
    */
-  private def shouldInline(e: Expression, extractOnlyConsumer: Boolean): 
Boolean = e match {
+  private def isCheap(e: Expression): Boolean = e match {
     case _: Attribute | _: OuterReference => true
     case _ if e.foldable => true
     // PythonUDF is handled by the rule ExtractPythonUDFs
     case _: PythonUDF => true
     // Alias and ExtractValue are very cheap.
-    case _: Alias | _: ExtractValue => e.children.forall(shouldInline(_, 
extractOnlyConsumer))
-    // These collection create functions are not cheap, but we have optimizer 
rules that can
-    // optimize them out if they are only consumed by ExtractValue, so we need 
to allow to inline
-    // them to avoid perf regression. As an example:
-    //   Project(s.a, s.b, Project(create_struct(a, b, c) as s, child))
-    // We should collapse these two projects and eventually get Project(a, b, 
child)
-    case _: CreateNamedStruct | _: CreateArray | _: CreateMap | _: 
UpdateFields =>
-      extractOnlyConsumer
+    case _: Alias | _: ExtractValue => e.children.forall(isCheap)
     case _ => false
   }
 
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
index 342ff3264b0..c5f506d4d68 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Alias, Rand}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Rand, UpdateFields}
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
@@ -31,7 +31,8 @@ class CollapseProjectSuite extends PlanTest {
     val batches =
       Batch("Subqueries", FixedPoint(10), EliminateSubqueryAliases) ::
       Batch("CollapseProject", Once, CollapseProject) ::
-      Batch("SimplifyExtractValueOps", Once, SimplifyExtractValueOps) :: Nil
+      Batch("SimplifyExtractValueOps", Once, SimplifyExtractValueOps) ::
+      Batch("ReplaceUpdateFieldsExpression", Once, 
ReplaceUpdateFieldsExpression) :: Nil
   }
 
   val testRelation = LocalRelation($"a".int, $"b".int)
@@ -132,28 +133,76 @@ class CollapseProjectSuite extends PlanTest {
 
     val optimized = Optimize.execute(query)
     comparePlans(optimized, query)
+  }
 
-    // CreateStruct is an exception if it's only referenced by ExtractValue.
-    val query2 = testRelation
-      .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"))
+  test("SPARK-39699: collapse project with collection creation expressions") {
+    val struct = namedStruct(
+      "a", $"a",
+      "a_plus_1", $"a" + 1,
+      "a_plus_2", $"a" + 2,
+      "nested", namedStruct("inner1", $"a" + 3, "inner2", $"a" + 4)
+    ).as("struct")
+    val baseQuery = testRelation.select(struct)
+
+    // Can collapse as there is only one non-cheap access: `struct.a_plus_1`
+    val query1 = baseQuery
       .select(($"struct".getField("a") + 
$"struct".getField("a_plus_1")).as("add"))
       .analyze
-    val optimized2 = Optimize.execute(query2)
-    val expected2 = testRelation
+    val optimized1 = Optimize.execute(query1)
+    val expected1 = testRelation
       .select(($"a" + ($"a" + 1)).as("add"))
       .analyze
-    comparePlans(optimized2, expected2)
+    comparePlans(optimized1, expected1)
 
-    // referencing `CreateStruct` only once in non-extract expression is OK.
-    val query3 = testRelation
-      .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"))
-      .select($"struct", $"struct".getField("a"))
+    // Cannot collapse as there are two non-cheap accesses: `struct.a_plus_1` 
and `struct.a_plus_1`
+    val query2 = baseQuery
+      .select(($"struct".getField("a_plus_1") + 
$"struct".getField("a_plus_1")).as("add"))
+      .analyze
+    val optimized2 = Optimize.execute(query2)
+    comparePlans(optimized2, query2)
+
+    // Cannot collapse as there are two non-cheap accesses: `struct.a_plus_1` 
and `struct`
+    val query3 = baseQuery
+      .select($"struct".getField("a_plus_1"), $"struct")
       .analyze
     val optimized3 = Optimize.execute(query3)
-    val expected3 = testRelation
-      .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"), 
$"a".as("struct.a"))
+    comparePlans(optimized3, query3)
+
+    // Can collapse as there is only one non-cheap access: `struct`
+    val query4 = baseQuery
+      .select($"struct".getField("a"), $"struct")
+      .analyze
+    val optimized4 = Optimize.execute(query4)
+    val expected4 = testRelation
+      .select($"a".as("struct.a"), struct)
+      .analyze
+    comparePlans(optimized4, expected4)
+
+    // Referenced by WithFields.
+    val query5 = testRelation.select(namedStruct("a", $"a", "b", $"a" + 
1).as("struct"))
+      .select(UpdateFields($"struct", "c", $"struct".getField("a")).as("u"))
+      .analyze
+    val optimized5 = Optimize.execute(query5)
+    val expected5 = testRelation
+      .select(namedStruct("a", $"a", "b", $"a" + 1, "c", 
$"a").as("struct").as("u"))
+      .analyze
+    comparePlans(optimized5, expected5)
+
+    // TODO: should collapse as the non-cheap accesses are distinct:
+    //  `struct.a_plus_1` and `struct.a_plus_2`
+    val query6 = baseQuery
+      .select(($"struct".getField("a_plus_1") + 
$"struct".getField("a_plus_2")).as("add"))
+      .analyze
+    val optimized6 = Optimize.execute(query6)
+    comparePlans(optimized6, query6)
+
+    // Cannot collapse as the two non-cheap accesses have a lineage:
+    // `struct.nested` and `struct.nested.inner1`
+    val query7 = baseQuery
+      .select($"struct".getField("nested"), 
$"struct".getField("nested").getField("inner1"))
       .analyze
-    comparePlans(optimized3, expected3)
+    val optimized7 = Optimize.execute(query7)
+    comparePlans(optimized7, query7)
   }
 
   test("preserve top-level alias metadata while collapsing projects") {
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index cef0dd48499..71acbdfdd2f 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -138,33 +138,33 @@ class ComplexTypesSuite extends PlanTest with 
ExpressionEvalHelper {
           "att1", $"id",
           "att2", $"id" * $"id")),
         CreateNamedStruct(Seq(
-          "att1", $"id" + 1,
-          "att2", ($"id" + 1) * ($"id" + 1))
+          "att1", $"id" + 1L,
+          "att2", $"id")
        ))
       ) as "arr"
     )
-    val query = rel
-      .select(
-        GetArrayStructFields($"arr", StructField("att1", LongType, false), 0, 
1, false) as "a1",
-        GetArrayItem($"arr", 1) as "a2",
-        GetStructField(GetArrayItem($"arr", 1), 0, None) as "a3",
-        GetArrayItem(
-          GetArrayStructFields($"arr",
-            StructField("att1", LongType, false),
-            0,
-            1,
-            false),
-          1) as "a4")
-
-    val expected = relation
-      .select(
-        CreateArray(Seq($"id", $"id" + 1L)) as "a1",
-        CreateNamedStruct(Seq(
-          "att1", ($"id" + 1L),
-          "att2", (($"id" + 1L) * ($"id" + 1L)))) as "a2",
-        ($"id" + 1L) as "a3",
-        ($"id" + 1L) as "a4")
-    checkRule(query, expected)
+    val field = StructField("att1", LongType, false)
+
+    // Can simplify as both the two extractions result to cheap expression: 
$"id"
+    val query1 = rel.select(
+      GetArrayStructFields($"arr", field, 0, 1, false).getItem(0) as "a1",
+      $"arr".getItem(1).getField("att2") as "a2")
+    val expected1 = relation.select($"id" as "a1", $"id" as "a2")
+    checkRule(query1, expected1)
+
+    // Can simplify as only one extraction results to non-cheap expression: 
array($"id", $"id" + 1)
+    val query2 = rel.select(
+      GetArrayStructFields($"arr", field, 0, 1, false) as "a1",
+      $"arr".getItem(1).getField("att2") as "a2")
+    val expected2 = relation.select(CreateArray(Seq($"id", $"id" + 1L)) as 
"a1", $"id" as "a2")
+    checkRule(query2, expected2)
+
+    // Cannot simplify as both extraction result to non-cheap expression:
+    //   array($"id", $"id" + 1), $"id" + 1
+    val query3 = rel.select(
+      GetArrayStructFields($"arr", field, 0, 1, false) as "a1",
+      $"arr".getItem(1).getField("att1") as "a2")
+    checkRule(query3, query3)
   }
 
   test("SPARK-22570: CreateArray should not create a lot of global variables") 
{
@@ -182,27 +182,39 @@ class ComplexTypesSuite extends PlanTest with 
ExpressionEvalHelper {
     val rel = relation
       .select(
         CreateMap(Seq(
-          "r1", CreateNamedStruct(Seq("att1", $"id")),
-          "r2", CreateNamedStruct(Seq("att1", ($"id" + 1L))))) as "m")
-    val query = rel
-      .select(
-        GetMapValue($"m", "r1") as "a1",
-        GetStructField(GetMapValue($"m", "r1"), 0, None) as "a2",
-        GetMapValue($"m", "r32") as "a3",
-        GetStructField(GetMapValue($"m", "r32"), 0, None) as "a4")
-
-    val expected =
-      relation.select(
-        CreateNamedStruct(Seq("att1", $"id")) as "a1",
-        $"id" as "a2",
-        Literal.create(
-          null,
-          StructType(
-            StructField("att1", LongType, nullable = false) :: Nil
-          )
-        ) as "a3",
-        Literal.create(null, LongType) as "a4")
-    checkRule(query, expected)
+          "r1", CreateNamedStruct(Seq("att1", $"id", "att2", $"id" + 1L)),
+          "r2", CreateNamedStruct(Seq("att1", $"id" + 1L, "att2", $"id")))) as 
"m")
+    val structType = new StructType().add("att1", LongType, false).add("att2", 
LongType, false)
+
+    // Can simplify as both the two extractions result to cheap expression: 
$"id"
+    val query1 = rel.select(
+      GetMapValue($"m", "r1").getField("att1") as "a1",
+      GetMapValue($"m", "r2").getField("att2") as "a2")
+    val expected1 = relation.select($"id" as "a1", $"id" as "a2")
+    checkRule(query1, expected1)
+
+    // Can simplify as only one extraction results to non-cheap expression: 
$"id" + 1
+    val query2 = rel.select(
+      GetMapValue($"m", "r1").getField("att1") as "a1",
+      GetMapValue($"m", "r2").getField("att1") as "a2")
+    val expected2 = relation.select($"id" as "a1", ($"id" + 1L) as "a2")
+    checkRule(query2, expected2)
+
+    // Can simplify as only one extraction results to non-cheap expression: 
$"id" + 1
+    val query3 = rel.select(
+      // key "r3" does not exist, so this extraction leads to null (or failure 
with ANSI mode)
+      // which is a cheap expression.
+      GetMapValue($"m", "r3") as "a1",
+      GetMapValue($"m", "r2").getField("att1") as "a2")
+    val expected3 = relation.select(Literal(null, structType) as "a1", ($"id" 
+ 1L) as "a2")
+    checkRule(query3, expected3)
+
+    // Cannot simplify as both extraction result to non-cheap expression:
+    //   struct($"id", $"id" + 1), $"id" + 1
+    val query4 = rel.select(
+      GetMapValue($"m", "r1") as "a1",
+      GetMapValue($"m", "r2").getField("att1") as "a2")
+    checkRule(query4, query4)
   }
 
   test("simplify map ops, constant lookup, dynamic keys") {
@@ -329,7 +341,7 @@ class ComplexTypesSuite extends PlanTest with 
ExpressionEvalHelper {
             "att1", $"id",
             "att2", $"id" * $"id")),
           CreateNamedStruct(Seq(
-            "att1", $"id" + 1,
+            "att1", $"id",
             "att2", ($"id" + 1) * ($"id" + 1))
           ))
         ) as "arr")
@@ -346,8 +358,8 @@ class ComplexTypesSuite extends PlanTest with 
ExpressionEvalHelper {
 
     val expected = LocalRelation($"id".long)
       .select(
-        ($"id" + 1L) as "a1",
-        ($"id" + 1L) as "a2")
+        $"id" as "a1",
+        $"id" as "a2")
       .orderBy($"id".asc)
     checkRule(query, expected)
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-39699][SQL] Make CollapseProject smarter about collection creation expressions

Reply via email to