cloud-fan commented on code in PR #37165:
URL: https://github.com/apache/spark/pull/37165#discussion_r918667893
##########
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala:
##########
@@ -132,28 +133,76 @@ class CollapseProjectSuite extends PlanTest {
val optimized = Optimize.execute(query)
comparePlans(optimized, query)
+ }
- // CreateStruct is an exception if it's only referenced by ExtractValue.
- val query2 = testRelation
- .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"))
+ test("SPARK-39699: collapse project with collection creation expressions") {
+ val struct = namedStruct(
+ "a", $"a",
+ "a_plus_1", $"a" + 1,
+ "a_plus_2", $"a" + 2,
+ "nested", namedStruct("inner1", $"a" + 3, "inner2", $"a" + 4)
+ ).as("struct")
+ val baseQuery = testRelation.select(struct)
+
+ // Can collapse as there is only one non-cheap access: `struct.a_plus_1`
+ val query1 = baseQuery
.select(($"struct".getField("a") +
$"struct".getField("a_plus_1")).as("add"))
.analyze
- val optimized2 = Optimize.execute(query2)
- val expected2 = testRelation
+ val optimized1 = Optimize.execute(query1)
+ val expected1 = testRelation
.select(($"a" + ($"a" + 1)).as("add"))
.analyze
- comparePlans(optimized2, expected2)
+ comparePlans(optimized1, expected1)
- // referencing `CreateStruct` only once in non-extract expression is OK.
- val query3 = testRelation
- .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"))
- .select($"struct", $"struct".getField("a"))
+ // Cannot collapse as there are two non-cheap accesses: `struct.a_plus_1`
and `struct.a_plus_1`
+ val query2 = baseQuery
+ .select(($"struct".getField("a_plus_1") +
$"struct".getField("a_plus_1")).as("add"))
+ .analyze
+ val optimized2 = Optimize.execute(query2)
+ comparePlans(optimized2, query2)
+
+ // Cannot collapse as there are two non-cheap accesses: `struct.a_plus_1`
and `struct`
+ val query3 = baseQuery
+ .select($"struct".getField("a_plus_1"), $"struct")
.analyze
val optimized3 = Optimize.execute(query3)
- val expected3 = testRelation
- .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"),
$"a".as("struct.a"))
+ comparePlans(optimized3, query3)
+
+ // Can collapse as there is only one non-cheap access: `struct`
+ val query4 = baseQuery
+ .select($"struct".getField("a"), $"struct")
+ .analyze
+ val optimized4 = Optimize.execute(query4)
+ val expected4 = testRelation
+ .select($"a".as("struct.a"), struct)
+ .analyze
+ comparePlans(optimized4, expected4)
+
+ // Referenced by WithFields.
+ val query5 = testRelation.select(namedStruct("a", $"a", "b", $"a" +
1).as("struct"))
+ .select(UpdateFields($"struct", "c", $"struct".getField("a")).as("u"))
+ .analyze
+ val optimized5 = Optimize.execute(query5)
+ val expected5 = testRelation
+ .select(namedStruct("a", $"a", "b", $"a" + 1, "c",
$"a").as("struct").as("u"))
+ .analyze
+ comparePlans(optimized5, expected5)
+
+ // TODO: should collapse as the non-cheap accesses are distinct:
Review Comment:
distinct access is not easy to distinguish, as we need to consider lineage.
e.g. `s.a` and `s.a.b` should not be treated as distinct access. This PR tries
to be conservative about collapsing projects and does not consider distinct
access.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]