cloud-fan commented on a change in pull request #31404:
URL: https://github.com/apache/spark/pull/31404#discussion_r577739918
##########
File path:
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala
##########
@@ -146,6 +146,96 @@ class SetOperationSuite extends PlanTest {
comparePlans(distinctUnionCorrectAnswer2, optimized2)
}
+ test("SPARK-34283: Remove unnecessary deduplicate in multiple unions") {
+ val query1 = OneRowRelation()
+ .select(Literal(1).as('a))
+ val query2 = OneRowRelation()
+ .select(Literal(2).as('b))
+ val query3 = OneRowRelation()
+ .select(Literal(3).as('c))
+
+ // D - U - D - U - query1
+ // | |
+ // query3 query2
+ val unionQuery1 = Deduplicate(query1.output, Union(
+ Deduplicate(query1.output, Union(query1, query2)), query3)).analyze
+ val optimized1 = Optimize.execute(unionQuery1)
+ val deduplicateUnionCorrectAnswer1 = Deduplicate(query1.output,
+ Union(query1 :: query2 :: query3 :: Nil))
+ comparePlans(deduplicateUnionCorrectAnswer1, optimized1)
+
+ // query1
+ // |
+ // D - U - U - query2
+ // |
+ // D - U - query2
+ // |
+ // query3
+ val unionQuery2 = Deduplicate(query1.output, Union(Union(query1, query2),
+ Deduplicate(query2.output, Union(query2, query3)))).analyze
+ val optimized2 = Optimize.execute(unionQuery2)
+ val deduplicateUnionCorrectAnswer2 =
+ Deduplicate(query1.output, Union(query1 :: query2 :: query2 :: query3 ::
Nil))
+ comparePlans(deduplicateUnionCorrectAnswer2, optimized2)
+
+ val unionQuery3 = Deduplicate(testRelation.output,
+ Union(Deduplicate(testRelation.output,
+ Union(testRelation :: testRelation :: Nil, true, false)) ::
testRelation :: Nil,
+ true, false))
+ val optimized3 = Optimize.execute(unionQuery3)
+ val deduplicateUnionCorrectAnswer3 =
+ Deduplicate(testRelation.output,
+ Union(testRelation :: testRelation :: testRelation :: Nil, true,
false))
+ comparePlans(deduplicateUnionCorrectAnswer3, optimized3, false)
+ }
+
+ test("SPARK-34283: Keep necessary deduplicate in multiple unions") {
+ val query1 = OneRowRelation()
+ .select(Literal(1).as('a))
+ val query2 = OneRowRelation()
+ .select(Literal(2).as('b))
+ val query3 = OneRowRelation()
+ .select(Literal(3).as('c))
+ val query4 = OneRowRelation()
+ .select(Literal(4).as('d))
+
+ // U - D - U - query1
+ // | |
+ // query3 query2
+ val unionQuery1 = Union(Deduplicate(query1.output, Union(query1, query2)),
query3).analyze
+ val optimized1 = Optimize.execute(unionQuery1)
+ val deduplicateUnionCorrectAnswer1 =
+ Union(Deduplicate(query1.output, Union(query1 :: query2 :: Nil)) ::
query3 :: Nil).analyze
+ comparePlans(deduplicateUnionCorrectAnswer1, optimized1)
Review comment:
nit: if we expect no plan change, we can just write
`comparePlans(unionQuery1, optimized1)`, no need to define
`deduplicateUnionCorrectAnswer1`
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]