dbatomic commented on code in PR #46722:
URL: https://github.com/apache/spark/pull/46722#discussion_r1622198569
##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -1046,6 +1046,211 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
})
}
+ test("hash join should be used for arrays of collated strings") {
+ val t1 = "T_1"
+ val t2 = "T_2"
+
+ case class HashJoinTestCase[R](collation: String, result: R)
+ val testCases = Seq(
+ HashJoinTestCase("UTF8_BINARY",
+ Seq(Row(Seq("aa"), 1, Seq("aa"), 2))),
+ HashJoinTestCase("UTF8_BINARY_LCASE",
+ Seq(Row(Seq("aa"), 1, Seq("AA"), 2), Row(Seq("aa"), 1, Seq("aa"), 2))),
+ HashJoinTestCase("UNICODE",
+ Seq(Row(Seq("aa"), 1, Seq("aa"), 2))),
+ HashJoinTestCase("UNICODE_CI",
+ Seq(Row(Seq("aa"), 1, Seq("AA"), 2), Row(Seq("aa"), 1, Seq("aa"), 2)))
+ )
+
+ testCases.foreach(t => {
+ withTable(t1, t2) {
+ sql(s"CREATE TABLE $t1 (x ARRAY<STRING COLLATE ${t.collation}>, i int)
USING PARQUET")
+ sql(s"INSERT INTO $t1 VALUES (array('aa'), 1)")
+
+ sql(s"CREATE TABLE $t2 (y ARRAY<STRING COLLATE ${t.collation}>, j int)
USING PARQUET")
+ sql(s"INSERT INTO $t2 VALUES (array('AA'), 2), (array('aa'), 2)")
+
+ val df = sql(s"SELECT * FROM $t1 JOIN $t2 ON $t1.x = $t2.y")
+ checkAnswer(df, t.result)
+
+ val queryPlan = df.queryExecution.executedPlan
+
+ // confirm that hash join is used instead of sort merge join
+ assert(
+ collectFirst(queryPlan) {
+ case _: HashJoin => ()
+ }.nonEmpty
+ )
+ assert(
+ collectFirst(queryPlan) {
+ case _: ShuffledJoin => ()
+ }.isEmpty
+ )
+
+ // if collation doesn't support binary equality, collation key should
be injected
+ if
(!CollationFactory.fetchCollation(t.collation).supportsBinaryEquality) {
Review Comment:
Can you:
1) Verify that there is no array transform if collation is not binary.
2) Check that there is CollationKey as part of ArrayTransform function.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]