uros-db commented on code in PR #45216:
URL: https://github.com/apache/spark/pull/45216#discussion_r1509076371
##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -183,6 +183,266 @@ class CollationSuite extends DatasourceV2SQLBase {
}
}
+ test("checkCollation throws exception for incompatible collationIds") {
+ val left: String = "abc" // collate with 'UNICODE_CI'
+ val leftCollationName: String = "UNICODE_CI";
+ var right: String = null // collate with 'UNICODE'
+ val rightCollationName: String = "UNICODE";
+ // contains
+ right = left.substring(1, 2);
+ checkError(
+ exception = intercept[SparkException] {
+ spark.sql(s"SELECT contains(collate('$left', '$leftCollationName')," +
+ s"collate('$right', '$rightCollationName'))").collect()
+ },
+ errorClass = "COLLATION_MISMATCH",
+ sqlState = "42K09",
+ parameters = Map(
+ "collationNameLeft" -> s"$leftCollationName",
+ "collationNameRight" -> s"$rightCollationName"
+ )
+ )
+ // startsWith
+ right = left.substring(0, 1);
+ checkError(
+ exception = intercept[SparkException] {
+ spark.sql(s"SELECT startsWith(collate('$left', '$leftCollationName'),"
+
+ s"collate('$right', '$rightCollationName'))").collect()
+ },
+ errorClass = "COLLATION_MISMATCH",
+ sqlState = "42K09",
+ parameters = Map(
+ "collationNameLeft" -> s"$leftCollationName",
+ "collationNameRight" -> s"$rightCollationName"
+ )
+ )
+ // endsWith
+ right = left.substring(2, 3);
+ checkError(
+ exception = intercept[SparkException] {
+ spark.sql(s"SELECT endsWith(collate('$left', '$leftCollationName')," +
+ s"collate('$right', '$rightCollationName'))").collect()
+ },
+ errorClass = "COLLATION_MISMATCH",
+ sqlState = "42K09",
+ parameters = Map(
+ "collationNameLeft" -> s"$leftCollationName",
+ "collationNameRight" -> s"$rightCollationName"
+ )
+ )
+ }
+
+ test("Support contains string expression with Collation") {
+ // Test 'contains' with different collations
+ var listLeft: List[String] = List()
+ var listRight: List[String] = List()
+ var listResult: List[Boolean] = List()
+
+ // UCS_BASIC (default) & UNICODE collation
+ listLeft = List("", "c", "abc", "cde", "abde", "abcde", "C", "ABC", "CDE",
"ABDE", "ABCDE")
+ listRight = List("", "c", "abc", "cde", "abde", "abcde", "C", "ABC",
"CDE", "ABDE", "ABCDE")
+ listResult = List(
+ // "" c abc cde abde abcde C ABC CDE ABDE
ABCDE
+ true, false, false, false, false, false, false, false, false, false,
false, // ""
+ true, true, false, false, false, false, false, false, false, false,
false, // c
+ true, true, true, false, false, false, false, false, false, false,
false, // abc
+ true, true, false, true, false, false, false, false, false, false,
false, // cde
+ true, false, false, false, true, false, false, false, false, false,
false, // abde
+ true, true, true, true, false, true, false, false, false, false, false,
// abcde
+ true, false, false, false, false, false, true, false, false, false,
false, // C
+ true, false, false, false, false, false, true, true, false, false,
false, // ABC
+ true, false, false, false, false, false, true, false, true, false,
false, // CDE
+ true, false, false, false, false, false, false, false, false, true,
false, // ABDE
+ true, false, false, false, false, false, true, true, true, false, true)
// ABCDE
Review Comment:
while it may seem a bit unusual, I think this matrix approach covers a broad
spectrum of test cases and generally works really well for this set of
functions - covering various edge-cases and different collation types (this was
especially useful when debugging and experimenting with new collations) ex.
imagine throwing Serbian (ć, Ć) or German collations (ä, Ä) into the mix with
other possible `abc`s
when I first wrote it as a standard linear set of tests, it was much harder
to see how and why these functions behave the way they do with different
collations, while this nicely aligned matrix gives a pretty clear overiview all
in one place (in addition, previously it was easy to miss something and not
cover all cases, and also hard-coding the expected results was extra-tedious)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]