miland-db commented on code in PR #45643:
URL: https://github.com/apache/spark/pull/45643#discussion_r1535541459
##########
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##########
@@ -70,6 +74,148 @@ class CollationStringExpressionsSuite extends QueryTest
with SharedSparkSession
})
}
+ test("INSTR check result on non-explicit default collation") {
+ checkEvaluation(StringInstr(Literal("aAads"), Literal("Aa")), 2)
+ }
+
+ test("INSTR check result on explicitly collated strings") {
+ // UTF8_BINARY_LCASE
+ checkEvaluation(StringInstr(Literal.create("aaads", StringType(1)),
+ Literal.create("Aa", StringType(1))), 1)
+ checkEvaluation(StringInstr(Collate(Literal("aaads"), "UTF8_BINARY_LCASE"),
+ Collate(Literal("Aa"), "UTF8_BINARY_LCASE")), 1)
+ // UNICODE
+ checkEvaluation(StringInstr(Literal.create("aaads", StringType(2)),
+ Literal.create("Aa", StringType(2))), 0)
+ checkEvaluation(StringInstr(Collate(Literal("aaads"), "UNICODE"),
+ Collate(Literal("Aa"), "UNICODE")), 0)
+ // UNICODE_CI
+ checkEvaluation(StringInstr(Literal.create("aaads", StringType(3)),
+ Literal.create("de", StringType(3))), 0)
+ checkEvaluation(StringInstr(Collate(Literal("aaads"), "UNICODE_CI"),
+ Collate(Literal("Aa"), "UNICODE_CI")), 0)
+ }
+
+ test("INSTR fail mismatched collation types") {
+ // UNICODE and UNICODE_CI
+ val expr1 = StringInstr(Collate(Literal("aaads"), "UNICODE"),
+ Collate(Literal("Aa"), "UNICODE_CI"))
+ assert(expr1.checkInputDataTypes() ==
+ DataTypeMismatch(
+ errorSubClass = "COLLATION_MISMATCH",
+ messageParameters = Map(
+ "collationNameLeft" -> "UNICODE",
+ "collationNameRight" -> "UNICODE_CI"
+ )
+ )
+ )
+ // DEFAULT(UTF8_BINARY) and UTF8_BINARY_LCASE
+ val expr2 = StringInstr(Literal("aaads"),
+ Collate(Literal("Aa"), "UTF8_BINARY_LCASE"))
+ assert(expr2.checkInputDataTypes() ==
+ DataTypeMismatch(
+ errorSubClass = "COLLATION_MISMATCH",
+ messageParameters = Map(
+ "collationNameLeft" -> "UTF8_BINARY",
+ "collationNameRight" -> "UTF8_BINARY_LCASE"
+ )
+ )
+ )
+ // UTF8_BINARY_LCASE and UNICODE_CI
+ val expr3 = StringInstr(Collate(Literal("aaads"), "UTF8_BINARY_LCASE"),
+ Collate(Literal("Aa"), "UNICODE_CI"))
+ assert(expr3.checkInputDataTypes() ==
+ DataTypeMismatch(
+ errorSubClass = "COLLATION_MISMATCH",
+ messageParameters = Map(
+ "collationNameLeft" -> "UTF8_BINARY_LCASE",
+ "collationNameRight" -> "UNICODE_CI"
+ )
+ )
+ )
+ }
+
+ test("FIND_IN_SET check result on non-explicit default collation") {
+ checkEvaluation(FindInSet(Literal("def"), Literal("abc,b,ab,c,def")), 5)
+ checkEvaluation(FindInSet(Literal("defg"), Literal("abc,b,ab,c,def")), 0)
+ }
+
+ test("FIND_IN_SET check result on explicitly collated strings") {
+ // UTF8_BINARY
+ checkEvaluation(FindInSet(Collate(Literal("a"), "UTF8_BINARY"),
+ Collate(Literal("abc,b,ab,c,def"), "UTF8_BINARY")), 0)
+ checkEvaluation(FindInSet(Collate(Literal("c"), "UTF8_BINARY"),
+ Collate(Literal("abc,b,ab,c,def"), "UTF8_BINARY")), 4)
+ checkEvaluation(FindInSet(Collate(Literal("AB"), "UTF8_BINARY"),
+ Collate(Literal("abc,b,ab,c,def"), "UTF8_BINARY")), 0)
+ checkEvaluation(FindInSet(Collate(Literal("abcd"), "UTF8_BINARY"),
+ Collate(Literal("abc,b,ab,c,def"), "UTF8_BINARY")), 0)
+ // UTF8_BINARY_LCASE
+ checkEvaluation(FindInSet(Collate(Literal("aB"), "UTF8_BINARY_LCASE"),
+ Collate(Literal("abc,b,ab,c,def"), "UTF8_BINARY_LCASE")), 3)
+ checkEvaluation(FindInSet(Collate(Literal("a"), "UTF8_BINARY_LCASE"),
+ Collate(Literal("abc,b,ab,c,def"), "UTF8_BINARY_LCASE")), 0)
+ checkEvaluation(FindInSet(Collate(Literal("abc"), "UTF8_BINARY_LCASE"),
+ Collate(Literal("aBc,b,ab,c,def"), "UTF8_BINARY_LCASE")), 1)
+ checkEvaluation(FindInSet(Collate(Literal("abcd"), "UTF8_BINARY_LCASE"),
+ Collate(Literal("aBc,b,ab,c,def"), "UTF8_BINARY_LCASE")), 0)
+ // UNICODE
+ checkEvaluation(FindInSet(Collate(Literal("a"), "UNICODE"),
+ Collate(Literal("abc,b,ab,c,def"), "UNICODE")), 0)
+ checkEvaluation(FindInSet(Collate(Literal("ab"), "UNICODE"),
+ Collate(Literal("abc,b,ab,c,def"), "UNICODE")), 3)
+ checkEvaluation(FindInSet(Collate(Literal("Ab"), "UNICODE"),
+ Collate(Literal("abc,b,ab,c,def"), "UNICODE")), 0)
+ // UNICODE_CI
+ checkEvaluation(FindInSet(Collate(Literal("a"), "UNICODE_CI"),
+ Collate(Literal("abc,b,ab,c,def"), "UNICODE_CI")), 0)
+ checkEvaluation(FindInSet(Collate(Literal("C"), "UNICODE_CI"),
+ Collate(Literal("abc,b,ab,c,def"), "UNICODE_CI")), 4)
+ checkEvaluation(FindInSet(Collate(Literal("DeF"), "UNICODE_CI"),
+ Collate(Literal("abc,b,ab,c,dEf"), "UNICODE_CI")), 5)
+ checkEvaluation(FindInSet(Collate(Literal("DEFG"), "UNICODE_CI"),
+ Collate(Literal("abc,b,ab,c,def"), "UNICODE_CI")), 0)
+ }
+
+ test("FIND_IN_SET fail mismatched collation types") {
+ // UNICODE and UNICODE_CI
+ val expr1 = FindInSet(Collate(Literal("a"), "UNICODE"),
+ Collate(Literal("abc,b,ab,c,def"), "UNICODE_CI"))
+ assert(expr1.checkInputDataTypes() ==
+ DataTypeMismatch(
+ errorSubClass = "COLLATION_MISMATCH",
+ messageParameters = Map(
+ "collationNameLeft" -> "UNICODE",
+ "collationNameRight" -> "UNICODE_CI"
+ )
+ )
+ )
+ // DEFAULT(UTF8_BINARY) and UTF8_BINARY_LCASE
+ val expr2 = FindInSet(Collate(Literal("a"), "UTF8_BINARY"),
+ Collate(Literal("abc,b,ab,c,def"), "UTF8_BINARY_LCASE"))
+ assert(expr2.checkInputDataTypes() ==
+ DataTypeMismatch(
+ errorSubClass = "COLLATION_MISMATCH",
+ messageParameters = Map(
+ "collationNameLeft" -> "UTF8_BINARY",
+ "collationNameRight" -> "UTF8_BINARY_LCASE"
+ )
+ )
+ )
+ // UTF8_BINARY_LCASE and UNICODE_CI
+ val expr3 = FindInSet(Collate(Literal("a"), "UTF8_BINARY_LCASE"),
+ Collate(Literal("abc,b,ab,c,def"), "UNICODE_CI"))
+ assert(expr3.checkInputDataTypes() ==
+ DataTypeMismatch(
+ errorSubClass = "COLLATION_MISMATCH",
+ messageParameters = Map(
+ "collationNameLeft" -> "UTF8_BINARY_LCASE",
+ "collationNameRight" -> "UNICODE_CI"
+ )
+ )
+ )
+ }
Review Comment:
I have E2E tests prepared. I don't have enough context about this so
whatever option you choose I will continue with it.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]