GideonPotok commented on code in PR #46040:
URL: https://github.com/apache/spark/pull/46040#discussion_r1570170214
##########
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##########
@@ -212,6 +212,119 @@ class CollationStringExpressionsSuite
})
}
+ test("Support Left/Right/Substr with collation") {
+ case class SubstringTestCase(query: String, collation: String, result: Row)
+ val checks = Seq(
+ SubstringTestCase(
+ "select substr('example' collate " + "utf8_binary_lcase" + ", 1, 100)",
+ "utf8_binary_lcase",
+ Row("example")),
+ SubstringTestCase(
+ "select substr('example' collate " + "utf8_binary" + ", 2, 2)",
+ "utf8_binary",
+ Row("xa")),
+ SubstringTestCase(
+ "select right('' collate " + "utf8_binary_lcase" + ", 1)",
+ "utf8_binary_lcase",
+ Row("")),
+ SubstringTestCase(
+ "select substr('example' collate " + "unicode" + ", 0, 0)",
+ "unicode",
+ Row("")),
+ SubstringTestCase(
+ "select substr('example' collate " + "unicode_ci" + ", -3, 2)",
+ "unicode_ci",
+ Row("pl")),
+ SubstringTestCase(
+ "select substr(' a世a ' collate " + "utf8_binary_lcase" + ", 2, 3)", //
scalastyle:ignore
+ "utf8_binary_lcase",
+ Row("a世a")), // scalastyle:ignore
+ SubstringTestCase(
+ "select left(' a世a ' collate " + "utf8_binary" + ", 3)", //
scalastyle:ignore
+ "utf8_binary",
+ Row(" a世")), // scalastyle:ignore
+ SubstringTestCase(
+ "select right(' a世a ' collate " + "unicode" + ", 3)", //
scalastyle:ignore
+ "unicode",
+ Row("世a ")), // scalastyle:ignore
+ SubstringTestCase(
+ "select left('ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ' collate " + "unicode_ci" + ",
3)", // scalastyle:ignore
+ "unicode_ci",
+ Row("ÀÃÂ")), // scalastyle:ignore
+ SubstringTestCase(
+ "select right('ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ' collate " + "utf8_binary_lcase"
+ ", 3)", // scalastyle:ignore
+ "utf8_binary_lcase",
+ Row("ǢǼÆ")), // scalastyle:ignore
+ SubstringTestCase(
+ "select substr('' collate " + "utf8_binary_lcase" + ", 1, 1)",
+ "utf8_binary_lcase",
+ Row("")),
+ SubstringTestCase(
+ "select substr('' collate " + "unicode" + ", 1, 1)",
+ "unicode",
+ Row("")),
+ SubstringTestCase(
+ "select left('' collate " + "utf8_binary" + ", 1)",
+ "utf8_binary",
+ Row("")),
+ // improper values
+ SubstringTestCase(
+ "select left(null collate " + "utf8_binary_lcase" + ", 1)",
+ "utf8_binary_lcase",
+ Row(null)),
+ SubstringTestCase(
+ "select right(null collate " + "unicode" + ", 1)",
+ "unicode",
+ Row(null)),
+ SubstringTestCase(
+ "select substr(null collate " + "utf8_binary" + ", 1)",
+ "utf8_binary",
+ Row(null)),
+ SubstringTestCase(
+ "select substr(null collate " + "unicode_ci" + ", 1, 1)",
+ "unicode_ci",
+ Row(null)),
+ SubstringTestCase(
+ "select left(null collate " + "utf8_binary_lcase" + ", null)",
+ "utf8_binary_lcase",
+ Row(null)),
+ SubstringTestCase(
+ "select right(null collate " + "unicode" + ", null)",
+ "unicode",
+ Row(null)),
+ SubstringTestCase(
+ "select substr(null collate " + "utf8_binary" + ", null)",
+ "utf8_binary",
+ Row(null)),
+ SubstringTestCase(
+ "select substr(null collate " + "unicode_ci" + ", null, null)",
+ "unicode_ci",
+ Row(null)),
+ SubstringTestCase(
+ "select left('ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ' collate " + "utf8_binary_lcase" +
", null)", // scalastyle:ignore
+ "utf8_binary_lcase",
+ Row(null)),
+ SubstringTestCase(
+ "select right('ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ' collate " + "unicode" + ",
null)", // scalastyle:ignore
+ "unicode",
+ Row(null)),
+ SubstringTestCase(
+ "select substr('ÀÃÂĀĂȦÄäåäáâãȻȻȻȻȻǢǼÆ' collate " + "utf8_binary" + ",
null)", // scalastyle:ignore
+ "utf8_binary",
+ Row(null)),
+ SubstringTestCase(
+ "select substr('' collate " + "unicode_ci" + ", null, null)",
+ "unicode_ci",
+ Row(null))
Review Comment:
@uros-db I will change it accordingly. Please advise - Do you want three
case classes, or one case class but with a parameter for function name? If the
latter (one case class), how do you want me to handle the third parameter
(`len`), which `left` and `right` do not have, and which is optional for
`substr`? Maybe with an `Option[String]`?
Is the quantity of tests satisfactory? I got it down from 112 tests to 25
tests. Thus 13 for valid values and 12 for invalid values. I can get it down
to 12 valid test cases and, say, six invalid values if you prefer.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]