HyukjinKwon commented on code in PR #45819:
URL: https://github.com/apache/spark/pull/45819#discussion_r1548778640
##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -509,6 +497,209 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
}
}
+ test("implicit casting of collated strings") {
+ val tableName = "parquet_dummy_implicit_cast_t22"
+ withTable(tableName) {
+ spark.sql(
+ s"""
+ | CREATE TABLE $tableName(c1 STRING COLLATE UTF8_BINARY_LCASE,
+ | c2 STRING COLLATE UNICODE, c3 STRING COLLATE UNICODE_CI, c4
STRING)
+ | USING PARQUET
+ |""".stripMargin)
+ sql(s"INSERT INTO $tableName VALUES ('a', 'a', 'a', 'a')")
+ sql(s"INSERT INTO $tableName VALUES ('A', 'A', 'A', 'A')")
+
+ // collate literal to c1's collation
+ checkAnswer(sql(s"SELECT c1 FROM $tableName WHERE c1 = 'a'"),
+ Seq(Row("a"), Row("A")))
+ checkAnswer(sql(s"SELECT c1 FROM $tableName WHERE 'a' = c1"),
+ Seq(Row("a"), Row("A")))
+
+ // collate c1 to UTF8_BINARY because it is explicitly set
+ checkAnswer(sql(s"SELECT c1 FROM $tableName WHERE c1 = COLLATE('a',
'UTF8_BINARY')"),
+ Seq(Row("a")))
+
+ // fail with implicit mismatch, as function return should be considered
implicit
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT c1 FROM $tableName " +
+ s"WHERE c1 = SUBSTR(COLLATE('a', 'UNICODE'), 0)")
+ },
+ errorClass = "COLLATION_MISMATCH.IMPLICIT",
+ parameters = Map.empty
+ )
+
+ // in operator
+ checkAnswer(sql(s"SELECT c1 FROM $tableName WHERE c1 IN ('a')"),
+ Seq(Row("a"), Row("A")))
+ // explicitly set collation inside IN operator
+ checkAnswer(sql(s"SELECT c1 FROM $tableName WHERE c1 IN ('b',
COLLATE('a', 'UTF8_BINARY'))"),
+ Seq(Row("a")))
+
+ // concat without type mismatch
+ checkAnswer(sql(s"SELECT c1 FROM $tableName WHERE c1 || 'a' || 'a' =
'aaa'"),
+ Seq(Row("a"), Row("A")))
+ checkAnswer(sql(s"SELECT c1 FROM $tableName WHERE c1 || COLLATE(c2,
'UTF8_BINARY') = 'aa'"),
+ Seq(Row("a")))
+
+ // concat of columns of different collations is allowed
+ // as long as we don't use the result in an unsupported function
+ // TODO: (SPARK-47210) Add indeterminate support
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT c1 || c2 FROM $tableName")
+ },
+ errorClass = "COLLATION_MISMATCH.IMPLICIT"
+ )
+
+
+ // concat + in
+ checkAnswer(sql(s"SELECT c1 FROM $tableName where c1 || 'a' " +
+ s"IN (COLLATE('aa', 'UTF8_BINARY_LCASE'))"), Seq(Row("a"), Row("A")))
+ checkAnswer(sql(s"SELECT c1 FROM $tableName where (c1 || 'a') " +
+ s"IN (COLLATE('aa', 'UTF8_BINARY'))"), Seq(Row("a")))
+
+ // columns have different collation
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT c1 FROM $tableName WHERE c1 = c3")
+ },
+ errorClass = "COLLATION_MISMATCH.IMPLICIT"
+ )
+
+ // different explicit collations are set
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(
+ s"""
+ |SELECT c1 FROM $tableName
+ |WHERE COLLATE('a', 'UTF8_BINARY') = COLLATE('a', 'UNICODE')"""
+ .stripMargin)
+ },
+ errorClass = "COLLATION_MISMATCH.EXPLICIT",
+ parameters = Map(
+ "explicitTypes" -> "`string`.`string collate UNICODE`"
+ )
+ )
+
+ // in operator has different collations
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT c1 FROM $tableName WHERE c1 IN " +
+ "(COLLATE('a', 'UTF8_BINARY'), COLLATE('b', 'UNICODE'))")
+ },
+ errorClass = "COLLATION_MISMATCH.EXPLICIT",
+ parameters = Map(
+ "explicitTypes" -> "`string`.`string collate UNICODE`"
+ )
+ )
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT c1 FROM $tableName WHERE COLLATE(c1, 'UNICODE') IN " +
+ "(COLLATE('a', 'UTF8_BINARY'))")
+ },
+ errorClass = "COLLATION_MISMATCH.EXPLICIT",
+ parameters = Map(
+ "explicitTypes" -> "`string collate UNICODE`.`string`"
+ )
+ )
+
+ // concat on different implicit collations should succeed,
+ // but should fail on try of comparison
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT c1 FROM $tableName WHERE c1 || c3 = 'aa'")
+ },
+ errorClass = "COLLATION_MISMATCH.IMPLICIT"
+ )
+
+ // concat on different implicit collations should succeed,
+ // but should fail on try of ordering
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT * FROM $tableName ORDER BY c1 || c3")
+ },
+ errorClass = "COLLATION_MISMATCH.IMPLICIT"
+ )
+
+ // concat + in
+ checkAnswer(sql(s"SELECT c1 FROM $tableName WHERE c1 || COLLATE('a',
'UTF8_BINARY') IN " +
+ s"(COLLATE('aa', 'UNICODE'))"),
+ Seq(Row("a")))
+
+ // array creation supports implicit casting
+ checkAnswer(sql(s"SELECT typeof(array('a' COLLATE UNICODE, 'b')[1])"),
+ Seq(Row("string collate UNICODE")))
+
+ // contains fails with indeterminate collation
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT * FROM $tableName WHERE contains(c1||c3, 'a')")
+ },
+ errorClass = "COLLATION_MISMATCH.IMPLICIT"
+ )
+
+ checkError(
+ exception = intercept[AnalysisException] {
+ sql(s"SELECT array('A', 'a' COLLATE UNICODE) == array('b' COLLATE
UNICODE_CI)")
+ },
+ errorClass = "COLLATION_MISMATCH.IMPLICIT"
+ )
+ }
+ }
+
+ test("cast of default collated strings in IN expression") {
+ val tableName = "t1"
+ withTable(tableName) {
+ spark.sql(
+ s"""
+ | CREATE TABLE $tableName(utf8_binary STRING COLLATE UTF8_BINARY,
+ | utf8_binary_lcase STRING COLLATE UTF8_BINARY_LCASE)
+ | USING PARQUET
+ |""".stripMargin)
+ sql(s"INSERT INTO $tableName VALUES ('aaa', 'aaa')")
+ sql(s"INSERT INTO $tableName VALUES ('AAA', 'AAA')")
+ sql(s"INSERT INTO $tableName VALUES ('bbb', 'bbb')")
+ sql(s"INSERT INTO $tableName VALUES ('BBB', 'BBB')")
+
+ checkAnswer(sql(s"SELECT * FROM $tableName " +
+ s"WHERE utf8_binary_lcase IN " +
+ s"('aaa' COLLATE UTF8_BINARY_LCASE, 'bbb' COLLATE UTF8_BINARY_LCASE)"),
+ Seq(Row("aaa", "aaa"), Row("AAA", "AAA"), Row("bbb", "bbb"),
Row("BBB", "BBB")))
+ checkAnswer(sql(s"SELECT * FROM $tableName " +
+ s"WHERE utf8_binary_lcase IN ('aaa' COLLATE UTF8_BINARY_LCASE,
'bbb')"),
+ Seq(Row("aaa", "aaa"), Row("AAA", "AAA"), Row("bbb", "bbb"),
Row("BBB", "BBB")))
+ }
+ }
+
+ // TODO: (SPARK-47210) Add indeterminate support
Review Comment:
```suggestion
// TODO(SPARK-47210): Add indeterminate support
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]