This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 653ac5b729e2 [SPARK-47423][SQL] Collations - Set operation support for strings with collations 653ac5b729e2 is described below commit 653ac5b729e2eba9bf097905b3fd136603b7a298 Author: Aleksandar Tomic <aleksandar.to...@databricks.com> AuthorDate: Sat Mar 16 09:21:08 2024 +0500 [SPARK-47423][SQL] Collations - Set operation support for strings with collations ### What changes were proposed in this pull request? This PR fixes support for set operations for strings with collations different from `UTF8_BINARY`. The fix is not strictly related to set operations and may resolve other problems in collation space. The fix is to add default value for `StringType` with collation. Previously the matching pattern would not catch the `StringType` with collation case and fix is simply to do pattern matching on `st: StringType` instead of relying on `StringType` match. ### Why are the changes needed? Fixing behaviour of set operations. ### Does this PR introduce _any_ user-facing change? Yes - fixing the logic that previously didn't work. ### How was this patch tested? Golden file tests are added. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45536 from dbatomic/collations_and_set_ops. Authored-by: Aleksandar Tomic <aleksandar.to...@databricks.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../spark/sql/catalyst/expressions/literals.scala | 2 +- .../sql-tests/analyzer-results/collations.sql.out | 51 +++++++++++++++++++++ .../test/resources/sql-tests/inputs/collations.sql | 7 +++ .../resources/sql-tests/results/collations.sql.out | 53 ++++++++++++++++++++++ 4 files changed, 112 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 9603647db06f..eadd4c04f4b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -195,7 +195,7 @@ object Literal { case TimestampNTZType => create(0L, TimestampNTZType) case it: DayTimeIntervalType => create(0L, it) case it: YearMonthIntervalType => create(0, it) - case StringType => Literal("") + case st: StringType => Literal(UTF8String.fromString(""), st) case BinaryType => Literal("".getBytes(StandardCharsets.UTF_8)) case CalendarIntervalType => Literal(new CalendarInterval(0, 0, 0)) case arr: ArrayType => create(Array(), arr) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out index fff2d4eab717..6d9bb3470be6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out @@ -149,6 +149,57 @@ DropTable false, false +- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t1 +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query analysis +Except false +:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] +: +- LocalRelation [col1#x] ++- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] + +- LocalRelation [col1#x] + + +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query analysis +Except All true +:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] +: +- LocalRelation [col1#x] ++- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] + +- LocalRelation [col1#x] + + +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query analysis +Distinct ++- Union false, false + :- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] + : +- LocalRelation [col1#x] + +- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] + +- LocalRelation [col1#x] + + +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query analysis +Union false, false +:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] +: +- LocalRelation [col1#x] ++- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] + +- LocalRelation [col1#x] + + +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query analysis +Intersect false +:- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] +: +- LocalRelation [col1#x] ++- Project [collate(col1#x, utf8_binary_lcase) AS collate(col1)#x] + +- LocalRelation [col1#x] + + -- !query create table t1 (c1 struct<utf8_binary: string collate utf8_binary, utf8_binary_lcase: string collate utf8_binary_lcase>) USING PARQUET -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql b/sql/core/src/test/resources/sql-tests/inputs/collations.sql index af87f7a321c2..52ce58b80823 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/collations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/collations.sql @@ -39,6 +39,13 @@ select * from t1 anti join t2 on t1.utf8_binary_lcase = t2.utf8_binary_lcase; drop table t2; drop table t1; +-- set operations +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); +select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'); + -- create table with struct field create table t1 (c1 struct<utf8_binary: string collate utf8_binary, utf8_binary_lcase: string collate utf8_binary_lcase>) USING PARQUET; diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out index 70ea4058655a..7d7c054c2b08 100644 --- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out @@ -158,6 +158,59 @@ struct<> +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query schema +struct<collate(col1):string collate UTF8_BINARY_LCASE> +-- !query output +zzz + + +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') except all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query schema +struct<collate(col1):string collate UTF8_BINARY_LCASE> +-- !query output +aaa +bbb +zzz +zzz + + +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query schema +struct<collate(col1):string collate UTF8_BINARY_LCASE> +-- !query output +aaa +bbb +zzz + + +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('AAA'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') union all select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query schema +struct<collate(col1):string collate UTF8_BINARY_LCASE> +-- !query output +AAA +BBB +ZZZ +aaa +aaa +bbb +bbb +zzz + + +-- !query +select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb'), ('BBB'), ('zzz'), ('ZZZ') intersect select col1 collate utf8_binary_lcase from values ('aaa'), ('bbb') +-- !query schema +struct<collate(col1):string collate UTF8_BINARY_LCASE> +-- !query output +aaa +bbb + + -- !query create table t1 (c1 struct<utf8_binary: string collate utf8_binary, utf8_binary_lcase: string collate utf8_binary_lcase>) USING PARQUET -- !query schema --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org