This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 0a94e7c7b9a8 [SPARK-47302][SQL] Collate keyword as identifier
0a94e7c7b9a8 is described below
commit 0a94e7c7b9a83ecfbca59f5b93532453f462500c
Author: Aleksandar Tomic <[email protected]>
AuthorDate: Fri Mar 8 13:33:21 2024 +0300
[SPARK-47302][SQL] Collate keyword as identifier
### What changes were proposed in this pull request?
With this change we move away from using collation names as string literals
and start treating them as identifiers, since that is the part of sql standard.
Collation names are marked as multi part identifiers, since, in future, we
will want to introduce user defined collations which can be part of nested
namespaces in catalog.
### Why are the changes needed?
Aligning with sql standard on collation syntax.
### Does this PR introduce _any_ user-facing change?
Yes. Collations are still not a released feature.
### How was this patch tested?
Existing tests are used.
### Was this patch authored or co-authored using generative AI tooling?
Closes #45405 from dbatomic/collate_key_word_as_identifier.
Lead-authored-by: Aleksandar Tomic <[email protected]>
Co-authored-by: Nikola Mandic <[email protected]>
Co-authored-by: Stefan Kandic <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../queries/select_collated_string.json | 2 +-
.../queries/select_collated_string.proto.bin | Bin 65 -> 63 bytes
.../connect/planner/SparkConnectProtoSuite.scala | 4 +-
python/pyspark/sql/tests/test_types.py | 18 ++++-----
python/pyspark/sql/types.py | 4 +-
.../spark/sql/catalyst/parser/SqlBaseParser.g4 | 2 +-
.../sql/catalyst/parser/DataTypeAstBuilder.scala | 2 +-
.../org/apache/spark/sql/types/DataType.scala | 2 +-
.../org/apache/spark/sql/types/StringType.scala | 2 +-
.../expressions/collationExpressions.scala | 2 +-
.../spark/sql/catalyst/parser/AstBuilder.scala | 2 +-
.../sql-functions/sql-expression-schema.md | 2 +-
.../sql-tests/analyzer-results/collations.sql.out | 26 ++++++-------
.../test/resources/sql-tests/inputs/collations.sql | 10 ++---
.../resources/sql-tests/results/collations.sql.out | 22 +++++------
.../org/apache/spark/sql/CollationSuite.scala | 42 +++++++++++++--------
.../sql/errors/QueryCompilationErrorsSuite.scala | 6 +--
17 files changed, 79 insertions(+), 69 deletions(-)
diff --git
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
index db065b36e345..00644e072190 100644
---
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
+++
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
@@ -8,7 +8,7 @@
"planId": "0"
},
"localRelation": {
- "schema": "struct\u003cs:string COLLATE
\u0027UCS_BASIC_LCASE\u0027\u003e"
+ "schema": "struct\u003cs:string COLLATE UCS_BASIC_LCASE\u003e"
}
},
"expressions": [{
diff --git
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
index 3a5661e54ce0..b247463daf43 100644
Binary files
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
and
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
differ
diff --git
a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
index bd52a16d5b22..1b50936d935a 100644
---
a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
+++
b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
@@ -1047,8 +1047,8 @@ class SparkConnectProtoSuite extends PlanTest with
SparkConnectPlanTest {
test("SPARK-47144: Collated string") {
Seq("UCS_BASIC", "UCS_BASIC_LCASE", "UNICODE",
"UNICODE_CI").map(collationName =>
Seq(
- s"select 'abc' collate '$collationName'",
- s"select collation('abc' collate '$collationName')").map(query =>
+ s"select 'abc' collate $collationName",
+ s"select collation('abc' collate $collationName)").map(query =>
comparePlans(connect.sql(query), spark.sql(query))))
}
diff --git a/python/pyspark/sql/tests/test_types.py
b/python/pyspark/sql/tests/test_types.py
index e0f81dc08743..a0dfdce1a96e 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -862,15 +862,15 @@ class TypesTestsMixin:
if k != "varchar" and k != "char":
self.assertEqual(t(), _parse_datatype_string(k))
self.assertEqual(IntegerType(), _parse_datatype_string("int"))
- self.assertEqual(StringType(), _parse_datatype_string("string COLLATE
'UCS_BASIC'"))
+ self.assertEqual(StringType(), _parse_datatype_string("string COLLATE
UCS_BASIC"))
self.assertEqual(StringType(0), _parse_datatype_string("string"))
- self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE
'UCS_BASIC'"))
- self.assertEqual(StringType(0), _parse_datatype_string("string
COLLATE 'UCS_BASIC'"))
- self.assertEqual(StringType(0), _parse_datatype_string("string
COLLATE'UCS_BASIC'"))
- self.assertEqual(StringType(1), _parse_datatype_string("string COLLATE
'UCS_BASIC_LCASE'"))
- self.assertEqual(StringType(1), _parse_datatype_string("string COLLATE
'UCS_BASIC_LCASE'"))
- self.assertEqual(StringType(2), _parse_datatype_string("string COLLATE
'UNICODE'"))
- self.assertEqual(StringType(3), _parse_datatype_string("string COLLATE
'UNICODE_CI'"))
+ self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE
UCS_BASIC"))
+ self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE
UCS_BASIC"))
+ self.assertEqual(StringType(1), _parse_datatype_string("string COLLATE
UCS_BASIC_LCASE"))
+ self.assertEqual(StringType(2), _parse_datatype_string("string COLLATE
UNICODE"))
+ self.assertEqual(StringType(2), _parse_datatype_string("string COLLATE
`UNICODE`"))
+ self.assertEqual(StringType(3), _parse_datatype_string("string COLLATE
UNICODE_CI"))
+ self.assertEqual(StringType(3), _parse_datatype_string("string COLLATE
`UNICODE_CI`"))
self.assertEqual(CharType(1), _parse_datatype_string("char(1)"))
self.assertEqual(CharType(10), _parse_datatype_string("char( 10 )"))
self.assertEqual(CharType(11), _parse_datatype_string("char( 11)"))
@@ -1410,7 +1410,7 @@ class TypesTestsMixin:
def test_collated_string(self):
dfs = [
- self.spark.sql("SELECT 'abc' collate 'UCS_BASIC_LCASE'"),
+ self.spark.sql("SELECT 'abc' collate UCS_BASIC_LCASE"),
self.spark.createDataFrame([], StructType([StructField("id",
StringType(1))])),
]
for df in dfs:
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index c4647b90767f..a30f41ae4023 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -261,7 +261,7 @@ class StringType(AtomicType):
def collationIdToName(self) -> str:
return (
- " COLLATE '%s'" % StringType.collationNames[self.collationId]
+ " COLLATE %s" % StringType.collationNames[self.collationId]
if self.collationId != 0
else ""
)
@@ -1486,7 +1486,7 @@ _all_complex_types: Dict[str, Type[Union[ArrayType,
MapType, StructType]]] = dic
(v.typeName(), v) for v in _complex_types
)
-_COLLATED_STRING = re.compile(r"string\s+COLLATE\s+'([\w_]+)'")
+_COLLATED_STRING = re.compile(r"string\s+COLLATE\s+([\w_]+|`[\w_]`)")
_LENGTH_CHAR = re.compile(r"char\(\s*(\d+)\s*\)")
_LENGTH_VARCHAR = re.compile(r"varchar\(\s*(\d+)\s*\)")
_FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(-?\d+)\s*\)")
diff --git
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
index ca01de4ffdc2..801cc62491a2 100644
---
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
+++
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -1096,7 +1096,7 @@ colPosition
;
collateClause
- : COLLATE collationName=stringLit
+ : COLLATE collationName=identifier
;
type
diff --git
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala
index 0d2822e13efc..5c2df6a6e9d9 100644
---
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala
+++
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala
@@ -218,6 +218,6 @@ class DataTypeAstBuilder extends
SqlBaseParserBaseVisitor[AnyRef] {
* Returns a collation name.
*/
override def visitCollateClause(ctx: CollateClauseContext): String =
withOrigin(ctx) {
- string(visitStringLit(ctx.stringLit))
+ ctx.identifier.getText
}
}
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
index efaf6e6bfd6a..2ffe0ba379dd 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -117,7 +117,7 @@ object DataType {
private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r
private val CHAR_TYPE = """char\(\s*(\d+)\s*\)""".r
private val VARCHAR_TYPE = """varchar\(\s*(\d+)\s*\)""".r
- private val COLLATED_STRING_TYPE = """string\s+COLLATE\s+'([\w_]+)'""".r
+ private val COLLATED_STRING_TYPE =
"""string\s+COLLATE\s+([\w_]+|`[\w_]`)""".r
def fromDDL(ddl: String): DataType = {
parseTypeWithFallback(
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
index 313f525742ae..b434c98edd2a 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
@@ -47,7 +47,7 @@ class StringType private(val collationId: Int) extends
AtomicType with Serializa
*/
override def typeName: String =
if (isDefaultCollation) "string"
- else s"string COLLATE
'${CollationFactory.fetchCollation(collationId).collationName}'"
+ else s"string COLLATE
${CollationFactory.fetchCollation(collationId).collationName}"
override def equals(obj: Any): Boolean =
obj.isInstanceOf[StringType] && obj.asInstanceOf[StringType].collationId
== collationId
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
index e51d9a67b166..99237d978dee 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.types._
Examples:
> SET spark.sql.collation.enabled=true;
spark.sql.collation.enabled true
- > SELECT COLLATION('Spark SQL' _FUNC_ 'UCS_BASIC_LCASE');
+ > SELECT COLLATION('Spark SQL' _FUNC_ UCS_BASIC_LCASE);
UCS_BASIC_LCASE
> SET spark.sql.collation.enabled=false;
spark.sql.collation.enabled false
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 48d8b8ad8f4d..9fed85b2a2a7 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -2194,7 +2194,7 @@ class AstBuilder extends DataTypeAstBuilder with
SQLConfHelper with Logging {
if (!SQLConf.get.collationEnabled) {
throw QueryCompilationErrors.collationNotEnabledError()
}
- string(visitStringLit(ctx.stringLit))
+ ctx.identifier.getText
}
/**
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index e20db3b49589..33c26fe3c036 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -81,7 +81,7 @@
| org.apache.spark.sql.catalyst.expressions.Chr | char | SELECT char(65) |
struct<char(65):string> |
| org.apache.spark.sql.catalyst.expressions.Chr | chr | SELECT chr(65) |
struct<chr(65):string> |
| org.apache.spark.sql.catalyst.expressions.Coalesce | coalesce | SELECT
coalesce(NULL, 1, NULL) | struct<coalesce(NULL, 1, NULL):int> |
-| org.apache.spark.sql.catalyst.expressions.CollateExpressionBuilder | collate
| SELECT COLLATION('Spark SQL' collate 'UCS_BASIC_LCASE') |
struct<collation(collate(Spark SQL)):string> |
+| org.apache.spark.sql.catalyst.expressions.CollateExpressionBuilder | collate
| SELECT COLLATION('Spark SQL' collate UCS_BASIC_LCASE) |
struct<collation(collate(Spark SQL)):string> |
| org.apache.spark.sql.catalyst.expressions.Collation | collation | SELECT
collation('Spark SQL') | struct<collation(Spark SQL):string> |
| org.apache.spark.sql.catalyst.expressions.Concat | concat | SELECT
concat('Spark', 'SQL') | struct<concat(Spark, SQL):string> |
| org.apache.spark.sql.catalyst.expressions.ConcatWs | concat_ws | SELECT
concat_ws(' ', 'Spark', 'SQL') | struct<concat_ws( , Spark, SQL):string> |
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
index 76f20e7f8dcb..794185c5cba4 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
@@ -1,6 +1,6 @@
-- Automatically generated by SQLQueryTestSuite
-- !query
-create table t1(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string
collate 'ucs_basic_lcase') using parquet
+create table t1(ucs_basic string collate ucs_basic, ucs_basic_lcase string
collate ucs_basic_lcase) using parquet
-- !query analysis
CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false
@@ -9,7 +9,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`,
false
insert into t1 values('aaa', 'aaa')
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`,
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included
in comparison]/{warehouse_dir}/t1), [ucs_basic, ucs_basic_lcase]
-+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x]
++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x]
+- LocalRelation [col1#x, col2#x]
@@ -17,7 +17,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_d
insert into t1 values('AAA', 'AAA')
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`,
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included
in comparison]/{warehouse_dir}/t1), [ucs_basic, ucs_basic_lcase]
-+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x]
++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x]
+- LocalRelation [col1#x, col2#x]
@@ -25,7 +25,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_d
insert into t1 values('bbb', 'bbb')
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`,
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included
in comparison]/{warehouse_dir}/t1), [ucs_basic, ucs_basic_lcase]
-+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x]
++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x]
+- LocalRelation [col1#x, col2#x]
@@ -33,7 +33,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_d
insert into t1 values('BBB', 'BBB')
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`,
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included
in comparison]/{warehouse_dir}/t1), [ucs_basic, ucs_basic_lcase]
-+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x]
++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x]
+- LocalRelation [col1#x, col2#x]
@@ -63,7 +63,7 @@ Project [ucs_basic#x, ucs_basic_lcase#x]
-- !query
-select * from t1 where ucs_basic_lcase = 'aaa' collate 'ucs_basic_lcase'
+select * from t1 where ucs_basic_lcase = 'aaa' collate ucs_basic_lcase
-- !query analysis
Project [ucs_basic#x, ucs_basic_lcase#x]
+- Filter (ucs_basic_lcase#x = collate(aaa, ucs_basic_lcase))
@@ -81,7 +81,7 @@ Project [ucs_basic#x, ucs_basic_lcase#x]
-- !query
-select * from t1 where ucs_basic_lcase < 'bbb' collate 'ucs_basic_lcase'
+select * from t1 where ucs_basic_lcase < 'bbb' collate ucs_basic_lcase
-- !query analysis
Project [ucs_basic#x, ucs_basic_lcase#x]
+- Filter (ucs_basic_lcase#x < collate(bbb, ucs_basic_lcase))
@@ -103,7 +103,7 @@ Project [ucs_basic#x, ucs_basic_lcase#x]
-- !query
-create table t2(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string
collate 'ucs_basic_lcase') using parquet
+create table t2(ucs_basic string collate ucs_basic, ucs_basic_lcase string
collate ucs_basic_lcase) using parquet
-- !query analysis
CreateDataSourceTableCommand `spark_catalog`.`default`.`t2`, false
@@ -112,7 +112,7 @@ CreateDataSourceTableCommand
`spark_catalog`.`default`.`t2`, false
insert into t2 values('aaa', 'aaa')
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in
comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`,
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included
in comparison]/{warehouse_dir}/t2), [ucs_basic, ucs_basic_lcase]
-+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x]
++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x]
+- LocalRelation [col1#x, col2#x]
@@ -120,7 +120,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_d
insert into t2 values('bbb', 'bbb')
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in
comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`,
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included
in comparison]/{warehouse_dir}/t2), [ucs_basic, ucs_basic_lcase]
-+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x]
++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string
COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x]
+- LocalRelation [col1#x, col2#x]
@@ -150,7 +150,7 @@ DropTable false, false
-- !query
-create table t1 (c1 struct<ucs_basic: string collate 'ucs_basic',
ucs_basic_lcase: string collate 'ucs_basic_lcase'>) USING PARQUET
+create table t1 (c1 struct<ucs_basic: string collate ucs_basic,
ucs_basic_lcase: string collate ucs_basic_lcase>) USING PARQUET
-- !query analysis
CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false
@@ -159,7 +159,7 @@ CreateDataSourceTableCommand
`spark_catalog`.`default`.`t1`, false
INSERT INTO t1 VALUES (named_struct('ucs_basic', 'aaa', 'ucs_basic_lcase',
'aaa'))
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`,
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included
in comparison]/{warehouse_dir}/t1), [c1]
-+- Project [named_struct(ucs_basic, col1#x.ucs_basic, ucs_basic_lcase,
cast(col1#x.ucs_basic_lcase as string COLLATE 'UCS_BASIC_LCASE')) AS c1#x]
++- Project [named_struct(ucs_basic, col1#x.ucs_basic, ucs_basic_lcase,
cast(col1#x.ucs_basic_lcase as string COLLATE UCS_BASIC_LCASE)) AS c1#x]
+- LocalRelation [col1#x]
@@ -167,7 +167,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_d
INSERT INTO t1 VALUES (named_struct('ucs_basic', 'AAA', 'ucs_basic_lcase',
'AAA'))
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in
comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in
comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`,
org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included
in comparison]/{warehouse_dir}/t1), [c1]
-+- Project [named_struct(ucs_basic, col1#x.ucs_basic, ucs_basic_lcase,
cast(col1#x.ucs_basic_lcase as string COLLATE 'UCS_BASIC_LCASE')) AS c1#x]
++- Project [named_struct(ucs_basic, col1#x.ucs_basic, ucs_basic_lcase,
cast(col1#x.ucs_basic_lcase as string COLLATE UCS_BASIC_LCASE)) AS c1#x]
+- LocalRelation [col1#x]
diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql
b/sql/core/src/test/resources/sql-tests/inputs/collations.sql
index 96e82c695069..91a159b39aee 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/collations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/collations.sql
@@ -1,7 +1,7 @@
-- test cases for collation support
-- Create a test table with data
-create table t1(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string
collate 'ucs_basic_lcase') using parquet;
+create table t1(ucs_basic string collate ucs_basic, ucs_basic_lcase string
collate ucs_basic_lcase) using parquet;
insert into t1 values('aaa', 'aaa');
insert into t1 values('AAA', 'AAA');
insert into t1 values('bbb', 'bbb');
@@ -17,19 +17,19 @@ select count(*) from t1 group by ucs_basic_lcase;
select * from t1 where ucs_basic = 'aaa';
-- filter equal ucs_basic_lcase
-select * from t1 where ucs_basic_lcase = 'aaa' collate 'ucs_basic_lcase';
+select * from t1 where ucs_basic_lcase = 'aaa' collate ucs_basic_lcase;
-- filter less then ucs_basic
select * from t1 where ucs_basic < 'bbb';
-- filter less then ucs_basic_lcase
-select * from t1 where ucs_basic_lcase < 'bbb' collate 'ucs_basic_lcase';
+select * from t1 where ucs_basic_lcase < 'bbb' collate ucs_basic_lcase;
-- inner join
select l.ucs_basic, r.ucs_basic_lcase from t1 l join t1 r on l.ucs_basic_lcase
= r.ucs_basic_lcase;
-- create second table for anti-join
-create table t2(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string
collate 'ucs_basic_lcase') using parquet;
+create table t2(ucs_basic string collate ucs_basic, ucs_basic_lcase string
collate ucs_basic_lcase) using parquet;
insert into t2 values('aaa', 'aaa');
insert into t2 values('bbb', 'bbb');
@@ -40,7 +40,7 @@ drop table t2;
drop table t1;
-- create table with struct field
-create table t1 (c1 struct<ucs_basic: string collate 'ucs_basic',
ucs_basic_lcase: string collate 'ucs_basic_lcase'>) USING PARQUET;
+create table t1 (c1 struct<ucs_basic: string collate ucs_basic,
ucs_basic_lcase: string collate ucs_basic_lcase>) USING PARQUET;
INSERT INTO t1 VALUES (named_struct('ucs_basic', 'aaa', 'ucs_basic_lcase',
'aaa'));
INSERT INTO t1 VALUES (named_struct('ucs_basic', 'AAA', 'ucs_basic_lcase',
'AAA'));
diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out
b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
index 49d6b94cae96..7814ae59173d 100644
--- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
@@ -1,6 +1,6 @@
-- Automatically generated by SQLQueryTestSuite
-- !query
-create table t1(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string
collate 'ucs_basic_lcase') using parquet
+create table t1(ucs_basic string collate ucs_basic, ucs_basic_lcase string
collate ucs_basic_lcase) using parquet
-- !query schema
struct<>
-- !query output
@@ -62,15 +62,15 @@ struct<count(1):bigint>
-- !query
select * from t1 where ucs_basic = 'aaa'
-- !query schema
-struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'>
+struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE>
-- !query output
aaa aaa
-- !query
-select * from t1 where ucs_basic_lcase = 'aaa' collate 'ucs_basic_lcase'
+select * from t1 where ucs_basic_lcase = 'aaa' collate ucs_basic_lcase
-- !query schema
-struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'>
+struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE>
-- !query output
AAA AAA
aaa aaa
@@ -79,7 +79,7 @@ aaa aaa
-- !query
select * from t1 where ucs_basic < 'bbb'
-- !query schema
-struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'>
+struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE>
-- !query output
AAA AAA
BBB BBB
@@ -87,9 +87,9 @@ aaa aaa
-- !query
-select * from t1 where ucs_basic_lcase < 'bbb' collate 'ucs_basic_lcase'
+select * from t1 where ucs_basic_lcase < 'bbb' collate ucs_basic_lcase
-- !query schema
-struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'>
+struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE>
-- !query output
AAA AAA
aaa aaa
@@ -98,7 +98,7 @@ aaa aaa
-- !query
select l.ucs_basic, r.ucs_basic_lcase from t1 l join t1 r on l.ucs_basic_lcase
= r.ucs_basic_lcase
-- !query schema
-struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'>
+struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE>
-- !query output
AAA AAA
AAA aaa
@@ -111,7 +111,7 @@ bbb bbb
-- !query
-create table t2(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string
collate 'ucs_basic_lcase') using parquet
+create table t2(ucs_basic string collate ucs_basic, ucs_basic_lcase string
collate ucs_basic_lcase) using parquet
-- !query schema
struct<>
-- !query output
@@ -137,7 +137,7 @@ struct<>
-- !query
select * from t1 anti join t2 on t1.ucs_basic_lcase = t2.ucs_basic_lcase
-- !query schema
-struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'>
+struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE>
-- !query output
@@ -159,7 +159,7 @@ struct<>
-- !query
-create table t1 (c1 struct<ucs_basic: string collate 'ucs_basic',
ucs_basic_lcase: string collate 'ucs_basic_lcase'>) USING PARQUET
+create table t1 (c1 struct<ucs_basic: string collate ucs_basic,
ucs_basic_lcase: string collate ucs_basic_lcase>) USING PARQUET
-- !query schema
struct<>
-- !query output
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
index 3632c06bfe7c..6f183e60f589 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
@@ -37,18 +37,18 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
test("collate returns proper type") {
Seq("ucs_basic", "ucs_basic_lcase", "unicode", "unicode_ci").foreach {
collationName =>
- checkAnswer(sql(s"select 'aaa' collate '$collationName'"), Row("aaa"))
+ checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa"))
val collationId = CollationFactory.collationNameToId(collationName)
- assert(sql(s"select 'aaa' collate '$collationName'").schema(0).dataType
+ assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType
== StringType(collationId))
}
}
test("collation name is case insensitive") {
Seq("uCs_BasIc", "uCs_baSic_Lcase", "uNicOde", "UNICODE_ci").foreach {
collationName =>
- checkAnswer(sql(s"select 'aaa' collate '$collationName'"), Row("aaa"))
+ checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa"))
val collationId = CollationFactory.collationNameToId(collationName)
- assert(sql(s"select 'aaa' collate '$collationName'").schema(0).dataType
+ assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType
== StringType(collationId))
}
}
@@ -56,7 +56,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
test("collation expression returns name of collation") {
Seq("ucs_basic", "ucs_basic_lcase", "unicode", "unicode_ci").foreach {
collationName =>
checkAnswer(
- sql(s"select collation('aaa' collate '$collationName')"),
Row(collationName.toUpperCase()))
+ sql(s"select collation('aaa' collate $collationName)"),
Row(collationName.toUpperCase()))
}
}
@@ -132,7 +132,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
test("invalid collation name throws exception") {
checkError(
- exception = intercept[SparkException] { sql("select 'aaa' collate
'UCS_BASIS'") },
+ exception = intercept[SparkException] { sql("select 'aaa' collate
UCS_BASIS") },
errorClass = "COLLATION_INVALID_NAME",
sqlState = "42704",
parameters = Map("proposal" -> "UCS_BASIC", "collationName" ->
"UCS_BASIS"))
@@ -153,7 +153,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
).foreach {
case (collationName, left, right, expected) =>
checkAnswer(
- sql(s"select '$left' collate '$collationName' = '$right' collate
'$collationName'"),
+ sql(s"select '$left' collate $collationName = '$right' collate
$collationName"),
Row(expected))
checkAnswer(
sql(s"select collate('$left', '$collationName') = collate('$right',
'$collationName')"),
@@ -178,7 +178,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
).foreach {
case (collationName, left, right, expected) =>
checkAnswer(
- sql(s"select '$left' collate '$collationName' < '$right' collate
'$collationName'"),
+ sql(s"select '$left' collate $collationName < '$right' collate
$collationName"),
Row(expected))
checkAnswer(
sql(s"select collate('$left', '$collationName') < collate('$right',
'$collationName')"),
@@ -420,9 +420,9 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
val tableNameBinary = "T_BINARY"
withTable(tableNameNonBinary) {
withTable(tableNameBinary) {
- sql(s"CREATE TABLE $tableNameNonBinary (c STRING COLLATE
'UCS_BASIC_LCASE') USING PARQUET")
+ sql(s"CREATE TABLE $tableNameNonBinary (c STRING COLLATE
UCS_BASIC_LCASE) USING PARQUET")
sql(s"INSERT INTO $tableNameNonBinary VALUES ('aaa')")
- sql(s"CREATE TABLE $tableNameBinary (c STRING COLLATE 'UCS_BASIC')
USING PARQUET")
+ sql(s"CREATE TABLE $tableNameBinary (c STRING COLLATE UCS_BASIC) USING
PARQUET")
sql(s"INSERT INTO $tableNameBinary VALUES ('aaa')")
val dfNonBinary = sql(s"SELECT COUNT(*), c FROM $tableNameNonBinary
GROUP BY c")
@@ -438,6 +438,16 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
}
}
+ test("text writing to parquet with collation enclosed with backticks") {
+ withTempPath{ path =>
+ sql(s"select 'a' COLLATE `UNICODE`").write.parquet(path.getAbsolutePath)
+
+ checkAnswer(
+ spark.read.parquet(path.getAbsolutePath),
+ Row("a"))
+ }
+ }
+
test("create table with collation") {
val tableName = "parquet_dummy_tbl"
val collationName = "UCS_BASIC_LCASE"
@@ -446,7 +456,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
withTable(tableName) {
sql(
s"""
- |CREATE TABLE $tableName (c1 STRING COLLATE '$collationName')
+ |CREATE TABLE $tableName (c1 STRING COLLATE $collationName)
|USING PARQUET
|""".stripMargin)
@@ -467,7 +477,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
sql(
s"""
|CREATE TABLE $tableName
- |(c1 STRUCT<name: STRING COLLATE '$collationName', age: INT>)
+ |(c1 STRUCT<name: STRING COLLATE $collationName, age: INT>)
|USING PARQUET
|""".stripMargin)
@@ -502,7 +512,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
sql(
s"""
|ALTER TABLE $tableName
- |ADD COLUMN c2 STRING COLLATE '$collationName'
+ |ADD COLUMN c2 STRING COLLATE $collationName
|""".stripMargin)
sql(s"INSERT INTO $tableName VALUES ('aaa', 'aaa')")
@@ -522,7 +532,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
withTable(tableName) {
sql(
s"""
- |CREATE TABLE $tableName (c1 string COLLATE '$collationName')
+ |CREATE TABLE $tableName (c1 string COLLATE $collationName)
|USING $v2Source
|""".stripMargin)
@@ -552,7 +562,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
sql(
s"""
|CREATE TABLE $tableName
- |(id INT, c1 STRING COLLATE 'UNICODE', c2 string)
+ |(id INT, c1 STRING COLLATE UNICODE, c2 string)
|USING parquet
|PARTITIONED BY (${partitionColumns.mkString(",")})
|""".stripMargin)
@@ -570,7 +580,7 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
createTable(partitionColumns: _*)
},
errorClass = "INVALID_PARTITION_COLUMN_DATA_TYPE",
- parameters = Map("type" -> "\"STRING COLLATE 'UNICODE'\"")
+ parameters = Map("type" -> "\"STRING COLLATE UNICODE\"")
);
}
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala
index 72250f9a7fdc..c9198c86c720 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala
@@ -955,9 +955,9 @@ class QueryCompilationErrorsSuite
test("SPARK-47102: the collation feature is off without collate builder
call") {
withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") {
Seq(
- "CREATE TABLE t(col STRING COLLATE 'UNICODE_CI') USING parquet",
- "CREATE TABLE t(col STRING COLLATE 'UNKNOWN_COLLATION_STRING') USING
parquet",
- "SELECT 'aaa' COLLATE 'UNICODE_CI'",
+ "CREATE TABLE t(col STRING COLLATE UNICODE_CI) USING parquet",
+ "CREATE TABLE t(col STRING COLLATE UNKNOWN_COLLATION_STRING) USING
parquet",
+ "SELECT 'aaa' COLLATE UNICODE_CI",
"select collation('aaa')"
).foreach { sqlText =>
checkError(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]