This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 0a94e7c7b9a8 [SPARK-47302][SQL] Collate keyword as identifier 0a94e7c7b9a8 is described below commit 0a94e7c7b9a83ecfbca59f5b93532453f462500c Author: Aleksandar Tomic <aleksandar.to...@databricks.com> AuthorDate: Fri Mar 8 13:33:21 2024 +0300 [SPARK-47302][SQL] Collate keyword as identifier ### What changes were proposed in this pull request? With this change we move away from using collation names as string literals and start treating them as identifiers, since that is the part of sql standard. Collation names are marked as multi part identifiers, since, in future, we will want to introduce user defined collations which can be part of nested namespaces in catalog. ### Why are the changes needed? Aligning with sql standard on collation syntax. ### Does this PR introduce _any_ user-facing change? Yes. Collations are still not a released feature. ### How was this patch tested? Existing tests are used. ### Was this patch authored or co-authored using generative AI tooling? Closes #45405 from dbatomic/collate_key_word_as_identifier. Lead-authored-by: Aleksandar Tomic <aleksandar.to...@databricks.com> Co-authored-by: Nikola Mandic <nikola.man...@databricks.com> Co-authored-by: Stefan Kandic <stefan.kan...@databricks.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../queries/select_collated_string.json | 2 +- .../queries/select_collated_string.proto.bin | Bin 65 -> 63 bytes .../connect/planner/SparkConnectProtoSuite.scala | 4 +- python/pyspark/sql/tests/test_types.py | 18 ++++----- python/pyspark/sql/types.py | 4 +- .../spark/sql/catalyst/parser/SqlBaseParser.g4 | 2 +- .../sql/catalyst/parser/DataTypeAstBuilder.scala | 2 +- .../org/apache/spark/sql/types/DataType.scala | 2 +- .../org/apache/spark/sql/types/StringType.scala | 2 +- .../expressions/collationExpressions.scala | 2 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 2 +- .../sql-functions/sql-expression-schema.md | 2 +- .../sql-tests/analyzer-results/collations.sql.out | 26 ++++++------- .../test/resources/sql-tests/inputs/collations.sql | 10 ++--- .../resources/sql-tests/results/collations.sql.out | 22 +++++------ .../org/apache/spark/sql/CollationSuite.scala | 42 +++++++++++++-------- .../sql/errors/QueryCompilationErrorsSuite.scala | 6 +-- 17 files changed, 79 insertions(+), 69 deletions(-) diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json index db065b36e345..00644e072190 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json @@ -8,7 +8,7 @@ "planId": "0" }, "localRelation": { - "schema": "struct\u003cs:string COLLATE \u0027UCS_BASIC_LCASE\u0027\u003e" + "schema": "struct\u003cs:string COLLATE UCS_BASIC_LCASE\u003e" } }, "expressions": [{ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin index 3a5661e54ce0..b247463daf43 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin differ diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala index bd52a16d5b22..1b50936d935a 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala @@ -1047,8 +1047,8 @@ class SparkConnectProtoSuite extends PlanTest with SparkConnectPlanTest { test("SPARK-47144: Collated string") { Seq("UCS_BASIC", "UCS_BASIC_LCASE", "UNICODE", "UNICODE_CI").map(collationName => Seq( - s"select 'abc' collate '$collationName'", - s"select collation('abc' collate '$collationName')").map(query => + s"select 'abc' collate $collationName", + s"select collation('abc' collate $collationName)").map(query => comparePlans(connect.sql(query), spark.sql(query)))) } diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index e0f81dc08743..a0dfdce1a96e 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -862,15 +862,15 @@ class TypesTestsMixin: if k != "varchar" and k != "char": self.assertEqual(t(), _parse_datatype_string(k)) self.assertEqual(IntegerType(), _parse_datatype_string("int")) - self.assertEqual(StringType(), _parse_datatype_string("string COLLATE 'UCS_BASIC'")) + self.assertEqual(StringType(), _parse_datatype_string("string COLLATE UCS_BASIC")) self.assertEqual(StringType(0), _parse_datatype_string("string")) - self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE 'UCS_BASIC'")) - self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE 'UCS_BASIC'")) - self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE'UCS_BASIC'")) - self.assertEqual(StringType(1), _parse_datatype_string("string COLLATE 'UCS_BASIC_LCASE'")) - self.assertEqual(StringType(1), _parse_datatype_string("string COLLATE 'UCS_BASIC_LCASE'")) - self.assertEqual(StringType(2), _parse_datatype_string("string COLLATE 'UNICODE'")) - self.assertEqual(StringType(3), _parse_datatype_string("string COLLATE 'UNICODE_CI'")) + self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE UCS_BASIC")) + self.assertEqual(StringType(0), _parse_datatype_string("string COLLATE UCS_BASIC")) + self.assertEqual(StringType(1), _parse_datatype_string("string COLLATE UCS_BASIC_LCASE")) + self.assertEqual(StringType(2), _parse_datatype_string("string COLLATE UNICODE")) + self.assertEqual(StringType(2), _parse_datatype_string("string COLLATE `UNICODE`")) + self.assertEqual(StringType(3), _parse_datatype_string("string COLLATE UNICODE_CI")) + self.assertEqual(StringType(3), _parse_datatype_string("string COLLATE `UNICODE_CI`")) self.assertEqual(CharType(1), _parse_datatype_string("char(1)")) self.assertEqual(CharType(10), _parse_datatype_string("char( 10 )")) self.assertEqual(CharType(11), _parse_datatype_string("char( 11)")) @@ -1410,7 +1410,7 @@ class TypesTestsMixin: def test_collated_string(self): dfs = [ - self.spark.sql("SELECT 'abc' collate 'UCS_BASIC_LCASE'"), + self.spark.sql("SELECT 'abc' collate UCS_BASIC_LCASE"), self.spark.createDataFrame([], StructType([StructField("id", StringType(1))])), ] for df in dfs: diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index c4647b90767f..a30f41ae4023 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -261,7 +261,7 @@ class StringType(AtomicType): def collationIdToName(self) -> str: return ( - " COLLATE '%s'" % StringType.collationNames[self.collationId] + " COLLATE %s" % StringType.collationNames[self.collationId] if self.collationId != 0 else "" ) @@ -1486,7 +1486,7 @@ _all_complex_types: Dict[str, Type[Union[ArrayType, MapType, StructType]]] = dic (v.typeName(), v) for v in _complex_types ) -_COLLATED_STRING = re.compile(r"string\s+COLLATE\s+'([\w_]+)'") +_COLLATED_STRING = re.compile(r"string\s+COLLATE\s+([\w_]+|`[\w_]`)") _LENGTH_CHAR = re.compile(r"char\(\s*(\d+)\s*\)") _LENGTH_VARCHAR = re.compile(r"varchar\(\s*(\d+)\s*\)") _FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(-?\d+)\s*\)") diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index ca01de4ffdc2..801cc62491a2 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -1096,7 +1096,7 @@ colPosition ; collateClause - : COLLATE collationName=stringLit + : COLLATE collationName=identifier ; type diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala index 0d2822e13efc..5c2df6a6e9d9 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeAstBuilder.scala @@ -218,6 +218,6 @@ class DataTypeAstBuilder extends SqlBaseParserBaseVisitor[AnyRef] { * Returns a collation name. */ override def visitCollateClause(ctx: CollateClauseContext): String = withOrigin(ctx) { - string(visitStringLit(ctx.stringLit)) + ctx.identifier.getText } } diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala index efaf6e6bfd6a..2ffe0ba379dd 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -117,7 +117,7 @@ object DataType { private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r private val CHAR_TYPE = """char\(\s*(\d+)\s*\)""".r private val VARCHAR_TYPE = """varchar\(\s*(\d+)\s*\)""".r - private val COLLATED_STRING_TYPE = """string\s+COLLATE\s+'([\w_]+)'""".r + private val COLLATED_STRING_TYPE = """string\s+COLLATE\s+([\w_]+|`[\w_]`)""".r def fromDDL(ddl: String): DataType = { parseTypeWithFallback( diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala index 313f525742ae..b434c98edd2a 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala @@ -47,7 +47,7 @@ class StringType private(val collationId: Int) extends AtomicType with Serializa */ override def typeName: String = if (isDefaultCollation) "string" - else s"string COLLATE '${CollationFactory.fetchCollation(collationId).collationName}'" + else s"string COLLATE ${CollationFactory.fetchCollation(collationId).collationName}" override def equals(obj: Any): Boolean = obj.isInstanceOf[StringType] && obj.asInstanceOf[StringType].collationId == collationId diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index e51d9a67b166..99237d978dee 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.types._ Examples: > SET spark.sql.collation.enabled=true; spark.sql.collation.enabled true - > SELECT COLLATION('Spark SQL' _FUNC_ 'UCS_BASIC_LCASE'); + > SELECT COLLATION('Spark SQL' _FUNC_ UCS_BASIC_LCASE); UCS_BASIC_LCASE > SET spark.sql.collation.enabled=false; spark.sql.collation.enabled false diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 48d8b8ad8f4d..9fed85b2a2a7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2194,7 +2194,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { if (!SQLConf.get.collationEnabled) { throw QueryCompilationErrors.collationNotEnabledError() } - string(visitStringLit(ctx.stringLit)) + ctx.identifier.getText } /** diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index e20db3b49589..33c26fe3c036 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -81,7 +81,7 @@ | org.apache.spark.sql.catalyst.expressions.Chr | char | SELECT char(65) | struct<char(65):string> | | org.apache.spark.sql.catalyst.expressions.Chr | chr | SELECT chr(65) | struct<chr(65):string> | | org.apache.spark.sql.catalyst.expressions.Coalesce | coalesce | SELECT coalesce(NULL, 1, NULL) | struct<coalesce(NULL, 1, NULL):int> | -| org.apache.spark.sql.catalyst.expressions.CollateExpressionBuilder | collate | SELECT COLLATION('Spark SQL' collate 'UCS_BASIC_LCASE') | struct<collation(collate(Spark SQL)):string> | +| org.apache.spark.sql.catalyst.expressions.CollateExpressionBuilder | collate | SELECT COLLATION('Spark SQL' collate UCS_BASIC_LCASE) | struct<collation(collate(Spark SQL)):string> | | org.apache.spark.sql.catalyst.expressions.Collation | collation | SELECT collation('Spark SQL') | struct<collation(Spark SQL):string> | | org.apache.spark.sql.catalyst.expressions.Concat | concat | SELECT concat('Spark', 'SQL') | struct<concat(Spark, SQL):string> | | org.apache.spark.sql.catalyst.expressions.ConcatWs | concat_ws | SELECT concat_ws(' ', 'Spark', 'SQL') | struct<concat_ws( , Spark, SQL):string> | diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out index 76f20e7f8dcb..794185c5cba4 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out @@ -1,6 +1,6 @@ -- Automatically generated by SQLQueryTestSuite -- !query -create table t1(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string collate 'ucs_basic_lcase') using parquet +create table t1(ucs_basic string collate ucs_basic, ucs_basic_lcase string collate ucs_basic_lcase) using parquet -- !query analysis CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false @@ -9,7 +9,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false insert into t1 values('aaa', 'aaa') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [ucs_basic, ucs_basic_lcase] -+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x] ++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -17,7 +17,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into t1 values('AAA', 'AAA') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [ucs_basic, ucs_basic_lcase] -+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x] ++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -25,7 +25,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into t1 values('bbb', 'bbb') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [ucs_basic, ucs_basic_lcase] -+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x] ++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -33,7 +33,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into t1 values('BBB', 'BBB') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [ucs_basic, ucs_basic_lcase] -+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x] ++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -63,7 +63,7 @@ Project [ucs_basic#x, ucs_basic_lcase#x] -- !query -select * from t1 where ucs_basic_lcase = 'aaa' collate 'ucs_basic_lcase' +select * from t1 where ucs_basic_lcase = 'aaa' collate ucs_basic_lcase -- !query analysis Project [ucs_basic#x, ucs_basic_lcase#x] +- Filter (ucs_basic_lcase#x = collate(aaa, ucs_basic_lcase)) @@ -81,7 +81,7 @@ Project [ucs_basic#x, ucs_basic_lcase#x] -- !query -select * from t1 where ucs_basic_lcase < 'bbb' collate 'ucs_basic_lcase' +select * from t1 where ucs_basic_lcase < 'bbb' collate ucs_basic_lcase -- !query analysis Project [ucs_basic#x, ucs_basic_lcase#x] +- Filter (ucs_basic_lcase#x < collate(bbb, ucs_basic_lcase)) @@ -103,7 +103,7 @@ Project [ucs_basic#x, ucs_basic_lcase#x] -- !query -create table t2(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string collate 'ucs_basic_lcase') using parquet +create table t2(ucs_basic string collate ucs_basic, ucs_basic_lcase string collate ucs_basic_lcase) using parquet -- !query analysis CreateDataSourceTableCommand `spark_catalog`.`default`.`t2`, false @@ -112,7 +112,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t2`, false insert into t2 values('aaa', 'aaa') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [ucs_basic, ucs_basic_lcase] -+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x] ++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -120,7 +120,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into t2 values('bbb', 'bbb') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [ucs_basic, ucs_basic_lcase] -+- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE 'UCS_BASIC_LCASE') AS ucs_basic_lcase#x] ++- Project [cast(col1#x as string) AS ucs_basic#x, cast(col2#x as string COLLATE UCS_BASIC_LCASE) AS ucs_basic_lcase#x] +- LocalRelation [col1#x, col2#x] @@ -150,7 +150,7 @@ DropTable false, false -- !query -create table t1 (c1 struct<ucs_basic: string collate 'ucs_basic', ucs_basic_lcase: string collate 'ucs_basic_lcase'>) USING PARQUET +create table t1 (c1 struct<ucs_basic: string collate ucs_basic, ucs_basic_lcase: string collate ucs_basic_lcase>) USING PARQUET -- !query analysis CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false @@ -159,7 +159,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`t1`, false INSERT INTO t1 VALUES (named_struct('ucs_basic', 'aaa', 'ucs_basic_lcase', 'aaa')) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [c1] -+- Project [named_struct(ucs_basic, col1#x.ucs_basic, ucs_basic_lcase, cast(col1#x.ucs_basic_lcase as string COLLATE 'UCS_BASIC_LCASE')) AS c1#x] ++- Project [named_struct(ucs_basic, col1#x.ucs_basic, ucs_basic_lcase, cast(col1#x.ucs_basic_lcase as string COLLATE UCS_BASIC_LCASE)) AS c1#x] +- LocalRelation [col1#x] @@ -167,7 +167,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d INSERT INTO t1 VALUES (named_struct('ucs_basic', 'AAA', 'ucs_basic_lcase', 'AAA')) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t1], Append, `spark_catalog`.`default`.`t1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t1), [c1] -+- Project [named_struct(ucs_basic, col1#x.ucs_basic, ucs_basic_lcase, cast(col1#x.ucs_basic_lcase as string COLLATE 'UCS_BASIC_LCASE')) AS c1#x] ++- Project [named_struct(ucs_basic, col1#x.ucs_basic, ucs_basic_lcase, cast(col1#x.ucs_basic_lcase as string COLLATE UCS_BASIC_LCASE)) AS c1#x] +- LocalRelation [col1#x] diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql b/sql/core/src/test/resources/sql-tests/inputs/collations.sql index 96e82c695069..91a159b39aee 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/collations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/collations.sql @@ -1,7 +1,7 @@ -- test cases for collation support -- Create a test table with data -create table t1(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string collate 'ucs_basic_lcase') using parquet; +create table t1(ucs_basic string collate ucs_basic, ucs_basic_lcase string collate ucs_basic_lcase) using parquet; insert into t1 values('aaa', 'aaa'); insert into t1 values('AAA', 'AAA'); insert into t1 values('bbb', 'bbb'); @@ -17,19 +17,19 @@ select count(*) from t1 group by ucs_basic_lcase; select * from t1 where ucs_basic = 'aaa'; -- filter equal ucs_basic_lcase -select * from t1 where ucs_basic_lcase = 'aaa' collate 'ucs_basic_lcase'; +select * from t1 where ucs_basic_lcase = 'aaa' collate ucs_basic_lcase; -- filter less then ucs_basic select * from t1 where ucs_basic < 'bbb'; -- filter less then ucs_basic_lcase -select * from t1 where ucs_basic_lcase < 'bbb' collate 'ucs_basic_lcase'; +select * from t1 where ucs_basic_lcase < 'bbb' collate ucs_basic_lcase; -- inner join select l.ucs_basic, r.ucs_basic_lcase from t1 l join t1 r on l.ucs_basic_lcase = r.ucs_basic_lcase; -- create second table for anti-join -create table t2(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string collate 'ucs_basic_lcase') using parquet; +create table t2(ucs_basic string collate ucs_basic, ucs_basic_lcase string collate ucs_basic_lcase) using parquet; insert into t2 values('aaa', 'aaa'); insert into t2 values('bbb', 'bbb'); @@ -40,7 +40,7 @@ drop table t2; drop table t1; -- create table with struct field -create table t1 (c1 struct<ucs_basic: string collate 'ucs_basic', ucs_basic_lcase: string collate 'ucs_basic_lcase'>) USING PARQUET; +create table t1 (c1 struct<ucs_basic: string collate ucs_basic, ucs_basic_lcase: string collate ucs_basic_lcase>) USING PARQUET; INSERT INTO t1 VALUES (named_struct('ucs_basic', 'aaa', 'ucs_basic_lcase', 'aaa')); INSERT INTO t1 VALUES (named_struct('ucs_basic', 'AAA', 'ucs_basic_lcase', 'AAA')); diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out index 49d6b94cae96..7814ae59173d 100644 --- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out @@ -1,6 +1,6 @@ -- Automatically generated by SQLQueryTestSuite -- !query -create table t1(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string collate 'ucs_basic_lcase') using parquet +create table t1(ucs_basic string collate ucs_basic, ucs_basic_lcase string collate ucs_basic_lcase) using parquet -- !query schema struct<> -- !query output @@ -62,15 +62,15 @@ struct<count(1):bigint> -- !query select * from t1 where ucs_basic = 'aaa' -- !query schema -struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'> +struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE> -- !query output aaa aaa -- !query -select * from t1 where ucs_basic_lcase = 'aaa' collate 'ucs_basic_lcase' +select * from t1 where ucs_basic_lcase = 'aaa' collate ucs_basic_lcase -- !query schema -struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'> +struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE> -- !query output AAA AAA aaa aaa @@ -79,7 +79,7 @@ aaa aaa -- !query select * from t1 where ucs_basic < 'bbb' -- !query schema -struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'> +struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE> -- !query output AAA AAA BBB BBB @@ -87,9 +87,9 @@ aaa aaa -- !query -select * from t1 where ucs_basic_lcase < 'bbb' collate 'ucs_basic_lcase' +select * from t1 where ucs_basic_lcase < 'bbb' collate ucs_basic_lcase -- !query schema -struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'> +struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE> -- !query output AAA AAA aaa aaa @@ -98,7 +98,7 @@ aaa aaa -- !query select l.ucs_basic, r.ucs_basic_lcase from t1 l join t1 r on l.ucs_basic_lcase = r.ucs_basic_lcase -- !query schema -struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'> +struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE> -- !query output AAA AAA AAA aaa @@ -111,7 +111,7 @@ bbb bbb -- !query -create table t2(ucs_basic string collate 'ucs_basic', ucs_basic_lcase string collate 'ucs_basic_lcase') using parquet +create table t2(ucs_basic string collate ucs_basic, ucs_basic_lcase string collate ucs_basic_lcase) using parquet -- !query schema struct<> -- !query output @@ -137,7 +137,7 @@ struct<> -- !query select * from t1 anti join t2 on t1.ucs_basic_lcase = t2.ucs_basic_lcase -- !query schema -struct<ucs_basic:string,ucs_basic_lcase:string COLLATE 'UCS_BASIC_LCASE'> +struct<ucs_basic:string,ucs_basic_lcase:string COLLATE UCS_BASIC_LCASE> -- !query output @@ -159,7 +159,7 @@ struct<> -- !query -create table t1 (c1 struct<ucs_basic: string collate 'ucs_basic', ucs_basic_lcase: string collate 'ucs_basic_lcase'>) USING PARQUET +create table t1 (c1 struct<ucs_basic: string collate ucs_basic, ucs_basic_lcase: string collate ucs_basic_lcase>) USING PARQUET -- !query schema struct<> -- !query output diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index 3632c06bfe7c..6f183e60f589 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -37,18 +37,18 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test("collate returns proper type") { Seq("ucs_basic", "ucs_basic_lcase", "unicode", "unicode_ci").foreach { collationName => - checkAnswer(sql(s"select 'aaa' collate '$collationName'"), Row("aaa")) + checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) val collationId = CollationFactory.collationNameToId(collationName) - assert(sql(s"select 'aaa' collate '$collationName'").schema(0).dataType + assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType == StringType(collationId)) } } test("collation name is case insensitive") { Seq("uCs_BasIc", "uCs_baSic_Lcase", "uNicOde", "UNICODE_ci").foreach { collationName => - checkAnswer(sql(s"select 'aaa' collate '$collationName'"), Row("aaa")) + checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) val collationId = CollationFactory.collationNameToId(collationName) - assert(sql(s"select 'aaa' collate '$collationName'").schema(0).dataType + assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType == StringType(collationId)) } } @@ -56,7 +56,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test("collation expression returns name of collation") { Seq("ucs_basic", "ucs_basic_lcase", "unicode", "unicode_ci").foreach { collationName => checkAnswer( - sql(s"select collation('aaa' collate '$collationName')"), Row(collationName.toUpperCase())) + sql(s"select collation('aaa' collate $collationName)"), Row(collationName.toUpperCase())) } } @@ -132,7 +132,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test("invalid collation name throws exception") { checkError( - exception = intercept[SparkException] { sql("select 'aaa' collate 'UCS_BASIS'") }, + exception = intercept[SparkException] { sql("select 'aaa' collate UCS_BASIS") }, errorClass = "COLLATION_INVALID_NAME", sqlState = "42704", parameters = Map("proposal" -> "UCS_BASIC", "collationName" -> "UCS_BASIS")) @@ -153,7 +153,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { ).foreach { case (collationName, left, right, expected) => checkAnswer( - sql(s"select '$left' collate '$collationName' = '$right' collate '$collationName'"), + sql(s"select '$left' collate $collationName = '$right' collate $collationName"), Row(expected)) checkAnswer( sql(s"select collate('$left', '$collationName') = collate('$right', '$collationName')"), @@ -178,7 +178,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { ).foreach { case (collationName, left, right, expected) => checkAnswer( - sql(s"select '$left' collate '$collationName' < '$right' collate '$collationName'"), + sql(s"select '$left' collate $collationName < '$right' collate $collationName"), Row(expected)) checkAnswer( sql(s"select collate('$left', '$collationName') < collate('$right', '$collationName')"), @@ -420,9 +420,9 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { val tableNameBinary = "T_BINARY" withTable(tableNameNonBinary) { withTable(tableNameBinary) { - sql(s"CREATE TABLE $tableNameNonBinary (c STRING COLLATE 'UCS_BASIC_LCASE') USING PARQUET") + sql(s"CREATE TABLE $tableNameNonBinary (c STRING COLLATE UCS_BASIC_LCASE) USING PARQUET") sql(s"INSERT INTO $tableNameNonBinary VALUES ('aaa')") - sql(s"CREATE TABLE $tableNameBinary (c STRING COLLATE 'UCS_BASIC') USING PARQUET") + sql(s"CREATE TABLE $tableNameBinary (c STRING COLLATE UCS_BASIC) USING PARQUET") sql(s"INSERT INTO $tableNameBinary VALUES ('aaa')") val dfNonBinary = sql(s"SELECT COUNT(*), c FROM $tableNameNonBinary GROUP BY c") @@ -438,6 +438,16 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { } } + test("text writing to parquet with collation enclosed with backticks") { + withTempPath{ path => + sql(s"select 'a' COLLATE `UNICODE`").write.parquet(path.getAbsolutePath) + + checkAnswer( + spark.read.parquet(path.getAbsolutePath), + Row("a")) + } + } + test("create table with collation") { val tableName = "parquet_dummy_tbl" val collationName = "UCS_BASIC_LCASE" @@ -446,7 +456,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { withTable(tableName) { sql( s""" - |CREATE TABLE $tableName (c1 STRING COLLATE '$collationName') + |CREATE TABLE $tableName (c1 STRING COLLATE $collationName) |USING PARQUET |""".stripMargin) @@ -467,7 +477,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql( s""" |CREATE TABLE $tableName - |(c1 STRUCT<name: STRING COLLATE '$collationName', age: INT>) + |(c1 STRUCT<name: STRING COLLATE $collationName, age: INT>) |USING PARQUET |""".stripMargin) @@ -502,7 +512,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql( s""" |ALTER TABLE $tableName - |ADD COLUMN c2 STRING COLLATE '$collationName' + |ADD COLUMN c2 STRING COLLATE $collationName |""".stripMargin) sql(s"INSERT INTO $tableName VALUES ('aaa', 'aaa')") @@ -522,7 +532,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { withTable(tableName) { sql( s""" - |CREATE TABLE $tableName (c1 string COLLATE '$collationName') + |CREATE TABLE $tableName (c1 string COLLATE $collationName) |USING $v2Source |""".stripMargin) @@ -552,7 +562,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { sql( s""" |CREATE TABLE $tableName - |(id INT, c1 STRING COLLATE 'UNICODE', c2 string) + |(id INT, c1 STRING COLLATE UNICODE, c2 string) |USING parquet |PARTITIONED BY (${partitionColumns.mkString(",")}) |""".stripMargin) @@ -570,7 +580,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { createTable(partitionColumns: _*) }, errorClass = "INVALID_PARTITION_COLUMN_DATA_TYPE", - parameters = Map("type" -> "\"STRING COLLATE 'UNICODE'\"") + parameters = Map("type" -> "\"STRING COLLATE UNICODE\"") ); } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index 72250f9a7fdc..c9198c86c720 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -955,9 +955,9 @@ class QueryCompilationErrorsSuite test("SPARK-47102: the collation feature is off without collate builder call") { withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") { Seq( - "CREATE TABLE t(col STRING COLLATE 'UNICODE_CI') USING parquet", - "CREATE TABLE t(col STRING COLLATE 'UNKNOWN_COLLATION_STRING') USING parquet", - "SELECT 'aaa' COLLATE 'UNICODE_CI'", + "CREATE TABLE t(col STRING COLLATE UNICODE_CI) USING parquet", + "CREATE TABLE t(col STRING COLLATE UNKNOWN_COLLATION_STRING) USING parquet", + "SELECT 'aaa' COLLATE UNICODE_CI", "select collation('aaa')" ).foreach { sqlText => checkError( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org