(spark) branch master updated: [SPARK-47102][SQL] Add the `COLLATION_ENABLED` config flag

maxgekk Tue, 05 Mar 2024 04:13:52 -0800

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 6534a3398ae9 [SPARK-47102][SQL] Add the `COLLATION_ENABLED` config flag
6534a3398ae9 is described below

commit 6534a3398ae9f4e14b02f4821bccde4b3671dc17
Author: Mihailo Milosevic <[email protected]>
AuthorDate: Tue Mar 5 15:11:26 2024 +0300

    [SPARK-47102][SQL] Add the `COLLATION_ENABLED` config flag
    
    ### What changes were proposed in this pull request?
    This PR adds `COLLATION_ENABLED` config to `SQLConf` and introduces new 
error class `UNSUPPORTED_FEATURE.COLLATION` to appropriately report error on 
usage of feature under development.
    
    Closes #45218
    
    ### Why are the changes needed?
    We want to make collations configurable on this flag. These changes disable 
usage of `collate` and `collation` functions, along with any `COLLATE` syntax 
when the flag is set to false. By default, the flag is set to false.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes. It introduces new error along with an appropriate message.
    
    ### How was this patch tested?
    ```
    ./build/mvn -Dtest=none 
-DwildcardSuites=org.apache.spark.sql.errors.QueryCompilationErrorsSuite test
    ./build/mvn -Dtest=none 
-DwildcardSuites=org.apache.spark.sql.catalyst.expressions.CollationExpressionSuite
 test
    ./build/mvn -Dtest=none 
-DwildcardSuites=org.apache.spark.sql.CollationSuite test
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #45285 from mihailom-db/SPARK-47102.
    
    Authored-by: Mihailo Milosevic <[email protected]>
    Signed-off-by: Max Gekk <[email protected]>
---
 .../src/main/resources/error/error-classes.json    |  5 ++++
 ...r-conditions-unsupported-feature-error-class.md |  4 +++
 .../expressions/collationExpressions.scala         | 27 +++++++++++++++++--
 .../spark/sql/catalyst/parser/AstBuilder.scala     |  7 +++++
 .../spark/sql/errors/QueryCompilationErrors.scala  |  6 +++++
 .../org/apache/spark/sql/internal/SQLConf.scala    | 10 +++++++
 .../spark/sql/execution/datasources/rules.scala    | 21 ++++++++++++++-
 .../sql/internal/BaseSessionStateBuilder.scala     |  1 +
 .../sql/errors/QueryCompilationErrorsSuite.scala   | 31 ++++++++++++++++++++++
 .../spark/sql/hive/HiveSessionStateBuilder.scala   |  1 +
 10 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-classes.json 
b/common/utils/src/main/resources/error/error-classes.json
index 7cf3e9c533ca..c61f348e63d8 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -3921,6 +3921,11 @@
           "Catalog <catalogName> does not support <operation>."
         ]
       },
+      "COLLATION" : {
+        "message" : [
+          "Collation is not yet supported."
+        ]
+      },
       "COMBINATION_QUERY_RESULT_CLAUSES" : {
         "message" : [
           "Combination of ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY."
diff --git a/docs/sql-error-conditions-unsupported-feature-error-class.md 
b/docs/sql-error-conditions-unsupported-feature-error-class.md
index 8d42ecdce790..e580ecc63b18 100644
--- a/docs/sql-error-conditions-unsupported-feature-error-class.md
+++ b/docs/sql-error-conditions-unsupported-feature-error-class.md
@@ -58,6 +58,10 @@ The ANALYZE TABLE command does not support views.
 
 Catalog `<catalogName>` does not support `<operation>`.
 
+## COLLATION
+
+Collation is not yet supported.
+
 ## COMBINATION_QUERY_RESULT_CLAUSES
 
 Combination of ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY.
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
index a2faca95dfbc..e51d9a67b166 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala
@@ -22,8 +22,10 @@ import 
org.apache.spark.sql.catalyst.analysis.ExpressionBuilder
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.CollationFactory
 import org.apache.spark.sql.errors.QueryCompilationErrors
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
+// scalastyle:off line.contains.tab
 @ExpressionDescription(
   usage = "_FUNC_(expr, collationName) - Marks a given expression with the 
specified collation.",
   arguments = """
@@ -33,13 +35,24 @@ import org.apache.spark.sql.types._
   """,
   examples = """
     Examples:
+      > SET spark.sql.collation.enabled=true;
+      spark.sql.collation.enabled      true
       > SELECT COLLATION('Spark SQL' _FUNC_ 'UCS_BASIC_LCASE');
-       UCS_BASIC_LCASE
+      UCS_BASIC_LCASE
+      > SET spark.sql.collation.enabled=false;
+      spark.sql.collation.enabled      false
   """,
   since = "4.0.0",
   group = "string_funcs")
+// scalastyle:on line.contains.tab
 object CollateExpressionBuilder extends ExpressionBuilder {
   override def build(funcName: String, expressions: Seq[Expression]): 
Expression = {
+    // We need to throw collationNotEnabledError before unexpectedNullError
+    // and nonFoldableArgumentError, as we do not want user to see misleading
+    // messages that collation is enabled
+    if (!SQLConf.get.collationEnabled) {
+      throw QueryCompilationErrors.collationNotEnabledError()
+    }
     expressions match {
       case Seq(e: Expression, collationExpr: Expression) =>
         (collationExpr.dataType, collationExpr.foldable) match {
@@ -80,15 +93,25 @@ case class Collate(child: Expression, collationName: String)
     defineCodeGen(ctx, ev, (in) => in)
 }
 
+// scalastyle:off line.contains.tab
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the collation name of a given expression.",
+  arguments = """
+    Arguments:
+      * expr - String expression to perform collation on.
+  """,
   examples = """
     Examples:
+      > SET spark.sql.collation.enabled=true;
+      spark.sql.collation.enabled      true
       > SELECT _FUNC_('Spark SQL');
-       UCS_BASIC
+      UCS_BASIC
+      > SET spark.sql.collation.enabled=false;
+      spark.sql.collation.enabled      false
   """,
   since = "4.0.0",
   group = "string_funcs")
+// scalastyle:on line.contains.tab
 case class Collation(child: Expression) extends UnaryExpression with 
RuntimeReplaceable {
   override def dataType: DataType = StringType
   override protected def withNewChildInternal(newChild: Expression): Collation 
= copy(newChild)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 096dbe840ead..48d8b8ad8f4d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -2190,6 +2190,13 @@ class AstBuilder extends DataTypeAstBuilder with 
SQLConfHelper with Logging {
     Collate(expression(ctx.primaryExpression), collationName)
   }
 
+  override def visitCollateClause(ctx: CollateClauseContext): String = 
withOrigin(ctx) {
+    if (!SQLConf.get.collationEnabled) {
+      throw QueryCompilationErrors.collationNotEnabledError()
+    }
+    string(visitStringLit(ctx.stringLit))
+  }
+
   /**
    * Create a [[Cast]] expression.
    */
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
index 38f2228f3389..95a66f015729 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -297,6 +297,12 @@ private[sql] object QueryCompilationErrors extends 
QueryErrorsBase with Compilat
     )
   }
 
+  def collationNotEnabledError(): Throwable = {
+    new AnalysisException(
+      errorClass = "UNSUPPORTED_FEATURE.COLLATION",
+      messageParameters = Map.empty)
+  }
+
   def unresolvedUsingColForJoinError(
       colName: String, suggestion: String, side: String): Throwable = {
     new AnalysisException(
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 04b392d0c44f..b262cb29c02e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -753,6 +753,14 @@ object SQLConf {
       .checkValue(_ > 0, "The initial number of partitions must be positive.")
       .createOptional
 
+  lazy val COLLATION_ENABLED =
+    buildConf("spark.sql.collation.enabled")
+      .doc("Collations feature is under development and its use should be done 
under this" +
+        "feature flag.")
+      .version("4.0.0")
+      .booleanConf
+      .createWithDefault(Utils.isTesting)
+
   val FETCH_SHUFFLE_BLOCKS_IN_BATCH =
     buildConf("spark.sql.adaptive.fetchShuffleBlocksInBatch")
       .internal()
@@ -4962,6 +4970,8 @@ class SQLConf extends Serializable with Logging with 
SqlApiConf {
     }
   }
 
+  def collationEnabled: Boolean = getConf(COLLATION_ENABLED)
+
   def adaptiveExecutionEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_ENABLED)
 
   def adaptiveExecutionLogLevel: String = getConf(ADAPTIVE_EXECUTION_LOG_LEVEL)
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index fceef89c3111..c9ad6dcaac8f 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -22,7 +22,7 @@ import java.util.Locale
 import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.{Expression, 
InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering}
+import org.apache.spark.sql.catalyst.expressions.{Collate, Collation, 
Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, 
RowOrdering}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes
@@ -32,6 +32,7 @@ import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{CreateTable => 
CreateTableV1}
 import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.InsertableRelation
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.PartitioningUtils.normalizePartitionSpec
@@ -594,3 +595,21 @@ case class QualifyLocationWithWarehouse(catalog: 
SessionCatalog) extends Rule[Lo
       c.copy(tableDesc = newTable)
   }
 }
+
+object CollationCheck extends (LogicalPlan => Unit) {
+  def apply(plan: LogicalPlan): Unit = {
+    plan.foreach {
+      case operator: LogicalPlan =>
+        operator.expressions.foreach(_.foreach(
+          e =>
+            if (isCollationExpression(e) && !SQLConf.get.collationEnabled) {
+              throw QueryCompilationErrors.collationNotEnabledError()
+            }
+          )
+        )
+    }
+  }
+
+  private def isCollationExpression(expression: Expression): Boolean =
+    expression.isInstanceOf[Collation] || expression.isInstanceOf[Collate]
+}
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
index 00c72294ca07..63c0d116ba3a 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala
@@ -223,6 +223,7 @@ abstract class BaseSessionStateBuilder(
         HiveOnlyCheck +:
         TableCapabilityCheck +:
         CommandCheck +:
+        CollationCheck +:
         customCheckRules
   }
 
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala
index 7554f8563393..a9506ad5422d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala
@@ -964,6 +964,37 @@ class QueryCompilationErrorsSuite
         "className" -> "org.apache.spark.sql.catalyst.expressions.UnsafeRow"))
   }
 
+  test("SPARK-47102: the collation feature is off without collate builder 
call") {
+    withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") {
+      Seq(
+        "CREATE TABLE t(col STRING COLLATE 'UNICODE_CI') USING parquet",
+        "CREATE TABLE t(col STRING COLLATE 'UNKNOWN_COLLATION_STRING') USING 
parquet",
+        "SELECT 'aaa' COLLATE 'UNICODE_CI'",
+        "select collation('aaa')"
+      ).foreach { sqlText =>
+        checkError(
+          exception = intercept[AnalysisException](sql(sqlText)),
+          errorClass = "UNSUPPORTED_FEATURE.COLLATION")
+      }
+    }
+  }
+
+  test("SPARK-47102: the collation feature is off with collate builder call") {
+    withSQLConf(SQLConf.COLLATION_ENABLED.key -> "false") {
+      Seq(
+        "SELECT collate('aaa', 'UNICODE_CI')",
+        "SELECT collate('aaa', 'UNKNOWN_COLLATION_STRING')"
+      ).foreach { sqlText =>
+        checkError(
+          exception = intercept[AnalysisException](sql(sqlText)),
+          errorClass = "UNSUPPORTED_FEATURE.COLLATION",
+          parameters = Map.empty,
+          context = ExpectedContext(
+            fragment = sqlText.substring(7), start = 7, stop = sqlText.length 
- 1))
+      }
+    }
+  }
+
   test("INTERNAL_ERROR: Convert unsupported data type from Spark to Parquet") {
     val converter = new SparkToParquetSchemaConverter
     val dummyDataType = new DataType {
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index 32100d060b09..416299b189cd 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -113,6 +113,7 @@ class HiveSessionStateBuilder(
         PreReadCheck +:
         TableCapabilityCheck +:
         CommandCheck +:
+        CollationCheck +:
         customCheckRules
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-47102][SQL] Add the `COLLATION_ENABLED` config flag

Reply via email to