stefankandic commented on code in PR #48222:
URL: https://github.com/apache/spark/pull/48222#discussion_r1772925056
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala:
##########
@@ -759,6 +759,14 @@ object SQLConf {
.checkValue(_ > 0, "The initial number of partitions must be positive.")
.createOptional
+ lazy val TRIM_COLLATION_ENABLED =
+ buildConf("spark.sql.trimCollation.enabled")
Review Comment:
```suggestion
buildConf("spark.sql.collation.trim.enabled")
```
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala:
##########
@@ -640,6 +639,26 @@ case class QualifyLocationWithWarehouse(catalog:
SessionCatalog) extends Rule[Lo
}
}
+object TrimCollationCheck extends (LogicalPlan => Unit) {
Review Comment:
Why do we need a new analyzer rule for this? Isn't the check in the ast
builder enough?
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java:
##########
@@ -661,58 +775,111 @@ private static int collationNameToId(
}
if (lastPos == -1) {
throw collationInvalidNameException(originalName);
- } else {
- String locale = collationName.substring(0, lastPos);
- int collationId =
ICULocaleToId.get(ICULocaleMapUppercase.get(locale));
-
- // Try all combinations of AS/AI and CS/CI.
- CaseSensitivity caseSensitivity;
- AccentSensitivity accentSensitivity;
- if (collationName.equals(locale) ||
- collationName.equals(locale + "_AS") ||
- collationName.equals(locale + "_CS") ||
- collationName.equals(locale + "_AS_CS") ||
- collationName.equals(locale + "_CS_AS")
- ) {
- caseSensitivity = CaseSensitivity.CS;
- accentSensitivity = AccentSensitivity.AS;
- } else if (collationName.equals(locale + "_CI") ||
- collationName.equals(locale + "_AS_CI") ||
- collationName.equals(locale + "_CI_AS")) {
- caseSensitivity = CaseSensitivity.CI;
- accentSensitivity = AccentSensitivity.AS;
- } else if (collationName.equals(locale + "_AI") ||
- collationName.equals(locale + "_CS_AI") ||
- collationName.equals(locale + "_AI_CS")) {
- caseSensitivity = CaseSensitivity.CS;
- accentSensitivity = AccentSensitivity.AI;
- } else if (collationName.equals(locale + "_AI_CI") ||
- collationName.equals(locale + "_CI_AI")) {
- caseSensitivity = CaseSensitivity.CI;
- accentSensitivity = AccentSensitivity.AI;
- } else {
- throw collationInvalidNameException(originalName);
- }
+ }
+ String locale = collationName.substring(0, lastPos);
+ int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale));
+ collationId = SpecifierUtils.setSpecValue(collationId,
+ IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU);
- // Build collation ID from computed specifiers.
- collationId = SpecifierUtils.setSpecValue(collationId,
- IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU);
- collationId = SpecifierUtils.setSpecValue(collationId,
- CASE_SENSITIVITY_OFFSET, caseSensitivity);
- collationId = SpecifierUtils.setSpecValue(collationId,
- ACCENT_SENSITIVITY_OFFSET, accentSensitivity);
+ // No other specifiers present.
+ if(collationName.equals(locale)){
return collationId;
}
+ if(collationName.charAt(locale.length()) != '_'){
+ throw collationInvalidNameException(originalName);
+ }
+ // Extract remaining specifiers and trim "_" separator.
+ String remainingSpecifiers = collationName.substring(lastPos + 1);
+
+ // Initialize default specifier flags.
+ boolean isCaseSpecifierSet = false;
+ boolean isAccentSpecifierSet = false;
+ boolean isSpaceTrimmingSpecifierSet = false;
+ CaseSensitivity caseSensitivity = CaseSensitivity.CS; //
Default: Case Sensitive
+ AccentSensitivity accentSensitivity = AccentSensitivity.AS; //
Default: Accent Sensitive
+ SpaceTrimming spaceTrimming = SpaceTrimming.NONE; // Default:
No Trim
+
+ String[] specifiers = remainingSpecifiers.split("_");
+
+ // Iterate through specifiers and set corresponding flags
+ for (String specifier : specifiers) {
+ switch (specifier) {
+ case "CI":
Review Comment:
would it be possible to do something like:
```java
case "CI":
case "CS":
assertNotSet(caseSpecifierSet);
caseSensitivity = CaseSensitivity.valueOf(specifier);
caseSpecifierSet = true;
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java:
##########
@@ -661,58 +775,111 @@ private static int collationNameToId(
}
if (lastPos == -1) {
throw collationInvalidNameException(originalName);
- } else {
- String locale = collationName.substring(0, lastPos);
- int collationId =
ICULocaleToId.get(ICULocaleMapUppercase.get(locale));
-
- // Try all combinations of AS/AI and CS/CI.
- CaseSensitivity caseSensitivity;
- AccentSensitivity accentSensitivity;
- if (collationName.equals(locale) ||
- collationName.equals(locale + "_AS") ||
- collationName.equals(locale + "_CS") ||
- collationName.equals(locale + "_AS_CS") ||
- collationName.equals(locale + "_CS_AS")
- ) {
- caseSensitivity = CaseSensitivity.CS;
- accentSensitivity = AccentSensitivity.AS;
- } else if (collationName.equals(locale + "_CI") ||
- collationName.equals(locale + "_AS_CI") ||
- collationName.equals(locale + "_CI_AS")) {
- caseSensitivity = CaseSensitivity.CI;
- accentSensitivity = AccentSensitivity.AS;
- } else if (collationName.equals(locale + "_AI") ||
- collationName.equals(locale + "_CS_AI") ||
- collationName.equals(locale + "_AI_CS")) {
- caseSensitivity = CaseSensitivity.CS;
- accentSensitivity = AccentSensitivity.AI;
- } else if (collationName.equals(locale + "_AI_CI") ||
- collationName.equals(locale + "_CI_AI")) {
- caseSensitivity = CaseSensitivity.CI;
- accentSensitivity = AccentSensitivity.AI;
- } else {
- throw collationInvalidNameException(originalName);
- }
+ }
+ String locale = collationName.substring(0, lastPos);
+ int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale));
+ collationId = SpecifierUtils.setSpecValue(collationId,
+ IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU);
- // Build collation ID from computed specifiers.
- collationId = SpecifierUtils.setSpecValue(collationId,
- IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU);
- collationId = SpecifierUtils.setSpecValue(collationId,
- CASE_SENSITIVITY_OFFSET, caseSensitivity);
- collationId = SpecifierUtils.setSpecValue(collationId,
- ACCENT_SENSITIVITY_OFFSET, accentSensitivity);
+ // No other specifiers present.
+ if(collationName.equals(locale)){
return collationId;
}
+ if(collationName.charAt(locale.length()) != '_'){
+ throw collationInvalidNameException(originalName);
+ }
+ // Extract remaining specifiers and trim "_" separator.
+ String remainingSpecifiers = collationName.substring(lastPos + 1);
+
+ // Initialize default specifier flags.
+ boolean isCaseSpecifierSet = false;
+ boolean isAccentSpecifierSet = false;
+ boolean isSpaceTrimmingSpecifierSet = false;
+ CaseSensitivity caseSensitivity = CaseSensitivity.CS; //
Default: Case Sensitive
Review Comment:
nit: comments usually go above the line itself
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala:
##########
@@ -18,15 +18,13 @@
package org.apache.spark.sql.execution.datasources
import java.util.Locale
-
Review Comment:
seems like unrelated change
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala:
##########
@@ -640,6 +639,26 @@ case class QualifyLocationWithWarehouse(catalog:
SessionCatalog) extends Rule[Lo
}
}
+object TrimCollationCheck extends (LogicalPlan => Unit) {
Review Comment:
If you want to catch the session string type you can do it here:
https://github.com/stefankandic/spark/blob/5192476531e3265495a2da11c14ac04e122f7d22/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L779
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala:
##########
@@ -18,15 +18,13 @@
package org.apache.spark.sql.execution.datasources
import java.util.Locale
-
import scala.collection.mutable.{HashMap, HashSet}
import scala.jdk.CollectionConverters._
-
Review Comment:
ditto
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala:
##########
@@ -149,6 +149,13 @@ class SparkSqlAstBuilder extends AstBuilder {
* }}}
*/
override def visitSetCollation(ctx: SetCollationContext): LogicalPlan =
withOrigin(ctx) {
+ val collationName = ctx.collationName.getText
+ if (!SQLConf.get.trimCollationEnabled &&
+ (collationName.toUpperCase().contains("TRIM") ||
Review Comment:
ditto
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala:
##########
@@ -2557,6 +2557,13 @@ class AstBuilder extends DataTypeAstBuilder
}
override def visitCollateClause(ctx: CollateClauseContext): String =
withOrigin(ctx) {
+ val collationName = ctx.collationName.getText
+ if (!SQLConf.get.trimCollationEnabled &&
+ (collationName.toUpperCase().contains("TRIM") ||
Review Comment:
isn't this first check making the other two redundant?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]