This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8f82d0f3dc17 [SPARK-49663][SQL] Enable RTRIM suggestions in collation
expressions
8f82d0f3dc17 is described below
commit 8f82d0f3dc1790ca17816fd7b8fb908b7c84fd90
Author: Jovan Pavlovic <[email protected]>
AuthorDate: Wed Oct 30 11:51:57 2024 +0100
[SPARK-49663][SQL] Enable RTRIM suggestions in collation expressions
### What changes were proposed in this pull request?
Add RTRIM collation suggestion for collation expressions in case of the
wrong name.
### Why are the changes needed?
Better customer experience.
### Does this PR introduce _any_ user-facing change?
Yes.
### How was this patch tested?
Added tests in CollationFactorySuite.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #48545 from jovanpavl-db/rtrim_closest_suggestions.
Authored-by: Jovan Pavlovic <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../spark/sql/catalyst/util/CollationFactory.java | 19 ++++++---
.../spark/unsafe/types/CollationFactorySuite.scala | 46 +++++++++++++++-------
2 files changed, 44 insertions(+), 21 deletions(-)
diff --git
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 3117854a432b..ad5e5ae845f8 100644
---
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -1278,19 +1278,26 @@ public final class CollationFactory {
Collation.CollationSpecUTF8.UTF8_BINARY_COLLATION.collationName,
Collation.CollationSpecUTF8.UTF8_LCASE_COLLATION.collationName
};
- validModifiers = new String[0];
+ validModifiers = new String[]{"_RTRIM"};
} else {
validRootNames = getICULocaleNames();
- validModifiers = new String[]{"_CI", "_AI", "_CS", "_AS"};
+ validModifiers = new String[]{"_CI", "_AI", "_CS", "_AS", "_RTRIM"};
}
// Split modifiers and locale name.
- final int MODIFIER_LENGTH = 3;
+ boolean foundModifier = true;
String localeName = collationName.toUpperCase();
List<String> modifiers = new ArrayList<>();
- while (Arrays.stream(validModifiers).anyMatch(localeName::endsWith)) {
- modifiers.add(localeName.substring(localeName.length() -
MODIFIER_LENGTH));
- localeName = localeName.substring(0, localeName.length() -
MODIFIER_LENGTH);
+ while (foundModifier) {
+ foundModifier = false;
+ for (String modifier : validModifiers) {
+ if (localeName.endsWith(modifier)) {
+ modifiers.add(modifier);
+ localeName = localeName.substring(0, localeName.length() -
modifier.length());
+ foundModifier = true;
+ break;
+ }
+ }
}
// Suggest version with unique modifiers.
diff --git
a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
index 1b16432e6378..0e94073e4773 100644
---
a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
+++
b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
@@ -100,27 +100,33 @@ class CollationFactorySuite extends AnyFunSuite with
Matchers { // scalastyle:ig
Seq(
("UTF8_BINARY_CS", "UTF8_BINARY"),
("UTF8_BINARY_AS", "UTF8_BINARY"), // this should be UNICODE_AS
- ("UTF8_BINARY_CS_AS","UTF8_BINARY"), // this should be UNICODE_CS_AS
- ("UTF8_BINARY_AS_CS","UTF8_BINARY"),
- ("UTF8_BINARY_CI","UTF8_BINARY"),
- ("UTF8_BINARY_AI","UTF8_BINARY"),
- ("UTF8_BINARY_CI_AI","UTF8_BINARY"),
- ("UTF8_BINARY_AI_CI","UTF8_BINARY"),
- ("UTF8_BS","UTF8_LCASE"),
- ("BINARY_UTF8","ar_SAU"),
- ("UTF8_BINARY_A","UTF8_BINARY"),
- ("UNICODE_X","UNICODE"),
- ("UNICODE_CI_X","UNICODE"),
- ("UNICODE_LCASE_X","UNICODE"),
- ("UTF8_UNICODE","UTF8_LCASE"),
- ("UTF8_BINARY_UNICODE","UTF8_BINARY"),
+ ("UTF8_BINARY_CS_AS", "UTF8_BINARY"), // this should be UNICODE_CS_AS
+ ("UTF8_BINARY_AS_CS", "UTF8_BINARY"),
+ ("UTF8_BINARY_CI", "UTF8_BINARY"),
+ ("UTF8_BINARY_AI", "UTF8_BINARY"),
+ ("UTF8_BINARY_CI_AI", "UTF8_BINARY"),
+ ("UTF8_BINARY_AI_CI", "UTF8_BINARY"),
+ ("UTF8_BINARY_AI_RTRIM", "UTF8_BINARY_RTRIM"),
+ ("UTF8_BINARY_CI_RTRIM", "UTF8_BINARY_RTRIM"),
+ ("UTF8_BINARY_AI_CI_RTRIM", "UTF8_BINARY_RTRIM"),
+ ("UTF8_BS", "UTF8_LCASE"),
+ ("BINARY_UTF8", "ar_SAU"),
+ ("UTF8_BINARY_A", "UTF8_BINARY"),
+ ("UNICODE_X", "UNICODE"),
+ ("UNICODE_CI_X", "UNICODE"),
+ ("UNICODE_LCASE_X", "UNICODE"),
+ ("UNICODE_RTRIM_LCASE_X", "UNICODE"),
+ ("UTF8_UNICODE", "UTF8_LCASE"),
+ ("UTF8_BINARY_UNICODE", "UTF8_BINARY"),
("CI_UNICODE", "UNICODE"),
("LCASE_UNICODE", "UNICODE"),
+ ("RTRIM_UNICODE", "UNICODE"),
("UNICODE_UNSPECIFIED", "UNICODE"),
("UNICODE_CI_UNSPECIFIED", "UNICODE"),
("UNICODE_UNSPECIFIED_CI_UNSPECIFIED", "UNICODE"),
("UNICODE_INDETERMINATE", "UNICODE"),
- ("UNICODE_CI_INDETERMINATE", "UNICODE")
+ ("UNICODE_CI_INDETERMINATE", "UNICODE"),
+ ("UNICODE_RTRIM_INDETERMINATE", "UNICODE")
).foreach{case (collationName, proposals) =>
checkCollationNameError(collationName, proposals)
}
@@ -372,15 +378,23 @@ class CollationFactorySuite extends AnyFunSuite with
Matchers { // scalastyle:ig
("CI_en", "ceb"),
("USA_CI_en", "UNICODE"),
("en_CI_USA", "en_USA"),
+ ("en_RTRIM_USA", "en_USA"),
("CI_sr_Cyrl_SRB", "sr_Cyrl_SRB"),
+ ("RTRIM_sr_Cyrl_SRB", "sr_Cyrl_SRB"),
("sr_CI_Cyrl_SRB", "sr_Cyrl_SRB"),
+ ("sr_RTRIM_Cyrl_SRB", "sr_Cyrl_SRB"),
("sr_Cyrl_CI_SRB", "sr_Cyrl_SRB"),
+ ("sr_Cyrl_RTRIM_SRB", "sr_Cyrl_SRB"),
("CI_Cyrl_sr", "sr_Cyrl_SRB"),
+ ("RTRIM_Cyrl_sr", "sr_Cyrl_SRB"),
("Cyrl_CI_sr", "he_ISR"),
("Cyrl_CI_sr_SRB", "sr_Cyrl_SRB"),
+ ("Cyrl_RTRIM_sr_SRB", "sr_Cyrl_SRB"),
("Cyrl_sr_CI_SRB", "sr_Cyrl_SRB"),
+ ("Cyrl_sr_RTRIM_SRB", "sr_Cyrl_SRB"),
// no locale specified
("_CI_AI", "af_CI_AI, am_CI_AI, ar_CI_AI"),
+ ("_CI_AI_RTRIM", "af_CI_AI_RTRIM, am_CI_AI_RTRIM, ar_CI_AI_RTRIM"),
("", "af, am, ar")
).foreach { case (collationName, proposals) =>
checkCollationNameError(collationName, proposals)
@@ -476,6 +490,7 @@ class CollationFactorySuite extends AnyFunSuite with
Matchers { // scalastyle:ig
("UNICODE_CI_CI", "UNICODE_CI"),
("UNICODE_CI_CS", "UNICODE_CS"),
("UNICODE_CS_CI", "UNICODE_CS"),
+ ("UNICODE_RTRIM_RTRIM", "UNICODE_RTRIM"),
("UNICODE_AS_AS", "UNICODE_AS"),
("UNICODE_AI_AI", "UNICODE_AI"),
("UNICODE_AS_AI", "UNICODE_AS"),
@@ -485,6 +500,7 @@ class CollationFactorySuite extends AnyFunSuite with
Matchers { // scalastyle:ig
("UNICODE_CS_AS_CI_AI", "UNICODE_CS_AS"),
("UNICODE__CS__AS", "UNICODE_AS"),
("UNICODE-CS-AS", "UNICODE"),
+ ("UNICODE__CS__RTRIM", "UNICODE_RTRIM"),
("UNICODECSAS", "UNICODE"),
("_CS_AS_UNICODE", "UNICODE")
).foreach { case (collationName, proposals) =>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]