This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 5d757993f4cf [SPARK-50269][SQL][TESTS] Further improve collation
support testing for various collations
5d757993f4cf is described below
commit 5d757993f4cfcd859eb11640d210a560d6136465
Author: Dejan Krakovic <[email protected]>
AuthorDate: Fri Nov 8 11:10:37 2024 +0100
[SPARK-50269][SQL][TESTS] Further improve collation support testing for
various collations
### What changes were proposed in this pull request?
Extend collation-related unit and e2e sql tests for various collations in
addition to the 4 common collations already used.
This is a follow up PR from https://github.com/apache/spark/pull/48608,
where it was decided to split the changes in separate PRs. This PR includes the
additional test suites mentioned in the comments of the original PR.
### Why are the changes needed?
Further expand collation testing coverage for various collations,
incorporating different languages, scripts, case/accent sensitivity, etc.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Extending existing collation-related unit and e2e sql tests.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #48799 from dejankrak-db/collation-additional-tests.
Authored-by: Dejan Krakovic <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../spark/unsafe/types/CollationSupportSuite.java | 24 ++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index a445cde52ad5..a696da8cf45b 100644
---
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -94,6 +94,7 @@ public class CollationSupportSuite {
assertCompare("bxx", "bü", "UNICODE", 1);
assertCompare("äü", "bü", "UNICODE_CI", -1);
assertCompare("bxx", "bü", "UNICODE_CI", 1);
+ assertCompare("cČć", "ČćC", "SR_CI_AI", 0);
// Case variation.
assertCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
assertCompare("ABCD", "abcd", "UTF8_LCASE", 0);
@@ -104,6 +105,7 @@ public class CollationSupportSuite {
assertCompare("AbCδ", "ABCΔ", "UTF8_LCASE", 0);
assertCompare("äBCd", "ÄBCD", "UNICODE", -1);
assertCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+ assertCompare("ÈÉÊË", "EeEe", "AF_CI_AI", 0);
// One-to-many case mapping (e.g. Turkish dotted I).
assertCompare("i\u0307", "İ", "UTF8_BINARY", -1);
assertCompare("İ", "i\u0307", "UTF8_BINARY", 1);
@@ -334,6 +336,7 @@ public class CollationSupportSuite {
assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertContains("The KKelvin.", "KKelvin,", "UTF8_LCASE", false);
+ assertContains("abčćd", "ABCCD", "SR_CI_AI", true);
// Case variation.
assertContains("aBcDe", "bcd", "UTF8_BINARY", false);
assertContains("aBcDe", "BcD", "UTF8_BINARY", true);
@@ -352,6 +355,7 @@ public class CollationSupportSuite {
assertContains("aBcDe", "BĆD", "UTF8_LCASE", false);
assertContains("aBcDe", "abćde", "UNICODE_CI", false);
assertContains("aBcDe", "AbĆdE", "UNICODE_CI", false);
+ assertContains("abEEE", "Bèêë", "AF_CI_AI", true);
// One-to-many case mapping (e.g. Turkish dotted I).
assertContains("i\u0307", "i", "UNICODE_CI", false);
assertContains("i\u0307", "\u0307", "UNICODE_CI", false);
@@ -580,6 +584,11 @@ public class CollationSupportSuite {
assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertStartsWith("KKelvin.", "KKelvin,", "UTF8_LCASE", false);
+ assertStartsWith("Ћао", "Ца", "sr_Cyrl_CI_AI", false);
+ assertStartsWith("Ћао", "ћа", "sr_Cyrl_CI_AI", true);
+ assertStartsWith("Ćao", "Ca", "SR_CI", false);
+ assertStartsWith("Ćao", "Ca", "SR_CI_AI", true);
+ assertStartsWith("Ćao", "Ća", "SR", true);
// Case variation.
assertStartsWith("aBcDe", "abc", "UTF8_BINARY", false);
assertStartsWith("aBcDe", "aBc", "UTF8_BINARY", true);
@@ -832,6 +841,11 @@ public class CollationSupportSuite {
assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true);
assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true);
assertEndsWith("The KKelvin", "KKelvin,", "UTF8_LCASE", false);
+ assertEndsWith("Ћевапчићи", "цици", "sr_Cyrl_CI_AI", false);
+ assertEndsWith("Ћевапчићи", "чИЋи", "sr_Cyrl_CI_AI", true);
+ assertEndsWith("Ćevapčići", "cici", "SR_CI", false);
+ assertEndsWith("Ćevapčići", "cici", "SR_CI_AI", true);
+ assertEndsWith("Ćevapčići", "čići", "SR", true);
// Case variation.
assertEndsWith("aBcDe", "cde", "UTF8_BINARY", false);
assertEndsWith("aBcDe", "cDe", "UTF8_BINARY", true);
@@ -1393,6 +1407,8 @@ public class CollationSupportSuite {
assertInitCap("ÄBĆΔE", "UTF8_LCASE", "Äbćδe");
assertInitCap("ÄBĆΔE", "UNICODE", "Äbćδe");
assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe");
+ assertInitCap("êéfgh", "AF_CI_AI", "Êéfgh");
+ assertInitCap("öoAÄ", "DE_CI_AI", "Öoaä");
// Case-variable character length
assertInitCap("İo", "UTF8_BINARY", "İo", "I\u0307o");
assertInitCap("İo", "UTF8_LCASE", "İo");
@@ -1580,6 +1596,8 @@ public class CollationSupportSuite {
assertStringInstr("aaadS", "Ds", "UTF8_LCASE", 4);
assertStringInstr("aaadS", "Ds", "UNICODE", 0);
assertStringInstr("aaadS", "Ds", "UNICODE_CI", 4);
+ assertStringInstr("aaaČŠčšcs", "cs", "SR", 8);
+ assertStringInstr("aaaČŠčšcs", "cs", "SR_CI_AI", 4);
// Advanced tests.
assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5);
assertStringInstr("test大千世界X大千世界", "大千", "UTF8_LCASE", 5);
@@ -2038,6 +2056,7 @@ public class CollationSupportSuite {
assertStringReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c");
assertStringReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad");
assertStringReplace("repl世ace", "Pl", "", "UNICODE_CI", "re世ace");
+ assertStringReplace("abcčšdabĆŠscd", "cs", "", "SR_CI_AI", "abcdabscd");
// One-to-many case mapping (e.g. Turkish dotted I).
assertStringReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12");
assertStringReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12");
@@ -2231,6 +2250,8 @@ public class CollationSupportSuite {
assertStringLocate("aa", "Aaads", 1, "UTF8_LCASE", 1);
assertStringLocate("aa", "Aaads", 1, "UNICODE", 2);
assertStringLocate("aa", "Aaads", 1, "UNICODE_CI", 1);
+ assertStringLocate("ćČ", "CćČČćCČĆČcČcććČč", 3, "SR", 14);
+ assertStringLocate("ćČ", "CćČČćCČĆČcČcććČč", 3, "SR_CI_AI", 3);
// Advanced tests.
assertStringLocate("界x", "test大千世界X大千世界", 1, "UTF8_BINARY", 0);
assertStringLocate("界X", "test大千世界X大千世界", 1, "UTF8_BINARY", 8);
@@ -2581,6 +2602,7 @@ public class CollationSupportSuite {
assertSubstringIndex("test大千世界X大千世界", "X", 1, "UNICODE_CI", "test大千世界");
assertSubstringIndex("test大千世界大千世界", "千", 2, "UNICODE_CI", "test大千世界大");
assertSubstringIndex("www||APACHE||org", "||", 2, "UNICODE_CI",
"www||APACHE");
+ assertSubstringIndex("wwwèapacheËorg", "Ê", -3, "AF_CI_AI", "apacheËorg");
// One-to-many case mapping (e.g. Turkish dotted I).
assertSubstringIndex("abİo12", "i\u0307o", 1, "UNICODE_CI", "ab");
assertSubstringIndex("abİo12", "i\u0307o", -1, "UNICODE_CI", "12");
@@ -2803,6 +2825,7 @@ public class CollationSupportSuite {
assertStringTrim("UNICODE_CI", "asd", "A", "sd");
assertStringTrim("UNICODE_CI", "ASD", "a", "SD");
assertStringTrim("UNICODE_CI", "ddsXXXaa", "ASD", "XXX");
+ assertStringTrim("SR_CI_AI", "cSCšćČXXXsčšČŠsć", "čš", "XXX");
// One-to-many case mapping (e.g. Turkish dotted I)..
assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
@@ -3730,6 +3753,7 @@ public class CollationSupportSuite {
assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f");
+ assertStringTranslate("abcdëÈêf", "ÊèË", "123", "AF_CI", "abcd321f");
// One-to-many case mapping (e.g. Turkish dotted I).
assertStringTranslate("İ", "i\u0307", "xy", "UTF8_BINARY", "İ");
assertStringTranslate("İ", "i\u0307", "xy", "UTF8_LCASE", "İ");
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]