uros-db commented on code in PR #46700:
URL: https://github.com/apache/spark/pull/46700#discussion_r1628861090
##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -26,6 +27,156 @@
// checkstyle.off: AvoidEscapedUnicodeCharacters
public class CollationSupportSuite {
+ /**
+ * A list containing some of the supported collations in Spark. Use this
list to iterate over
+ * all the important collation groups (binary, lowercase, icu) for complete
unit test coverage.
+ * Note: this list may come in handy when the Spark function result is the
same regardless of
+ * the specified collations (as often seen in some pass-through Spark
expressions).
+ */
+ private final String[] testSupportedCollations =
+ {"UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"};
+
+ /**
+ * Collation-aware UTF8String comparison.
+ */
+
+ private void assertStringCompare(String s1, String s2, String collationName,
int expected)
+ throws SparkException {
+ UTF8String l = UTF8String.fromString(s1);
+ UTF8String r = UTF8String.fromString(s2);
+ int compare =
CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
+ assertEquals(Integer.signum(expected), Integer.signum(compare));
+ }
+
+ @Test
+ public void testCompare() throws SparkException {
+ for (String collationName: testSupportedCollations) {
+ // Edge cases
+ assertStringCompare("", "", collationName, 0);
+ assertStringCompare("a", "", collationName, 1);
+ assertStringCompare("", "a", collationName, -1);
+ // Basic tests
+ assertStringCompare("a", "a", collationName, 0);
+ assertStringCompare("a", "b", collationName, -1);
+ assertStringCompare("b", "a", collationName, 1);
+ assertStringCompare("A", "A", collationName, 0);
+ assertStringCompare("A", "B", collationName, -1);
+ assertStringCompare("B", "A", collationName, 1);
+ assertStringCompare("aa", "a", collationName, 1);
+ assertStringCompare("b", "bb", collationName, -1);
+ assertStringCompare("abc", "a", collationName, 1);
+ assertStringCompare("abc", "b", collationName, -1);
+ assertStringCompare("abc", "ab", collationName, 1);
+ assertStringCompare("abc", "abc", collationName, 0);
+ // ASCII strings
+ assertStringCompare("aaaa", "aaa", collationName, 1);
+ assertStringCompare("hello", "world", collationName, -1);
+ assertStringCompare("Spark", "Spark", collationName, 0);
+ // Non-ASCII strings
+ assertStringCompare("ü", "ü", collationName, 0);
+ assertStringCompare("ü", "", collationName, 1);
+ assertStringCompare("", "ü", collationName, -1);
+ assertStringCompare("äü", "äü", collationName, 0);
+ assertStringCompare("äxx", "äx", collationName, 1);
+ assertStringCompare("a", "ä", collationName, -1);
+ }
+ // Non-ASCII strings
+ assertStringCompare("äü", "bü", "UTF8_BINARY", 1);
+ assertStringCompare("bxx", "bü", "UTF8_BINARY", -1);
+ assertStringCompare("äü", "bü", "UTF8_BINARY_LCASE", 1);
+ assertStringCompare("bxx", "bü", "UTF8_BINARY_LCASE", -1);
+ assertStringCompare("äü", "bü", "UNICODE", -1);
+ assertStringCompare("bxx", "bü", "UNICODE", 1);
+ assertStringCompare("äü", "bü", "UNICODE_CI", -1);
+ assertStringCompare("bxx", "bü", "UNICODE_CI", 1);
+ // Case variation
+ assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+ assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
+ assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
+ // Accent variation
+ assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+ assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
+ assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+ // Case-variable character length
+ assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+ assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+ assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("i\u0307", "İ", "UNICODE", -1);
+ assertStringCompare("İ", "i\u0307", "UNICODE", 1);
+ assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
+ assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
+ assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+ assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+ assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+ assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+ // Conditional case mapping
+ assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
+ assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
+ assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
+ assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+ assertStringCompare("ς", "σ", "UNICODE", 1);
+ assertStringCompare("ς", "Σ", "UNICODE", 1);
+ assertStringCompare("σ", "Σ", "UNICODE", -1);
+ assertStringCompare("ς", "σ", "UNICODE_CI", 0);
+ assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
+ assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
+ }
+
+ private void assertLowerCaseCodePoints(UTF8String target, UTF8String
expected,
+ Boolean useCodePoints) {
+ if (useCodePoints) {
+ assertEquals(expected.toString(),
+ CollationAwareUTF8String.lowerCaseCodePoints(target.toString()));
+ } else {
+ assertEquals(expected, target.toLowerCase());
+ }
+ }
+
+ @Test
+ public void testLowerCaseCodePoints() {
+ // Edge cases
+ assertLowerCaseCodePoints(UTF8String.fromString(""),
UTF8String.fromString(""), false);
+ assertLowerCaseCodePoints(UTF8String.fromString(""),
UTF8String.fromString(""), true);
+ // Basic tests
+ assertLowerCaseCodePoints(UTF8String.fromString("abcd"),
UTF8String.fromString("abcd"), false);
+ assertLowerCaseCodePoints(UTF8String.fromString("AbCd"),
UTF8String.fromString("abcd"), false);
+ assertLowerCaseCodePoints(UTF8String.fromString("abcd"),
UTF8String.fromString("abcd"), true);
+ assertLowerCaseCodePoints(UTF8String.fromString("aBcD"),
UTF8String.fromString("abcd"), true);
+ // Accent variation
+ assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"),
UTF8String.fromString("abćd"), false);
+ assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"),
UTF8String.fromString("abcδ"), true);
+ // Case-variable character length
+ assertLowerCaseCodePoints(
+ UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"),
false);
+ assertLowerCaseCodePoints(
+ UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"),
false);
+ assertLowerCaseCodePoints(
+ UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true);
+ assertLowerCaseCodePoints(
+ UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"),
true);
+ // Conditional case mapping
+ assertLowerCaseCodePoints(
+ UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"),
UTF8String.fromString("θαλασσινος"), false);
+ assertLowerCaseCodePoints(
+ UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"),
UTF8String.fromString("θαλασσινοσ"), true);
+ // Surrogate pairs are treated as invalid UTF8 sequences
+ assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
+ {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte)
0x80}),
+ UTF8String.fromString("\ufffd\ufffd"), false);
+ assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
+ {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte)
0x80}),
+ UTF8String.fromString("\ufffd\ufffd"), true);
Review Comment:
yes, the behaviour does seem to differ - and I agree to go with ICU4C logic
we'll be adding this in a follow-up PR
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]