Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]

via GitHub Wed, 05 Jun 2024 23:43:01 -0700


uros-db commented on code in PR #46700:
URL: https://github.com/apache/spark/pull/46700#discussion_r1628861090



##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -26,6 +27,156 @@
 // checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
+  /**
+   * A list containing some of the supported collations in Spark. Use this 
list to iterate over
+   * all the important collation groups (binary, lowercase, icu) for complete 
unit test coverage.
+   * Note: this list may come in handy when the Spark function result is the 
same regardless of
+   * the specified collations (as often seen in some pass-through Spark 
expressions).
+   */
+  private final String[] testSupportedCollations =
+    {"UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"};
+
+  /**
+   * Collation-aware UTF8String comparison.
+   */
+
+  private void assertStringCompare(String s1, String s2, String collationName, 
int expected)
+          throws SparkException {
+    UTF8String l = UTF8String.fromString(s1);
+    UTF8String r = UTF8String.fromString(s2);
+    int compare = 
CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
+    assertEquals(Integer.signum(expected), Integer.signum(compare));
+  }
+
+  @Test
+  public void testCompare() throws SparkException {
+    for (String collationName: testSupportedCollations) {
+      // Edge cases
+      assertStringCompare("", "", collationName, 0);
+      assertStringCompare("a", "", collationName, 1);
+      assertStringCompare("", "a", collationName, -1);
+      // Basic tests
+      assertStringCompare("a", "a", collationName, 0);
+      assertStringCompare("a", "b", collationName, -1);
+      assertStringCompare("b", "a", collationName, 1);
+      assertStringCompare("A", "A", collationName, 0);
+      assertStringCompare("A", "B", collationName, -1);
+      assertStringCompare("B", "A", collationName, 1);
+      assertStringCompare("aa", "a", collationName, 1);
+      assertStringCompare("b", "bb", collationName, -1);
+      assertStringCompare("abc", "a", collationName, 1);
+      assertStringCompare("abc", "b", collationName, -1);
+      assertStringCompare("abc", "ab", collationName, 1);
+      assertStringCompare("abc", "abc", collationName, 0);
+      // ASCII strings
+      assertStringCompare("aaaa", "aaa", collationName, 1);
+      assertStringCompare("hello", "world", collationName, -1);
+      assertStringCompare("Spark", "Spark", collationName, 0);
+      // Non-ASCII strings
+      assertStringCompare("ü", "ü", collationName, 0);
+      assertStringCompare("ü", "", collationName, 1);
+      assertStringCompare("", "ü", collationName, -1);
+      assertStringCompare("äü", "äü", collationName, 0);
+      assertStringCompare("äxx", "äx", collationName, 1);
+      assertStringCompare("a", "ä", collationName, -1);
+    }
+    // Non-ASCII strings
+    assertStringCompare("äü", "bü", "UTF8_BINARY", 1);
+    assertStringCompare("bxx", "bü", "UTF8_BINARY", -1);
+    assertStringCompare("äü", "bü", "UTF8_BINARY_LCASE", 1);
+    assertStringCompare("bxx", "bü", "UTF8_BINARY_LCASE", -1);
+    assertStringCompare("äü", "bü", "UNICODE", -1);
+    assertStringCompare("bxx", "bü", "UNICODE", 1);
+    assertStringCompare("äü", "bü", "UNICODE_CI", -1);
+    assertStringCompare("bxx", "bü", "UNICODE_CI", 1);
+    // Case variation
+    assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+    assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
+    assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
+    // Accent variation
+    assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+    assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
+    assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+    // Case-variable character length
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307", "İ", "UNICODE", -1);
+    assertStringCompare("İ", "i\u0307", "UNICODE", 1);
+    assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
+    assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+    // Conditional case mapping
+    assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "σ", "UNICODE", 1);
+    assertStringCompare("ς", "Σ", "UNICODE", 1);
+    assertStringCompare("σ", "Σ", "UNICODE", -1);
+    assertStringCompare("ς", "σ", "UNICODE_CI", 0);
+    assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
+    assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
+  }
+
+  private void assertLowerCaseCodePoints(UTF8String target, UTF8String 
expected,
+      Boolean useCodePoints) {
+    if (useCodePoints) {
+      assertEquals(expected.toString(),
+        CollationAwareUTF8String.lowerCaseCodePoints(target.toString()));
+    } else {
+      assertEquals(expected, target.toLowerCase());
+    }
+  }
+
+  @Test
+  public void testLowerCaseCodePoints() {
+    // Edge cases
+    assertLowerCaseCodePoints(UTF8String.fromString(""), 
UTF8String.fromString(""), false);
+    assertLowerCaseCodePoints(UTF8String.fromString(""), 
UTF8String.fromString(""), true);
+    // Basic tests
+    assertLowerCaseCodePoints(UTF8String.fromString("abcd"), 
UTF8String.fromString("abcd"), false);
+    assertLowerCaseCodePoints(UTF8String.fromString("AbCd"), 
UTF8String.fromString("abcd"), false);
+    assertLowerCaseCodePoints(UTF8String.fromString("abcd"), 
UTF8String.fromString("abcd"), true);
+    assertLowerCaseCodePoints(UTF8String.fromString("aBcD"), 
UTF8String.fromString("abcd"), true);
+    // Accent variation
+    assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"), 
UTF8String.fromString("abćd"), false);
+    assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"), 
UTF8String.fromString("abcδ"), true);
+    // Case-variable character length
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"), 
false);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"), 
false);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"), 
true);
+    // Conditional case mapping
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), 
UTF8String.fromString("θαλασσινος"), false);
+    assertLowerCaseCodePoints(
+      UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), 
UTF8String.fromString("θαλασσινοσ"), true);
+    // Surrogate pairs are treated as invalid UTF8 sequences
+    assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
+      {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 
0x80}),
+      UTF8String.fromString("\ufffd\ufffd"), false);
+    assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[]
+      {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 
0x80}),
+      UTF8String.fromString("\ufffd\ufffd"), true);

Review Comment:
   yes, the behaviour does seem to differ - and I agree to go with ICU4C logic
   
   we'll be adding this in a follow-up PR



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]

Reply via email to