Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]

via GitHub Mon, 27 May 2024 01:27:28 -0700


uros-db commented on code in PR #46700:
URL: https://github.com/apache/spark/pull/46700#discussion_r1615675019



##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -17,15 +17,136 @@
 package org.apache.spark.unsafe.types;
 
 import org.apache.spark.SparkException;
+import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
 import org.apache.spark.sql.catalyst.util.CollationFactory;
 import org.apache.spark.sql.catalyst.util.CollationSupport;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
+  /**
+   * Collation-aware UTF8String comparison.
+   */
+
+  private void assertStringCompare(String s1, String s2, String collationName, 
int expected)
+          throws SparkException {
+    UTF8String l = UTF8String.fromString(s1);
+    UTF8String r = UTF8String.fromString(s2);
+    int compare = 
CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
+    assertEquals(Integer.signum(expected), Integer.signum(compare));
+  }
+
+  @Test
+  public void testCompare() throws SparkException {
+    // Edge cases
+    assertStringCompare("", "", "UTF8_BINARY", 0);
+    assertStringCompare("a", "", "UTF8_BINARY", 1);
+    assertStringCompare("", "a", "UTF8_BINARY", -1);
+    assertStringCompare("", "", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1);
+    assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1);
+    assertStringCompare("", "", "UNICODE", 0);
+    assertStringCompare("a", "", "UNICODE", 1);
+    assertStringCompare("", "a", "UNICODE", -1);
+    assertStringCompare("", "", "UNICODE_CI", 0);
+    assertStringCompare("a", "", "UNICODE_CI", 1);
+    assertStringCompare("", "a", "UNICODE_CI", -1);
+    // Basic tests
+    assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+    assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
+    assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
+    // Accent variation
+    assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+    assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
+    assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+    // Case-variable character length
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307", "İ", "UNICODE", -1);
+    assertStringCompare("İ", "i\u0307", "UNICODE", 1);
+    assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
+    assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+    // Conditional case mapping
+    assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "σ", "UNICODE", 1);
+    assertStringCompare("ς", "Σ", "UNICODE", 1);
+    assertStringCompare("σ", "Σ", "UNICODE", -1);
+    assertStringCompare("ς", "σ", "UNICODE_CI", 0);
+    assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
+    assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
+  }
+
+  private void assertLcaseCompare(String target, String expected, String 
collationName)
+      throws SparkException {
+    if (collationName.equals("UTF8_BINARY")) {
+      UTF8String targetUTF8 = UTF8String.fromString(target);
+      UTF8String expectedUTF8 = UTF8String.fromString(expected);
+      assertEquals(expectedUTF8, targetUTF8.toLowerCase());
+    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+      assertEquals(expected, 
CollationAwareUTF8String.lowerCaseCodePoints(target));
+    } else {
+      int collationId = CollationFactory.collationNameToId(collationName);
+      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, 
collationId));
+    }
+  }
+
+  @Test
+  public void testLcaseCompare() throws SparkException {
+    // Edge cases
+    assertLcaseCompare("", "", "UTF8_BINARY");
+    assertLcaseCompare("", "", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("", "", "UNICODE");
+    assertLcaseCompare("", "", "UNICODE_CI");
+    // Basic tests
+    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY");
+    assertLcaseCompare("AbCd", "abcd", "UTF8_BINARY");
+    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("aBcD", "abcd", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("abcd", "abcd", "UNICODE");
+    assertLcaseCompare("aBCd", "abcd", "UNICODE");
+    assertLcaseCompare("abcd", "abcd", "UNICODE_CI");
+    assertLcaseCompare("AbcD", "abcd", "UNICODE_CI");
+    // Accent variation
+    assertLcaseCompare("AbĆd", "abćd", "UTF8_BINARY");
+    assertLcaseCompare("aBcΔ", "abcδ", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("ÄbcD", "äbcd", "UNICODE");
+    assertLcaseCompare("aB́Cd", "ab́cd", "UNICODE_CI");
+    // Case-variable character length
+    assertLcaseCompare("İoDiNe", "i̇odine", "UTF8_BINARY");
+    assertLcaseCompare("Abi̇o12", "abi̇o12", "UTF8_BINARY");
+    assertLcaseCompare("İodInE", "i̇odine", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("İoDinE", "i̇odine", "UNICODE");
+    assertLcaseCompare("abi̇O12", "abi̇o12", "UNICODE");
+    assertLcaseCompare("İodINe", "i̇odine", "UNICODE_CI");
+    assertLcaseCompare("ABi̇o12", "abi̇o12", "UNICODE_CI");
+    // Conditional case mapping
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != 
UNICODE_CI
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI");

Review Comment:
   edit: updated tests to reflect this



##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -17,15 +17,136 @@
 package org.apache.spark.unsafe.types;
 
 import org.apache.spark.SparkException;
+import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
 import org.apache.spark.sql.catalyst.util.CollationFactory;
 import org.apache.spark.sql.catalyst.util.CollationSupport;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
+  /**
+   * Collation-aware UTF8String comparison.
+   */
+
+  private void assertStringCompare(String s1, String s2, String collationName, 
int expected)
+          throws SparkException {
+    UTF8String l = UTF8String.fromString(s1);
+    UTF8String r = UTF8String.fromString(s2);
+    int compare = 
CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
+    assertEquals(Integer.signum(expected), Integer.signum(compare));
+  }
+
+  @Test
+  public void testCompare() throws SparkException {
+    // Edge cases
+    assertStringCompare("", "", "UTF8_BINARY", 0);
+    assertStringCompare("a", "", "UTF8_BINARY", 1);
+    assertStringCompare("", "a", "UTF8_BINARY", -1);
+    assertStringCompare("", "", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1);
+    assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1);
+    assertStringCompare("", "", "UNICODE", 0);
+    assertStringCompare("a", "", "UNICODE", 1);
+    assertStringCompare("", "a", "UNICODE", -1);
+    assertStringCompare("", "", "UNICODE_CI", 0);
+    assertStringCompare("a", "", "UNICODE_CI", 1);
+    assertStringCompare("", "a", "UNICODE_CI", -1);
+    // Basic tests
+    assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+    assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
+    assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
+    // Accent variation
+    assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+    assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
+    assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+    // Case-variable character length
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307", "İ", "UNICODE", -1);
+    assertStringCompare("İ", "i\u0307", "UNICODE", 1);
+    assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
+    assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+    // Conditional case mapping
+    assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "σ", "UNICODE", 1);
+    assertStringCompare("ς", "Σ", "UNICODE", 1);
+    assertStringCompare("σ", "Σ", "UNICODE", -1);
+    assertStringCompare("ς", "σ", "UNICODE_CI", 0);
+    assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
+    assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
+  }
+
+  private void assertLcaseCompare(String target, String expected, String 
collationName)
+      throws SparkException {
+    if (collationName.equals("UTF8_BINARY")) {
+      UTF8String targetUTF8 = UTF8String.fromString(target);
+      UTF8String expectedUTF8 = UTF8String.fromString(expected);
+      assertEquals(expectedUTF8, targetUTF8.toLowerCase());
+    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+      assertEquals(expected, 
CollationAwareUTF8String.lowerCaseCodePoints(target));
+    } else {
+      int collationId = CollationFactory.collationNameToId(collationName);
+      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, 
collationId));
+    }
+  }
+
+  @Test
+  public void testLcaseCompare() throws SparkException {

Review Comment:
   (resolved below)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]

Reply via email to