Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]

via GitHub Thu, 23 May 2024 20:06:50 -0700


mkaravel commented on code in PR #46700:
URL: https://github.com/apache/spark/pull/46700#discussion_r1612550274



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -147,6 +162,45 @@ public static String toLowerCase(final String target, 
final int collationId) {
     return UCharacter.toLowerCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to lowercase using ICU rules, with special 
handling for
+   * conditional case mappings (i.e. characters that map to multiple 
characters in lowercase).
+   *
+   * @param codePoint The code point to convert to lowercase.
+   * @param sb The StringBuilder to append the lowercase character to.
+   */
+  private static void lowercaseCodePoint(final int codePoint, final 
StringBuilder sb) {
+    // Latin capital letter I with dot above is mapped to 2 lowercase 
characters.
+    if (codePoint == 0x0130) {
+      sb.append("i̇");
+    }
+    // Greek final and non-final capital letter sigma should be mapped the 
same.
+    else if (codePoint == 0x03C2) {
+      sb.append("σ");
+    }
+    // All other characters should follow context-unaware ICU single-code 
point case mapping.
+    else {
+      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+    }

Review Comment:
   I think the following is easier to read:
   ```suggestion
       // Latin capital letter I with dot above is mapped to 2 lowercase 
characters.
       if (codePoint == 0x0130) {
         // Latin capital letter I with dot above is mapped to 2 lowercase 
characters.
         sb.append("i̇");
       } else if (codePoint == 0x03C2) {
         // Greek final and non-final capital letter sigma should be mapped the 
same.
         sb.append("σ");
       } else {
         // All other characters should follow context-unaware ICU single-code 
point case mapping.
         sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
       }
   ```



##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -17,15 +17,136 @@
 package org.apache.spark.unsafe.types;
 
 import org.apache.spark.SparkException;
+import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
 import org.apache.spark.sql.catalyst.util.CollationFactory;
 import org.apache.spark.sql.catalyst.util.CollationSupport;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
+  /**
+   * Collation-aware UTF8String comparison.
+   */
+
+  private void assertStringCompare(String s1, String s2, String collationName, 
int expected)
+          throws SparkException {
+    UTF8String l = UTF8String.fromString(s1);
+    UTF8String r = UTF8String.fromString(s2);
+    int compare = 
CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
+    assertEquals(Integer.signum(expected), Integer.signum(compare));
+  }
+
+  @Test
+  public void testCompare() throws SparkException {
+    // Edge cases
+    assertStringCompare("", "", "UTF8_BINARY", 0);
+    assertStringCompare("a", "", "UTF8_BINARY", 1);
+    assertStringCompare("", "a", "UTF8_BINARY", -1);
+    assertStringCompare("", "", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1);
+    assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1);
+    assertStringCompare("", "", "UNICODE", 0);
+    assertStringCompare("a", "", "UNICODE", 1);
+    assertStringCompare("", "a", "UNICODE", -1);
+    assertStringCompare("", "", "UNICODE_CI", 0);
+    assertStringCompare("a", "", "UNICODE_CI", 1);
+    assertStringCompare("", "a", "UNICODE_CI", -1);
+    // Basic tests
+    assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+    assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
+    assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
+    // Accent variation
+    assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+    assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
+    assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+    // Case-variable character length
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307", "İ", "UNICODE", -1);
+    assertStringCompare("İ", "i\u0307", "UNICODE", 1);
+    assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
+    assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+    // Conditional case mapping
+    assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "σ", "UNICODE", 1);
+    assertStringCompare("ς", "Σ", "UNICODE", 1);
+    assertStringCompare("σ", "Σ", "UNICODE", -1);
+    assertStringCompare("ς", "σ", "UNICODE_CI", 0);
+    assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
+    assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
+  }
+
+  private void assertLcaseCompare(String target, String expected, String 
collationName)
+      throws SparkException {
+    if (collationName.equals("UTF8_BINARY")) {
+      UTF8String targetUTF8 = UTF8String.fromString(target);
+      UTF8String expectedUTF8 = UTF8String.fromString(expected);
+      assertEquals(expectedUTF8, targetUTF8.toLowerCase());
+    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+      assertEquals(expected, 
CollationAwareUTF8String.lowerCaseCodePoints(target));
+    } else {
+      int collationId = CollationFactory.collationNameToId(collationName);
+      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, 
collationId));
+    }
+  }
+
+  @Test
+  public void testLcaseCompare() throws SparkException {

Review Comment:
   I think this test deserves a comment explaining what it tests.
   To be honest I am bit confused about that it tests. Looking at the 
implementation of `assertLcaseCompare` I would have expected that it tests the 
`toLowerCase` method of the collation, but then for UTF8_BINARY_LCASE we do not 
use the collation's `toLowerCase`.



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -147,6 +162,45 @@ public static String toLowerCase(final String target, 
final int collationId) {
     return UCharacter.toLowerCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to lowercase using ICU rules, with special 
handling for
+   * conditional case mappings (i.e. characters that map to multiple 
characters in lowercase).
+   *
+   * @param codePoint The code point to convert to lowercase.
+   * @param sb The StringBuilder to append the lowercase character to.
+   */
+  private static void lowercaseCodePoint(final int codePoint, final 
StringBuilder sb) {
+    // Latin capital letter I with dot above is mapped to 2 lowercase 
characters.
+    if (codePoint == 0x0130) {
+      sb.append("i̇");
+    }
+    // Greek final and non-final capital letter sigma should be mapped the 
same.
+    else if (codePoint == 0x03C2) {
+      sb.append("σ");
+    }
+    // All other characters should follow context-unaware ICU single-code 
point case mapping.
+    else {
+      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+    }

Review Comment:
   Let's also check for surrogate pairs and just map them to themselves.



##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -17,15 +17,136 @@
 package org.apache.spark.unsafe.types;
 
 import org.apache.spark.SparkException;
+import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
 import org.apache.spark.sql.catalyst.util.CollationFactory;
 import org.apache.spark.sql.catalyst.util.CollationSupport;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
+  /**
+   * Collation-aware UTF8String comparison.
+   */
+
+  private void assertStringCompare(String s1, String s2, String collationName, 
int expected)
+          throws SparkException {
+    UTF8String l = UTF8String.fromString(s1);
+    UTF8String r = UTF8String.fromString(s2);
+    int compare = 
CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
+    assertEquals(Integer.signum(expected), Integer.signum(compare));
+  }
+
+  @Test
+  public void testCompare() throws SparkException {
+    // Edge cases
+    assertStringCompare("", "", "UTF8_BINARY", 0);
+    assertStringCompare("a", "", "UTF8_BINARY", 1);
+    assertStringCompare("", "a", "UTF8_BINARY", -1);
+    assertStringCompare("", "", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1);
+    assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1);
+    assertStringCompare("", "", "UNICODE", 0);
+    assertStringCompare("a", "", "UNICODE", 1);
+    assertStringCompare("", "a", "UNICODE", -1);
+    assertStringCompare("", "", "UNICODE_CI", 0);
+    assertStringCompare("a", "", "UNICODE_CI", 1);
+    assertStringCompare("", "a", "UNICODE_CI", -1);
+    // Basic tests
+    assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+    assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
+    assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
+    // Accent variation
+    assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+    assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
+    assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+    // Case-variable character length
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307", "İ", "UNICODE", -1);
+    assertStringCompare("İ", "i\u0307", "UNICODE", 1);
+    assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
+    assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+    // Conditional case mapping
+    assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "σ", "UNICODE", 1);
+    assertStringCompare("ς", "Σ", "UNICODE", 1);
+    assertStringCompare("σ", "Σ", "UNICODE", -1);
+    assertStringCompare("ς", "σ", "UNICODE_CI", 0);
+    assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
+    assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
+  }
+
+  private void assertLcaseCompare(String target, String expected, String 
collationName)
+      throws SparkException {
+    if (collationName.equals("UTF8_BINARY")) {
+      UTF8String targetUTF8 = UTF8String.fromString(target);
+      UTF8String expectedUTF8 = UTF8String.fromString(expected);
+      assertEquals(expectedUTF8, targetUTF8.toLowerCase());
+    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+      assertEquals(expected, 
CollationAwareUTF8String.lowerCaseCodePoints(target));
+    } else {
+      int collationId = CollationFactory.collationNameToId(collationName);
+      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, 
collationId));
+    }
+  }
+
+  @Test
+  public void testLcaseCompare() throws SparkException {
+    // Edge cases
+    assertLcaseCompare("", "", "UTF8_BINARY");
+    assertLcaseCompare("", "", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("", "", "UNICODE");
+    assertLcaseCompare("", "", "UNICODE_CI");
+    // Basic tests
+    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY");
+    assertLcaseCompare("AbCd", "abcd", "UTF8_BINARY");
+    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("aBcD", "abcd", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("abcd", "abcd", "UNICODE");
+    assertLcaseCompare("aBCd", "abcd", "UNICODE");
+    assertLcaseCompare("abcd", "abcd", "UNICODE_CI");
+    assertLcaseCompare("AbcD", "abcd", "UNICODE_CI");
+    // Accent variation
+    assertLcaseCompare("AbĆd", "abćd", "UTF8_BINARY");
+    assertLcaseCompare("aBcΔ", "abcδ", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("ÄbcD", "äbcd", "UNICODE");
+    assertLcaseCompare("aB́Cd", "ab́cd", "UNICODE_CI");
+    // Case-variable character length
+    assertLcaseCompare("İoDiNe", "i̇odine", "UTF8_BINARY");
+    assertLcaseCompare("Abi̇o12", "abi̇o12", "UTF8_BINARY");
+    assertLcaseCompare("İodInE", "i̇odine", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("İoDinE", "i̇odine", "UNICODE");
+    assertLcaseCompare("abi̇O12", "abi̇o12", "UNICODE");
+    assertLcaseCompare("İodINe", "i̇odine", "UNICODE_CI");
+    assertLcaseCompare("ABi̇o12", "abi̇o12", "UNICODE_CI");
+    // Conditional case mapping
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != 
UNICODE_CI
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI");

Review Comment:
   If the goal is to test the `toLowerCase` method of the collation, I would 
suggest using as input string: `A\u030A` which should map to `a\u030A` for 
UTF8_BINARY and UTF8_BINARY_LCASE and to `å` for UNICODE and UNICODE_CI.
   What I am trying to get to is that lowercasing using any of the 4 collations 
in this test should be exactly the same if character normalization is not 
concerned. Using `CollationAwareUTF8String.lowerCaseCodePoints` in my opinion 
clouds the intent here. If you want to have a test that compares 
`CollationAwareUTF8String.lowerCaseCodePoints` against 
`CollationAwareUTF8String.toLowerCase` for the UTF8_BINARY_LCASE, that is a 
test that makes sense, but this has nothing to do with the other collations.



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -147,6 +162,45 @@ public static String toLowerCase(final String target, 
final int collationId) {
     return UCharacter.toLowerCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to lowercase using ICU rules, with special 
handling for
+   * conditional case mappings (i.e. characters that map to multiple 
characters in lowercase).
+   *
+   * @param codePoint The code point to convert to lowercase.
+   * @param sb The StringBuilder to append the lowercase character to.
+   */
+  private static void lowercaseCodePoint(final int codePoint, final 
StringBuilder sb) {
+    // Latin capital letter I with dot above is mapped to 2 lowercase 
characters.
+    if (codePoint == 0x0130) {
+      sb.append("i̇");
+    }
+    // Greek final and non-final capital letter sigma should be mapped the 
same.
+    else if (codePoint == 0x03C2) {
+      sb.append("σ");
+    }
+    // All other characters should follow context-unaware ICU single-code 
point case mapping.
+    else {
+      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+    }

Review Comment:
   For the two special cases it might be better/faster to just append the 
codepoint, just like what we do for the generic case:
   ```java
   if (codePoint == 0x0130) {
     sb.appendCodePoint(0x0049);
     sb.appendCodePoint(0x0307);
   } else if (codePoint == 0x03C2) {
     sb.appendCodePoint(0x03C3);
   } ...
   ```



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -147,6 +162,45 @@ public static String toLowerCase(final String target, 
final int collationId) {
     return UCharacter.toLowerCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to lowercase using ICU rules, with special 
handling for
+   * conditional case mappings (i.e. characters that map to multiple 
characters in lowercase).

Review Comment:
   ```suggestion
      * one-to-many case mappings (i.e. characters that map to multiple 
characters in lowercase).
   ```
   The term "conditional case mapping" in UNICODE jargon means that in order to 
do the case mapping we need to look at the context of the character to case 
mapped inside the string it belongs to. Basically, how we case map that 
character is subject to (depends on, or is conditional to) the context in which 
it appears, the string.



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -147,6 +162,45 @@ public static String toLowerCase(final String target, 
final int collationId) {
     return UCharacter.toLowerCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to lowercase using ICU rules, with special 
handling for
+   * conditional case mappings (i.e. characters that map to multiple 
characters in lowercase).
+   *
+   * @param codePoint The code point to convert to lowercase.
+   * @param sb The StringBuilder to append the lowercase character to.
+   */
+  private static void lowercaseCodePoint(final int codePoint, final 
StringBuilder sb) {
+    // Latin capital letter I with dot above is mapped to 2 lowercase 
characters.
+    if (codePoint == 0x0130) {
+      sb.append("i̇");
+    }
+    // Greek final and non-final capital letter sigma should be mapped the 
same.
+    else if (codePoint == 0x03C2) {
+      sb.append("σ");
+    }
+    // All other characters should follow context-unaware ICU single-code 
point case mapping.
+    else {
+      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+    }
+  }
+
+  /**
+   * Converts an entire string to lowercase using ICU rules, code point by 
code point, with
+   * special handling for conditional case mappings (i.e. characters that map 
to multiple
+   * characters in lowercase). This method omits information about 
context-sensitive case mappings.
+   *
+   * @param target The target string to convert to lowercase.
+   * @return The string converted to lowercase in a context-unaware manner.
+   */
+  public static String lowerCaseCodePoints(final String target) {
+      StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < target.length(); ++i) {
+      int codePoint = target.codePointAt(i);
+      lowercaseCodePoint(codePoint, sb);

Review Comment:
   ```suggestion
         lowercaseCodePoint(target.codePointAt(i), sb);
   ```



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -147,6 +162,45 @@ public static String toLowerCase(final String target, 
final int collationId) {
     return UCharacter.toLowerCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to lowercase using ICU rules, with special 
handling for
+   * conditional case mappings (i.e. characters that map to multiple 
characters in lowercase).
+   *
+   * @param codePoint The code point to convert to lowercase.
+   * @param sb The StringBuilder to append the lowercase character to.
+   */
+  private static void lowercaseCodePoint(final int codePoint, final 
StringBuilder sb) {
+    // Latin capital letter I with dot above is mapped to 2 lowercase 
characters.
+    if (codePoint == 0x0130) {
+      sb.append("i̇");
+    }
+    // Greek final and non-final capital letter sigma should be mapped the 
same.
+    else if (codePoint == 0x03C2) {
+      sb.append("σ");
+    }
+    // All other characters should follow context-unaware ICU single-code 
point case mapping.
+    else {
+      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+    }
+  }
+
+  /**
+   * Converts an entire string to lowercase using ICU rules, code point by 
code point, with
+   * special handling for conditional case mappings (i.e. characters that map 
to multiple
+   * characters in lowercase). This method omits information about 
context-sensitive case mappings.
+   *
+   * @param target The target string to convert to lowercase.
+   * @return The string converted to lowercase in a context-unaware manner.
+   */
+  public static String lowerCaseCodePoints(final String target) {
+      StringBuilder sb = new StringBuilder();

Review Comment:
   ```suggestion
       StringBuilder sb = new StringBuilder();
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]

Reply via email to