(spark) branch master updated: [SPARK-48682][SQL] Use ICU in InitCap expression for UTF8_BINARY strings

wenchen Thu, 27 Jun 2024 18:04:49 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 2c9eb1c1f00f [SPARK-48682][SQL] Use ICU in InitCap expression for 
UTF8_BINARY strings
2c9eb1c1f00f is described below

commit 2c9eb1c1f00f0d8f9434bfc62faa70f85dfa2a9c
Author: Uros Bojanic <[email protected]>
AuthorDate: Fri Jun 28 09:04:30 2024 +0800

    [SPARK-48682][SQL] Use ICU in InitCap expression for UTF8_BINARY strings
    
    ### What changes were proposed in this pull request?
    Update `InitCap` Spark expressions to use ICU case mappings for UTF8_BINARY 
collation, instead of the currently used JVM case mappings. This behaviour is 
put under the `ICU_CASE_MAPPINGS_ENABLED` flag in SQLConf, which is true by 
default. Note: the same flag is used for `Lower` & `Upper` expressions, with 
changes introduced in: https://github.com/apache/spark/pull/47043.
    
    ### Why are the changes needed?
    To keep the consistency between collations - all collations shouls use 
ICU-based case mappings, including the UTF8_BINARY collation.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, the behaviour of `initcap` string function for UTF8_BINARY will now 
rely on ICU-based case mappings. However, by turning the 
`ICU_CASE_MAPPINGS_ENABLED` flag off, users can get the old JVM-based case 
mappings. Note that the difference between the two is really subtle.
    
    ### How was this patch tested?
    Existing tests, with extended `CollationSupport` unit tests for InitCap to 
verify both ICU and JVM behaviour.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #47100 from uros-db/change-initcap.
    
    Authored-by: Uros Bojanic <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/util/CollationSupport.java  | 12 +++--
 .../org/apache/spark/unsafe/types/UTF8String.java  | 57 +++++++++++++++++-----
 .../spark/unsafe/types/CollationSupportSuite.java  |  6 ++-
 .../catalyst/expressions/stringExpressions.scala   |  7 ++-
 4 files changed, 64 insertions(+), 18 deletions(-)

diff --git 
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
 
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index a5bb1fe715bb..0f10955c986a 100644
--- 
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ 
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -280,10 +280,10 @@ public final class CollationSupport {
   }
 
   public static class InitCap {
-    public static UTF8String exec(final UTF8String v, final int collationId) {
+    public static UTF8String exec(final UTF8String v, final int collationId, 
boolean useICU) {
       CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
-        return execBinary(v);
+        return useICU ? execBinaryICU(v) : execBinary(v);
       } else if (collation.supportsLowercaseEquality) {
         return execLowercase(v);
       } else {
@@ -291,11 +291,12 @@ public final class CollationSupport {
       }
     }
 
-    public static String genCode(final String v, final int collationId) {
+    public static String genCode(final String v, final int collationId, 
boolean useICU) {
       CollationFactory.Collation collation = 
CollationFactory.fetchCollation(collationId);
       String expr = "CollationSupport.InitCap.exec";
       if (collation.supportsBinaryEquality) {
-        return String.format(expr + "Binary(%s)", v);
+        String funcName = useICU ? "BinaryICU" : "Binary";
+        return String.format(expr + "%s(%s)", funcName, v);
       } else if (collation.supportsLowercaseEquality) {
         return String.format(expr + "Lowercase(%s)", v);
       } else {
@@ -305,6 +306,9 @@ public final class CollationSupport {
     public static UTF8String execBinary(final UTF8String v) {
       return v.toLowerCase().toTitleCase();
     }
+    public static UTF8String execBinaryICU(final UTF8String v) {
+      return CollationAwareUTF8String.toLowerCase(v).toTitleCaseICU();
+    }
     public static UTF8String execLowercase(final UTF8String v) {
       return CollationAwareUTF8String.toTitleCase(v);
     }
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 49d3088f8a2f..38b9b803acbe 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -31,6 +31,7 @@ import com.esotericsoftware.kryo.Kryo;
 import com.esotericsoftware.kryo.KryoSerializable;
 import com.esotericsoftware.kryo.io.Input;
 import com.esotericsoftware.kryo.io.Output;
+import com.ibm.icu.lang.UCharacter;
 
 import org.apache.spark.sql.catalyst.util.CollationFactory;
 import org.apache.spark.unsafe.Platform;
@@ -558,24 +559,35 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
   }
 
   /**
-   * Returns the title case of this string, that could be used as title.
+   * Returns the title case of this string, that could be used as title. There 
are essentially two
+   * different version of this method - one using the JVM case mapping rules, 
and the other using
+   * the ICU case mapping rules. ASCII implementation is the same for both, 
but please refer to the
+   * respective methods for the slow (non-ASCII) implementation for more 
details on the differences.
    */
   public UTF8String toTitleCase() {
     if (numBytes == 0) {
       return EMPTY_UTF8;
     }
-    // Optimization - in case of ASCII chars we can skip copying the data to 
and from StringBuilder
-    byte prev = ' ', curr;
-    for (int i = 0; i < numBytes; i++) {
-      curr = getByte(i);
-      if (prev == ' ' && curr < 0) {
-        // non-ASCII
-        return toTitleCaseSlow();
-      }
-      prev = curr;
+
+    return isFullAscii() ? toTitleCaseAscii() : toTitleCaseSlow();
+  }
+
+  public UTF8String toTitleCaseICU() {
+    if (numBytes == 0) {
+      return EMPTY_UTF8;
     }
+
+    return isFullAscii() ? toTitleCaseAscii() : toTitleCaseSlowICU();
+  }
+
+  /*
+   * Fast path to return the title case of this string, given that all 
characters are ASCII.
+   * This implementation essentially works for all collations currently 
supported in Spark.
+   * This method is more efficient, because it skips copying the data to and 
from StringBuilder.
+   */
+  private UTF8String toTitleCaseAscii() {
     byte[] bytes = new byte[numBytes];
-    prev = ' ';
+    byte prev = ' ', curr;
     for (int i = 0; i < numBytes; i++) {
       curr = getByte(i);
       if (prev == ' ') {
@@ -588,6 +600,11 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
     return fromBytes(bytes);
   }
 
+  /*
+   * Slow path to return the title case of this string, according to JVM case 
mapping rules.
+   * This is considered the "old" behaviour for UTF8_BINARY collation, and is 
not recommended.
+   * To use this, set the spark.sql.ICU_CASE_MAPPINGS_ENABLED configuration to 
`false`.
+   */
   private UTF8String toTitleCaseSlow() {
     StringBuilder sb = new StringBuilder();
     String s = toString();
@@ -601,6 +618,24 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
     return fromString(sb.toString());
   }
 
+  /*
+   * Slow path to return the title case of this string, according to ICU case 
mapping rules.
+   * This is considered the "new" behaviour for UTF8_BINARY collation, and is 
recommended.
+   * This is used by default, since spark.sql.ICU_CASE_MAPPINGS_ENABLED is set 
to `true`.
+   */
+  private UTF8String toTitleCaseSlowICU() {
+    StringBuilder sb = new StringBuilder();
+    String s = toString();
+    sb.append(s);
+    sb.setCharAt(0, (char) UCharacter.toTitleCase(sb.charAt(0)));
+    for (int i = 1; i < s.length(); i++) {
+      if (sb.charAt(i - 1) == ' ') {
+        sb.setCharAt(i, (char) UCharacter.toTitleCase(sb.charAt(i)));
+      }
+    }
+    return fromString(sb.toString());
+  }
+
   /*
    * Returns the index of the string `match` in this String. This string has 
to be a comma separated
    * list. If `match` contains a comma 0 will be returned. If the `match` 
isn't part of this String,
diff --git 
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
 
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 9602c83c6c80..d027f67c08ff 100644
--- 
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ 
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -731,7 +731,11 @@ public class CollationSupportSuite {
     UTF8String target_utf8 = UTF8String.fromString(target);
     UTF8String expected_utf8 = UTF8String.fromString(expected);
     int collationId = CollationFactory.collationNameToId(collationName);
-    assertEquals(expected_utf8, CollationSupport.InitCap.exec(target_utf8, 
collationId));
+    // Testing the new ICU-based implementation of the Lower function.
+    assertEquals(expected_utf8, CollationSupport.InitCap.exec(target_utf8, 
collationId, true));
+    // Testing the old JVM-based implementation of the Lower function.
+    assertEquals(expected_utf8, CollationSupport.InitCap.exec(target_utf8, 
collationId, false));
+    // Note: results should be the same in these tests for both ICU and 
JVM-based implementations.
   }
 
   @Test
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index e951d40d4d46..a0c796274f76 100755
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2048,14 +2048,17 @@ case class InitCap(child: Expression)
 
   final lazy val collationId: Int = 
child.dataType.asInstanceOf[StringType].collationId
 
+  // Flag to indicate whether to use ICU instead of JVM case mappings for 
UTF8_BINARY collation.
+  private final lazy val useICU = 
SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)
+
   override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
   override def dataType: DataType = child.dataType
 
   override def nullSafeEval(string: Any): Any = {
-    CollationSupport.InitCap.exec(string.asInstanceOf[UTF8String], collationId)
+    CollationSupport.InitCap.exec(string.asInstanceOf[UTF8String], 
collationId, useICU)
   }
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, str => CollationSupport.InitCap.genCode(str, 
collationId))
+    defineCodeGen(ctx, ev, str => CollationSupport.InitCap.genCode(str, 
collationId, useICU))
   }
 
   override protected def withNewChildInternal(newChild: Expression): InitCap =


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-48682][SQL] Use ICU in InitCap expression for UTF8_BINARY strings

Reply via email to