This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new a7dc020bc0e5 [SPARK-48681][SQL] Use ICU in Lower/Upper expressions for
UTF8_BINARY strings
a7dc020bc0e5 is described below
commit a7dc020bc0e53a16446ca3eda54c77e374c3b23e
Author: Uros Bojanic <[email protected]>
AuthorDate: Mon Jun 24 16:20:03 2024 +0800
[SPARK-48681][SQL] Use ICU in Lower/Upper expressions for UTF8_BINARY
strings
### What changes were proposed in this pull request?
Update `Lower` & `Upper` Spark expressions to use ICU case mappings for
UTF8_BINARY collation, instead of the currently used JVM case mappings. This
behaviour is put under the `ICU_CASE_MAPPINGS_ENABLED` flag in SQLConf, which
is `true` by default.
### Why are the changes needed?
To keep the consistency between collations - all collations shouls use
ICU-based case mappings, including the UTF8_BINARY collation.
### Does this PR introduce _any_ user-facing change?
Yes, the behaviour of `lower` & `upper` string functions for UTF8_BINARY
will now rely on ICU-based case mappings. However, by turning the
`ICU_CASE_MAPPINGS_ENABLED` flag off, users can get the old JVM-based case
mappings. Note that the difference between the two is really subtle.
### How was this patch tested?
Existing tests, with extended `CollationSupport` unit tests for Lower/Upper
to verify both ICU and JVM behaviour.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #47043 from uros-db/change-lower-upper.
Authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/util/CollationSupport.java | 26 ++++++++++++++--------
.../spark/unsafe/types/CollationSupportSuite.java | 12 ++++++++--
.../catalyst/expressions/stringExpressions.scala | 16 +++++++++----
.../org/apache/spark/sql/internal/SQLConf.scala | 8 +++++++
4 files changed, 47 insertions(+), 15 deletions(-)
diff --git
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index 5eebec7f1301..a5bb1fe715bb 100644
---
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -206,21 +206,22 @@ public final class CollationSupport {
}
public static class Upper {
- public static UTF8String exec(final UTF8String v, final int collationId) {
+ public static UTF8String exec(final UTF8String v, final int collationId,
boolean useICU) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
- return execBinary(v);
+ return useICU ? execBinaryICU(v) : execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
- public static String genCode(final String v, final int collationId) {
+ public static String genCode(final String v, final int collationId,
boolean useICU) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Upper.exec";
if (collation.supportsBinaryEquality) {
- return String.format(expr + "Binary(%s)", v);
+ String funcName = useICU ? "BinaryICU" : "Binary";
+ return String.format(expr + "%s(%s)", funcName, v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
@@ -230,6 +231,9 @@ public final class CollationSupport {
public static UTF8String execBinary(final UTF8String v) {
return v.toUpperCase();
}
+ public static UTF8String execBinaryICU(final UTF8String v) {
+ return CollationAwareUTF8String.toUpperCase(v);
+ }
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toUpperCase(v);
}
@@ -239,21 +243,22 @@ public final class CollationSupport {
}
public static class Lower {
- public static UTF8String exec(final UTF8String v, final int collationId) {
+ public static UTF8String exec(final UTF8String v, final int collationId,
boolean useICU) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
- return execBinary(v);
+ return useICU ? execBinaryICU(v) : execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
- public static String genCode(final String v, final int collationId) {
+ public static String genCode(final String v, final int collationId,
boolean useICU) {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- String expr = "CollationSupport.Lower.exec";
+ String expr = "CollationSupport.Lower.exec";
if (collation.supportsBinaryEquality) {
- return String.format(expr + "Binary(%s)", v);
+ String funcName = useICU ? "BinaryICU" : "Binary";
+ return String.format(expr + "%s(%s)", funcName, v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
@@ -263,6 +268,9 @@ public final class CollationSupport {
public static UTF8String execBinary(final UTF8String v) {
return v.toLowerCase();
}
+ public static UTF8String execBinaryICU(final UTF8String v) {
+ return CollationAwareUTF8String.toLowerCase(v);
+ }
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toLowerCase(v);
}
diff --git
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 58826005fc46..99f35ef81dc6 100644
---
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -606,7 +606,11 @@ public class CollationSupportSuite {
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
- assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8,
collationId));
+ // Testing the new ICU-based implementation of the Upper function.
+ assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8,
collationId, true));
+ // Testing the old JVM-based implementation of the Upper function.
+ assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8,
collationId, false));
+ // Note: results should be the same in these tests for both ICU and
JVM-based implementations.
}
@Test
@@ -660,7 +664,11 @@ public class CollationSupportSuite {
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
- assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8,
collationId));
+ // Testing the new ICU-based implementation of the Lower function.
+ assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8,
collationId, true));
+ // Testing the old JVM-based implementation of the Lower function.
+ assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8,
collationId, false));
+ // Note: results should be the same in these tests for both ICU and
JVM-based implementations.
}
@Test
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index ac23962f41ed..e0a9d6f77edd 100755
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -455,12 +455,16 @@ case class Upper(child: Expression)
final lazy val collationId: Int =
child.dataType.asInstanceOf[StringType].collationId
- override def convert(v: UTF8String): UTF8String =
CollationSupport.Upper.exec(v, collationId)
+ // Flag to indicate whether to use ICU instead of JVM case mappings for
UTF8_BINARY collation.
+ private final lazy val useICU =
SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)
+
+ override def convert(v: UTF8String): UTF8String =
+ CollationSupport.Upper.exec(v, collationId, useICU)
final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER)
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
- defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId))
+ defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId,
useICU))
}
override protected def withNewChildInternal(newChild: Expression): Upper =
copy(child = newChild)
@@ -483,12 +487,16 @@ case class Lower(child: Expression)
final lazy val collationId: Int =
child.dataType.asInstanceOf[StringType].collationId
- override def convert(v: UTF8String): UTF8String =
CollationSupport.Lower.exec(v, collationId)
+ // Flag to indicate whether to use ICU instead of JVM case mappings for
UTF8_BINARY collation.
+ private final lazy val useICU =
SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)
+
+ override def convert(v: UTF8String): UTF8String =
+ CollationSupport.Lower.exec(v, collationId, useICU)
final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER)
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
- defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId))
+ defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId,
useICU))
}
override def prettyName: String =
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index fd804bc0e986..799e54aaecea 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -785,6 +785,14 @@ object SQLConf {
_ => Map())
.createWithDefault("UTF8_BINARY")
+ val ICU_CASE_MAPPINGS_ENABLED =
+ buildConf("spark.sql.icu.caseMappings.enabled")
+ .doc("When enabled we use the ICU library (instead of the JVM) to
implement case mappings" +
+ " for strings under UTF8_BINARY collation.")
+ .version("4.0.0")
+ .booleanConf
+ .createWithDefault(true)
+
val FETCH_SHUFFLE_BLOCKS_IN_BATCH =
buildConf("spark.sql.adaptive.fetchShuffleBlocksInBatch")
.internal()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]