(spark) branch master updated: [SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String used in UTF8_BINARY_LCASE collation

wenchen Wed, 10 Apr 2024 08:25:24 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 627f6082edca [SPARK-47693][SQL] Add optimization for lowercase 
comparison of UTF8String used in UTF8_BINARY_LCASE collation
627f6082edca is described below

commit 627f6082edca0507439f0c736e179caf55e6a01d
Author: Nikola Mandic <[email protected]>
AuthorDate: Wed Apr 10 23:23:32 2024 +0800

    [SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String 
used in UTF8_BINARY_LCASE collation
    
    ### What changes were proposed in this pull request?
    
    Current collation 
[benchmarks](https://github.com/apache/spark/blob/e9f204ae93061a862e4da52c128eaf3512a66c7b/sql/core/benchmarks/CollationBenchmark-results.txt)
 indicate that `UTF8_BINARY_LCASE` collation comparisons are order of magnitude 
slower (~7-10x) than plain binary comparisons. Improve the performance by 
optimizing lowercase comparison function for `UTF8String` instances instead of 
performing full lowercase conversion before binary comparison.
    
    Optimization is based on similar method used in `toLowerCase` where we 
check character by character if conversion is valid under ASCII and fallback to 
slow comparison of native strings. In latter case, we only take into 
consideration suffixes that are left to compare.
    
    Benchmarks from `CollationBenchmark` ran locally show substantial 
performance increase:
    ```
    [info] collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    [info] 
--------------------------------------------------------------------------------------------------------------------------
    [info] UTF8_BINARY_LCASE                                    7199           
7209          14          0.0       71988.8       1.0X
    [info] UNICODE                                              3925           
3929           5          0.0       39250.4       1.8X
    [info] UTF8_BINARY                                          3935           
3950          21          0.0       39351.2       1.8X
    [info] UNICODE_CI                                          45248          
51404        8706          0.0      452484.7       0.2X
    ```
    
    ### Why are the changes needed?
    
    To improve performance of comparisons of strings under UTF8_BINARY_LCASE 
collation.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Added unit tests to `UTF8StringSuite`.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #45816 from nikolamand-db/SPARK-47693.
    
    Authored-by: Nikola Mandic <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/util/CollationFactory.java  |  2 +-
 .../org/apache/spark/unsafe/types/UTF8String.java  | 96 +++++++++++++---------
 .../apache/spark/unsafe/types/UTF8StringSuite.java | 23 ++++++
 .../CollationBenchmark-jdk21-results.txt           | 30 +++----
 sql/core/benchmarks/CollationBenchmark-results.txt | 30 +++----
 .../CollationNonASCIIBenchmark-jdk21-results.txt   | 27 ++++++
 .../CollationNonASCIIBenchmark-results.txt         | 27 ++++++
 .../execution/benchmark/CollationBenchmark.scala   | 84 +++++++++++++------
 8 files changed, 223 insertions(+), 96 deletions(-)

diff --git 
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
 
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 119508a37e71..72a6e574707f 100644
--- 
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++ 
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -148,7 +148,7 @@ public final class CollationFactory {
     collationTable[1] = new Collation(
       "UTF8_BINARY_LCASE",
       null,
-      (s1, s2) -> s1.toLowerCase().binaryCompare(s2.toLowerCase()),
+      UTF8String::compareLowerCase,
       "1.0",
       (s) -> (long)s.toLowerCase().hashCode(),
       false,
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index c5dfb91f06c6..2006efb07a04 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -424,21 +424,16 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
     if (numBytes == 0) {
       return EMPTY_UTF8;
     }
-
-    byte[] bytes = new byte[numBytes];
-    bytes[0] = (byte) Character.toTitleCase(getByte(0));
+    // Optimization - do char level uppercase conversion in case of chars in 
ASCII range
     for (int i = 0; i < numBytes; i++) {
-      byte b = getByte(i);
-      if (numBytesForFirstByte(b) != 1) {
-        // fallback
-        return toUpperCaseSlow();
-      }
-      int upper = Character.toUpperCase(b);
-      if (upper > 127) {
-        // fallback
+      if (getByte(i) < 0) {
+        // non-ASCII
         return toUpperCaseSlow();
       }
-      bytes[i] = (byte) upper;
+    }
+    byte[] bytes = new byte[numBytes];
+    for (int i = 0; i < numBytes; i++) {
+      bytes[i] = (byte) Character.toUpperCase(getByte(i));
     }
     return fromBytes(bytes);
   }
@@ -447,6 +442,34 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
     return fromString(toString().toUpperCase());
   }
 
+  /**
+   * Optimized lowercase comparison for UTF8_BINARY_LCASE collation
+   * a.compareLowerCase(b) is equivalent to 
a.toLowerCase().binaryCompare(b.toLowerCase())
+   */
+  public int compareLowerCase(UTF8String other) {
+    int curr;
+    for (curr = 0; curr < numBytes && curr < other.numBytes; curr++) {
+      byte left, right;
+      if ((left = getByte(curr)) < 0 || (right = other.getByte(curr)) < 0) {
+        return compareLowerCaseSuffixSlow(other, curr);
+      }
+      int lowerLeft = Character.toLowerCase(left);
+      int lowerRight = Character.toLowerCase(right);
+      if (lowerLeft != lowerRight) {
+        return lowerLeft - lowerRight;
+      }
+    }
+    return numBytes - other.numBytes;
+  }
+
+  private int compareLowerCaseSuffixSlow(UTF8String other, int pref) {
+    UTF8String suffixLeft = UTF8String.fromAddress(base, offset + pref,
+      numBytes - pref);
+    UTF8String suffixRight = UTF8String.fromAddress(other.base, other.offset + 
pref,
+      other.numBytes - pref);
+    return 
suffixLeft.toLowerCaseSlow().binaryCompare(suffixRight.toLowerCaseSlow());
+  }
+
   /**
    * Returns the lower case of this string
    */
@@ -454,21 +477,16 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
     if (numBytes == 0) {
       return EMPTY_UTF8;
     }
-
-    byte[] bytes = new byte[numBytes];
-    bytes[0] = (byte) Character.toTitleCase(getByte(0));
+    // Optimization - do char level lowercase conversion in case of chars in 
ASCII range
     for (int i = 0; i < numBytes; i++) {
-      byte b = getByte(i);
-      if (numBytesForFirstByte(b) != 1) {
-        // fallback
+      if (getByte(i) < 0) {
+        // non-ASCII
         return toLowerCaseSlow();
       }
-      int lower = Character.toLowerCase(b);
-      if (lower > 127) {
-        // fallback
-        return toLowerCaseSlow();
-      }
-      bytes[i] = (byte) lower;
+    }
+    byte[] bytes = new byte[numBytes];
+    for (int i = 0; i < numBytes; i++) {
+      bytes[i] = (byte) Character.toLowerCase(getByte(i));
     }
     return fromBytes(bytes);
   }
@@ -484,24 +502,26 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
     if (numBytes == 0) {
       return EMPTY_UTF8;
     }
-
+    // Optimization - in case of ASCII chars we can skip copying the data to 
and from StringBuilder
+    byte prev = ' ', curr;
+    for (int i = 0; i < numBytes; i++) {
+      curr = getByte(i);
+      if (prev == ' ' && curr < 0) {
+        // non-ASCII
+        return toTitleCaseSlow();
+      }
+      prev = curr;
+    }
     byte[] bytes = new byte[numBytes];
+    prev = ' ';
     for (int i = 0; i < numBytes; i++) {
-      byte b = getByte(i);
-      if (i == 0 || getByte(i - 1) == ' ') {
-        if (numBytesForFirstByte(b) != 1) {
-          // fallback
-          return toTitleCaseSlow();
-        }
-        int upper = Character.toTitleCase(b);
-        if (upper > 127) {
-          // fallback
-          return toTitleCaseSlow();
-        }
-        bytes[i] = (byte) upper;
+      curr = getByte(i);
+      if (prev == ' ') {
+        bytes[i] = (byte) Character.toTitleCase(curr);
       } else {
-        bytes[i] = b;
+        bytes[i] = curr;
       }
+      prev = curr;
     }
     return fromBytes(bytes);
   }
diff --git 
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
 
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 594b96944934..934b93c9345b 100644
--- 
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ 
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -107,6 +107,29 @@ public class UTF8StringSuite {
     assertTrue(fromString("你好123").binaryCompare(fromString("你好122")) > 0);
   }
 
+  @Test
+  public void lowercaseComparison() {
+    // SPARK-47693: Test optimized lowercase comparison of UTF8String instances
+    // ASCII
+    assertEquals(fromString("aaa").compareLowerCase(fromString("AAA")), 0);
+    assertTrue(fromString("aaa").compareLowerCase(fromString("AAAA")) < 0);
+    assertTrue(fromString("AAA").compareLowerCase(fromString("aaaa")) < 0);
+    assertTrue(fromString("a").compareLowerCase(fromString("B")) < 0);
+    assertTrue(fromString("b").compareLowerCase(fromString("A")) > 0);
+    assertEquals(fromString("aAa").compareLowerCase(fromString("AaA")), 0);
+    assertTrue(fromString("abcd").compareLowerCase(fromString("abC")) > 0);
+    assertTrue(fromString("ABC").compareLowerCase(fromString("abcd")) < 0);
+    assertEquals(fromString("abcd").compareLowerCase(fromString("abcd")), 0);
+    // non-ASCII
+    assertEquals(fromString("ü").compareLowerCase(fromString("Ü")), 0);
+    assertEquals(fromString("Äü").compareLowerCase(fromString("äÜ")), 0);
+    assertTrue(fromString("a").compareLowerCase(fromString("ä")) < 0);
+    assertTrue(fromString("a").compareLowerCase(fromString("Ä")) < 0);
+    assertTrue(fromString("A").compareLowerCase(fromString("ä")) < 0);
+    assertTrue(fromString("bä").compareLowerCase(fromString("aü")) > 0);
+    assertTrue(fromString("bxxxxxxxxxx").compareLowerCase(fromString("bü")) < 
0);
+  }
+
   protected static void testUpperandLower(String upper, String lower) {
     UTF8String us = fromString(upper);
     UTF8String ls = fromString(lower);
diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt 
b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
index e1d7a42aac61..32cbbc74e911 100644
--- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
@@ -1,27 +1,27 @@
-OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
 AMD EPYC 7763 64-Core Processor
 collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
--------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE                                   29904          29937       
   47          0.0      299036.1       1.0X
-UNICODE                                              3886           3893       
   10          0.0       38863.0       7.7X
-UTF8_BINARY                                          3945           3945       
    0          0.0       39449.6       7.6X
-UNICODE_CI                                          45321          45330       
   12          0.0      453210.3       0.7X
+UTF8_BINARY_LCASE                                    6910           6912       
    3          0.0       69099.7       1.0X
+UNICODE                                              4367           4368       
    1          0.0       43669.6       1.6X
+UTF8_BINARY                                          4361           4364       
    4          0.0       43606.5       1.6X
+UNICODE_CI                                          46480          46526       
   66          0.0      464795.7       0.1X
 
-OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
 AMD EPYC 7763 64-Core Processor
 collation unit benchmarks - compareFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
---------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE                                    29807          29818      
    17          0.0      298065.0       1.0X
-UNICODE                                              45704          45723      
    27          0.0      457036.2       0.7X
-UTF8_BINARY                                           6460           6464      
     7          0.0       64597.9       4.6X
-UNICODE_CI                                           45498          45508      
    14          0.0      454977.6       0.7X
+UTF8_BINARY_LCASE                                     6522           6526      
     4          0.0       65223.9       1.0X
+UNICODE                                              45792          45797      
     7          0.0      457922.3       0.1X
+UTF8_BINARY                                           7092           7112      
    29          0.0       70921.7       0.9X
+UNICODE_CI                                           47548          47564      
    22          0.0      475476.7       0.1X
 
-OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
 AMD EPYC 7763 64-Core Processor
 collation unit benchmarks - hashFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE                                 23553          23595         
 59          0.0      235531.8       1.0X
-UNICODE                                          197303         197309         
  8          0.0     1973034.1       0.1X
-UTF8_BINARY                                       14389          14391         
  2          0.0      143891.2       1.6X
-UNICODE_CI                                       166880         166885         
  7          0.0     1668799.5       0.1X
+UTF8_BINARY_LCASE                                 11716          11716         
  1          0.0      117157.9       1.0X
+UNICODE                                          180133         180137         
  5          0.0     1801332.1       0.1X
+UTF8_BINARY                                       10476          10477         
  1          0.0      104757.4       1.1X
+UNICODE_CI                                       148171         148190         
 28          0.0     1481705.6       0.1X
 
diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt 
b/sql/core/benchmarks/CollationBenchmark-results.txt
index d8ebdfa695ff..4028b0f005a3 100644
--- a/sql/core/benchmarks/CollationBenchmark-results.txt
+++ b/sql/core/benchmarks/CollationBenchmark-results.txt
@@ -1,27 +1,27 @@
-OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
 AMD EPYC 7763 64-Core Processor
 collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
--------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE                                   34122          34152       
   42          0.0      341224.2       1.0X
-UNICODE                                              4520           4522       
    2          0.0       45201.8       7.5X
-UTF8_BINARY                                          4524           4526       
    2          0.0       45243.0       7.5X
-UNICODE_CI                                          52706          52711       
    7          0.0      527056.1       0.6X
+UTF8_BINARY_LCASE                                    7692           7731       
   55          0.0       76919.2       1.0X
+UNICODE                                              4378           4379       
    0          0.0       43784.6       1.8X
+UTF8_BINARY                                          4382           4396       
   19          0.0       43821.6       1.8X
+UNICODE_CI                                          48344          48360       
   23          0.0      483436.5       0.2X
 
-OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
 AMD EPYC 7763 64-Core Processor
 collation unit benchmarks - compareFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
---------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE                                    33467          33474      
    10          0.0      334671.7       1.0X
-UNICODE                                              51168          51168      
     1          0.0      511677.4       0.7X
-UTF8_BINARY                                           5561           5593      
    45          0.0       55610.9       6.0X
-UNICODE_CI                                           51929          51955      
    36          0.0      519291.8       0.6X
+UTF8_BINARY_LCASE                                     9819           9820      
     0          0.0       98194.9       1.0X
+UNICODE                                              49507          49518      
    17          0.0      495066.2       0.2X
+UTF8_BINARY                                           7354           7365      
    17          0.0       73536.3       1.3X
+UNICODE_CI                                           52149          52163      
    20          0.0      521489.4       0.2X
 
-OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
 AMD EPYC 7763 64-Core Processor
 collation unit benchmarks - hashFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE                                 22079          22083         
  5          0.0      220786.7       1.0X
-UNICODE                                          177636         177709         
103          0.0     1776363.9       0.1X
-UTF8_BINARY                                       11954          11956         
  3          0.0      119536.7       1.8X
-UNICODE_CI                                       158014         158038         
 35          0.0     1580135.7       0.1X
+UTF8_BINARY_LCASE                                 18110          18127         
 24          0.0      181103.9       1.0X
+UNICODE                                          171375         171435         
 85          0.0     1713752.3       0.1X
+UTF8_BINARY                                       14012          14030         
 26          0.0      140116.7       1.3X
+UNICODE_CI                                       153847         153901         
 76          0.0     1538471.1       0.1X
 
diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt 
b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..dc68b747203f
--- /dev/null
+++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
@@ -0,0 +1,27 @@
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+--------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                   18244          18258       
   20          0.0      456096.4       1.0X
+UNICODE                                               498            498       
    0          0.1       12440.3      36.7X
+UTF8_BINARY                                           499            500       
    1          0.1       12467.7      36.6X
+UNICODE_CI                                          13429          13443       
   19          0.0      335725.4       1.4X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - compareFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+---------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                    18377          18399      
    31          0.0      459430.5       1.0X
+UNICODE                                              14238          14240      
     3          0.0      355957.4       1.3X
+UTF8_BINARY                                            975            976      
     1          0.0       24371.3      18.9X
+UNICODE_CI                                           13819          13826      
    10          0.0      345482.6       1.3X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - hashFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                  9183           9230         
 67          0.0      229564.0       1.0X
+UNICODE                                           38937          38952         
 22          0.0      973421.3       0.2X
+UTF8_BINARY                                        1376           1376         
  0          0.0       34397.5       6.7X
+UNICODE_CI                                        32881          32882         
  1          0.0      822027.4       0.3X
+
diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt 
b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
new file mode 100644
index 000000000000..bb58968764c7
--- /dev/null
+++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
@@ -0,0 +1,27 @@
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+--------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                   17881          17885       
    6          0.0      447017.7       1.0X
+UNICODE                                               493            495       
    2          0.1       12328.9      36.3X
+UTF8_BINARY                                           493            494       
    1          0.1       12331.4      36.3X
+UNICODE_CI                                          13731          13737       
    8          0.0      343284.6       1.3X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - compareFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+---------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                    18041          18047      
     8          0.0      451030.2       1.0X
+UNICODE                                              14023          14047      
    34          0.0      350573.9       1.3X
+UTF8_BINARY                                           1387           1397      
    14          0.0       34680.4      13.0X
+UNICODE_CI                                           14232          14242      
    14          0.0      355808.4       1.3X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - hashFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 10494          10499         
  6          0.0      262360.0       1.0X
+UNICODE                                           40410          40422         
 17          0.0     1010261.8       0.3X
+UTF8_BINARY                                        2035           2035         
  1          0.0       50877.8       5.2X
+UNICODE_CI                                        31470          31493         
 32          0.0      786752.4       0.3X
+
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
index 24e61052f561..7a93c7c495e2 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
@@ -22,31 +22,11 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
 import org.apache.spark.sql.catalyst.util.CollationFactory
 import org.apache.spark.unsafe.types.UTF8String
 
-/**
- * Benchmark to measure performance for comparisons between collated strings. 
To run this benchmark:
- * {{{
- *   1. without sbt:
- *      bin/spark-submit --class <this class>
- *        --jars <spark core test jar>,<spark catalyst test jar> <spark sql 
test jar>
- *   2. build/sbt "sql/Test/runMain 
org.apache.spark.sql.execution.benchmark.CollationBenchmark"
- *   3. generate result:
- *      SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this 
class>"
- *      Results will be written to "benchmarks/CollationBenchmark-results.txt".
- * }}}
- */
-
-object CollationBenchmark extends BenchmarkBase {
-  private val collationTypes = Seq("UTF8_BINARY_LCASE", "UNICODE", 
"UTF8_BINARY", "UNICODE_CI")
+abstract class CollationBenchmarkBase extends BenchmarkBase {
+  protected val collationTypes: Seq[String] =
+    Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI")
 
-  def generateSeqInput(n: Long): Seq[UTF8String] = {
-    val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", 
"def", "def",
-      "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", 
"VWX", "vwx",
-      "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", 
"GHI", "ghi",
-      "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", 
"YZ")
-      .map(UTF8String.fromString)
-    val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % 
input.size))
-    inputLong
-  }
+  def generateSeqInput(n: Long): Seq[UTF8String]
 
   def benchmarkUTFStringEquals(collationTypes: Seq[String], utf8Strings: 
Seq[UTF8String]): Unit = {
     val sublistStrings = utf8Strings
@@ -54,7 +34,7 @@ object CollationBenchmark extends BenchmarkBase {
     val benchmark = new Benchmark(
       "collation unit benchmarks - equalsFunction",
       utf8Strings.size * 10,
-      warmupTime = 4.seconds,
+      warmupTime = 10.seconds,
       output = output)
     collationTypes.foreach(collationType => {
       val collation = CollationFactory.fetchCollation(collationType)
@@ -77,7 +57,7 @@ object CollationBenchmark extends BenchmarkBase {
     val benchmark = new Benchmark(
       "collation unit benchmarks - compareFunction",
       utf8Strings.size * 10,
-      warmupTime = 4.seconds,
+      warmupTime = 10.seconds,
       output = output)
     collationTypes.foreach(collationType => {
       val collation = CollationFactory.fetchCollation(collationType)
@@ -103,7 +83,7 @@ object CollationBenchmark extends BenchmarkBase {
     val benchmark = new Benchmark(
       "collation unit benchmarks - hashFunction",
       utf8Strings.size * 10,
-      warmupTime = 4.seconds,
+      warmupTime = 10.seconds,
       output = output)
     collationTypes.foreach(collationType => {
       val collation = CollationFactory.fetchCollation(collationType)
@@ -120,6 +100,31 @@ object CollationBenchmark extends BenchmarkBase {
     )
     benchmark.run()
   }
+}
+
+/**
+ * Benchmark to measure performance for comparisons between collated strings. 
To run this benchmark:
+ * {{{
+ *   1. without sbt:
+ *      bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar> <spark sql 
test jar>
+ *   2. build/sbt "sql/Test/runMain 
org.apache.spark.sql.execution.benchmark.CollationBenchmark"
+ *   3. generate result:
+ *      SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this 
class>"
+ *      Results will be written to "benchmarks/CollationBenchmark-results.txt".
+ * }}}
+ */
+object CollationBenchmark extends CollationBenchmarkBase {
+
+  override def generateSeqInput(n: Long): Seq[UTF8String] = {
+    val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", 
"def", "def",
+      "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", 
"VWX", "vwx",
+      "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", 
"GHI", "ghi",
+      "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", 
"YZ")
+      .map(UTF8String.fromString)
+    val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % 
input.size))
+    inputLong
+  }
 
   override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
     benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L))
@@ -127,3 +132,28 @@ object CollationBenchmark extends BenchmarkBase {
     benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L))
   }
 }
+
+/**
+ * Measure performance of collation comparisons of non-ASCII strings.
+ */
+object CollationNonASCIIBenchmark extends CollationBenchmarkBase {
+
+  override def generateSeqInput(n: Long): Seq[UTF8String] = {
+    // scalastyle:off nonascii
+    val inputSet = Seq("A", "a", "Ä", "ä")
+    // lowercase and uppercase plain and umlaut A combinations of 3 letters 
(AAA, aäA, ...)
+    val input = (for {
+      x <- inputSet
+      y <- inputSet
+      z <- inputSet } yield x + y + z).map(UTF8String.fromString)
+    val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % 
input.size))
+    inputLong
+    // scalastyle:on nonascii
+  }
+
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    benchmarkUTFStringEquals(collationTypes, generateSeqInput(4000L))
+    benchmarkUTFStringCompare(collationTypes, generateSeqInput(4000L))
+    benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(4000L))
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String used in UTF8_BINARY_LCASE collation

Reply via email to