This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 627f6082edca [SPARK-47693][SQL] Add optimization for lowercase
comparison of UTF8String used in UTF8_BINARY_LCASE collation
627f6082edca is described below
commit 627f6082edca0507439f0c736e179caf55e6a01d
Author: Nikola Mandic <[email protected]>
AuthorDate: Wed Apr 10 23:23:32 2024 +0800
[SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String
used in UTF8_BINARY_LCASE collation
### What changes were proposed in this pull request?
Current collation
[benchmarks](https://github.com/apache/spark/blob/e9f204ae93061a862e4da52c128eaf3512a66c7b/sql/core/benchmarks/CollationBenchmark-results.txt)
indicate that `UTF8_BINARY_LCASE` collation comparisons are order of magnitude
slower (~7-10x) than plain binary comparisons. Improve the performance by
optimizing lowercase comparison function for `UTF8String` instances instead of
performing full lowercase conversion before binary comparison.
Optimization is based on similar method used in `toLowerCase` where we
check character by character if conversion is valid under ASCII and fallback to
slow comparison of native strings. In latter case, we only take into
consideration suffixes that are left to compare.
Benchmarks from `CollationBenchmark` ran locally show substantial
performance increase:
```
[info] collation unit benchmarks - equalsFunction: Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
[info]
--------------------------------------------------------------------------------------------------------------------------
[info] UTF8_BINARY_LCASE 7199
7209 14 0.0 71988.8 1.0X
[info] UNICODE 3925
3929 5 0.0 39250.4 1.8X
[info] UTF8_BINARY 3935
3950 21 0.0 39351.2 1.8X
[info] UNICODE_CI 45248
51404 8706 0.0 452484.7 0.2X
```
### Why are the changes needed?
To improve performance of comparisons of strings under UTF8_BINARY_LCASE
collation.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Added unit tests to `UTF8StringSuite`.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #45816 from nikolamand-db/SPARK-47693.
Authored-by: Nikola Mandic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/util/CollationFactory.java | 2 +-
.../org/apache/spark/unsafe/types/UTF8String.java | 96 +++++++++++++---------
.../apache/spark/unsafe/types/UTF8StringSuite.java | 23 ++++++
.../CollationBenchmark-jdk21-results.txt | 30 +++----
sql/core/benchmarks/CollationBenchmark-results.txt | 30 +++----
.../CollationNonASCIIBenchmark-jdk21-results.txt | 27 ++++++
.../CollationNonASCIIBenchmark-results.txt | 27 ++++++
.../execution/benchmark/CollationBenchmark.scala | 84 +++++++++++++------
8 files changed, 223 insertions(+), 96 deletions(-)
diff --git
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
index 119508a37e71..72a6e574707f 100644
---
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
+++
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -148,7 +148,7 @@ public final class CollationFactory {
collationTable[1] = new Collation(
"UTF8_BINARY_LCASE",
null,
- (s1, s2) -> s1.toLowerCase().binaryCompare(s2.toLowerCase()),
+ UTF8String::compareLowerCase,
"1.0",
(s) -> (long)s.toLowerCase().hashCode(),
false,
diff --git
a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index c5dfb91f06c6..2006efb07a04 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -424,21 +424,16 @@ public final class UTF8String implements
Comparable<UTF8String>, Externalizable,
if (numBytes == 0) {
return EMPTY_UTF8;
}
-
- byte[] bytes = new byte[numBytes];
- bytes[0] = (byte) Character.toTitleCase(getByte(0));
+ // Optimization - do char level uppercase conversion in case of chars in
ASCII range
for (int i = 0; i < numBytes; i++) {
- byte b = getByte(i);
- if (numBytesForFirstByte(b) != 1) {
- // fallback
- return toUpperCaseSlow();
- }
- int upper = Character.toUpperCase(b);
- if (upper > 127) {
- // fallback
+ if (getByte(i) < 0) {
+ // non-ASCII
return toUpperCaseSlow();
}
- bytes[i] = (byte) upper;
+ }
+ byte[] bytes = new byte[numBytes];
+ for (int i = 0; i < numBytes; i++) {
+ bytes[i] = (byte) Character.toUpperCase(getByte(i));
}
return fromBytes(bytes);
}
@@ -447,6 +442,34 @@ public final class UTF8String implements
Comparable<UTF8String>, Externalizable,
return fromString(toString().toUpperCase());
}
+ /**
+ * Optimized lowercase comparison for UTF8_BINARY_LCASE collation
+ * a.compareLowerCase(b) is equivalent to
a.toLowerCase().binaryCompare(b.toLowerCase())
+ */
+ public int compareLowerCase(UTF8String other) {
+ int curr;
+ for (curr = 0; curr < numBytes && curr < other.numBytes; curr++) {
+ byte left, right;
+ if ((left = getByte(curr)) < 0 || (right = other.getByte(curr)) < 0) {
+ return compareLowerCaseSuffixSlow(other, curr);
+ }
+ int lowerLeft = Character.toLowerCase(left);
+ int lowerRight = Character.toLowerCase(right);
+ if (lowerLeft != lowerRight) {
+ return lowerLeft - lowerRight;
+ }
+ }
+ return numBytes - other.numBytes;
+ }
+
+ private int compareLowerCaseSuffixSlow(UTF8String other, int pref) {
+ UTF8String suffixLeft = UTF8String.fromAddress(base, offset + pref,
+ numBytes - pref);
+ UTF8String suffixRight = UTF8String.fromAddress(other.base, other.offset +
pref,
+ other.numBytes - pref);
+ return
suffixLeft.toLowerCaseSlow().binaryCompare(suffixRight.toLowerCaseSlow());
+ }
+
/**
* Returns the lower case of this string
*/
@@ -454,21 +477,16 @@ public final class UTF8String implements
Comparable<UTF8String>, Externalizable,
if (numBytes == 0) {
return EMPTY_UTF8;
}
-
- byte[] bytes = new byte[numBytes];
- bytes[0] = (byte) Character.toTitleCase(getByte(0));
+ // Optimization - do char level lowercase conversion in case of chars in
ASCII range
for (int i = 0; i < numBytes; i++) {
- byte b = getByte(i);
- if (numBytesForFirstByte(b) != 1) {
- // fallback
+ if (getByte(i) < 0) {
+ // non-ASCII
return toLowerCaseSlow();
}
- int lower = Character.toLowerCase(b);
- if (lower > 127) {
- // fallback
- return toLowerCaseSlow();
- }
- bytes[i] = (byte) lower;
+ }
+ byte[] bytes = new byte[numBytes];
+ for (int i = 0; i < numBytes; i++) {
+ bytes[i] = (byte) Character.toLowerCase(getByte(i));
}
return fromBytes(bytes);
}
@@ -484,24 +502,26 @@ public final class UTF8String implements
Comparable<UTF8String>, Externalizable,
if (numBytes == 0) {
return EMPTY_UTF8;
}
-
+ // Optimization - in case of ASCII chars we can skip copying the data to
and from StringBuilder
+ byte prev = ' ', curr;
+ for (int i = 0; i < numBytes; i++) {
+ curr = getByte(i);
+ if (prev == ' ' && curr < 0) {
+ // non-ASCII
+ return toTitleCaseSlow();
+ }
+ prev = curr;
+ }
byte[] bytes = new byte[numBytes];
+ prev = ' ';
for (int i = 0; i < numBytes; i++) {
- byte b = getByte(i);
- if (i == 0 || getByte(i - 1) == ' ') {
- if (numBytesForFirstByte(b) != 1) {
- // fallback
- return toTitleCaseSlow();
- }
- int upper = Character.toTitleCase(b);
- if (upper > 127) {
- // fallback
- return toTitleCaseSlow();
- }
- bytes[i] = (byte) upper;
+ curr = getByte(i);
+ if (prev == ' ') {
+ bytes[i] = (byte) Character.toTitleCase(curr);
} else {
- bytes[i] = b;
+ bytes[i] = curr;
}
+ prev = curr;
}
return fromBytes(bytes);
}
diff --git
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 594b96944934..934b93c9345b 100644
---
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -107,6 +107,29 @@ public class UTF8StringSuite {
assertTrue(fromString("你好123").binaryCompare(fromString("你好122")) > 0);
}
+ @Test
+ public void lowercaseComparison() {
+ // SPARK-47693: Test optimized lowercase comparison of UTF8String instances
+ // ASCII
+ assertEquals(fromString("aaa").compareLowerCase(fromString("AAA")), 0);
+ assertTrue(fromString("aaa").compareLowerCase(fromString("AAAA")) < 0);
+ assertTrue(fromString("AAA").compareLowerCase(fromString("aaaa")) < 0);
+ assertTrue(fromString("a").compareLowerCase(fromString("B")) < 0);
+ assertTrue(fromString("b").compareLowerCase(fromString("A")) > 0);
+ assertEquals(fromString("aAa").compareLowerCase(fromString("AaA")), 0);
+ assertTrue(fromString("abcd").compareLowerCase(fromString("abC")) > 0);
+ assertTrue(fromString("ABC").compareLowerCase(fromString("abcd")) < 0);
+ assertEquals(fromString("abcd").compareLowerCase(fromString("abcd")), 0);
+ // non-ASCII
+ assertEquals(fromString("ü").compareLowerCase(fromString("Ü")), 0);
+ assertEquals(fromString("Äü").compareLowerCase(fromString("äÜ")), 0);
+ assertTrue(fromString("a").compareLowerCase(fromString("ä")) < 0);
+ assertTrue(fromString("a").compareLowerCase(fromString("Ä")) < 0);
+ assertTrue(fromString("A").compareLowerCase(fromString("ä")) < 0);
+ assertTrue(fromString("bä").compareLowerCase(fromString("aü")) > 0);
+ assertTrue(fromString("bxxxxxxxxxx").compareLowerCase(fromString("bü")) <
0);
+ }
+
protected static void testUpperandLower(String upper, String lower) {
UTF8String us = fromString(upper);
UTF8String ls = fromString(lower);
diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
index e1d7a42aac61..32cbbc74e911 100644
--- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
@@ -1,27 +1,27 @@
-OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE 29904 29937
47 0.0 299036.1 1.0X
-UNICODE 3886 3893
10 0.0 38863.0 7.7X
-UTF8_BINARY 3945 3945
0 0.0 39449.6 7.6X
-UNICODE_CI 45321 45330
12 0.0 453210.3 0.7X
+UTF8_BINARY_LCASE 6910 6912
3 0.0 69099.7 1.0X
+UNICODE 4367 4368
1 0.0 43669.6 1.6X
+UTF8_BINARY 4361 4364
4 0.0 43606.5 1.6X
+UNICODE_CI 46480 46526
66 0.0 464795.7 0.1X
-OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE 29807 29818
17 0.0 298065.0 1.0X
-UNICODE 45704 45723
27 0.0 457036.2 0.7X
-UTF8_BINARY 6460 6464
7 0.0 64597.9 4.6X
-UNICODE_CI 45498 45508
14 0.0 454977.6 0.7X
+UTF8_BINARY_LCASE 6522 6526
4 0.0 65223.9 1.0X
+UNICODE 45792 45797
7 0.0 457922.3 0.1X
+UTF8_BINARY 7092 7112
29 0.0 70921.7 0.9X
+UNICODE_CI 47548 47564
22 0.0 475476.7 0.1X
-OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE 23553 23595
59 0.0 235531.8 1.0X
-UNICODE 197303 197309
8 0.0 1973034.1 0.1X
-UTF8_BINARY 14389 14391
2 0.0 143891.2 1.6X
-UNICODE_CI 166880 166885
7 0.0 1668799.5 0.1X
+UTF8_BINARY_LCASE 11716 11716
1 0.0 117157.9 1.0X
+UNICODE 180133 180137
5 0.0 1801332.1 0.1X
+UTF8_BINARY 10476 10477
1 0.0 104757.4 1.1X
+UNICODE_CI 148171 148190
28 0.0 1481705.6 0.1X
diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt
b/sql/core/benchmarks/CollationBenchmark-results.txt
index d8ebdfa695ff..4028b0f005a3 100644
--- a/sql/core/benchmarks/CollationBenchmark-results.txt
+++ b/sql/core/benchmarks/CollationBenchmark-results.txt
@@ -1,27 +1,27 @@
-OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
--------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE 34122 34152
42 0.0 341224.2 1.0X
-UNICODE 4520 4522
2 0.0 45201.8 7.5X
-UTF8_BINARY 4524 4526
2 0.0 45243.0 7.5X
-UNICODE_CI 52706 52711
7 0.0 527056.1 0.6X
+UTF8_BINARY_LCASE 7692 7731
55 0.0 76919.2 1.0X
+UNICODE 4378 4379
0 0.0 43784.6 1.8X
+UTF8_BINARY 4382 4396
19 0.0 43821.6 1.8X
+UNICODE_CI 48344 48360
23 0.0 483436.5 0.2X
-OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE 33467 33474
10 0.0 334671.7 1.0X
-UNICODE 51168 51168
1 0.0 511677.4 0.7X
-UTF8_BINARY 5561 5593
45 0.0 55610.9 6.0X
-UNICODE_CI 51929 51955
36 0.0 519291.8 0.6X
+UTF8_BINARY_LCASE 9819 9820
0 0.0 98194.9 1.0X
+UNICODE 49507 49518
17 0.0 495066.2 0.2X
+UTF8_BINARY 7354 7365
17 0.0 73536.3 1.3X
+UNICODE_CI 52149 52163
20 0.0 521489.4 0.2X
-OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY_LCASE 22079 22083
5 0.0 220786.7 1.0X
-UNICODE 177636 177709
103 0.0 1776363.9 0.1X
-UTF8_BINARY 11954 11956
3 0.0 119536.7 1.8X
-UNICODE_CI 158014 158038
35 0.0 1580135.7 0.1X
+UTF8_BINARY_LCASE 18110 18127
24 0.0 181103.9 1.0X
+UNICODE 171375 171435
85 0.0 1713752.3 0.1X
+UTF8_BINARY 14012 14030
26 0.0 140116.7 1.3X
+UNICODE_CI 153847 153901
76 0.0 1538471.1 0.1X
diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..dc68b747203f
--- /dev/null
+++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
@@ -0,0 +1,27 @@
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+--------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE 18244 18258
20 0.0 456096.4 1.0X
+UNICODE 498 498
0 0.1 12440.3 36.7X
+UTF8_BINARY 499 500
1 0.1 12467.7 36.6X
+UNICODE_CI 13429 13443
19 0.0 335725.4 1.4X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+---------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE 18377 18399
31 0.0 459430.5 1.0X
+UNICODE 14238 14240
3 0.0 355957.4 1.3X
+UTF8_BINARY 975 976
1 0.0 24371.3 18.9X
+UNICODE_CI 13819 13826
10 0.0 345482.6 1.3X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE 9183 9230
67 0.0 229564.0 1.0X
+UNICODE 38937 38952
22 0.0 973421.3 0.2X
+UTF8_BINARY 1376 1376
0 0.0 34397.5 6.7X
+UNICODE_CI 32881 32882
1 0.0 822027.4 0.3X
+
diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
new file mode 100644
index 000000000000..bb58968764c7
--- /dev/null
+++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
@@ -0,0 +1,27 @@
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+--------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE 17881 17885
6 0.0 447017.7 1.0X
+UNICODE 493 495
2 0.1 12328.9 36.3X
+UTF8_BINARY 493 494
1 0.1 12331.4 36.3X
+UNICODE_CI 13731 13737
8 0.0 343284.6 1.3X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+---------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE 18041 18047
8 0.0 451030.2 1.0X
+UNICODE 14023 14047
34 0.0 350573.9 1.3X
+UTF8_BINARY 1387 1397
14 0.0 34680.4 13.0X
+UNICODE_CI 14232 14242
14 0.0 355808.4 1.3X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE 10494 10499
6 0.0 262360.0 1.0X
+UNICODE 40410 40422
17 0.0 1010261.8 0.3X
+UTF8_BINARY 2035 2035
1 0.0 50877.8 5.2X
+UNICODE_CI 31470 31493
32 0.0 786752.4 0.3X
+
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
index 24e61052f561..7a93c7c495e2 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
@@ -22,31 +22,11 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
import org.apache.spark.sql.catalyst.util.CollationFactory
import org.apache.spark.unsafe.types.UTF8String
-/**
- * Benchmark to measure performance for comparisons between collated strings.
To run this benchmark:
- * {{{
- * 1. without sbt:
- * bin/spark-submit --class <this class>
- * --jars <spark core test jar>,<spark catalyst test jar> <spark sql
test jar>
- * 2. build/sbt "sql/Test/runMain
org.apache.spark.sql.execution.benchmark.CollationBenchmark"
- * 3. generate result:
- * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this
class>"
- * Results will be written to "benchmarks/CollationBenchmark-results.txt".
- * }}}
- */
-
-object CollationBenchmark extends BenchmarkBase {
- private val collationTypes = Seq("UTF8_BINARY_LCASE", "UNICODE",
"UTF8_BINARY", "UNICODE_CI")
+abstract class CollationBenchmarkBase extends BenchmarkBase {
+ protected val collationTypes: Seq[String] =
+ Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI")
- def generateSeqInput(n: Long): Seq[UTF8String] = {
- val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF",
"def", "def",
- "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu",
"VWX", "vwx",
- "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def",
"GHI", "ghi",
- "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx",
"YZ")
- .map(UTF8String.fromString)
- val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt %
input.size))
- inputLong
- }
+ def generateSeqInput(n: Long): Seq[UTF8String]
def benchmarkUTFStringEquals(collationTypes: Seq[String], utf8Strings:
Seq[UTF8String]): Unit = {
val sublistStrings = utf8Strings
@@ -54,7 +34,7 @@ object CollationBenchmark extends BenchmarkBase {
val benchmark = new Benchmark(
"collation unit benchmarks - equalsFunction",
utf8Strings.size * 10,
- warmupTime = 4.seconds,
+ warmupTime = 10.seconds,
output = output)
collationTypes.foreach(collationType => {
val collation = CollationFactory.fetchCollation(collationType)
@@ -77,7 +57,7 @@ object CollationBenchmark extends BenchmarkBase {
val benchmark = new Benchmark(
"collation unit benchmarks - compareFunction",
utf8Strings.size * 10,
- warmupTime = 4.seconds,
+ warmupTime = 10.seconds,
output = output)
collationTypes.foreach(collationType => {
val collation = CollationFactory.fetchCollation(collationType)
@@ -103,7 +83,7 @@ object CollationBenchmark extends BenchmarkBase {
val benchmark = new Benchmark(
"collation unit benchmarks - hashFunction",
utf8Strings.size * 10,
- warmupTime = 4.seconds,
+ warmupTime = 10.seconds,
output = output)
collationTypes.foreach(collationType => {
val collation = CollationFactory.fetchCollation(collationType)
@@ -120,6 +100,31 @@ object CollationBenchmark extends BenchmarkBase {
)
benchmark.run()
}
+}
+
+/**
+ * Benchmark to measure performance for comparisons between collated strings.
To run this benchmark:
+ * {{{
+ * 1. without sbt:
+ * bin/spark-submit --class <this class>
+ * --jars <spark core test jar>,<spark catalyst test jar> <spark sql
test jar>
+ * 2. build/sbt "sql/Test/runMain
org.apache.spark.sql.execution.benchmark.CollationBenchmark"
+ * 3. generate result:
+ * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this
class>"
+ * Results will be written to "benchmarks/CollationBenchmark-results.txt".
+ * }}}
+ */
+object CollationBenchmark extends CollationBenchmarkBase {
+
+ override def generateSeqInput(n: Long): Seq[UTF8String] = {
+ val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF",
"def", "def",
+ "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu",
"VWX", "vwx",
+ "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def",
"GHI", "ghi",
+ "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx",
"YZ")
+ .map(UTF8String.fromString)
+ val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt %
input.size))
+ inputLong
+ }
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L))
@@ -127,3 +132,28 @@ object CollationBenchmark extends BenchmarkBase {
benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L))
}
}
+
+/**
+ * Measure performance of collation comparisons of non-ASCII strings.
+ */
+object CollationNonASCIIBenchmark extends CollationBenchmarkBase {
+
+ override def generateSeqInput(n: Long): Seq[UTF8String] = {
+ // scalastyle:off nonascii
+ val inputSet = Seq("A", "a", "Ä", "ä")
+ // lowercase and uppercase plain and umlaut A combinations of 3 letters
(AAA, aäA, ...)
+ val input = (for {
+ x <- inputSet
+ y <- inputSet
+ z <- inputSet } yield x + y + z).map(UTF8String.fromString)
+ val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt %
input.size))
+ inputLong
+ // scalastyle:on nonascii
+ }
+
+ override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+ benchmarkUTFStringEquals(collationTypes, generateSeqInput(4000L))
+ benchmarkUTFStringCompare(collationTypes, generateSeqInput(4000L))
+ benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(4000L))
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]