This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 627f6082edca [SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String used in UTF8_BINARY_LCASE collation 627f6082edca is described below commit 627f6082edca0507439f0c736e179caf55e6a01d Author: Nikola Mandic <nikola.man...@databricks.com> AuthorDate: Wed Apr 10 23:23:32 2024 +0800 [SPARK-47693][SQL] Add optimization for lowercase comparison of UTF8String used in UTF8_BINARY_LCASE collation ### What changes were proposed in this pull request? Current collation [benchmarks](https://github.com/apache/spark/blob/e9f204ae93061a862e4da52c128eaf3512a66c7b/sql/core/benchmarks/CollationBenchmark-results.txt) indicate that `UTF8_BINARY_LCASE` collation comparisons are order of magnitude slower (~7-10x) than plain binary comparisons. Improve the performance by optimizing lowercase comparison function for `UTF8String` instances instead of performing full lowercase conversion before binary comparison. Optimization is based on similar method used in `toLowerCase` where we check character by character if conversion is valid under ASCII and fallback to slow comparison of native strings. In latter case, we only take into consideration suffixes that are left to compare. Benchmarks from `CollationBenchmark` ran locally show substantial performance increase: ``` [info] collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] -------------------------------------------------------------------------------------------------------------------------- [info] UTF8_BINARY_LCASE 7199 7209 14 0.0 71988.8 1.0X [info] UNICODE 3925 3929 5 0.0 39250.4 1.8X [info] UTF8_BINARY 3935 3950 21 0.0 39351.2 1.8X [info] UNICODE_CI 45248 51404 8706 0.0 452484.7 0.2X ``` ### Why are the changes needed? To improve performance of comparisons of strings under UTF8_BINARY_LCASE collation. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added unit tests to `UTF8StringSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45816 from nikolamand-db/SPARK-47693. Authored-by: Nikola Mandic <nikola.man...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/util/CollationFactory.java | 2 +- .../org/apache/spark/unsafe/types/UTF8String.java | 96 +++++++++++++--------- .../apache/spark/unsafe/types/UTF8StringSuite.java | 23 ++++++ .../CollationBenchmark-jdk21-results.txt | 30 +++---- sql/core/benchmarks/CollationBenchmark-results.txt | 30 +++---- .../CollationNonASCIIBenchmark-jdk21-results.txt | 27 ++++++ .../CollationNonASCIIBenchmark-results.txt | 27 ++++++ .../execution/benchmark/CollationBenchmark.scala | 84 +++++++++++++------ 8 files changed, 223 insertions(+), 96 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 119508a37e71..72a6e574707f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -148,7 +148,7 @@ public final class CollationFactory { collationTable[1] = new Collation( "UTF8_BINARY_LCASE", null, - (s1, s2) -> s1.toLowerCase().binaryCompare(s2.toLowerCase()), + UTF8String::compareLowerCase, "1.0", (s) -> (long)s.toLowerCase().hashCode(), false, diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c5dfb91f06c6..2006efb07a04 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -424,21 +424,16 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable, if (numBytes == 0) { return EMPTY_UTF8; } - - byte[] bytes = new byte[numBytes]; - bytes[0] = (byte) Character.toTitleCase(getByte(0)); + // Optimization - do char level uppercase conversion in case of chars in ASCII range for (int i = 0; i < numBytes; i++) { - byte b = getByte(i); - if (numBytesForFirstByte(b) != 1) { - // fallback - return toUpperCaseSlow(); - } - int upper = Character.toUpperCase(b); - if (upper > 127) { - // fallback + if (getByte(i) < 0) { + // non-ASCII return toUpperCaseSlow(); } - bytes[i] = (byte) upper; + } + byte[] bytes = new byte[numBytes]; + for (int i = 0; i < numBytes; i++) { + bytes[i] = (byte) Character.toUpperCase(getByte(i)); } return fromBytes(bytes); } @@ -447,6 +442,34 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable, return fromString(toString().toUpperCase()); } + /** + * Optimized lowercase comparison for UTF8_BINARY_LCASE collation + * a.compareLowerCase(b) is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()) + */ + public int compareLowerCase(UTF8String other) { + int curr; + for (curr = 0; curr < numBytes && curr < other.numBytes; curr++) { + byte left, right; + if ((left = getByte(curr)) < 0 || (right = other.getByte(curr)) < 0) { + return compareLowerCaseSuffixSlow(other, curr); + } + int lowerLeft = Character.toLowerCase(left); + int lowerRight = Character.toLowerCase(right); + if (lowerLeft != lowerRight) { + return lowerLeft - lowerRight; + } + } + return numBytes - other.numBytes; + } + + private int compareLowerCaseSuffixSlow(UTF8String other, int pref) { + UTF8String suffixLeft = UTF8String.fromAddress(base, offset + pref, + numBytes - pref); + UTF8String suffixRight = UTF8String.fromAddress(other.base, other.offset + pref, + other.numBytes - pref); + return suffixLeft.toLowerCaseSlow().binaryCompare(suffixRight.toLowerCaseSlow()); + } + /** * Returns the lower case of this string */ @@ -454,21 +477,16 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable, if (numBytes == 0) { return EMPTY_UTF8; } - - byte[] bytes = new byte[numBytes]; - bytes[0] = (byte) Character.toTitleCase(getByte(0)); + // Optimization - do char level lowercase conversion in case of chars in ASCII range for (int i = 0; i < numBytes; i++) { - byte b = getByte(i); - if (numBytesForFirstByte(b) != 1) { - // fallback + if (getByte(i) < 0) { + // non-ASCII return toLowerCaseSlow(); } - int lower = Character.toLowerCase(b); - if (lower > 127) { - // fallback - return toLowerCaseSlow(); - } - bytes[i] = (byte) lower; + } + byte[] bytes = new byte[numBytes]; + for (int i = 0; i < numBytes; i++) { + bytes[i] = (byte) Character.toLowerCase(getByte(i)); } return fromBytes(bytes); } @@ -484,24 +502,26 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable, if (numBytes == 0) { return EMPTY_UTF8; } - + // Optimization - in case of ASCII chars we can skip copying the data to and from StringBuilder + byte prev = ' ', curr; + for (int i = 0; i < numBytes; i++) { + curr = getByte(i); + if (prev == ' ' && curr < 0) { + // non-ASCII + return toTitleCaseSlow(); + } + prev = curr; + } byte[] bytes = new byte[numBytes]; + prev = ' '; for (int i = 0; i < numBytes; i++) { - byte b = getByte(i); - if (i == 0 || getByte(i - 1) == ' ') { - if (numBytesForFirstByte(b) != 1) { - // fallback - return toTitleCaseSlow(); - } - int upper = Character.toTitleCase(b); - if (upper > 127) { - // fallback - return toTitleCaseSlow(); - } - bytes[i] = (byte) upper; + curr = getByte(i); + if (prev == ' ') { + bytes[i] = (byte) Character.toTitleCase(curr); } else { - bytes[i] = b; + bytes[i] = curr; } + prev = curr; } return fromBytes(bytes); } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 594b96944934..934b93c9345b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -107,6 +107,29 @@ public class UTF8StringSuite { assertTrue(fromString("你好123").binaryCompare(fromString("你好122")) > 0); } + @Test + public void lowercaseComparison() { + // SPARK-47693: Test optimized lowercase comparison of UTF8String instances + // ASCII + assertEquals(fromString("aaa").compareLowerCase(fromString("AAA")), 0); + assertTrue(fromString("aaa").compareLowerCase(fromString("AAAA")) < 0); + assertTrue(fromString("AAA").compareLowerCase(fromString("aaaa")) < 0); + assertTrue(fromString("a").compareLowerCase(fromString("B")) < 0); + assertTrue(fromString("b").compareLowerCase(fromString("A")) > 0); + assertEquals(fromString("aAa").compareLowerCase(fromString("AaA")), 0); + assertTrue(fromString("abcd").compareLowerCase(fromString("abC")) > 0); + assertTrue(fromString("ABC").compareLowerCase(fromString("abcd")) < 0); + assertEquals(fromString("abcd").compareLowerCase(fromString("abcd")), 0); + // non-ASCII + assertEquals(fromString("ü").compareLowerCase(fromString("Ü")), 0); + assertEquals(fromString("Äü").compareLowerCase(fromString("äÜ")), 0); + assertTrue(fromString("a").compareLowerCase(fromString("ä")) < 0); + assertTrue(fromString("a").compareLowerCase(fromString("Ä")) < 0); + assertTrue(fromString("A").compareLowerCase(fromString("ä")) < 0); + assertTrue(fromString("bä").compareLowerCase(fromString("aü")) > 0); + assertTrue(fromString("bxxxxxxxxxx").compareLowerCase(fromString("bü")) < 0); + } + protected static void testUpperandLower(String upper, String lower) { UTF8String us = fromString(upper); UTF8String ls = fromString(lower); diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt index e1d7a42aac61..32cbbc74e911 100644 --- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt @@ -1,27 +1,27 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 29904 29937 47 0.0 299036.1 1.0X -UNICODE 3886 3893 10 0.0 38863.0 7.7X -UTF8_BINARY 3945 3945 0 0.0 39449.6 7.6X -UNICODE_CI 45321 45330 12 0.0 453210.3 0.7X +UTF8_BINARY_LCASE 6910 6912 3 0.0 69099.7 1.0X +UNICODE 4367 4368 1 0.0 43669.6 1.6X +UTF8_BINARY 4361 4364 4 0.0 43606.5 1.6X +UNICODE_CI 46480 46526 66 0.0 464795.7 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 29807 29818 17 0.0 298065.0 1.0X -UNICODE 45704 45723 27 0.0 457036.2 0.7X -UTF8_BINARY 6460 6464 7 0.0 64597.9 4.6X -UNICODE_CI 45498 45508 14 0.0 454977.6 0.7X +UTF8_BINARY_LCASE 6522 6526 4 0.0 65223.9 1.0X +UNICODE 45792 45797 7 0.0 457922.3 0.1X +UTF8_BINARY 7092 7112 29 0.0 70921.7 0.9X +UNICODE_CI 47548 47564 22 0.0 475476.7 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY_LCASE 23553 23595 59 0.0 235531.8 1.0X -UNICODE 197303 197309 8 0.0 1973034.1 0.1X -UTF8_BINARY 14389 14391 2 0.0 143891.2 1.6X -UNICODE_CI 166880 166885 7 0.0 1668799.5 0.1X +UTF8_BINARY_LCASE 11716 11716 1 0.0 117157.9 1.0X +UNICODE 180133 180137 5 0.0 1801332.1 0.1X +UTF8_BINARY 10476 10477 1 0.0 104757.4 1.1X +UNICODE_CI 148171 148190 28 0.0 1481705.6 0.1X diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt index d8ebdfa695ff..4028b0f005a3 100644 --- a/sql/core/benchmarks/CollationBenchmark-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-results.txt @@ -1,27 +1,27 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 34122 34152 42 0.0 341224.2 1.0X -UNICODE 4520 4522 2 0.0 45201.8 7.5X -UTF8_BINARY 4524 4526 2 0.0 45243.0 7.5X -UNICODE_CI 52706 52711 7 0.0 527056.1 0.6X +UTF8_BINARY_LCASE 7692 7731 55 0.0 76919.2 1.0X +UNICODE 4378 4379 0 0.0 43784.6 1.8X +UTF8_BINARY 4382 4396 19 0.0 43821.6 1.8X +UNICODE_CI 48344 48360 23 0.0 483436.5 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 33467 33474 10 0.0 334671.7 1.0X -UNICODE 51168 51168 1 0.0 511677.4 0.7X -UTF8_BINARY 5561 5593 45 0.0 55610.9 6.0X -UNICODE_CI 51929 51955 36 0.0 519291.8 0.6X +UTF8_BINARY_LCASE 9819 9820 0 0.0 98194.9 1.0X +UNICODE 49507 49518 17 0.0 495066.2 0.2X +UTF8_BINARY 7354 7365 17 0.0 73536.3 1.3X +UNICODE_CI 52149 52163 20 0.0 521489.4 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY_LCASE 22079 22083 5 0.0 220786.7 1.0X -UNICODE 177636 177709 103 0.0 1776363.9 0.1X -UTF8_BINARY 11954 11956 3 0.0 119536.7 1.8X -UNICODE_CI 158014 158038 35 0.0 1580135.7 0.1X +UTF8_BINARY_LCASE 18110 18127 24 0.0 181103.9 1.0X +UNICODE 171375 171435 85 0.0 1713752.3 0.1X +UTF8_BINARY 14012 14030 26 0.0 140116.7 1.3X +UNICODE_CI 153847 153901 76 0.0 1538471.1 0.1X diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt new file mode 100644 index 000000000000..dc68b747203f --- /dev/null +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt @@ -0,0 +1,27 @@ +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 18244 18258 20 0.0 456096.4 1.0X +UNICODE 498 498 0 0.1 12440.3 36.7X +UTF8_BINARY 499 500 1 0.1 12467.7 36.6X +UNICODE_CI 13429 13443 19 0.0 335725.4 1.4X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 18377 18399 31 0.0 459430.5 1.0X +UNICODE 14238 14240 3 0.0 355957.4 1.3X +UTF8_BINARY 975 976 1 0.0 24371.3 18.9X +UNICODE_CI 13819 13826 10 0.0 345482.6 1.3X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 9183 9230 67 0.0 229564.0 1.0X +UNICODE 38937 38952 22 0.0 973421.3 0.2X +UTF8_BINARY 1376 1376 0 0.0 34397.5 6.7X +UNICODE_CI 32881 32882 1 0.0 822027.4 0.3X + diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt new file mode 100644 index 000000000000..bb58968764c7 --- /dev/null +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt @@ -0,0 +1,27 @@ +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 17881 17885 6 0.0 447017.7 1.0X +UNICODE 493 495 2 0.1 12328.9 36.3X +UTF8_BINARY 493 494 1 0.1 12331.4 36.3X +UNICODE_CI 13731 13737 8 0.0 343284.6 1.3X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 18041 18047 8 0.0 451030.2 1.0X +UNICODE 14023 14047 34 0.0 350573.9 1.3X +UTF8_BINARY 1387 1397 14 0.0 34680.4 13.0X +UNICODE_CI 14232 14242 14 0.0 355808.4 1.3X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 10494 10499 6 0.0 262360.0 1.0X +UNICODE 40410 40422 17 0.0 1010261.8 0.3X +UTF8_BINARY 2035 2035 1 0.0 50877.8 5.2X +UNICODE_CI 31470 31493 32 0.0 786752.4 0.3X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala index 24e61052f561..7a93c7c495e2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala @@ -22,31 +22,11 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.unsafe.types.UTF8String -/** - * Benchmark to measure performance for comparisons between collated strings. To run this benchmark: - * {{{ - * 1. without sbt: - * bin/spark-submit --class <this class> - * --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar> - * 2. build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark" - * 3. generate result: - * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this class>" - * Results will be written to "benchmarks/CollationBenchmark-results.txt". - * }}} - */ - -object CollationBenchmark extends BenchmarkBase { - private val collationTypes = Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI") +abstract class CollationBenchmarkBase extends BenchmarkBase { + protected val collationTypes: Seq[String] = + Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI") - def generateSeqInput(n: Long): Seq[UTF8String] = { - val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", - "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", - "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", "GHI", "ghi", - "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", "YZ") - .map(UTF8String.fromString) - val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size)) - inputLong - } + def generateSeqInput(n: Long): Seq[UTF8String] def benchmarkUTFStringEquals(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = { val sublistStrings = utf8Strings @@ -54,7 +34,7 @@ object CollationBenchmark extends BenchmarkBase { val benchmark = new Benchmark( "collation unit benchmarks - equalsFunction", utf8Strings.size * 10, - warmupTime = 4.seconds, + warmupTime = 10.seconds, output = output) collationTypes.foreach(collationType => { val collation = CollationFactory.fetchCollation(collationType) @@ -77,7 +57,7 @@ object CollationBenchmark extends BenchmarkBase { val benchmark = new Benchmark( "collation unit benchmarks - compareFunction", utf8Strings.size * 10, - warmupTime = 4.seconds, + warmupTime = 10.seconds, output = output) collationTypes.foreach(collationType => { val collation = CollationFactory.fetchCollation(collationType) @@ -103,7 +83,7 @@ object CollationBenchmark extends BenchmarkBase { val benchmark = new Benchmark( "collation unit benchmarks - hashFunction", utf8Strings.size * 10, - warmupTime = 4.seconds, + warmupTime = 10.seconds, output = output) collationTypes.foreach(collationType => { val collation = CollationFactory.fetchCollation(collationType) @@ -120,6 +100,31 @@ object CollationBenchmark extends BenchmarkBase { ) benchmark.run() } +} + +/** + * Benchmark to measure performance for comparisons between collated strings. To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class <this class> + * --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar> + * 2. build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark" + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this class>" + * Results will be written to "benchmarks/CollationBenchmark-results.txt". + * }}} + */ +object CollationBenchmark extends CollationBenchmarkBase { + + override def generateSeqInput(n: Long): Seq[UTF8String] = { + val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", + "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", + "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", "GHI", "ghi", + "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", "YZ") + .map(UTF8String.fromString) + val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size)) + inputLong + } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L)) @@ -127,3 +132,28 @@ object CollationBenchmark extends BenchmarkBase { benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L)) } } + +/** + * Measure performance of collation comparisons of non-ASCII strings. + */ +object CollationNonASCIIBenchmark extends CollationBenchmarkBase { + + override def generateSeqInput(n: Long): Seq[UTF8String] = { + // scalastyle:off nonascii + val inputSet = Seq("A", "a", "Ä", "ä") + // lowercase and uppercase plain and umlaut A combinations of 3 letters (AAA, aäA, ...) + val input = (for { + x <- inputSet + y <- inputSet + z <- inputSet } yield x + y + z).map(UTF8String.fromString) + val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size)) + inputLong + // scalastyle:on nonascii + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + benchmarkUTFStringEquals(collationTypes, generateSeqInput(4000L)) + benchmarkUTFStringCompare(collationTypes, generateSeqInput(4000L)) + benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(4000L)) + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org