GideonPotok commented on code in PR #45453: URL: https://github.com/apache/spark/pull/45453#discussion_r1536889413
########## sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala: ########## @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.benchmark + +import scala.util.Random + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.{DataFrame} +import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.functions._ +import org.apache.spark.unsafe.types.UTF8String + +/** + * Benchmark to measure performance for joins. To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class <this class> + * --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar> + * 2. build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark" + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this class>" + * Results will be written to "benchmarks/JoinBenchmark-results.txt". + * }}} + */ + +object CollationBenchmark extends SqlBasedBenchmark { + private val collationTypes = Seq("UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI") + + def generateUTF8Strings(n: Int): Seq[UTF8String] = { + // Generate n UTF8Strings + Seq("ABC", "aBC", "abc", "DEF", "def", "GHI", "ghi", "JKL", "jkl", + "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", "YZ").map(UTF8String.fromString) ++ + (18 to n).map(i => UTF8String.fromString(Random.nextString(i % 25))).sortBy(_.hashCode()) + } + + def benchmarkUTFString(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings.slice(0, 200) + val benchmark = collationTypes.foldLeft( + new Benchmark(s"collation unit benchmarks", utf8Strings.size, output = output)) { + (b, collationType) => + val collation = CollationFactory.fetchCollation(collationType) + b.addCase(s"equalsFunction - $collationType") { _ => + sublistStrings.foreach(s1 => + utf8Strings.foreach(s => + collation.equalsFunction(s, s1).booleanValue() + ) + ) + } + b.addCase(s"collator.compare - $collationType") { _ => + sublistStrings.foreach(s1 => + utf8Strings.foreach(s => + collation.comparator.compare(s, s1) + ) + ) + } + b.addCase(s"hashFunction - $collationType") { _ => + sublistStrings.foreach(s1 => + utf8Strings.foreach(s => + collation.hashFunction.applyAsLong(s) + ) + ) + } + b + } + benchmark.run() + } + + def df1: DataFrame = { + val d = spark.createDataFrame(Seq( + ("ABC", "ABC"), ("aBC", "abc"), ("abc", "ABC"), ("DEF", "DEF"), ("def", "DEF"), Review Comment: Done, inputs are the now same. Let me know what you think. Local results are as follows (It shows that UTF8_BINARY and UNICODE have very fast equals function relative to the other collations): ``` [info] filter df column with collation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ----------------------------------------------------------------------------------------------------------------------------------- [info] filter df column with collation - UTF8_BINARY_LCASE 5 6 1 1.9 528.3 1.0X [info] filter df column with collation - UNICODE 4 5 1 2.3 427.4 1.2X [info] filter df column with collation - UTF8_BINARY 5 5 1 2.2 464.2 1.1X [info] filter df column with collation - UNICODE_CI 5 6 1 1.9 531.1 1.0X [info] Running benchmark: collation unit benchmarks [info] Running case: equalsFunction - UTF8_BINARY_LCASE [info] Stopped after 2 iterations, 3897 ms [info] Running case: collator.compare - UTF8_BINARY_LCASE [info] Stopped after 2 iterations, 4164 ms [info] Running case: hashFunction - UTF8_BINARY_LCASE [info] Stopped after 2 iterations, 3782 ms [info] Running case: equalsFunction - UNICODE [info] Stopped after 6 iterations, 2142 ms [info] Running case: collator.compare - UNICODE [info] Stopped after 2 iterations, 7718 ms [info] Running case: hashFunction - UNICODE [info] Stopped after 2 iterations, 16612 ms [info] Running case: equalsFunction - UTF8_BINARY [info] Stopped after 7 iterations, 2088 ms [info] Running case: collator.compare - UTF8_BINARY [info] Stopped after 4 iterations, 2112 ms [info] Running case: hashFunction - UTF8_BINARY [info] Stopped after 3 iterations, 2324 ms [info] Running case: equalsFunction - UNICODE_CI [info] Stopped after 2 iterations, 8298 ms [info] Running case: collator.compare - UNICODE_CI [info] Stopped after 2 iterations, 6933 ms [info] Running case: hashFunction - UNICODE_CI [info] Stopped after 2 iterations, 12882 ms [info] OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Mac OS X 14.4 [info] Apple M3 Max [info] collation unit benchmarks: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------------------------------ [info] equalsFunction - UTF8_BINARY_LCASE 1948 1949 1 0.0 194813.5 1.0X [info] collator.compare - UTF8_BINARY_LCASE 2081 2082 1 0.0 208136.3 0.9X [info] hashFunction - UTF8_BINARY_LCASE 1890 1891 2 0.0 189021.8 1.0X [info] equalsFunction - UNICODE 357 357 0 0.0 35675.0 5.5X [info] collator.compare - UNICODE 3848 3859 16 0.0 384793.0 0.5X [info] hashFunction - UNICODE 8304 8306 3 0.0 830445.5 0.2X [info] equalsFunction - UTF8_BINARY 296 298 2 0.0 29608.1 6.6X [info] collator.compare - UTF8_BINARY 528 528 0 0.0 52779.0 3.7X [info] hashFunction - UTF8_BINARY 773 775 1 0.0 77336.4 2.5X [info] equalsFunction - UNICODE_CI 4141 4149 12 0.0 414060.1 0.5X [info] collator.compare - UNICODE_CI 3461 3467 8 0.0 346055.8 0.6X [info] hashFunction - UNICODE_CI 6418 6441 33 0.0 641794.3 0.3X ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
