Repository: spark Updated Branches: refs/heads/master b1328cc58 -> 669ade3a8
[SPARK-25657][SQL][TEST] Refactor HashBenchmark to use main method ## What changes were proposed in this pull request? Refactor `HashBenchmark` to use main method. 1. use `spark-submit`: ```console bin/spark-submit --class org.apache.spark.sql.HashBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar ``` 2. Generate benchmark result: ```console SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain org.apache.spark.sql.HashBenchmark" ``` ## How was this patch tested? manual tests Closes #22651 from wangyum/SPARK-25657. Lead-authored-by: Yuming Wang <wgy...@gmail.com> Co-authored-by: Yuming Wang <yumw...@ebay.com> Co-authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/669ade3a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/669ade3a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/669ade3a Branch: refs/heads/master Commit: 669ade3a8eed0016b5ece57d776cea0616417088 Parents: b1328cc Author: Yuming Wang <wgy...@gmail.com> Authored: Sun Oct 7 09:49:37 2018 -0700 Committer: Dongjoon Hyun <dongj...@apache.org> Committed: Sun Oct 7 09:49:37 2018 -0700 ---------------------------------------------------------------------- .../benchmarks/HashBenchmark-results.txt | 70 +++++++++ .../org/apache/spark/sql/HashBenchmark.scala | 152 +++++++------------ 2 files changed, 129 insertions(+), 93 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/669ade3a/sql/catalyst/benchmarks/HashBenchmark-results.txt ---------------------------------------------------------------------- diff --git a/sql/catalyst/benchmarks/HashBenchmark-results.txt b/sql/catalyst/benchmarks/HashBenchmark-results.txt new file mode 100644 index 0000000..2459b35 --- /dev/null +++ b/sql/catalyst/benchmarks/HashBenchmark-results.txt @@ -0,0 +1,70 @@ +================================================================================================ +single ints +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 5615 / 5616 95.6 10.5 1.0X +codegen version 8400 / 8407 63.9 15.6 0.7X +codegen version 64-bit 8139 / 8145 66.0 15.2 0.7X +codegen HiveHash version 7213 / 7348 74.4 13.4 0.8X + + +================================================================================================ +single longs +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 6053 / 6054 88.7 11.3 1.0X +codegen version 9367 / 9369 57.3 17.4 0.6X +codegen version 64-bit 8041 / 8051 66.8 15.0 0.8X +codegen HiveHash version 7546 / 7575 71.1 14.1 0.8X + + +================================================================================================ +normal +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 3181 / 3182 0.7 1517.0 1.0X +codegen version 2403 / 2403 0.9 1145.7 1.3X +codegen version 64-bit 915 / 916 2.3 436.2 3.5X +codegen HiveHash version 4505 / 4527 0.5 2148.3 0.7X + + +================================================================================================ +array +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 1828 / 1844 0.1 13946.1 1.0X +codegen version 3678 / 3804 0.0 28058.2 0.5X +codegen version 64-bit 2925 / 2931 0.0 22317.8 0.6X +codegen HiveHash version 1216 / 1217 0.1 9280.0 1.5X + + +================================================================================================ +map +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +interpreted version 0 / 0 44.3 22.6 1.0X +codegen version 176 / 176 0.0 42978.8 0.0X +codegen version 64-bit 173 / 175 0.0 42214.3 0.0X +codegen HiveHash version 44 / 44 0.1 10659.9 0.0X + + http://git-wip-us.apache.org/repos/asf/spark/blob/669ade3a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala index 7a2a66c..4226ab3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/HashBenchmark.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.benchmark.Benchmark +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection @@ -26,94 +26,87 @@ import org.apache.spark.sql.types._ /** * Benchmark for the previous interpreted hash function(InternalRow.hashCode) vs codegened * hash expressions (Murmur3Hash/xxHash64). + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class <this class> --jars <spark core test jar> <spark catalyst test jar> + * 2. build/sbt "catalyst/test:runMain <this class>" + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain <this class>" + * Results will be written to "benchmarks/HashBenchmark-results.txt". + * }}} */ -object HashBenchmark { +object HashBenchmark extends BenchmarkBase { def test(name: String, schema: StructType, numRows: Int, iters: Int): Unit = { - val generator = RandomDataGenerator.forType(schema, nullable = false).get - val encoder = RowEncoder(schema) - val attrs = schema.toAttributes - val safeProjection = GenerateSafeProjection.generate(attrs, attrs) + runBenchmark(name) { + val generator = RandomDataGenerator.forType(schema, nullable = false).get + val encoder = RowEncoder(schema) + val attrs = schema.toAttributes + val safeProjection = GenerateSafeProjection.generate(attrs, attrs) - val rows = (1 to numRows).map(_ => - // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format. - safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy() - ).toArray + val rows = (1 to numRows).map(_ => + // The output of encoder is UnsafeRow, use safeProjection to turn in into safe format. + safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy() + ).toArray - val benchmark = new Benchmark("Hash For " + name, iters * numRows.toLong) - benchmark.addCase("interpreted version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += rows(i).hashCode() - i += 1 + val benchmark = new Benchmark("Hash For " + name, iters * numRows.toLong, output = output) + benchmark.addCase("interpreted version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += rows(i).hashCode() + i += 1 + } } } - } - val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs) - benchmark.addCase("codegen version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHashCode(rows(i)).getInt(0) - i += 1 + val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs) + benchmark.addCase("codegen version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHashCode(rows(i)).getInt(0) + i += 1 + } } } - } - val getHashCode64b = UnsafeProjection.create(new XxHash64(attrs) :: Nil, attrs) - benchmark.addCase("codegen version 64-bit") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHashCode64b(rows(i)).getInt(0) - i += 1 + val getHashCode64b = UnsafeProjection.create(new XxHash64(attrs) :: Nil, attrs) + benchmark.addCase("codegen version 64-bit") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHashCode64b(rows(i)).getInt(0) + i += 1 + } } } - } - val getHiveHashCode = UnsafeProjection.create(new HiveHash(attrs) :: Nil, attrs) - benchmark.addCase("codegen HiveHash version") { _: Int => - var sum = 0 - for (_ <- 0L until iters) { - var i = 0 - while (i < numRows) { - sum += getHiveHashCode(rows(i)).getInt(0) - i += 1 + val getHiveHashCode = UnsafeProjection.create(new HiveHash(attrs) :: Nil, attrs) + benchmark.addCase("codegen HiveHash version") { _: Int => + var sum = 0 + for (_ <- 0L until iters) { + var i = 0 + while (i < numRows) { + sum += getHiveHashCode(rows(i)).getInt(0) + i += 1 + } } } - } - benchmark.run() + benchmark.run() + } } - def main(args: Array[String]): Unit = { + override def runBenchmarkSuite(): Unit = { val singleInt = new StructType().add("i", IntegerType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3262 / 3267 164.6 6.1 1.0X - codegen version 6448 / 6718 83.3 12.0 0.5X - codegen version 64-bit 6088 / 6154 88.2 11.3 0.5X - codegen HiveHash version 4732 / 4745 113.5 8.8 0.7X - */ test("single ints", singleInt, 1 << 15, 1 << 14) val singleLong = new StructType().add("i", LongType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3716 / 3726 144.5 6.9 1.0X - codegen version 7706 / 7732 69.7 14.4 0.5X - codegen version 64-bit 6370 / 6399 84.3 11.9 0.6X - codegen HiveHash version 4924 / 5026 109.0 9.2 0.8X - */ test("single longs", singleLong, 1 << 15, 1 << 14) val normal = new StructType() @@ -131,45 +124,18 @@ object HashBenchmark { .add("binary", BinaryType) .add("date", DateType) .add("timestamp", TimestampType) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 2985 / 3013 0.7 1423.4 1.0X - codegen version 2422 / 2434 0.9 1155.1 1.2X - codegen version 64-bit 856 / 920 2.5 408.0 3.5X - codegen HiveHash version 4501 / 4979 0.5 2146.4 0.7X - */ test("normal", normal, 1 << 10, 1 << 11) val arrayOfInt = ArrayType(IntegerType) val array = new StructType() .add("array", arrayOfInt) .add("arrayOfArray", ArrayType(arrayOfInt)) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 3100 / 3555 0.0 23651.8 1.0X - codegen version 5779 / 5865 0.0 44088.4 0.5X - codegen version 64-bit 4738 / 4821 0.0 36151.7 0.7X - codegen HiveHash version 2200 / 2246 0.1 16785.9 1.4X - */ test("array", array, 1 << 8, 1 << 9) val mapOfInt = MapType(IntegerType, IntegerType) val map = new StructType() .add("map", mapOfInt) .add("mapOfMap", MapType(IntegerType, mapOfInt)) - /* - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - interpreted version 0 / 0 48.1 20.8 1.0X - codegen version 257 / 275 0.0 62768.7 0.0X - codegen version 64-bit 226 / 240 0.0 55224.5 0.0X - codegen HiveHash version 89 / 96 0.0 21708.8 0.0X - */ test("map", map, 1 << 6, 1 << 6) } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org