This is an automated email from the ASF dual-hosted git repository. dbtsai pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 6bd995b [SPARK-26958][SQL][TEST] Add NestedSchemaPruningBenchmark 6bd995b is described below commit 6bd995b1013f3753bf1df68d75b4bd9cd2337fae Author: Dongjoon Hyun <dh...@apple.com> AuthorDate: Thu Feb 21 23:39:36 2019 +0000 [SPARK-26958][SQL][TEST] Add NestedSchemaPruningBenchmark ## What changes were proposed in this pull request? This adds `NestedSchemaPruningBenchmark` to show the nested schema pruning performance clearly and to verify new PR's performance benefit and to prevent the future performance degradation. ## How was this patch tested? Manually run the benchmark. ``` SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.NestedSchemaPruningBenchmark" ``` Closes #23862 from dongjoon-hyun/SPARK-NESTED-SCHEMA-PRUNING-BM. Lead-authored-by: Dongjoon Hyun <dh...@apple.com> Co-authored-by: DB Tsai <d_t...@apple.com> Signed-off-by: DB Tsai <d_t...@apple.com> --- .../NestedSchemaPruningBenchmark-results.txt | 40 +++++ .../benchmark/NestedSchemaPruningBenchmark.scala | 163 +++++++++++++++++++++ 2 files changed, 203 insertions(+) diff --git a/sql/core/benchmarks/NestedSchemaPruningBenchmark-results.txt b/sql/core/benchmarks/NestedSchemaPruningBenchmark-results.txt new file mode 100644 index 0000000..7585cae --- /dev/null +++ b/sql/core/benchmarks/NestedSchemaPruningBenchmark-results.txt @@ -0,0 +1,40 @@ +================================================================================================ +Nested Schema Pruning Benchmark +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_201-b09 on Mac OS X 10.14.3 +Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz +Selection: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Top-level column 59 / 68 16.9 59.1 1.0X +Nested column 180 / 186 5.6 179.7 0.3X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_201-b09 on Mac OS X 10.14.3 +Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz +Limiting: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Top-level column 241 / 246 4.2 240.9 1.0X +Nested column 1828 / 1904 0.5 1827.5 0.1X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_201-b09 on Mac OS X 10.14.3 +Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz +Repartitioning: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Top-level column 201 / 208 5.0 200.8 1.0X +Nested column 1811 / 1864 0.6 1811.4 0.1X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_201-b09 on Mac OS X 10.14.3 +Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz +Repartitioning by exprs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Top-level column 206 / 212 4.9 205.8 1.0X +Nested column 1814 / 1863 0.6 1814.3 0.1X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_201-b09 on Mac OS X 10.14.3 +Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz +Sorting: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Top-level column 282 / 302 3.5 281.7 1.0X +Nested column 2093 / 2199 0.5 2093.1 0.1X + + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala new file mode 100644 index 0000000..ddfc8ae --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.internal.SQLConf + +/** + * Synthetic benchmark for nested schema pruning performance. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class <this class> --jars <spark core test jar> <sql core test jar> + * 2. build/sbt "sql/test:runMain <this class>" + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>" + * Results will be written to "benchmarks/NestedSchemaPruningBenchmark-results.txt". + * }}} + */ +object NestedSchemaPruningBenchmark extends SqlBasedBenchmark { + + import spark.implicits._ + + private val N = 1000000 + private val numIters = 10 + + // We use `col1 BIGINT, col2 STRUCT<_1: BIGINT, _2: STRING>` as a test schema. + // col1 and col2._1 is used for comparision. col2._2 mimics the burden for the other columns + private val df = spark + .range(N * 10) + .sample(false, 0.1) + .map(x => (x, (x, s"$x" * 100))) + .toDF("col1", "col2") + + private def addCase(benchmark: Benchmark, name: String, sql: String): Unit = { + benchmark.addCase(name) { _ => + spark.sql(sql).write.format("noop").save() + } + } + + private def selectBenchmark(numRows: Int, numIters: Int): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + Seq(1, 2).foreach { i => + df.write.parquet(path + s"/$i") + spark.read.parquet(path + s"/$i").createOrReplaceTempView(s"t$i") + } + + val benchmark = new Benchmark(s"Selection", numRows, numIters, output = output) + + addCase(benchmark, "Top-level column", "SELECT col1 FROM (SELECT col1 FROM t1)") + addCase(benchmark, "Nested column", "SELECT col2._1 FROM (SELECT col2 FROM t2)") + + benchmark.run() + } + } + + private def limitBenchmark(numRows: Int, numIters: Int): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + Seq(1, 2).foreach { i => + df.write.parquet(path + s"/$i") + spark.read.parquet(path + s"/$i").createOrReplaceTempView(s"t$i") + } + + val benchmark = new Benchmark(s"Limiting", numRows, numIters, output = output) + + addCase(benchmark, "Top-level column", + s"SELECT col1 FROM (SELECT col1 FROM t1 LIMIT ${Int.MaxValue})") + addCase(benchmark, "Nested column", + s"SELECT col2._1 FROM (SELECT col2 FROM t2 LIMIT ${Int.MaxValue})") + + benchmark.run() + } + } + + private def repartitionBenchmark(numRows: Int, numIters: Int): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + Seq(1, 2).foreach { i => + df.write.parquet(path + s"/$i") + spark.read.parquet(path + s"/$i").createOrReplaceTempView(s"t$i") + } + + val benchmark = new Benchmark(s"Repartitioning", numRows, numIters, output = output) + + addCase(benchmark, "Top-level column", + s"SELECT col1 FROM (SELECT /*+ REPARTITION(1) */ col1 FROM t1)") + addCase(benchmark, "Nested column", + s"SELECT col2._1 FROM (SELECT /*+ REPARTITION(1) */ col2 FROM t2)") + + benchmark.run() + } + } + + private def repartitionByExprBenchmark(numRows: Int, numIters: Int): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + Seq(1, 2).foreach { i => + df.write.parquet(path + s"/$i") + spark.read.parquet(path + s"/$i").createOrReplaceTempView(s"t$i") + } + + val benchmark = new Benchmark(s"Repartitioning by exprs", numRows, numIters, output = output) + + addCase(benchmark, "Top-level column", + s"SELECT col1 FROM (SELECT col1 FROM t1 DISTRIBUTE BY col1)") + addCase(benchmark, "Nested column", + s"SELECT col2._1 FROM (SELECT col2 FROM t2 DISTRIBUTE BY col2._1)") + + benchmark.run() + } + } + + private def sortBenchmark(numRows: Int, numIters: Int): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + Seq(1, 2).foreach { i => + df.write.parquet(path + s"/$i") + spark.read.parquet(path + s"/$i").createOrReplaceTempView(s"t$i") + } + + val benchmark = new Benchmark(s"Sorting", numRows, numIters, output = output) + + addCase(benchmark, "Top-level column", "SELECT col1 FROM t1 ORDER BY col1") + addCase(benchmark, "Nested column", "SELECT col2._1 FROM t2 ORDER BY col2._1") + + benchmark.run() + } + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark(s"Nested Schema Pruning Benchmark") { + withSQLConf (SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "true") { + selectBenchmark (N, numIters) + limitBenchmark (N, numIters) + repartitionBenchmark (N, numIters) + repartitionByExprBenchmark (N, numIters) + sortBenchmark (N, numIters) + } + } + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org