Repository: spark Updated Branches: refs/heads/master 17edfec59 -> 8be7e6bb3
[SPARK-21973][SQL] Add an new option to filter queries in TPC-DS ## What changes were proposed in this pull request? This pr added a new option to filter TPC-DS queries to run in `TPCDSQueryBenchmark`. By default, `TPCDSQueryBenchmark` runs all the TPC-DS queries. This change could enable developers to run some of the TPC-DS queries by this option, e.g., to run q2, q4, and q6 only: ``` spark-submit --class <this class> --conf spark.sql.tpcds.queryFilter="q2,q4,q6" --jars <spark sql test jar> ``` ## How was this patch tested? Manually checked. Author: Takeshi Yamamuro <yamam...@apache.org> Closes #19188 from maropu/RunPartialQueriesInTPCDS. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8be7e6bb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8be7e6bb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8be7e6bb Branch: refs/heads/master Commit: 8be7e6bb3cc8afd07c24e7dbf0f8fbe0f491d740 Parents: 17edfec Author: Takeshi Yamamuro <yamam...@apache.org> Authored: Wed Sep 13 21:54:10 2017 -0700 Committer: gatorsmile <gatorsm...@gmail.com> Committed: Wed Sep 13 21:54:10 2017 -0700 ---------------------------------------------------------------------- .../benchmark/TPCDSQueryBenchmark.scala | 23 +++++++++++++++++--- .../TPCDSQueryBenchmarkArguments.scala | 17 +++++++++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/8be7e6bb/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala index 63d118c..99c6df7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation @@ -29,9 +30,9 @@ import org.apache.spark.util.Benchmark /** * Benchmark to measure TPCDS query performance. * To run this: - * spark-submit --class <this class> <spark sql test jar> <TPCDS data location> + * spark-submit --class <this class> <spark sql test jar> --data-location <TPCDS data location> */ -object TPCDSQueryBenchmark { +object TPCDSQueryBenchmark extends Logging { val conf = new SparkConf() .setMaster("local[1]") @@ -90,7 +91,9 @@ object TPCDSQueryBenchmark { benchmark.addCase(name) { i => spark.sql(queryString).collect() } + logInfo(s"\n\n===== TPCDS QUERY BENCHMARK OUTPUT FOR $name =====\n") benchmark.run() + logInfo(s"\n\n===== FINISHED $name =====\n") } } @@ -110,6 +113,20 @@ object TPCDSQueryBenchmark { "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90", "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99") - tpcdsAll(benchmarkArgs.dataLocation, queries = tpcdsQueries) + // If `--query-filter` defined, filters the queries that this option selects + val queriesToRun = if (benchmarkArgs.queryFilter.nonEmpty) { + val queries = tpcdsQueries.filter { case queryName => + benchmarkArgs.queryFilter.contains(queryName) + } + if (queries.isEmpty) { + throw new RuntimeException( + s"Empty queries to run. Bad query name filter: ${benchmarkArgs.queryFilter}") + } + queries + } else { + tpcdsQueries + } + + tpcdsAll(benchmarkArgs.dataLocation, queries = queriesToRun) } } http://git-wip-us.apache.org/repos/asf/spark/blob/8be7e6bb/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala index 8edc77b..184ffff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala @@ -17,21 +17,33 @@ package org.apache.spark.sql.execution.benchmark +import java.util.Locale + + class TPCDSQueryBenchmarkArguments(val args: Array[String]) { var dataLocation: String = null + var queryFilter: Set[String] = Set.empty parseArgs(args.toList) validateArguments() + private def optionMatch(optionName: String, s: String): Boolean = { + optionName == s.toLowerCase(Locale.ROOT) + } + private def parseArgs(inputArgs: List[String]): Unit = { var args = inputArgs - while(args.nonEmpty) { + while (args.nonEmpty) { args match { - case ("--data-location") :: value :: tail => + case optName :: value :: tail if optionMatch("--data-location", optName) => dataLocation = value args = tail + case optName :: value :: tail if optionMatch("--query-filter", optName) => + queryFilter = value.toLowerCase(Locale.ROOT).split(",").map(_.trim).toSet + args = tail + case _ => // scalastyle:off println System.err.println("Unknown/unsupported param " + args) @@ -47,6 +59,7 @@ class TPCDSQueryBenchmarkArguments(val args: Array[String]) { |Usage: spark-submit --class <this class> <spark sql test jar> [Options] |Options: | --data-location Path to TPCDS data + | --query-filter Queries to filter, e.g., q3,q5,q13 | |------------------------------------------------------------------------------------------------------------------ |In order to run this benchmark, please follow the instructions at --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org