Github user dongjoon-hyun commented on a diff in the pull request:
https://github.com/apache/spark/pull/22661#discussion_r224270597
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala
---
@@ -19,229 +19,165 @@ package org.apache.spark.sql.execution.benchmark
import org.apache.spark.sql.execution.joins._
import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.IntegerType
/**
* Benchmark to measure performance for aggregate primitives.
- * To run this:
- * build/sbt "sql/test-only *benchmark.JoinBenchmark"
- *
- * Benchmarks in this file are skipped in normal builds.
+ * To run this benchmark:
+ * {{{
+ * 1. without sbt:
+ * bin/spark-submit --class <this class> --jars <spark core test jar>
<spark sql test jar>
+ * 2. build/sbt "sql/test:runMain <this class>"
+ * 3. generate result:
+ * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this
class>"
+ * Results will be written to "benchmarks/JoinBenchmark-results.txt".
+ * }}}
*/
-class JoinBenchmark extends BenchmarkWithCodegen {
+object JoinBenchmark extends SqlBasedBenchmark {
- ignore("broadcast hash join, long key") {
+ def broadcastHashJoinLongKey(): Unit = {
val N = 20 << 20
val M = 1 << 16
- val dim = broadcast(sparkSession.range(M).selectExpr("id as k",
"cast(id as string) as v"))
- runBenchmark("Join w long", N) {
- val df = sparkSession.range(N).join(dim, (col("id") % M) ===
col("k"))
+ val dim = broadcast(spark.range(M).selectExpr("id as k", "cast(id as
string) as v"))
+ codegenBenchmark("Join w long", N) {
+ val df = spark.range(N).join(dim, (col("id") % M) === col("k"))
assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined)
df.count()
}
-
- /*
- Java HotSpot(TM) 64-Bit Server VM 1.7.0_60-b19 on Mac OS X 10.9.5
- Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
- Join w long: Best/Avg Time(ms) Rate(M/s)
Per Row(ns) Relative
-
-------------------------------------------------------------------------------------------
- Join w long codegen=false 3002 / 3262 7.0
143.2 1.0X
- Join w long codegen=true 321 / 371 65.3
15.3 9.3X
- */
}
- ignore("broadcast hash join, long key with duplicates") {
+
+ def broadcastHashJoinLongKeyWithDuplicates(): Unit = {
val N = 20 << 20
val M = 1 << 16
- val dim = broadcast(sparkSession.range(M).selectExpr("id as k",
"cast(id as string) as v"))
--- End diff --
So, this is a removal of redundant one, right?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]