wangyum commented on pull request #29642:
URL: https://github.com/apache/spark/pull/29642#issuecomment-738661573
```scala
package org.apache.spark.sql.execution.benchmark
import java.io.File
import scala.util.Random
import org.apache.spark.SparkConf
import org.apache.spark.benchmark.Benchmark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.monotonically_increasing_id
/**
* Benchmark to measure read performance InSet Filter pushdown.
* To run this benchmark:
* {{{
* 1. without sbt: bin/spark-submit --class <this class> <spark sql test
jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt
"sql/test:runMain <this class>"
* Results will be written to
"benchmarks/InSetFilterPushdownBenchmark-results.txt".
* }}}
*/
object InSetFilterPushdownBenchmark extends SqlBasedBenchmark {
override def getSparkSession: SparkSession = {
val conf = new SparkConf()
.setAppName(this.getClass.getSimpleName)
// Since `spark.master` always exists, overrides this value
.set("spark.master", "local[1]")
.setIfMissing("spark.driver.memory", "3g")
.setIfMissing("orc.compression", "snappy")
.setIfMissing("spark.sql.parquet.compression.codec", "snappy")
SparkSession.builder().config(conf).getOrCreate()
}
private val numRows = 1024 * 1024 * 15
private val width = 5
// For Parquet/ORC, we will use the same value for block size and
compression size
private val blockSize =
org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE
def withTempTable(tableNames: String*)(f: => Unit): Unit = {
try f finally tableNames.foreach(spark.catalog.dropTempView)
}
private def prepareTable(dir: File, numRows: Int): Unit = {
import spark.implicits._
val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i")
val df = spark.range(numRows).map(_ =>
Random.nextLong).selectExpr(selectExpr: _*)
.withColumn("value", monotonically_increasing_id())
.sort("value")
df.write.mode("overwrite")
.option("orc.compress.size", blockSize)
.option("orc.stripe.size",
blockSize).format("orc").saveAsTable("orcTable")
df.write.mode("overwrite")
.option("parquet.block.size",
blockSize).format("parquet").saveAsTable("parquetTable")
df.write.mode("overwrite").format("csv").saveAsTable("csvTable")
}
def filterPushDownBenchmark(
values: Int,
title: String,
whereExpr: String,
selectExpr: String = "*"): Unit = {
val benchmark = new Benchmark(title, values, minNumIters = 5, output =
output)
Seq(Int.MaxValue, 10).foreach { pushDownEnabled =>
val name = s"Parquet ${if (pushDownEnabled == 10) s"(Rewrite InSet)"
else ""}"
benchmark.addCase(name) { _ =>
withSQLConf("spark.sql.optimizer.inSetRewriteMinMaxThreshold" ->
s"$pushDownEnabled") {
spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE
$whereExpr").noop()
}
}
}
Seq(Int.MaxValue, 10).foreach { pushDownEnabled =>
val name = s"ORC ${if (pushDownEnabled == 10) s"(Rewrite InSet)" else
""}"
benchmark.addCase(name) { _ =>
withSQLConf("spark.sql.optimizer.inSetRewriteMinMaxThreshold" ->
s"$pushDownEnabled") {
spark.sql(s"SELECT $selectExpr FROM orcTable WHERE
$whereExpr").noop()
}
}
}
Seq(Int.MaxValue, 10).foreach { pushDownEnabled =>
val name = s"CSV ${if (pushDownEnabled == 10) s"(Rewrite InSet)" else
""}"
benchmark.addCase(name) { _ =>
withSQLConf("spark.sql.optimizer.inSetRewriteMinMaxThreshold" ->
s"$pushDownEnabled") {
spark.sql(s"SELECT $selectExpr FROM csvTable WHERE
$whereExpr").noop()
}
}
}
benchmark.run()
}
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
runBenchmark("Pushdown benchmark for rewrite InSet") {
withTempPath { dir =>
withTempTable("orcTable", "parquetTable") {
prepareTable(dir, numRows)
Seq(50, 1000, 5000, 20000).foreach { count =>
Seq(1, 10, 50, 90).foreach { distribution =>
val filter =
Range(0, count).map(r => scala.util.Random.nextInt(numRows *
distribution / 100))
val whereExpr = s"value in(${filter.mkString(",")})"
val title = s"Rewrite InSet (values count: $count,
distribution: $distribution)"
filterPushDownBenchmark(numRows, title, whereExpr)
}
}
}
}
}
}
}
```
Result:
```
================================================================================================
Pushdown benchmark for rewrite InSet
================================================================================================
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 50, distribution: 1): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
---------------------------------------------------------------------------------------------------------------------------------
Parquet 8289
8371 68 1.9 527.0 1.0X
Parquet (Rewrite InSet) 598
614 14 26.3 38.0 13.9X
ORC 442
454 20 35.6 28.1 18.8X
ORC (Rewrite InSet) 411
431 20 38.2 26.1 20.2X
CSV 23399
23618 154 0.7 1487.7 0.4X
CSV (Rewrite InSet) 23437
24070 744 0.7 1490.1 0.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 50, distribution: 10): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------
Parquet 8191
8244 50 1.9 520.8 1.0X
Parquet (Rewrite InSet) 1166
1178 13 13.5 74.1 7.0X
ORC 500
521 16 31.5 31.8 16.4X
ORC (Rewrite InSet) 514
526 8 30.6 32.7 15.9X
CSV 23447
23704 316 0.7 1490.7 0.3X
CSV (Rewrite InSet) 23639
23821 153 0.7 1502.9 0.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 50, distribution: 50): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------
Parquet 8157
8233 56 1.9 518.6 1.0X
Parquet (Rewrite InSet) 4224
4257 42 3.7 268.6 1.9X
ORC 513
536 25 30.7 32.6 15.9X
ORC (Rewrite InSet) 511
530 18 30.8 32.5 16.0X
CSV 23665
24270 795 0.7 1504.6 0.3X
CSV (Rewrite InSet) 23321
23596 221 0.7 1482.7 0.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 50, distribution: 90): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
----------------------------------------------------------------------------------------------------------------------------------
Parquet 8225
8335 84 1.9 522.9 1.0X
Parquet (Rewrite InSet) 7138
7218 115 2.2 453.8 1.2X
ORC 526
559 36 29.9 33.4 15.6X
ORC (Rewrite InSet) 507
538 24 31.1 32.2 16.2X
CSV 23411
23731 496 0.7 1488.4 0.4X
CSV (Rewrite InSet) 23470
23546 82 0.7 1492.2 0.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 1000, distribution: 1): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------------
Parquet 8744
8845 90 1.8 555.9 1.0X
Parquet (Rewrite InSet) 650
656 4 24.2 41.3 13.5X
ORC 535
559 16 29.4 34.0 16.4X
ORC (Rewrite InSet) 532
551 16 29.5 33.9 16.4X
CSV 30467
32289 1496 0.5 1937.0 0.3X
CSV (Rewrite InSet) 23981
24614 596 0.7 1524.7 0.4X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 1000, distribution: 10): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet 8383
8468 82 1.9 533.0 1.0X
Parquet (Rewrite InSet) 1351
1362 9 11.6 85.9 6.2X
ORC 1048
1069 19 15.0 66.6 8.0X
ORC (Rewrite InSet) 1052
1071 28 15.0 66.9 8.0X
CSV 30950
32767 1238 0.5 1967.7 0.3X
CSV (Rewrite InSet) 24209
24513 396 0.6 1539.2 0.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 1000, distribution: 50): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet 8402
8481 55 1.9 534.2 1.0X
Parquet (Rewrite InSet) 4532
4677 186 3.5 288.1 1.9X
ORC 2621
2659 46 6.0 166.6 3.2X
ORC (Rewrite InSet) 2631
2738 193 6.0 167.2 3.2X
CSV 30098
30226 79 0.5 1913.6 0.3X
CSV (Rewrite InSet) 27913
28481 693 0.6 1774.7 0.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 1000, distribution: 90): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet 8420
8468 65 1.9 535.4 1.0X
Parquet (Rewrite InSet) 7621
7781 191 2.1 484.5 1.1X
ORC 3108
3167 53 5.1 197.6 2.7X
ORC (Rewrite InSet) 3089
3175 59 5.1 196.4 2.7X
CSV 30555
32254 1187 0.5 1942.6 0.3X
CSV (Rewrite InSet) 31091
31607 480 0.5 1976.7 0.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 5000, distribution: 1): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-----------------------------------------------------------------------------------------------------------------------------------
Parquet 9125
9170 55 1.7 580.2 1.0X
Parquet (Rewrite InSet) 1206
1234 18 13.0 76.6 7.6X
ORC 1244
1254 7 12.6 79.1 7.3X
ORC (Rewrite InSet) 1236
1250 12 12.7 78.6 7.4X
CSV 350424
355583 1016 0.0 22279.3 0.0X
CSV (Rewrite InSet) 28577
28875 458 0.6 1816.9 0.3X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 5000, distribution: 10): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet 9162
9408 253 1.7 582.5 1.0X
Parquet (Rewrite InSet) 1911
1930 13 8.2 121.5 4.8X
ORC 1774
1809 41 8.9 112.8 5.2X
ORC (Rewrite InSet) 1769
1785 24 8.9 112.5 5.2X
CSV 364909
368618 NaN 0.0 23200.3 0.0X
CSV (Rewrite InSet) 58985
59425 287 0.3 3750.1 0.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 5000, distribution: 50): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet 9218
9499 173 1.7 586.1 1.0X
Parquet (Rewrite InSet) 5109
5139 32 3.1 324.8 1.8X
ORC 4089
4137 72 3.8 260.0 2.3X
ORC (Rewrite InSet) 4056
4121 93 3.9 257.9 2.3X
CSV 359994
364490 790 0.0 22887.8 0.0X
CSV (Rewrite InSet) 196472
202225 721 0.1 12491.4 0.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 5000, distribution: 90): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet 9147
9247 64 1.7 581.6 1.0X
Parquet (Rewrite InSet) 8369
8520 179 1.9 532.1 1.1X
ORC 6267
6305 47 2.5 398.4 1.5X
ORC (Rewrite InSet) 6289
6435 199 2.5 399.8 1.5X
CSV 369254
371915 697 0.0 23476.6 0.0X
CSV (Rewrite InSet) 326837
329082 NaN 0.0 20779.7 0.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 20000, distribution: 1): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------------------
Parquet 11866
11944 105 1.3 754.4 1.0X
Parquet (Rewrite InSet) 3578
3670 81 4.4 227.5 3.3X
ORC 4119
4152 33 3.8 261.9 2.9X
ORC (Rewrite InSet) 4054
4181 84 3.9 257.7 2.9X
CSV 2319345
2350577 153 0.0 147460.0 0.0X
CSV (Rewrite InSet) 55273
56287 821 0.3 3514.2 0.2X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 20000, distribution: 10): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet 12194
12287 91 1.3 775.3 1.0X
Parquet (Rewrite InSet) 4442
4479 42 3.5 282.4 2.7X
ORC 4805
4847 53 3.3 305.5 2.5X
ORC (Rewrite InSet) 4746
4838 94 3.3 301.7 2.6X
CSV 2958262
2979920 967 0.0 188081.2 0.0X
CSV (Rewrite InSet) 322782
329114 1177 0.0 20521.9 0.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 20000, distribution: 50): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet 12138
12205 69 1.3 771.7 1.0X
Parquet (Rewrite InSet) 7760
7901 160 2.0 493.3 1.6X
ORC 7072
7263 148 2.2 449.6 1.7X
ORC (Rewrite InSet) 7094
7225 87 2.2 451.0 1.7X
CSV 2906664
2948342 220 0.0 184800.7 0.0X
CSV (Rewrite InSet) 1367893
1393413 1348 0.0 86968.3 0.0X
Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux
3.10.0-957.10.1.el7.x86_64
Intel Core Processor (Broadwell, IBRS)
Rewrite InSet (values count: 20000, distribution: 90): Best Time(ms) Avg
Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
-------------------------------------------------------------------------------------------------------------------------------------
Parquet 12230
12593 387 1.3 777.5 1.0X
Parquet (Rewrite InSet) 11262
11580 263 1.4 716.0 1.1X
ORC 9712
9794 75 1.6 617.5 1.3X
ORC (Rewrite InSet) 9658
9763 109 1.6 614.1 1.3X
CSV 2776344
2807999 1140 0.0 176515.2 0.0X
CSV (Rewrite InSet) 2506408
2519162 802 0.0 159353.1 0.0X
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]