wangyum commented on pull request #29642:
URL: https://github.com/apache/spark/pull/29642#issuecomment-738661573


   ```scala
   package org.apache.spark.sql.execution.benchmark
   
   import java.io.File
   
   import scala.util.Random
   
   import org.apache.spark.SparkConf
   import org.apache.spark.benchmark.Benchmark
   import org.apache.spark.sql.SparkSession
   import org.apache.spark.sql.functions.monotonically_increasing_id
   
   /**
    * Benchmark to measure read performance InSet Filter pushdown.
    * To run this benchmark:
    * {{{
    *   1. without sbt: bin/spark-submit --class <this class> <spark sql test 
jar>
    *   2. build/sbt "sql/test:runMain <this class>"
    *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt 
"sql/test:runMain <this class>"
    *      Results will be written to 
"benchmarks/InSetFilterPushdownBenchmark-results.txt".
    * }}}
    */
   object InSetFilterPushdownBenchmark extends SqlBasedBenchmark {
   
     override def getSparkSession: SparkSession = {
       val conf = new SparkConf()
         .setAppName(this.getClass.getSimpleName)
         // Since `spark.master` always exists, overrides this value
         .set("spark.master", "local[1]")
         .setIfMissing("spark.driver.memory", "3g")
         .setIfMissing("orc.compression", "snappy")
         .setIfMissing("spark.sql.parquet.compression.codec", "snappy")
   
       SparkSession.builder().config(conf).getOrCreate()
     }
   
     private val numRows = 1024 * 1024 * 15
     private val width = 5
     // For Parquet/ORC, we will use the same value for block size and 
compression size
     private val blockSize = 
org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE
   
     def withTempTable(tableNames: String*)(f: => Unit): Unit = {
       try f finally tableNames.foreach(spark.catalog.dropTempView)
     }
   
     private def prepareTable(dir: File, numRows: Int): Unit = {
       import spark.implicits._
       val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i")
       val df = spark.range(numRows).map(_ => 
Random.nextLong).selectExpr(selectExpr: _*)
         .withColumn("value", monotonically_increasing_id())
         .sort("value")
   
       df.write.mode("overwrite")
         .option("orc.compress.size", blockSize)
         .option("orc.stripe.size", 
blockSize).format("orc").saveAsTable("orcTable")
   
       df.write.mode("overwrite")
         .option("parquet.block.size", 
blockSize).format("parquet").saveAsTable("parquetTable")
   
       df.write.mode("overwrite").format("csv").saveAsTable("csvTable")
     }
   
     def filterPushDownBenchmark(
          values: Int,
          title: String,
          whereExpr: String,
          selectExpr: String = "*"): Unit = {
       val benchmark = new Benchmark(title, values, minNumIters = 5, output = 
output)
   
       Seq(Int.MaxValue, 10).foreach { pushDownEnabled =>
         val name = s"Parquet ${if (pushDownEnabled == 10) s"(Rewrite InSet)" 
else ""}"
         benchmark.addCase(name) { _ =>
           withSQLConf("spark.sql.optimizer.inSetRewriteMinMaxThreshold" -> 
s"$pushDownEnabled") {
             spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE 
$whereExpr").noop()
           }
         }
       }
   
       Seq(Int.MaxValue, 10).foreach { pushDownEnabled =>
         val name = s"ORC ${if (pushDownEnabled == 10) s"(Rewrite InSet)" else 
""}"
         benchmark.addCase(name) { _ =>
           withSQLConf("spark.sql.optimizer.inSetRewriteMinMaxThreshold" -> 
s"$pushDownEnabled") {
             spark.sql(s"SELECT $selectExpr FROM orcTable WHERE 
$whereExpr").noop()
           }
         }
       }
   
       Seq(Int.MaxValue, 10).foreach { pushDownEnabled =>
         val name = s"CSV ${if (pushDownEnabled == 10) s"(Rewrite InSet)" else 
""}"
         benchmark.addCase(name) { _ =>
           withSQLConf("spark.sql.optimizer.inSetRewriteMinMaxThreshold" -> 
s"$pushDownEnabled") {
             spark.sql(s"SELECT $selectExpr FROM csvTable WHERE 
$whereExpr").noop()
           }
         }
       }
   
       benchmark.run()
     }
   
     override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
   
       runBenchmark("Pushdown benchmark for rewrite InSet") {
         withTempPath { dir =>
           withTempTable("orcTable", "parquetTable") {
             prepareTable(dir, numRows)
             Seq(50, 1000, 5000, 20000).foreach { count =>
               Seq(1, 10, 50, 90).foreach { distribution =>
                 val filter =
                   Range(0, count).map(r => scala.util.Random.nextInt(numRows * 
distribution / 100))
                 val whereExpr = s"value in(${filter.mkString(",")})"
                 val title = s"Rewrite InSet (values count: $count, 
distribution: $distribution)"
                 filterPushDownBenchmark(numRows, title, whereExpr)
               }
             }
           }
         }
       }
     }
   }
   ```
   
   Result:
   ```
   
================================================================================================
   Pushdown benchmark for rewrite InSet
   
================================================================================================
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 50, distribution: 1):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
---------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                     8289           
8371          68          1.9         527.0       1.0X
   Parquet (Rewrite InSet)                                      598            
614          14         26.3          38.0      13.9X
   ORC                                                          442            
454          20         35.6          28.1      18.8X
   ORC (Rewrite InSet)                                          411            
431          20         38.2          26.1      20.2X
   CSV                                                        23399          
23618         154          0.7        1487.7       0.4X
   CSV (Rewrite InSet)                                        23437          
24070         744          0.7        1490.1       0.4X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 50, distribution: 10):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
----------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                      8191           
8244          50          1.9         520.8       1.0X
   Parquet (Rewrite InSet)                                      1166           
1178          13         13.5          74.1       7.0X
   ORC                                                           500            
521          16         31.5          31.8      16.4X
   ORC (Rewrite InSet)                                           514            
526           8         30.6          32.7      15.9X
   CSV                                                         23447          
23704         316          0.7        1490.7       0.3X
   CSV (Rewrite InSet)                                         23639          
23821         153          0.7        1502.9       0.3X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 50, distribution: 50):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
----------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                      8157           
8233          56          1.9         518.6       1.0X
   Parquet (Rewrite InSet)                                      4224           
4257          42          3.7         268.6       1.9X
   ORC                                                           513            
536          25         30.7          32.6      15.9X
   ORC (Rewrite InSet)                                           511            
530          18         30.8          32.5      16.0X
   CSV                                                         23665          
24270         795          0.7        1504.6       0.3X
   CSV (Rewrite InSet)                                         23321          
23596         221          0.7        1482.7       0.3X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 50, distribution: 90):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
----------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                      8225           
8335          84          1.9         522.9       1.0X
   Parquet (Rewrite InSet)                                      7138           
7218         115          2.2         453.8       1.2X
   ORC                                                           526            
559          36         29.9          33.4      15.6X
   ORC (Rewrite InSet)                                           507            
538          24         31.1          32.2      16.2X
   CSV                                                         23411          
23731         496          0.7        1488.4       0.4X
   CSV (Rewrite InSet)                                         23470          
23546          82          0.7        1492.2       0.4X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 1000, distribution: 1):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-----------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                       8744           
8845          90          1.8         555.9       1.0X
   Parquet (Rewrite InSet)                                        650           
 656           4         24.2          41.3      13.5X
   ORC                                                            535           
 559          16         29.4          34.0      16.4X
   ORC (Rewrite InSet)                                            532           
 551          16         29.5          33.9      16.4X
   CSV                                                          30467          
32289        1496          0.5        1937.0       0.3X
   CSV (Rewrite InSet)                                          23981          
24614         596          0.7        1524.7       0.4X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 1000, distribution: 10):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        8383          
 8468          82          1.9         533.0       1.0X
   Parquet (Rewrite InSet)                                        1351          
 1362           9         11.6          85.9       6.2X
   ORC                                                            1048          
 1069          19         15.0          66.6       8.0X
   ORC (Rewrite InSet)                                            1052          
 1071          28         15.0          66.9       8.0X
   CSV                                                           30950          
32767        1238          0.5        1967.7       0.3X
   CSV (Rewrite InSet)                                           24209          
24513         396          0.6        1539.2       0.3X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 1000, distribution: 50):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        8402          
 8481          55          1.9         534.2       1.0X
   Parquet (Rewrite InSet)                                        4532          
 4677         186          3.5         288.1       1.9X
   ORC                                                            2621          
 2659          46          6.0         166.6       3.2X
   ORC (Rewrite InSet)                                            2631          
 2738         193          6.0         167.2       3.2X
   CSV                                                           30098          
30226          79          0.5        1913.6       0.3X
   CSV (Rewrite InSet)                                           27913          
28481         693          0.6        1774.7       0.3X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 1000, distribution: 90):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        8420          
 8468          65          1.9         535.4       1.0X
   Parquet (Rewrite InSet)                                        7621          
 7781         191          2.1         484.5       1.1X
   ORC                                                            3108          
 3167          53          5.1         197.6       2.7X
   ORC (Rewrite InSet)                                            3089          
 3175          59          5.1         196.4       2.7X
   CSV                                                           30555          
32254        1187          0.5        1942.6       0.3X
   CSV (Rewrite InSet)                                           31091          
31607         480          0.5        1976.7       0.3X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 5000, distribution: 1):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-----------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                       9125           
9170          55          1.7         580.2       1.0X
   Parquet (Rewrite InSet)                                       1206           
1234          18         13.0          76.6       7.6X
   ORC                                                           1244           
1254           7         12.6          79.1       7.3X
   ORC (Rewrite InSet)                                           1236           
1250          12         12.7          78.6       7.4X
   CSV                                                         350424         
355583        1016          0.0       22279.3       0.0X
   CSV (Rewrite InSet)                                          28577          
28875         458          0.6        1816.9       0.3X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 5000, distribution: 10):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        9162          
 9408         253          1.7         582.5       1.0X
   Parquet (Rewrite InSet)                                        1911          
 1930          13          8.2         121.5       4.8X
   ORC                                                            1774          
 1809          41          8.9         112.8       5.2X
   ORC (Rewrite InSet)                                            1769          
 1785          24          8.9         112.5       5.2X
   CSV                                                          364909         
368618         NaN          0.0       23200.3       0.0X
   CSV (Rewrite InSet)                                           58985          
59425         287          0.3        3750.1       0.2X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 5000, distribution: 50):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        9218          
 9499         173          1.7         586.1       1.0X
   Parquet (Rewrite InSet)                                        5109          
 5139          32          3.1         324.8       1.8X
   ORC                                                            4089          
 4137          72          3.8         260.0       2.3X
   ORC (Rewrite InSet)                                            4056          
 4121          93          3.9         257.9       2.3X
   CSV                                                          359994         
364490         790          0.0       22887.8       0.0X
   CSV (Rewrite InSet)                                          196472         
202225         721          0.1       12491.4       0.0X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 5000, distribution: 90):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        9147          
 9247          64          1.7         581.6       1.0X
   Parquet (Rewrite InSet)                                        8369          
 8520         179          1.9         532.1       1.1X
   ORC                                                            6267          
 6305          47          2.5         398.4       1.5X
   ORC (Rewrite InSet)                                            6289          
 6435         199          2.5         399.8       1.5X
   CSV                                                          369254         
371915         697          0.0       23476.6       0.0X
   CSV (Rewrite InSet)                                          326837         
329082         NaN          0.0       20779.7       0.0X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 20000, distribution: 1):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                       11866          
11944         105          1.3         754.4       1.0X
   Parquet (Rewrite InSet)                                        3578          
 3670          81          4.4         227.5       3.3X
   ORC                                                            4119          
 4152          33          3.8         261.9       2.9X
   ORC (Rewrite InSet)                                            4054          
 4181          84          3.9         257.7       2.9X
   CSV                                                         2319345        
2350577         153          0.0      147460.0       0.0X
   CSV (Rewrite InSet)                                           55273          
56287         821          0.3        3514.2       0.2X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 20000, distribution: 10):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        12194         
 12287          91          1.3         775.3       1.0X
   Parquet (Rewrite InSet)                                         4442         
  4479          42          3.5         282.4       2.7X
   ORC                                                             4805         
  4847          53          3.3         305.5       2.5X
   ORC (Rewrite InSet)                                             4746         
  4838          94          3.3         301.7       2.6X
   CSV                                                          2958262        
2979920         967          0.0      188081.2       0.0X
   CSV (Rewrite InSet)                                           322782         
329114        1177          0.0       20521.9       0.0X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 20000, distribution: 50):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        12138         
 12205          69          1.3         771.7       1.0X
   Parquet (Rewrite InSet)                                         7760         
  7901         160          2.0         493.3       1.6X
   ORC                                                             7072         
  7263         148          2.2         449.6       1.7X
   ORC (Rewrite InSet)                                             7094         
  7225          87          2.2         451.0       1.7X
   CSV                                                          2906664        
2948342         220          0.0      184800.7       0.0X
   CSV (Rewrite InSet)                                          1367893        
1393413        1348          0.0       86968.3       0.0X
   
   Java HotSpot(TM) 64-Bit Server VM 1.8.0_221-b11 on Linux 
3.10.0-957.10.1.el7.x86_64
   Intel Core Processor (Broadwell, IBRS)
   Rewrite InSet (values count: 20000, distribution: 90):  Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   
-------------------------------------------------------------------------------------------------------------------------------------
   Parquet                                                        12230         
 12593         387          1.3         777.5       1.0X
   Parquet (Rewrite InSet)                                        11262         
 11580         263          1.4         716.0       1.1X
   ORC                                                             9712         
  9794          75          1.6         617.5       1.3X
   ORC (Rewrite InSet)                                             9658         
  9763         109          1.6         614.1       1.3X
   CSV                                                          2776344        
2807999        1140          0.0      176515.2       0.0X
   CSV (Rewrite InSet)                                          2506408        
2519162         802          0.0      159353.1       0.0X
   
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to