Repository: spark
Updated Branches:
  refs/heads/master 83e19d5b8 -> 8115e6b26


[SPARK-25662][SQL][TEST] Refactor DataSourceReadBenchmark to use main method

## What changes were proposed in this pull request?

1. Refactor DataSourceReadBenchmark

## How was this patch tested?

Manually tested and regenerated results.
```
SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain 
org.apache.spark.sql.execution.benchmark.DataSourceReadBenchmark"
```

Closes #22664 from peter-toth/SPARK-25662.

Lead-authored-by: Peter Toth <[email protected]>
Co-authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: DB Tsai <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8115e6b2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8115e6b2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8115e6b2

Branch: refs/heads/master
Commit: 8115e6b26916c42491d712c06c73c045f4ee17e1
Parents: 83e19d5
Author: Peter Toth <[email protected]>
Authored: Thu Oct 11 20:27:07 2018 +0000
Committer: DB Tsai <[email protected]>
Committed: Thu Oct 11 20:27:07 2018 +0000

----------------------------------------------------------------------
 .../DataSourceReadBenchmark-results.txt         | 269 +++++++++++++++++
 .../benchmark/DataSourceReadBenchmark.scala     | 300 +++----------------
 2 files changed, 316 insertions(+), 253 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8115e6b2/sql/core/benchmarks/DataSourceReadBenchmark-results.txt
----------------------------------------------------------------------
diff --git a/sql/core/benchmarks/DataSourceReadBenchmark-results.txt 
b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt
new file mode 100644
index 0000000..2d3bae4
--- /dev/null
+++ b/sql/core/benchmarks/DataSourceReadBenchmark-results.txt
@@ -0,0 +1,269 @@
+================================================================================================
+SQL Single Numeric Column Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single TINYINT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     21508 / 22112          0.7        
1367.5       1.0X
+SQL Json                                      8705 / 8825          1.8         
553.4       2.5X
+SQL Parquet Vectorized                         157 /  186        100.0         
 10.0     136.7X
+SQL Parquet MR                                1789 / 1794          8.8         
113.8      12.0X
+SQL ORC Vectorized                             156 /  166        100.9         
  9.9     138.0X
+SQL ORC Vectorized with copy                   218 /  225         72.1         
 13.9      98.6X
+SQL ORC MR                                    1448 / 1492         10.9         
 92.0      14.9X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Parquet Reader Single TINYINT Column Scan: Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+ParquetReader Vectorized                       202 /  211         77.7         
 12.9       1.0X
+ParquetReader Vectorized -> Row                118 /  120        133.5         
  7.5       1.7X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single SMALLINT Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     23282 / 23312          0.7        
1480.2       1.0X
+SQL Json                                      9187 / 9189          1.7         
584.1       2.5X
+SQL Parquet Vectorized                         204 /  218         77.0         
 13.0     114.0X
+SQL Parquet MR                                1941 / 1953          8.1         
123.4      12.0X
+SQL ORC Vectorized                             217 /  225         72.6         
 13.8     107.5X
+SQL ORC Vectorized with copy                   279 /  289         56.3         
 17.8      83.4X
+SQL ORC MR                                    1541 / 1549         10.2         
 98.0      15.1X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Parquet Reader Single SMALLINT Column Scan: Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+ParquetReader Vectorized                       288 /  297         54.6         
 18.3       1.0X
+ParquetReader Vectorized -> Row                255 /  257         61.7         
 16.2       1.1X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single INT Column Scan:              Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     24990 / 25012          0.6        
1588.8       1.0X
+SQL Json                                      9837 / 9865          1.6         
625.4       2.5X
+SQL Parquet Vectorized                         170 /  180         92.3         
 10.8     146.6X
+SQL Parquet MR                                2319 / 2328          6.8         
147.4      10.8X
+SQL ORC Vectorized                             293 /  301         53.7         
 18.6      85.3X
+SQL ORC Vectorized with copy                   297 /  309         52.9         
 18.9      84.0X
+SQL ORC MR                                    1667 / 1674          9.4         
106.0      15.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Parquet Reader Single INT Column Scan:   Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+ParquetReader Vectorized                       257 /  274         61.3         
 16.3       1.0X
+ParquetReader Vectorized -> Row                259 /  264         60.8         
 16.4       1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single BIGINT Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     32537 / 32554          0.5        
2068.7       1.0X
+SQL Json                                    12610 / 12668          1.2         
801.7       2.6X
+SQL Parquet Vectorized                         258 /  276         61.0         
 16.4     126.2X
+SQL Parquet MR                                2422 / 2435          6.5         
154.0      13.4X
+SQL ORC Vectorized                             378 /  385         41.6         
 24.0      86.2X
+SQL ORC Vectorized with copy                   381 /  389         41.3         
 24.2      85.4X
+SQL ORC MR                                    1797 / 1819          8.8         
114.3      18.1X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Parquet Reader Single BIGINT Column Scan: Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+ParquetReader Vectorized                       352 /  368         44.7         
 22.4       1.0X
+ParquetReader Vectorized -> Row                351 /  359         44.8         
 22.3       1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single FLOAT Column Scan:            Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     27179 / 27184          0.6        
1728.0       1.0X
+SQL Json                                    12578 / 12585          1.3         
799.7       2.2X
+SQL Parquet Vectorized                         161 /  171         97.5         
 10.3     168.5X
+SQL Parquet MR                                2361 / 2395          6.7         
150.1      11.5X
+SQL ORC Vectorized                             473 /  480         33.3         
 30.0      57.5X
+SQL ORC Vectorized with copy                   478 /  483         32.9         
 30.4      56.8X
+SQL ORC MR                                    1858 / 1859          8.5         
118.2      14.6X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Parquet Reader Single FLOAT Column Scan: Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+ParquetReader Vectorized                       251 /  255         62.7         
 15.9       1.0X
+ParquetReader Vectorized -> Row                255 /  259         61.8         
 16.2       1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single DOUBLE Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     34797 / 34830          0.5        
2212.3       1.0X
+SQL Json                                    17806 / 17828          0.9        
1132.1       2.0X
+SQL Parquet Vectorized                         260 /  269         60.6         
 16.5     134.0X
+SQL Parquet MR                                2512 / 2534          6.3         
159.7      13.9X
+SQL ORC Vectorized                             582 /  593         27.0         
 37.0      59.8X
+SQL ORC Vectorized with copy                   576 /  584         27.3         
 36.6      60.4X
+SQL ORC MR                                    2309 / 2313          6.8         
146.8      15.1X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Parquet Reader Single DOUBLE Column Scan: Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+ParquetReader Vectorized                       350 /  363         44.9         
 22.3       1.0X
+ParquetReader Vectorized -> Row                350 /  366         44.9         
 22.3       1.0X
+
+
+================================================================================================
+Int and String Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Int and String Scan:                     Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     22486 / 22590          0.5        
2144.5       1.0X
+SQL Json                                    14124 / 14195          0.7        
1347.0       1.6X
+SQL Parquet Vectorized                        2342 / 2347          4.5         
223.4       9.6X
+SQL Parquet MR                                4660 / 4664          2.2         
444.4       4.8X
+SQL ORC Vectorized                            2378 / 2379          4.4         
226.8       9.5X
+SQL ORC Vectorized with copy                  2548 / 2571          4.1         
243.0       8.8X
+SQL ORC MR                                    4206 / 4211          2.5         
401.1       5.3X
+
+
+================================================================================================
+Repeated String Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Repeated String:                         Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     12150 / 12178          0.9        
1158.7       1.0X
+SQL Json                                      7012 / 7014          1.5         
668.7       1.7X
+SQL Parquet Vectorized                         792 /  796         13.2         
 75.5      15.3X
+SQL Parquet MR                                1961 / 1975          5.3         
187.0       6.2X
+SQL ORC Vectorized                             482 /  485         21.8         
 46.0      25.2X
+SQL ORC Vectorized with copy                   710 /  715         14.8         
 67.7      17.1X
+SQL ORC MR                                    2081 / 2083          5.0         
198.5       5.8X
+
+
+================================================================================================
+Partitioned Table Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Partitioned Table:                       Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Data column - CSV                           31789 / 31791          0.5        
2021.1       1.0X
+Data column - Json                          12873 / 12918          1.2         
818.4       2.5X
+Data column - Parquet Vectorized               267 /  280         58.9         
 17.0     119.1X
+Data column - Parquet MR                      3387 / 3402          4.6         
215.3       9.4X
+Data column - ORC Vectorized                   391 /  453         40.2         
 24.9      81.2X
+Data column - ORC Vectorized with copy         392 /  398         40.2         
 24.9      81.2X
+Data column - ORC MR                          2508 / 2512          6.3         
159.4      12.7X
+Partition column - CSV                        6965 / 6977          2.3         
442.8       4.6X
+Partition column - Json                       5563 / 5576          2.8         
353.7       5.7X
+Partition column - Parquet Vectorized           65 /   78        241.1         
  4.1     487.2X
+Partition column - Parquet MR                 1811 / 1811          8.7         
115.1      17.6X
+Partition column - ORC Vectorized               66 /   73        239.0         
  4.2     483.0X
+Partition column - ORC Vectorized with copy        65 /   70        241.1      
     4.1     487.3X
+Partition column - ORC MR                     1775 / 1778          8.9         
112.8      17.9X
+Both columns - CSV                          30032 / 30113          0.5        
1909.4       1.1X
+Both columns - Json                         13941 / 13959          1.1         
886.3       2.3X
+Both columns - Parquet Vectorized              312 /  330         50.3         
 19.9     101.7X
+Both columns - Parquet MR                     3858 / 3862          4.1         
245.3       8.2X
+Both columns - ORC Vectorized                  431 /  437         36.5         
 27.4      73.8X
+Both column - ORC Vectorized with copy         523 /  529         30.1         
 33.3      60.7X
+Both columns - ORC MR                         2712 / 2805          5.8         
172.4      11.7X
+
+
+================================================================================================
+String with Nulls Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+String with Nulls Scan:                  Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     13525 / 13823          0.8        
1289.9       1.0X
+SQL Json                                      9913 / 9921          1.1         
945.3       1.4X
+SQL Parquet Vectorized                        1517 / 1517          6.9         
144.7       8.9X
+SQL Parquet MR                                3996 / 4008          2.6         
381.1       3.4X
+ParquetReader Vectorized                      1120 / 1128          9.4         
106.8      12.1X
+SQL ORC Vectorized                            1203 / 1224          8.7         
114.7      11.2X
+SQL ORC Vectorized with copy                  1639 / 1646          6.4         
156.3       8.3X
+SQL ORC MR                                    3720 / 3780          2.8         
354.7       3.6X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+String with Nulls Scan:                  Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     15860 / 15877          0.7        
1512.5       1.0X
+SQL Json                                      7676 / 7688          1.4         
732.0       2.1X
+SQL Parquet Vectorized                        1072 / 1084          9.8         
102.2      14.8X
+SQL Parquet MR                                2890 / 2897          3.6         
275.6       5.5X
+ParquetReader Vectorized                      1052 / 1053         10.0         
100.4      15.1X
+SQL ORC Vectorized                            1248 / 1248          8.4         
119.0      12.7X
+SQL ORC Vectorized with copy                  1627 / 1637          6.4         
155.2       9.7X
+SQL ORC MR                                    3365 / 3369          3.1         
320.9       4.7X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+String with Nulls Scan:                  Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     13401 / 13561          0.8        
1278.1       1.0X
+SQL Json                                      5253 / 5303          2.0         
500.9       2.6X
+SQL Parquet Vectorized                         233 /  242         45.0         
 22.2      57.6X
+SQL Parquet MR                                1791 / 1796          5.9         
170.8       7.5X
+ParquetReader Vectorized                       236 /  238         44.4         
 22.5      56.7X
+SQL ORC Vectorized                             453 /  473         23.2         
 43.2      29.6X
+SQL ORC Vectorized with copy                   573 /  577         18.3         
 54.7      23.4X
+SQL ORC MR                                    1846 / 1850          5.7         
176.0       7.3X
+
+
+================================================================================================
+Single Column Scan From Wide Columns
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Single Column Scan from 10 columns:      Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                       3147 / 3148          0.3        
3001.1       1.0X
+SQL Json                                      2666 / 2693          0.4        
2542.9       1.2X
+SQL Parquet Vectorized                          54 /   58         19.5         
 51.3      58.5X
+SQL Parquet MR                                 220 /  353          4.8         
209.9      14.3X
+SQL ORC Vectorized                              63 /   77         16.8         
 59.7      50.3X
+SQL ORC Vectorized with copy                    63 /   66         16.7         
 59.8      50.2X
+SQL ORC MR                                     317 /  321          3.3         
302.2       9.9X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Single Column Scan from 50 columns:      Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                       7902 / 7921          0.1        
7536.2       1.0X
+SQL Json                                      9467 / 9491          0.1        
9028.6       0.8X
+SQL Parquet Vectorized                          73 /   79         14.3         
 69.8     108.0X
+SQL Parquet MR                                 239 /  247          4.4         
228.0      33.1X
+SQL ORC Vectorized                              78 /   84         13.4         
 74.6     101.0X
+SQL ORC Vectorized with copy                    78 /   88         13.4         
 74.4     101.3X
+SQL ORC MR                                     910 /  918          1.2         
867.6       8.7X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Single Column Scan from 100 columns:     Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+SQL CSV                                     13539 / 13543          0.1       
12912.0       1.0X
+SQL Json                                    17420 / 17446          0.1       
16613.1       0.8X
+SQL Parquet Vectorized                         103 /  120         10.2         
 98.1     131.6X
+SQL Parquet MR                                 250 /  258          4.2         
238.9      54.1X
+SQL ORC Vectorized                              99 /  104         10.6         
 94.6     136.5X
+SQL ORC Vectorized with copy                   100 /  106         10.5         
 95.6     135.1X
+SQL ORC MR                                    1653 / 1659          0.6        
1576.3       8.2X
+
+

http://git-wip-us.apache.org/repos/asf/spark/blob/8115e6b2/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala
index 51a7f9f..a1e7f9e 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala
@@ -22,7 +22,7 @@ import scala.collection.JavaConverters._
 import scala.util.Random
 
 import org.apache.spark.SparkConf
-import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
 import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.SQLHelper
@@ -34,10 +34,16 @@ import org.apache.spark.sql.vectorized.ColumnVector
 
 /**
  * Benchmark to measure data source read performance.
- * To run this:
- *  spark-submit --class <this class> <spark sql test jar>
+ * To run this benchmark:
+ * {{{
+ *   1. without sbt: bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar> <spark sql 
test jar>
+ *   2. build/sbt "sql/test:runMain <this class>"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt 
"sql/test:runMain <this class>"
+ *      Results will be written to 
"benchmarks/DataSourceReadBenchmark-results.txt".
+ * }}}
  */
-object DataSourceReadBenchmark extends SQLHelper {
+object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper {
   val conf = new SparkConf()
     .setAppName("DataSourceReadBenchmark")
     // Since `spark.master` always exists, overrides this value
@@ -93,11 +99,16 @@ object DataSourceReadBenchmark extends SQLHelper {
 
   def numericScanBenchmark(values: Int, dataType: DataType): Unit = {
     // Benchmarks running through spark sql.
-    val sqlBenchmark = new Benchmark(s"SQL Single ${dataType.sql} Column 
Scan", values)
+    val sqlBenchmark = new Benchmark(
+      s"SQL Single ${dataType.sql} Column Scan",
+      values,
+      output = output)
 
     // Benchmarks driving reader component directly.
     val parquetReaderBenchmark = new Benchmark(
-      s"Parquet Reader Single ${dataType.sql} Column Scan", values)
+      s"Parquet Reader Single ${dataType.sql} Column Scan",
+      values,
+      output = output)
 
     withTempPath { dir =>
       withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") 
{
@@ -140,74 +151,6 @@ object DataSourceReadBenchmark extends SQLHelper {
           }
         }
 
-        /*
-        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 
4.14.33-51.37.amzn1.x86_64
-        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-        SQL Single TINYINT Column Scan:      Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 22964 / 23096          0.7     
   1460.0       1.0X
-        SQL Json                                  8469 / 8593          1.9     
    538.4       2.7X
-        SQL Parquet Vectorized                     164 /  177         95.8     
     10.4     139.9X
-        SQL Parquet MR                            1687 / 1706          9.3     
    107.2      13.6X
-        SQL ORC Vectorized                         191 /  197         82.3     
     12.2     120.2X
-        SQL ORC Vectorized with copy               215 /  219         73.2     
     13.7     106.9X
-        SQL ORC MR                                1392 / 1412         11.3     
     88.5      16.5X
-
-
-        SQL Single SMALLINT Column Scan:     Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 24090 / 24097          0.7     
   1531.6       1.0X
-        SQL Json                                  8791 / 8813          1.8     
    558.9       2.7X
-        SQL Parquet Vectorized                     204 /  212         77.0     
     13.0     117.9X
-        SQL Parquet MR                            1813 / 1850          8.7     
    115.3      13.3X
-        SQL ORC Vectorized                         226 /  230         69.7     
     14.4     106.7X
-        SQL ORC Vectorized with copy               295 /  298         53.3     
     18.8      81.6X
-        SQL ORC MR                                1526 / 1549         10.3     
     97.1      15.8X
-
-
-        SQL Single INT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 25637 / 25791          0.6     
   1629.9       1.0X
-        SQL Json                                  9532 / 9570          1.7     
    606.0       2.7X
-        SQL Parquet Vectorized                     181 /  191         86.8     
     11.5     141.5X
-        SQL Parquet MR                            2210 / 2227          7.1     
    140.5      11.6X
-        SQL ORC Vectorized                         309 /  317         50.9     
     19.6      83.0X
-        SQL ORC Vectorized with copy               316 /  322         49.8     
     20.1      81.2X
-        SQL ORC MR                                1650 / 1680          9.5     
    104.9      15.5X
-
-
-        SQL Single BIGINT Column Scan:       Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 31617 / 31764          0.5     
   2010.1       1.0X
-        SQL Json                                12440 / 12451          1.3     
    790.9       2.5X
-        SQL Parquet Vectorized                     284 /  315         55.4     
     18.0     111.4X
-        SQL Parquet MR                            2382 / 2390          6.6     
    151.5      13.3X
-        SQL ORC Vectorized                         398 /  403         39.5     
     25.3      79.5X
-        SQL ORC Vectorized with copy               410 /  413         38.3     
     26.1      77.1X
-        SQL ORC MR                                1783 / 1813          8.8     
    113.4      17.7X
-
-
-        SQL Single FLOAT Column Scan:        Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 26679 / 26742          0.6     
   1696.2       1.0X
-        SQL Json                                12490 / 12541          1.3     
    794.1       2.1X
-        SQL Parquet Vectorized                     174 /  183         90.4     
     11.1     153.3X
-        SQL Parquet MR                            2201 / 2223          7.1     
    140.0      12.1X
-        SQL ORC Vectorized                         415 /  429         37.9     
     26.4      64.3X
-        SQL ORC Vectorized with copy               422 /  428         37.2     
     26.9      63.2X
-        SQL ORC MR                                1767 / 1773          8.9     
    112.3      15.1X
-
-
-        SQL Single DOUBLE Column Scan:       Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 34223 / 34324          0.5     
   2175.8       1.0X
-        SQL Json                                17784 / 17785          0.9     
   1130.7       1.9X
-        SQL Parquet Vectorized                     277 /  283         56.7     
     17.6     123.4X
-        SQL Parquet MR                            2356 / 2386          6.7     
    149.8      14.5X
-        SQL ORC Vectorized                         533 /  536         29.5     
     33.9      64.2X
-        SQL ORC Vectorized with copy               541 /  546         29.1     
     34.4      63.3X
-        SQL ORC MR                                2166 / 2177          7.3     
    137.7      15.8X
-        */
         sqlBenchmark.run()
 
         // Driving the parquet reader in batch mode directly.
@@ -279,51 +222,13 @@ object DataSourceReadBenchmark extends SQLHelper {
           }
         }
 
-        /*
-        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 
4.14.33-51.37.amzn1.x86_64
-        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-        Single TINYINT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   198 /  202         79.4     
     12.6       1.0X
-        ParquetReader Vectorized -> Row            119 /  121        132.3     
      7.6       1.7X
-
-
-        Single SMALLINT Column Scan:         Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   282 /  287         55.8     
     17.9       1.0X
-        ParquetReader Vectorized -> Row            246 /  247         64.0     
     15.6       1.1X
-
-
-        Single INT Column Scan:              Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   258 /  262         60.9     
     16.4       1.0X
-        ParquetReader Vectorized -> Row            259 /  260         60.8     
     16.5       1.0X
-
-
-        Single BIGINT Column Scan:           Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   361 /  369         43.6     
     23.0       1.0X
-        ParquetReader Vectorized -> Row            361 /  371         43.6     
     22.9       1.0X
-
-
-        Single FLOAT Column Scan:            Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   253 /  261         62.2     
     16.1       1.0X
-        ParquetReader Vectorized -> Row            254 /  256         61.9     
     16.2       1.0X
-
-
-        Single DOUBLE Column Scan:           Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        ParquetReader Vectorized                   357 /  364         44.0     
     22.7       1.0X
-        ParquetReader Vectorized -> Row            358 /  366         44.0     
     22.7       1.0X
-        */
         parquetReaderBenchmark.run()
       }
     }
   }
 
   def intStringScanBenchmark(values: Int): Unit = {
-    val benchmark = new Benchmark("Int and String Scan", values)
+    val benchmark = new Benchmark("Int and String Scan", values, output = 
output)
 
     withTempPath { dir =>
       withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") 
{
@@ -368,26 +273,13 @@ object DataSourceReadBenchmark extends SQLHelper {
           }
         }
 
-        /*
-        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 
4.14.33-51.37.amzn1.x86_64
-        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-        Int and String Scan:                 Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 27145 / 27158          0.4     
   2588.7       1.0X
-        SQL Json                                12969 / 13337          0.8     
   1236.8       2.1X
-        SQL Parquet Vectorized                    2419 / 2448          4.3     
    230.7      11.2X
-        SQL Parquet MR                            4631 / 4633          2.3     
    441.7       5.9X
-        SQL ORC Vectorized                        2412 / 2465          4.3     
    230.0      11.3X
-        SQL ORC Vectorized with copy              2633 / 2675          4.0     
    251.1      10.3X
-        SQL ORC MR                                4280 / 4350          2.4     
    408.2       6.3X
-        */
         benchmark.run()
       }
     }
   }
 
   def repeatedStringScanBenchmark(values: Int): Unit = {
-    val benchmark = new Benchmark("Repeated String", values)
+    val benchmark = new Benchmark("Repeated String", values, output = output)
 
     withTempPath { dir =>
       withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") 
{
@@ -432,26 +324,13 @@ object DataSourceReadBenchmark extends SQLHelper {
           }
         }
 
-        /*
-        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 
4.14.33-51.37.amzn1.x86_64
-        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-        Repeated String:                     Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 17345 / 17424          0.6     
   1654.1       1.0X
-        SQL Json                                  8639 / 8664          1.2     
    823.9       2.0X
-        SQL Parquet Vectorized                     839 /  854         12.5     
     80.0      20.7X
-        SQL Parquet MR                            1771 / 1775          5.9     
    168.9       9.8X
-        SQL ORC Vectorized                         550 /  569         19.1     
     52.4      31.6X
-        SQL ORC Vectorized with copy               785 /  849         13.4     
     74.9      22.1X
-        SQL ORC MR                                2168 / 2202          4.8     
    206.7       8.0X
-        */
         benchmark.run()
       }
     }
   }
 
   def partitionTableScanBenchmark(values: Int): Unit = {
-    val benchmark = new Benchmark("Partitioned Table", values)
+    val benchmark = new Benchmark("Partitioned Table", values, output = output)
 
     withTempPath { dir =>
       withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") 
{
@@ -562,40 +441,13 @@ object DataSourceReadBenchmark extends SQLHelper {
           }
         }
 
-        /*
-        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 
4.14.33-51.37.amzn1.x86_64
-        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-        Partitioned Table:                   Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        Data column - CSV                       32613 / 32841          0.5     
   2073.4       1.0X
-        Data column - Json                      13343 / 13469          1.2     
    848.3       2.4X
-        Data column - Parquet Vectorized           302 /  318         52.1     
     19.2     108.0X
-        Data column - Parquet MR                  2908 / 2924          5.4     
    184.9      11.2X
-        Data column - ORC Vectorized               412 /  425         38.1     
     26.2      79.1X
-        Data column - ORC Vectorized with copy     442 /  446         35.6     
     28.1      73.8X
-        Data column - ORC MR                      2390 / 2396          6.6     
    152.0      13.6X
-        Partition column - CSV                    9626 / 9683          1.6     
    612.0       3.4X
-        Partition column - Json                 10909 / 10923          1.4     
    693.6       3.0X
-        Partition column - Parquet Vectorized       69 /   76        228.4     
      4.4     473.6X
-        Partition column - Parquet MR             1898 / 1933          8.3     
    120.7      17.2X
-        Partition column - ORC Vectorized           67 /   74        236.0     
      4.2     489.4X
-        Partition column - ORC Vectorized with copy 65 /   72        241.9     
      4.1     501.6X
-        Partition column - ORC MR                 1743 / 1749          9.0     
    110.8      18.7X
-        Both columns - CSV                      35523 / 35552          0.4     
   2258.5       0.9X
-        Both columns - Json                     13676 / 13681          1.2     
    869.5       2.4X
-        Both columns - Parquet Vectorized          317 /  326         49.5     
     20.2     102.7X
-        Both columns - Parquet MR                 3333 / 3336          4.7     
    211.9       9.8X
-        Both columns - ORC Vectorized              441 /  446         35.6     
     28.1      73.9X
-        Both column - ORC Vectorized with copy     517 /  524         30.4     
     32.9      63.1X
-        Both columns - ORC MR                     2574 / 2577          6.1     
    163.6      12.7X
-        */
         benchmark.run()
       }
     }
   }
 
   def stringWithNullsScanBenchmark(values: Int, fractionOfNulls: Double): Unit 
= {
-    val benchmark = new Benchmark("String with Nulls Scan", values)
+    val benchmark = new Benchmark("String with Nulls Scan", values, output = 
output)
 
     withTempPath { dir =>
       withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") 
{
@@ -673,51 +525,16 @@ object DataSourceReadBenchmark extends SQLHelper {
           }
         }
 
-        /*
-        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 
4.14.33-51.37.amzn1.x86_64
-        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-        String with Nulls Scan:              Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 14875 / 14920          0.7     
   1418.6       1.0X
-        SQL Json                                10974 / 10992          1.0     
   1046.5       1.4X
-        SQL Parquet Vectorized                    1711 / 1750          6.1     
    163.2       8.7X
-        SQL Parquet MR                            3838 / 3884          2.7     
    366.0       3.9X
-        ParquetReader Vectorized                  1155 / 1168          9.1     
    110.2      12.9X
-        SQL ORC Vectorized                        1341 / 1380          7.8     
    127.9      11.1X
-        SQL ORC Vectorized with copy              1659 / 1716          6.3     
    158.2       9.0X
-        SQL ORC MR                                3594 / 3634          2.9     
    342.7       4.1X
-
-
-        String with Nulls Scan:              Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 17219 / 17264          0.6     
   1642.1       1.0X
-        SQL Json                                  8843 / 8864          1.2     
    843.3       1.9X
-        SQL Parquet Vectorized                    1169 / 1178          9.0     
    111.4      14.7X
-        SQL Parquet MR                            2676 / 2697          3.9     
    255.2       6.4X
-        ParquetReader Vectorized                  1068 / 1071          9.8     
    101.8      16.1X
-        SQL ORC Vectorized                        1319 / 1319          7.9     
    125.8      13.1X
-        SQL ORC Vectorized with copy              1638 / 1639          6.4     
    156.2      10.5X
-        SQL ORC MR                                3230 / 3257          3.2     
    308.1       5.3X
-
-
-        String with Nulls Scan:              Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 13976 / 14053          0.8     
   1332.8       1.0X
-        SQL Json                                  5166 / 5176          2.0     
    492.6       2.7X
-        SQL Parquet Vectorized                     274 /  282         38.2     
     26.2      50.9X
-        SQL Parquet MR                            1553 / 1555          6.8     
    148.1       9.0X
-        ParquetReader Vectorized                   241 /  246         43.5     
     23.0      57.9X
-        SQL ORC Vectorized                         476 /  479         22.0     
     45.4      29.3X
-        SQL ORC Vectorized with copy               584 /  588         17.9     
     55.7      23.9X
-        SQL ORC MR                                1720 / 1734          6.1     
    164.1       8.1X
-        */
         benchmark.run()
       }
     }
   }
 
   def columnsBenchmark(values: Int, width: Int): Unit = {
-    val benchmark = new Benchmark(s"Single Column Scan from $width columns", 
values)
+    val benchmark = new Benchmark(
+      s"Single Column Scan from $width columns",
+      values,
+      output = output)
 
     withTempPath { dir =>
       withTempTable("t1", "csvTable", "jsonTable", "parquetTable", "orcTable") 
{
@@ -763,58 +580,35 @@ object DataSourceReadBenchmark extends SQLHelper {
           }
         }
 
-        /*
-        OpenJDK 64-Bit Server VM 1.8.0_171-b10 on Linux 
4.14.33-51.37.amzn1.x86_64
-        Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
-        Single Column Scan from 10 columns:  Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                   3478 / 3481          0.3     
   3316.4       1.0X
-        SQL Json                                  2646 / 2654          0.4     
   2523.6       1.3X
-        SQL Parquet Vectorized                      67 /   72         15.8     
     63.5      52.2X
-        SQL Parquet MR                             207 /  214          5.1     
    197.6      16.8X
-        SQL ORC Vectorized                          69 /   76         15.2     
     66.0      50.3X
-        SQL ORC Vectorized with copy                70 /   76         15.0     
     66.5      49.9X
-        SQL ORC MR                                 299 /  303          3.5     
    285.1      11.6X
-
-
-        Single Column Scan from 50 columns:  Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                   9214 / 9236          0.1     
   8786.7       1.0X
-        SQL Json                                  9943 / 9978          0.1     
   9482.7       0.9X
-        SQL Parquet Vectorized                      77 /   86         13.6     
     73.3     119.8X
-        SQL Parquet MR                             229 /  235          4.6     
    218.6      40.2X
-        SQL ORC Vectorized                          84 /   96         12.5     
     80.0     109.9X
-        SQL ORC Vectorized with copy                83 /   91         12.6     
     79.4     110.7X
-        SQL ORC MR                                 843 /  854          1.2     
    804.0      10.9X
-
-
-        Single Column Scan from 100 columns  Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-        
--------------------------------------------------------------------------------------------
-        SQL CSV                                 16503 / 16622          0.1     
  15738.9       1.0X
-        SQL Json                                19109 / 19184          0.1     
  18224.2       0.9X
-        SQL Parquet Vectorized                      99 /  108         10.6     
     94.3     166.8X
-        SQL Parquet MR                             253 /  264          4.1     
    241.6      65.1X
-        SQL ORC Vectorized                         107 /  114          9.8     
    101.6     154.8X
-        SQL ORC Vectorized with copy               107 /  118          9.8     
    102.1     154.1X
-        SQL ORC MR                                1526 / 1529          0.7     
   1455.3      10.8X
-        */
         benchmark.run()
       }
     }
   }
 
-  def main(args: Array[String]): Unit = {
-    Seq(ByteType, ShortType, IntegerType, LongType, FloatType, 
DoubleType).foreach { dataType =>
-      numericScanBenchmark(1024 * 1024 * 15, dataType)
+  override def runBenchmarkSuite(): Unit = {
+    runBenchmark("SQL Single Numeric Column Scan") {
+      Seq(ByteType, ShortType, IntegerType, LongType, FloatType, 
DoubleType).foreach {
+        dataType => numericScanBenchmark(1024 * 1024 * 15, dataType)
+      }
+    }
+    runBenchmark("Int and String Scan") {
+      intStringScanBenchmark(1024 * 1024 * 10)
     }
-    intStringScanBenchmark(1024 * 1024 * 10)
-    repeatedStringScanBenchmark(1024 * 1024 * 10)
-    partitionTableScanBenchmark(1024 * 1024 * 15)
-    for (fractionOfNulls <- List(0.0, 0.50, 0.95)) {
-      stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls)
+    runBenchmark("Repeated String Scan") {
+      repeatedStringScanBenchmark(1024 * 1024 * 10)
     }
-    for (columnWidth <- List(10, 50, 100)) {
-      columnsBenchmark(1024 * 1024 * 1, columnWidth)
+    runBenchmark("Partitioned Table Scan") {
+      partitionTableScanBenchmark(1024 * 1024 * 15)
+    }
+    runBenchmark("String with Nulls Scan") {
+      for (fractionOfNulls <- List(0.0, 0.50, 0.95)) {
+        stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls)
+      }
+    }
+    runBenchmark("Single Column Scan From Wide Columns") {
+      for (columnWidth <- List(10, 50, 100)) {
+        columnsBenchmark(1024 * 1024 * 1, columnWidth)
+      }
     }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to