This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 3339dee  [SPARK-31311][SQL][TESTS] Benchmark date-time rebasing in ORC 
datasource
3339dee is described below

commit 3339deee0776cea64ce557d18427e8beb8fb8745
Author: Max Gekk <[email protected]>
AuthorDate: Wed Apr 1 07:02:26 2020 +0000

    [SPARK-31311][SQL][TESTS] Benchmark date-time rebasing in ORC datasource
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to add new benchmarks to `DateTimeRebaseBenchmark` for 
saving and loading dates/timestamps to/from ORC files. I extracted common code 
from the benchmark for Parquet datasource and place it to the methods 
`caseName()` and `getPath()`. Added benchmarks for ORC save/load dates before 
and after 1582-10-15 because an implementation may have different performance 
for dates before the Julian calendar cutover day, see #28067 as an example.
    
    ### Why are the changes needed?
    To have the base line for future optimizations of 
`fromJavaDate()`/`toJavaDate()` and `toJavaTimestamp()`/`fromJavaTimestamp()` 
in `DateTimeUtils`. The methods are used while saving/loading dates/timestamps 
by ORC datasource.
    
    ### Does this PR introduce any user-facing change?
    No
    
    ### How was this patch tested?
    By running the updated benchmark `DateTimeRebaseBenchmark` via the command:
    ```
    SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain 
org.apache.spark.sql.execution.benchmark.DateTimeRebaseBenchmark"
    ```
    in the environment:
    
    | Item | Description |
    | ---- | ----|
    | Region | us-west-2 (Oregon) |
    | Instance | r3.xlarge |
    | AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 
(ami-06f2f779464715dc5) |
    | Java | OpenJDK 1.8.0_242-8u242/11.0.6+10 |
    
    Closes #28076 from MaxGekk/rebase-benchmark-orc.
    
    Lead-authored-by: Max Gekk <[email protected]>
    Co-authored-by: Maxim Gekk <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
    (cherry picked from commit 91af87d34ee11490164ca5f1023d9818775dceb7)
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../DateTimeRebaseBenchmark-jdk11-results.txt      |  97 +++++++++++-----
 .../benchmarks/DateTimeRebaseBenchmark-results.txt |  97 +++++++++++-----
 .../benchmark/DateTimeRebaseBenchmark.scala        | 129 ++++++++++++++-------
 3 files changed, 227 insertions(+), 96 deletions(-)

diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt 
b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt
index 4fed511..01b0639 100644
--- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt
@@ -6,48 +6,89 @@ OpenJDK 64-Bit Server VM 
11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save dates to parquet:                    Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                   9304           9304         
  0         10.7          93.0       1.0X
-before 1582, noop                                  9187           9187         
  0         10.9          91.9       1.0X
-after 1582, rebase off                            22054          22054         
  0          4.5         220.5       0.4X
-after 1582, rebase on                             20361          20361         
  0          4.9         203.6       0.5X
-before 1582, rebase off                           20286          20286         
  0          4.9         202.9       0.5X
-before 1582, rebase on                            22230          22230         
  0          4.5         222.3       0.4X
+after 1582, noop                                   9299           9299         
  0         10.8          93.0       1.0X
+before 1582, noop                                  9220           9220         
  0         10.8          92.2       1.0X
+after 1582, rebase off                            20390          20390         
  0          4.9         203.9       0.5X
+after 1582, rebase on                             20378          20378         
  0          4.9         203.8       0.5X
+before 1582, rebase off                           20069          20069         
  0          5.0         200.7       0.5X
+before 1582, rebase on                            20637          20637         
  0          4.8         206.4       0.5X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 
4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load dates from parquet:                  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off, rebase off                   12773          12866         
129          7.8         127.7       1.0X
-after 1582, vec off, rebase on                    13063          13086         
 39          7.7         130.6       1.0X
-after 1582, vec on, rebase off                     3678           3719         
 61         27.2          36.8       3.5X
-after 1582, vec on, rebase on                      5078           5121         
 52         19.7          50.8       2.5X
-before 1582, vec off, rebase off                  12942          12972         
 42          7.7         129.4       1.0X
-before 1582, vec off, rebase on                   13866          13904         
 58          7.2         138.7       0.9X
-before 1582, vec on, rebase off                    3678           3711         
 43         27.2          36.8       3.5X
-before 1582, vec on, rebase on                     5621           5657         
 44         17.8          56.2       2.3X
+after 1582, vec off, rebase off                   12927          13017         
 78          7.7         129.3       1.0X
+after 1582, vec off, rebase on                    13127          13176         
 50          7.6         131.3       1.0X
+after 1582, vec on, rebase off                     3725           3779         
 91         26.8          37.3       3.5X
+after 1582, vec on, rebase on                      5134           5221         
 99         19.5          51.3       2.5X
+before 1582, vec off, rebase off                  13049          13061         
 16          7.7         130.5       1.0X
+before 1582, vec off, rebase on                   13877          13916         
 51          7.2         138.8       0.9X
+before 1582, vec on, rebase off                    3702           3736         
 56         27.0          37.0       3.5X
+before 1582, vec on, rebase on                     5567           5637         
 78         18.0          55.7       2.3X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 
4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save timestamps to parquet:               Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                   2983           2983         
  0         33.5          29.8       1.0X
-before 1582, noop                                  2979           2979         
  0         33.6          29.8       1.0X
-after 1582, rebase off                            17452          17452         
  0          5.7         174.5       0.2X
-after 1582, rebase on                             70193          70193         
  0          1.4         701.9       0.0X
-before 1582, rebase off                           17784          17784         
  0          5.6         177.8       0.2X
-before 1582, rebase on                            83498          83498         
  0          1.2         835.0       0.0X
+after 1582, noop                                   2988           2988         
  0         33.5          29.9       1.0X
+before 1582, noop                                  3000           3000         
  0         33.3          30.0       1.0X
+after 1582, rebase off                            16163          16163         
  0          6.2         161.6       0.2X
+after 1582, rebase on                             68399          68399         
  0          1.5         684.0       0.0X
+before 1582, rebase off                           16921          16921         
  0          5.9         169.2       0.2X
+before 1582, rebase on                            74425          74425         
  0          1.3         744.3       0.0X
 
 OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 
4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load timestamps from parquet:             Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off, rebase off                   15114          15151         
 32          6.6         151.1       1.0X
-after 1582, vec off, rebase on                    45804          45912         
126          2.2         458.0       0.3X
-after 1582, vec on, rebase off                     4900           4947         
 56         20.4          49.0       3.1X
-after 1582, vec on, rebase on                     34599          34650         
 45          2.9         346.0       0.4X
-before 1582, vec off, rebase off                  15093          15174         
 70          6.6         150.9       1.0X
-before 1582, vec off, rebase on                   47367          47472         
121          2.1         473.7       0.3X
-before 1582, vec on, rebase off                    4884           4952         
 80         20.5          48.8       3.1X
-before 1582, vec on, rebase on                    35831          35883         
 59          2.8         358.3       0.4X
+after 1582, vec off, rebase off                   15147          15258         
 97          6.6         151.5       1.0X
+after 1582, vec off, rebase on                    45035          45101         
 60          2.2         450.3       0.3X
+after 1582, vec on, rebase off                     4934           5012         
100         20.3          49.3       3.1X
+after 1582, vec on, rebase on                     34263          34360         
 88          2.9         342.6       0.4X
+before 1582, vec off, rebase off                  15177          15220         
 37          6.6         151.8       1.0X
+before 1582, vec off, rebase on                   46754          46761         
 12          2.1         467.5       0.3X
+before 1582, vec on, rebase off                    4892           4956         
 61         20.4          48.9       3.1X
+before 1582, vec on, rebase on                    35989          36014         
 22          2.8         359.9       0.4X
+
+
+================================================================================================
+Rebasing dates/timestamps in ORC datasource
+================================================================================================
+
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 
4.15.0-1063-aws
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Save dates to ORC:                        Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+after 1582, noop                                   9295           9295         
  0         10.8          93.0       1.0X
+before 1582, noop                                  9352           9352         
  0         10.7          93.5       1.0X
+after 1582                                        17112          17112         
  0          5.8         171.1       0.5X
+before 1582                                       17979          17979         
  0          5.6         179.8       0.5X
+
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 
4.15.0-1063-aws
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Load dates from ORC:                      Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+after 1582, vec off                               20874          20905         
 38          4.8         208.7       1.0X
+after 1582, vec on                                 3813           3844         
 28         26.2          38.1       5.5X
+before 1582, vec off                              25912          25949         
 38          3.9         259.1       0.8X
+before 1582, vec on                                4322           4343         
 19         23.1          43.2       4.8X
+
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 
4.15.0-1063-aws
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Save timestamps to ORC:                   Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+after 1582, noop                                   3003           3003         
  0         33.3          30.0       1.0X
+before 1582, noop                                  3012           3012         
  0         33.2          30.1       1.0X
+after 1582                                        41031          41031         
  0          2.4         410.3       0.1X
+before 1582                                       44436          44436         
  0          2.3         444.4       0.1X
+
+OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 
4.15.0-1063-aws
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Load timestamps from ORC:                 Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+after 1582, vec off                               28477          28582         
 92          3.5         284.8       1.0X
+after 1582, vec on                                20754          20924         
237          4.8         207.5       1.4X
+before 1582, vec off                              32858          32921         
 58          3.0         328.6       0.9X
+before 1582, vec on                               25734          25769         
 30          3.9         257.3       1.1X
 
 
diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt 
b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt
index ee48627..b353013 100644
--- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt
+++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt
@@ -6,48 +6,89 @@ OpenJDK 64-Bit Server VM 
1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save dates to parquet:                    Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                   9582           9582         
  0         10.4          95.8       1.0X
-before 1582, noop                                  9473           9473         
  0         10.6          94.7       1.0X
-after 1582, rebase off                            21431          21431         
  0          4.7         214.3       0.4X
-after 1582, rebase on                             22156          22156         
  0          4.5         221.6       0.4X
-before 1582, rebase off                           21399          21399         
  0          4.7         214.0       0.4X
-before 1582, rebase on                            22927          22927         
  0          4.4         229.3       0.4X
+after 1582, noop                                   9691           9691         
  0         10.3          96.9       1.0X
+before 1582, noop                                  9024           9024         
  0         11.1          90.2       1.1X
+after 1582, rebase off                            21195          21195         
  0          4.7         211.9       0.5X
+after 1582, rebase on                             20045          20045         
  0          5.0         200.4       0.5X
+before 1582, rebase off                           20039          20039         
  0          5.0         200.4       0.5X
+before 1582, rebase on                            20451          20451         
  0          4.9         204.5       0.5X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 
4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load dates from parquet:                  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off, rebase off                   12637          12736         
111          7.9         126.4       1.0X
-after 1582, vec off, rebase on                    13463          13531         
 61          7.4         134.6       0.9X
-after 1582, vec on, rebase off                     3693           3703         
  8         27.1          36.9       3.4X
-after 1582, vec on, rebase on                      5242           5252         
  9         19.1          52.4       2.4X
-before 1582, vec off, rebase off                  13055          13169         
126          7.7         130.5       1.0X
-before 1582, vec off, rebase on                   14067          14270         
185          7.1         140.7       0.9X
-before 1582, vec on, rebase off                    3697           3702         
  7         27.1          37.0       3.4X
-before 1582, vec on, rebase on                     6058           6097         
 34         16.5          60.6       2.1X
+after 1582, vec off, rebase off                   13207          13339         
116          7.6         132.1       1.0X
+after 1582, vec off, rebase on                    13408          13446         
 57          7.5         134.1       1.0X
+after 1582, vec on, rebase off                     3680           3712         
 39         27.2          36.8       3.6X
+after 1582, vec on, rebase on                      5229           5261         
 29         19.1          52.3       2.5X
+before 1582, vec off, rebase off                  13135          13164         
 25          7.6         131.4       1.0X
+before 1582, vec off, rebase on                   13946          14033         
 94          7.2         139.5       0.9X
+before 1582, vec on, rebase off                    3689           3726         
 49         27.1          36.9       3.6X
+before 1582, vec on, rebase on                     5679           5687         
  9         17.6          56.8       2.3X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 
4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Save timestamps to parquet:               Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-after 1582, noop                                   2713           2713         
  0         36.9          27.1       1.0X
-before 1582, noop                                  2715           2715         
  0         36.8          27.2       1.0X
-after 1582, rebase off                            16768          16768         
  0          6.0         167.7       0.2X
-after 1582, rebase on                             82811          82811         
  0          1.2         828.1       0.0X
-before 1582, rebase off                           17052          17052         
  0          5.9         170.5       0.2X
-before 1582, rebase on                            95134          95134         
  0          1.1         951.3       0.0X
+after 1582, noop                                   2720           2720         
  0         36.8          27.2       1.0X
+before 1582, noop                                  2712           2712         
  0         36.9          27.1       1.0X
+after 1582, rebase off                            16626          16626         
  0          6.0         166.3       0.2X
+after 1582, rebase on                             85136          85136         
  0          1.2         851.4       0.0X
+before 1582, rebase off                           16855          16855         
  0          5.9         168.6       0.2X
+before 1582, rebase on                           106121         106121         
  0          0.9        1061.2       0.0X
 
 OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 
4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Load timestamps from parquet:             Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-after 1582, vec off, rebase off                   15200          15321         
194          6.6         152.0       1.0X
-after 1582, vec off, rebase on                    63160          63337         
177          1.6         631.6       0.2X
-after 1582, vec on, rebase off                     4891           4928         
 43         20.4          48.9       3.1X
-after 1582, vec on, rebase on                     45474          45484         
 10          2.2         454.7       0.3X
-before 1582, vec off, rebase off                  15203          15330         
110          6.6         152.0       1.0X
-before 1582, vec off, rebase on                   65588          65664         
 73          1.5         655.9       0.2X
-before 1582, vec on, rebase off                    4844           4916         
105         20.6          48.4       3.1X
-before 1582, vec on, rebase on                    47815          47943         
162          2.1         478.2       0.3X
+after 1582, vec off, rebase off                   15198          15301         
 90          6.6         152.0       1.0X
+after 1582, vec off, rebase on                    55210          55370         
140          1.8         552.1       0.3X
+after 1582, vec on, rebase off                     4859           4880         
 19         20.6          48.6       3.1X
+after 1582, vec on, rebase on                     44758          44824         
 85          2.2         447.6       0.3X
+before 1582, vec off, rebase off                  15206          15316         
112          6.6         152.1       1.0X
+before 1582, vec off, rebase on                   60452          60588         
222          1.7         604.5       0.3X
+before 1582, vec on, rebase off                    4892           4933         
 36         20.4          48.9       3.1X
+before 1582, vec on, rebase on                    46871          46950         
 82          2.1         468.7       0.3X
+
+
+================================================================================================
+Rebasing dates/timestamps in ORC datasource
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 
4.15.0-1063-aws
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Save dates to ORC:                        Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+after 1582, noop                                   9102           9102         
  0         11.0          91.0       1.0X
+before 1582, noop                                  9099           9099         
  0         11.0          91.0       1.0X
+after 1582                                        17652          17652         
  0          5.7         176.5       0.5X
+before 1582                                       18284          18284         
  0          5.5         182.8       0.5X
+
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 
4.15.0-1063-aws
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Load dates from ORC:                      Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+after 1582, vec off                               25169          25215         
 48          4.0         251.7       1.0X
+after 1582, vec on                                 3701           3717         
 16         27.0          37.0       6.8X
+before 1582, vec off                              26919          27045         
182          3.7         269.2       0.9X
+before 1582, vec on                                4169           4192         
 31         24.0          41.7       6.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 
4.15.0-1063-aws
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Save timestamps to ORC:                   Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+after 1582, noop                                   2906           2906         
  0         34.4          29.1       1.0X
+before 1582, noop                                  2863           2863         
  0         34.9          28.6       1.0X
+after 1582                                        48858          48858         
  0          2.0         488.6       0.1X
+before 1582                                       50945          50945         
  0          2.0         509.5       0.1X
+
+OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 
4.15.0-1063-aws
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Load timestamps from ORC:                 Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+after 1582, vec off                               40925          40955         
 26          2.4         409.2       1.0X
+after 1582, vec on                                31246          31404         
164          3.2         312.5       1.3X
+before 1582, vec off                              44634          44680         
 40          2.2         446.3       0.9X
+before 1582, vec on                               35578          35834         
282          2.8         355.8       1.2X
 
 
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala
index 48ceccc..6285461 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.benchmark
 
+import java.io.File
 import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneOffset}
 
 import org.apache.spark.benchmark.Benchmark
@@ -91,68 +92,116 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark {
     }
   }
 
+  private def benchmarkInputs(benchmark: Benchmark, rowsNum: Int, dateTime: 
String): Unit = {
+    benchmark.addCase("after 1582, noop", 1) { _ =>
+      genDF(rowsNum, dateTime, after1582 = true).noop()
+    }
+    benchmark.addCase("before 1582, noop", 1) { _ =>
+      genDF(rowsNum, dateTime, after1582 = false).noop()
+    }
+  }
+
+  private def flagToStr(flag: Boolean): String = {
+    if (flag) "on" else "off"
+  }
+
+  private def caseName(
+      after1582: Boolean,
+      rebase: Option[Boolean] = None,
+      vec: Option[Boolean] = None): String = {
+    val period = if (after1582) "after" else "before"
+    val vecFlag = vec.map(flagToStr).map(flag => s", vec $flag").getOrElse("")
+    val rebaseFlag = rebase.map(flagToStr).map(flag => s", rebase 
$flag").getOrElse("")
+    s"$period 1582$vecFlag$rebaseFlag"
+  }
+
+  private def getPath(
+      basePath: File,
+      dateTime: String,
+      after1582: Boolean,
+      rebase: Option[Boolean] = None): String = {
+    val period = if (after1582) "after" else "before"
+    val rebaseFlag = rebase.map(flagToStr).map(flag => s"_$flag").getOrElse("")
+    basePath.getAbsolutePath + s"/${dateTime}_${period}_1582$rebaseFlag"
+  }
+
   override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    val rowsNum = 100000000
+
     withTempPath { path =>
       runBenchmark("Rebasing dates/timestamps in Parquet datasource") {
-        val rowsNum = 100000000
         Seq("date", "timestamp").foreach { dateTime =>
           val benchmark = new Benchmark(s"Save ${dateTime}s to parquet", 
rowsNum, output = output)
-          benchmark.addCase("after 1582, noop", 1) { _ =>
-            genDF(rowsNum, dateTime, after1582 = true).noop()
-          }
-          benchmark.addCase("before 1582, noop", 1) { _ =>
-            genDF(rowsNum, dateTime, after1582 = false).noop()
-          }
-
-          def save(after1582: Boolean, rebase: Boolean): Unit = {
-            val period = if (after1582) "after" else "before"
-            val rebaseFlag = if (rebase) "on" else "off"
-            val caseName = s"$period 1582, rebase $rebaseFlag"
-            benchmark.addCase(caseName, 1) { _ =>
-              withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_WRITE.key 
-> rebase.toString) {
-                val df = genDF(rowsNum, dateTime, after1582)
-                val pathToWrite = path.getAbsolutePath + 
s"/${dateTime}_${period}_1582_$rebaseFlag"
-                df.write
-                  .mode("overwrite")
-                  .format("parquet")
-                  .save(pathToWrite)
-              }
-            }
-          }
-
+          benchmarkInputs(benchmark, rowsNum, dateTime)
           Seq(true, false).foreach { after1582 =>
             Seq(false, true).foreach { rebase =>
-              save(after1582, rebase)
+              benchmark.addCase(caseName(after1582, Some(rebase)), 1) { _ =>
+                withSQLConf(
+                  SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_WRITE.key -> 
rebase.toString) {
+                  genDF(rowsNum, dateTime, after1582)
+                    .write
+                    .mode("overwrite")
+                    .format("parquet")
+                    .save(getPath(path, dateTime, after1582, Some(rebase)))
+                }
+              }
             }
           }
           benchmark.run()
 
           val benchmark2 = new Benchmark(
             s"Load ${dateTime}s from parquet", rowsNum, output = output)
-
-          def load(after1582: Boolean, vec: Boolean, rebase: Boolean): Unit = {
-            val period = if (after1582) "after" else "before"
-            val rebaseFlag = if (rebase) "on" else "off"
-            val vecFlag = if (vec) "on" else "off"
-            val caseName = s"$period 1582, vec $vecFlag, rebase $rebaseFlag"
-            benchmark2.addCase(caseName, 3) { _ =>
-              withSQLConf(
-                SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vec.toString,
-                SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> 
rebase.toString) {
-                val pathToRead = path.getAbsolutePath + 
s"/${dateTime}_${period}_1582_$rebaseFlag"
-                spark.read.format("parquet").load(pathToRead).noop()
+          Seq(true, false).foreach { after1582 =>
+            Seq(false, true).foreach { vec =>
+              Seq(false, true).foreach { rebase =>
+                benchmark2.addCase(caseName(after1582, Some(rebase), 
Some(vec)), 3) { _ =>
+                  withSQLConf(
+                    SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> 
vec.toString,
+                    SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> 
rebase.toString) {
+                    spark.read
+                      .format("parquet")
+                      .load(getPath(path, dateTime, after1582, Some(rebase)))
+                      .noop()
+                  }
+                }
               }
             }
           }
+          benchmark2.run()
+        }
+      }
+    }
+
+    withTempPath { path =>
+      runBenchmark("Rebasing dates/timestamps in ORC datasource") {
+        Seq("date", "timestamp").foreach { dateTime =>
+          val benchmark = new Benchmark(s"Save ${dateTime}s to ORC", rowsNum, 
output = output)
+          benchmarkInputs(benchmark, rowsNum, dateTime)
+          Seq(true, false).foreach { after1582 =>
+            benchmark.addCase(caseName(after1582), 1) { _ =>
+              genDF(rowsNum, dateTime, after1582)
+                .write
+                .mode("overwrite")
+                .format("orc")
+                .save(getPath(path, dateTime, after1582))
+            }
+          }
+          benchmark.run()
 
+          val benchmark2 = new Benchmark(s"Load ${dateTime}s from ORC", 
rowsNum, output = output)
           Seq(true, false).foreach { after1582 =>
             Seq(false, true).foreach { vec =>
-              Seq(false, true).foreach { rebase =>
-                load(after1582, vec, rebase)
+              benchmark2.addCase(caseName(after1582, vec = Some(vec)), 3) { _ 
=>
+                withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> 
vec.toString) {
+                  spark
+                    .read
+                    .format("orc")
+                    .load(getPath(path, dateTime, after1582))
+                    .noop()
+                }
               }
             }
           }
-
           benchmark2.run()
         }
       }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to