This is an automated email from the ASF dual-hosted git repository.

yangjie01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 1e14c5c9edb0 [SPARK-54693][CORE][TESTS] Add LZ4TPCDSDataBenchmark
1e14c5c9edb0 is described below

commit 1e14c5c9edb06cf8894101c726d8d62b9ce498af
Author: Cheng Pan <[email protected]>
AuthorDate: Mon Dec 15 16:19:02 2025 +0800

    [SPARK-54693][CORE][TESTS] Add LZ4TPCDSDataBenchmark
    
    ### What changes were proposed in this pull request?
    
    Add `LZ4TPCDSDataBenchmark`, test `LZ4CompressionCodec` against TPCDS 
`catalog_sales.dat` (SF1), the size is about 283M.
    
    ### Why are the changes needed?
    
    Add a benchmark to measure the perf impact of lz4 security upgrading.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Added benchmark result later. Since the change has a refactor that touched 
`ZSTDTPCDSDataBenchmark`, also updated its benchmark results.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #53453 from pan3793/SPARK-54693.
    
    Lead-authored-by: Cheng Pan <[email protected]>
    Co-authored-by: pan3793 <[email protected]>
    Signed-off-by: yangjie01 <[email protected]>
---
 .github/workflows/benchmark.yml                    |  4 +-
 .../LZ4TPCDSDataBenchmark-jdk21-results.txt        | 17 +++++
 core/benchmarks/LZ4TPCDSDataBenchmark-results.txt  | 17 +++++
 .../ZStandardTPCDSDataBenchmark-jdk21-results.txt  | 60 ++++++++---------
 .../ZStandardTPCDSDataBenchmark-results.txt        | 68 +++++++++----------
 .../apache/spark/io/LZ4TPCDSDataBenchmark.scala    | 78 ++++++++++++++++++++++
 .../org/apache/spark/io/TPCDSDataBenchmark.scala   | 64 ++++++++++++++++++
 .../spark/io/ZStandardTPCDSDataBenchmark.scala     | 31 ++++-----
 8 files changed, 254 insertions(+), 85 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 3e90bb329be5..b0454c016640 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -72,7 +72,7 @@ jobs:
   # Any TPC-DS related updates on this job need to be applied to tpcds-1g job 
of build_and_test.yml as well
   tpcds-1g-gen:
     name: "Generate an TPC-DS dataset with SF=1"
-    if: contains(inputs.class, 'TPCDSQueryBenchmark') || 
contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || contains(inputs.class, 
'*')
+    if: contains(inputs.class, 'TPCDSQueryBenchmark') || 
contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class, 
'ZStandardTPCDSDataBenchmark') || contains(inputs.class, '*')
     runs-on: ubuntu-latest
     env:
       SPARK_LOCAL_IP: localhost
@@ -177,7 +177,7 @@ jobs:
         distribution: zulu
         java-version: ${{ inputs.jdk }}
     - name: Cache TPC-DS generated data
-      if: contains(inputs.class, 'TPCDSQueryBenchmark') || 
contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || contains(inputs.class, 
'*')
+      if: contains(inputs.class, 'TPCDSQueryBenchmark') || 
contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class, 
'ZStandardTPCDSDataBenchmark') || contains(inputs.class, '*')
       id: cache-tpcds-sf-1
       uses: actions/cache@v4
       with:
diff --git a/core/benchmarks/LZ4TPCDSDataBenchmark-jdk21-results.txt 
b/core/benchmarks/LZ4TPCDSDataBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..64fb5dc7875a
--- /dev/null
+++ b/core/benchmarks/LZ4TPCDSDataBenchmark-jdk21-results.txt
@@ -0,0 +1,17 @@
+================================================================================================
+Benchmark LZ4CompressionCodec
+================================================================================================
+
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Compression:                              Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Compression 4 times                                2621           2633         
 16          0.0   655315176.3       1.0X
+
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Decompression:                            Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Decompression 4 times                              2270           2286         
 23          0.0   567457203.5       1.0X
+
+
diff --git a/core/benchmarks/LZ4TPCDSDataBenchmark-results.txt 
b/core/benchmarks/LZ4TPCDSDataBenchmark-results.txt
new file mode 100644
index 000000000000..ff05d27454ec
--- /dev/null
+++ b/core/benchmarks/LZ4TPCDSDataBenchmark-results.txt
@@ -0,0 +1,17 @@
+================================================================================================
+Benchmark LZ4CompressionCodec
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Compression:                              Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Compression 4 times                                2614           2621         
 10          0.0   653597000.8       1.0X
+
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Decompression:                            Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Decompression 4 times                              2346           2361         
 22          0.0   586455894.5       1.0X
+
+
diff --git a/core/benchmarks/ZStandardTPCDSDataBenchmark-jdk21-results.txt 
b/core/benchmarks/ZStandardTPCDSDataBenchmark-jdk21-results.txt
index 7630433a2f7a..bdf21e4ad764 100644
--- a/core/benchmarks/ZStandardTPCDSDataBenchmark-jdk21-results.txt
+++ b/core/benchmarks/ZStandardTPCDSDataBenchmark-jdk21-results.txt
@@ -2,48 +2,48 @@
 Benchmark ZStandardCompressionCodec
 
================================================================================================
 
-OpenJDK 64-Bit Server VM 21.0.8+9-LTS on Linux 6.11.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
 AMD EPYC 7763 64-Core Processor
-Benchmark ZStandardCompressionCodec:                Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+Compression:                                        Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
----------------------------------------------------------------------------------------------------------------------------------
-Compression 4 times at level 1 without buffer pool           2567           
2573           9          0.0   641834272.8       1.0X
-Compression 4 times at level 2 without buffer pool           4171           
4173           2          0.0  1042717610.7       0.6X
-Compression 4 times at level 3 without buffer pool           6202           
6205           4          0.0  1550416592.3       0.4X
-Compression 4 times at level 1 with buffer pool              2562           
2562           0          0.0   640541340.8       1.0X
-Compression 4 times at level 2 with buffer pool              4172           
4173           1          0.0  1043037897.5       0.6X
-Compression 4 times at level 3 with buffer pool              6199           
6203           4          0.0  1549874630.3       0.4X
+Compression 4 times at level 1 without buffer pool           2585           
2586           1          0.0   646239352.5       1.0X
+Compression 4 times at level 2 without buffer pool           4211           
4216           7          0.0  1052689958.8       0.6X
+Compression 4 times at level 3 without buffer pool           6328           
6328           1          0.0  1581904435.8       0.4X
+Compression 4 times at level 1 with buffer pool              2585           
2587           3          0.0   646306514.0       1.0X
+Compression 4 times at level 2 with buffer pool              4193           
4195           3          0.0  1048293261.0       0.6X
+Compression 4 times at level 3 with buffer pool              6264           
6284          28          0.0  1565962696.5       0.4X
 
-OpenJDK 64-Bit Server VM 21.0.8+9-LTS on Linux 6.11.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
 AMD EPYC 7763 64-Core Processor
-Benchmark ZStandardCompressionCodec:                    Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+Decompression:                                          Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
--------------------------------------------------------------------------------------------------------------------------------------
-Decompression 4 times from level 1 without buffer pool            856          
  875          17          0.0   214044635.0       1.0X
-Decompression 4 times from level 2 without buffer pool           1141          
 1147          10          0.0   285134371.0       0.8X
-Decompression 4 times from level 3 without buffer pool           1345          
 1345           0          0.0   336239892.8       0.6X
-Decompression 4 times from level 1 with buffer pool               844          
  851          12          0.0   210917530.8       1.0X
-Decompression 4 times from level 2 with buffer pool              1111          
 1126          22          0.0   277630951.0       0.8X
-Decompression 4 times from level 3 with buffer pool              1338          
 1348          13          0.0   334616619.0       0.6X
+Decompression 4 times from level 1 without buffer pool            845          
  877          53          0.0   211244491.3       1.0X
+Decompression 4 times from level 2 without buffer pool           1141          
 1145           6          0.0   285267829.3       0.7X
+Decompression 4 times from level 3 without buffer pool           1358          
 1366          11          0.0   339611143.3       0.6X
+Decompression 4 times from level 1 with buffer pool               856          
  862          10          0.0   213948489.5       1.0X
+Decompression 4 times from level 2 with buffer pool              1153          
 1156           5          0.0   288195943.3       0.7X
+Decompression 4 times from level 3 with buffer pool              1368          
 1375           9          0.0   342032342.0       0.6X
 
-OpenJDK 64-Bit Server VM 21.0.8+9-LTS on Linux 6.11.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
 AMD EPYC 7763 64-Core Processor
 Parallel Compression at level 3:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers                2099           2099         
  0          0.0   524768920.0       1.0X
-Parallel Compression with 1 workers                2119           2119         
  0          0.0   529758581.0       1.0X
-Parallel Compression with 2 workers                1087           1090         
  4          0.0   271687124.0       1.9X
-Parallel Compression with 4 workers                 761            764         
  2          0.0   190289606.5       2.8X
-Parallel Compression with 8 workers                 782            789         
  7          0.0   195588720.8       2.7X
-Parallel Compression with 16 workers                897            908         
 10          0.0   224272497.0       2.3X
+Parallel Compression with 0 workers                2128           2130         
  3          0.0   531987145.5       1.0X
+Parallel Compression with 1 workers                2117           2131         
 20          0.0   529324483.0       1.0X
+Parallel Compression with 2 workers                1106           1107         
  0          0.0   276552237.3       1.9X
+Parallel Compression with 4 workers                 771            774         
  3          0.0   192777338.5       2.8X
+Parallel Compression with 8 workers                 815            826         
 13          0.0   203648175.7       2.6X
+Parallel Compression with 16 workers                950            963         
 11          0.0   237496152.5       2.2X
 
-OpenJDK 64-Bit Server VM 21.0.8+9-LTS on Linux 6.11.0-1018-azure
+OpenJDK 64-Bit Server VM 21.0.9+10-LTS on Linux 6.11.0-1018-azure
 AMD EPYC 7763 64-Core Processor
 Parallel Compression at level 9:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers                8240           8242         
  2          0.0  2060062709.3       1.0X
-Parallel Compression with 1 workers                6915           6924         
 13          0.0  1728757390.0       1.2X
-Parallel Compression with 2 workers                3663           3664         
  1          0.0   915832528.8       2.2X
-Parallel Compression with 4 workers                3239           3247         
 12          0.0   809829698.3       2.5X
-Parallel Compression with 8 workers                3594           3610         
 23          0.0   898542204.8       2.3X
-Parallel Compression with 16 workers               3888           3898         
 14          0.0   972017914.3       2.1X
+Parallel Compression with 0 workers                8976           9140         
231          0.0  2244064460.3       1.0X
+Parallel Compression with 1 workers                7198           7214         
 24          0.0  1799376873.8       1.2X
+Parallel Compression with 2 workers                3836           3850         
 20          0.0   959020774.8       2.3X
+Parallel Compression with 4 workers                3271           3284         
 18          0.0   817803690.0       2.7X
+Parallel Compression with 8 workers                3806           3807         
  2          0.0   951490679.3       2.4X
+Parallel Compression with 16 workers               4004           4026         
 31          0.0  1000957120.3       2.2X
 
 
diff --git a/core/benchmarks/ZStandardTPCDSDataBenchmark-results.txt 
b/core/benchmarks/ZStandardTPCDSDataBenchmark-results.txt
index 01784a686f81..b69321de8861 100644
--- a/core/benchmarks/ZStandardTPCDSDataBenchmark-results.txt
+++ b/core/benchmarks/ZStandardTPCDSDataBenchmark-results.txt
@@ -2,48 +2,48 @@
 Benchmark ZStandardCompressionCodec
 
================================================================================================
 
-OpenJDK 64-Bit Server VM 17.0.16+8-LTS on Linux 6.11.0-1018-azure
-Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
-Benchmark ZStandardCompressionCodec:                Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Compression:                                        Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
----------------------------------------------------------------------------------------------------------------------------------
-Compression 4 times at level 1 without buffer pool           2513           
2519          10          0.0   628127696.5       1.0X
-Compression 4 times at level 2 without buffer pool           3914           
3916           3          0.0   978555442.0       0.6X
-Compression 4 times at level 3 without buffer pool           5417           
5424           9          0.0  1354342306.3       0.5X
-Compression 4 times at level 1 with buffer pool              2524           
2526           4          0.0   630890054.5       1.0X
-Compression 4 times at level 2 with buffer pool              3911           
3911           0          0.0   977705511.5       0.6X
-Compression 4 times at level 3 with buffer pool              5447           
5451           5          0.0  1361830044.0       0.5X
+Compression 4 times at level 1 without buffer pool           2572           
2573           0          0.0   643092504.0       1.0X
+Compression 4 times at level 2 without buffer pool           4223           
4224           2          0.0  1055729472.5       0.6X
+Compression 4 times at level 3 without buffer pool           6306           
6307           1          0.0  1576606903.0       0.4X
+Compression 4 times at level 1 with buffer pool              2566           
2567           2          0.0   641403366.5       1.0X
+Compression 4 times at level 2 with buffer pool              4229           
4229           0          0.0  1057252850.2       0.6X
+Compression 4 times at level 3 with buffer pool              6369           
6370           1          0.0  1592238794.8       0.4X
 
-OpenJDK 64-Bit Server VM 17.0.16+8-LTS on Linux 6.11.0-1018-azure
-Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
-Benchmark ZStandardCompressionCodec:                    Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
+Decompression:                                          Best Time(ms)   Avg 
Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
--------------------------------------------------------------------------------------------------------------------------------------
-Decompression 4 times from level 1 without buffer pool           1053          
 1056           4          0.0   263342668.0       1.0X
-Decompression 4 times from level 2 without buffer pool           1401          
 1403           3          0.0   350193821.5       0.8X
-Decompression 4 times from level 3 without buffer pool           1683          
 1683           1          0.0   420625735.0       0.6X
-Decompression 4 times from level 1 with buffer pool              1078          
 1079           2          0.0   269507650.5       1.0X
-Decompression 4 times from level 2 with buffer pool              1397          
 1403           8          0.0   349213546.5       0.8X
-Decompression 4 times from level 3 with buffer pool              1679          
 1680           2          0.0   419721856.3       0.6X
+Decompression 4 times from level 1 without buffer pool            914          
  915           1          0.0   228498988.8       1.0X
+Decompression 4 times from level 2 without buffer pool           1186          
 1188           2          0.0   296544159.0       0.8X
+Decompression 4 times from level 3 without buffer pool           1424          
 1424           0          0.0   355996464.8       0.6X
+Decompression 4 times from level 1 with buffer pool               915          
  916           2          0.0   228733774.3       1.0X
+Decompression 4 times from level 2 with buffer pool              1189          
 1192           4          0.0   297227461.0       0.8X
+Decompression 4 times from level 3 with buffer pool              1421          
 1425           5          0.0   355242896.0       0.6X
 
-OpenJDK 64-Bit Server VM 17.0.16+8-LTS on Linux 6.11.0-1018-azure
-Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
 Parallel Compression at level 3:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers                1815           1825         
 14          0.0   453734902.0       1.0X
-Parallel Compression with 1 workers                1654           1663         
 12          0.0   413617141.5       1.1X
-Parallel Compression with 2 workers                 857            859         
  2          0.0   214195314.3       2.1X
-Parallel Compression with 4 workers                 777            781         
  3          0.0   194366718.5       2.3X
-Parallel Compression with 8 workers                 784            786         
  3          0.0   195965343.5       2.3X
-Parallel Compression with 16 workers                836            858         
 19          0.0   208885356.8       2.2X
+Parallel Compression with 0 workers                2142           2143         
  2          0.0   535512459.0       1.0X
+Parallel Compression with 1 workers                2145           2146         
  2          0.0   536274942.0       1.0X
+Parallel Compression with 2 workers                1094           1100         
  9          0.0   273416030.8       2.0X
+Parallel Compression with 4 workers                 768            769         
  2          0.0   192023783.8       2.8X
+Parallel Compression with 8 workers                 831            839         
 13          0.0   207743301.0       2.6X
+Parallel Compression with 16 workers                960            972         
 16          0.0   240059822.0       2.2X
 
-OpenJDK 64-Bit Server VM 17.0.16+8-LTS on Linux 6.11.0-1018-azure
-Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
+OpenJDK 64-Bit Server VM 17.0.17+10-LTS on Linux 6.11.0-1018-azure
+AMD EPYC 7763 64-Core Processor
 Parallel Compression at level 9:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Parallel Compression with 0 workers                8302           8310         
 12          0.0  2075384302.0       1.0X
-Parallel Compression with 1 workers                6869           6871         
  3          0.0  1717196107.5       1.2X
-Parallel Compression with 2 workers                3570           3587         
 25          0.0   892437927.8       2.3X
-Parallel Compression with 4 workers                3152           3152         
  1          0.0   787940909.5       2.6X
-Parallel Compression with 8 workers                3463           3470         
 10          0.0   865739615.8       2.4X
-Parallel Compression with 16 workers               4061           4083         
 31          0.0  1015251097.3       2.0X
+Parallel Compression with 0 workers                9016           9031         
 21          0.0  2254017491.8       1.0X
+Parallel Compression with 1 workers                7423           7467         
 62          0.0  1855653850.8       1.2X
+Parallel Compression with 2 workers                4203           4242         
 54          0.0  1050838310.7       2.1X
+Parallel Compression with 4 workers                3472           3481         
 13          0.0   867935936.0       2.6X
+Parallel Compression with 8 workers                3952           3996         
 62          0.0   988086500.5       2.3X
+Parallel Compression with 16 workers               4081           4115         
 47          0.0  1020289087.7       2.2X
 
 
diff --git 
a/core/src/test/scala/org/apache/spark/io/LZ4TPCDSDataBenchmark.scala 
b/core/src/test/scala/org/apache/spark/io/LZ4TPCDSDataBenchmark.scala
new file mode 100644
index 000000000000..f3f2bd1057b7
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/io/LZ4TPCDSDataBenchmark.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.io
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, OutputStream}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.benchmark.Benchmark
+
+/**
+ * Benchmark for LZ4 codec performance.
+ * {{{
+ *   To run this benchmark:
+ *   1. without sbt: bin/spark-submit --class <this class> <spark core test 
jar>
+ *   2. build/sbt "core/Test/runMain <this class>"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt 
"core/Test/runMain <this class>"
+ *      Results will be written to 
"benchmarks/LZ4TPCDSDataBenchmark-results.txt".
+ * }}}
+ */
+object LZ4TPCDSDataBenchmark extends TPCDSDataBenchmark {
+
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    prepareData()
+    runBenchmark("Benchmark LZ4CompressionCodec") {
+      compressionBenchmark()
+      decompressionBenchmark()
+    }
+  }
+
+  private def compressionBenchmark(): Unit = {
+    val benchmark = new Benchmark("Compression", N, output = output)
+    val conf = new SparkConf(false)
+    benchmark.addCase(s"Compression $N times") { _ =>
+      (1 until N).foreach { _ =>
+        val os = new LZ4CompressionCodec(conf)
+          .compressedOutputStream(OutputStream.nullOutputStream())
+        os.write(data)
+        os.close()
+      }
+    }
+    benchmark.run()
+  }
+
+  private def decompressionBenchmark(): Unit = {
+    val benchmark = new Benchmark("Decompression", N, output = output)
+    val conf = new SparkConf(false)
+    val outputStream = new ByteArrayOutputStream()
+    val out = new 
LZ4CompressionCodec(conf).compressedOutputStream(outputStream)
+    out.write(data)
+    out.close()
+    val bytes = outputStream.toByteArray
+
+    benchmark.addCase(s"Decompression $N times") { _ =>
+      (1 until N).foreach { _ =>
+        val bais = new ByteArrayInputStream(bytes)
+        val is = new LZ4CompressionCodec(conf).compressedInputStream(bais)
+        is.readAllBytes()
+        is.close()
+      }
+    }
+    benchmark.run()
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/io/TPCDSDataBenchmark.scala 
b/core/src/test/scala/org/apache/spark/io/TPCDSDataBenchmark.scala
new file mode 100644
index 000000000000..b6b419e0201a
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/io/TPCDSDataBenchmark.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.io
+
+import java.nio.file.{Files, Paths}
+
+import org.apache.spark.benchmark.BenchmarkBase
+
+/**
+ * TPC-DS data preparation:
+ * <p>
+ * 1. Follow https://github.com/gregrahn/tpcds-kit.git to set up tpcds-kit
+ * <p>
+ * 2. Create a folder and export environment variable SPARK_TPCDS_DATA_TEXT
+ * {{{
+ * mkdir -p /path/of/tpcds-sf1-text
+ * export SPARK_TPCDS_DATA_TEXT=/path/of/tpcds-sf1-text
+ * }}}
+ * <p>
+ * 3. Generate TPC-DS (SF1) text data under SPARK_TPCDS_DATA_TEXT
+ * {{{
+ * tpcds-kit/tools/dsdgen \
+ *   -DISTRIBUTIONS tpcds-kit/tools/tpcds.idx \
+ *   -SCALE 1 \
+ *   -DIR $SPARK_TPCDS_DATA_TEXT
+ * }}}
+ */
+abstract class TPCDSDataBenchmark extends BenchmarkBase {
+
+  val N = 4
+
+  var data: Array[Byte] = _
+
+  protected def prepareData(): Unit = {
+    val tpcDsDataDir = sys.env.get("SPARK_TPCDS_DATA_TEXT")
+    assert(tpcDsDataDir.nonEmpty, "Can not find env var SPARK_TPCDS_DATA_TEXT")
+
+    val catalogSalesDatPath = Paths.get(tpcDsDataDir.get, "catalog_sales.dat")
+    assert(Files.exists(catalogSalesDatPath), s"File $catalogSalesDatPath does 
not exists, " +
+      s"please follow instruction to generate the TPC-DS (SF1) text data 
first.")
+
+    // the size of TPCDS catalog_sales.dat (SF1) is about 283M
+    data = Files.readAllBytes(catalogSalesDatPath)
+  }
+
+  override def afterAll(): Unit = {
+    data = null
+  }
+}
diff --git 
a/core/src/test/scala/org/apache/spark/io/ZStandardTPCDSDataBenchmark.scala 
b/core/src/test/scala/org/apache/spark/io/ZStandardTPCDSDataBenchmark.scala
index 69820c15be30..a546d03c8eb3 100644
--- a/core/src/test/scala/org/apache/spark/io/ZStandardTPCDSDataBenchmark.scala
+++ b/core/src/test/scala/org/apache/spark/io/ZStandardTPCDSDataBenchmark.scala
@@ -18,10 +18,9 @@
 package org.apache.spark.io
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, 
ObjectOutputStream, OutputStream}
-import java.nio.file.{Files, Paths}
 
 import org.apache.spark.SparkConf
-import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
+import org.apache.spark.benchmark.Benchmark
 import 
org.apache.spark.internal.config.{IO_COMPRESSION_ZSTD_BUFFERPOOL_ENABLED, 
IO_COMPRESSION_ZSTD_LEVEL, IO_COMPRESSION_ZSTD_WORKERS}
 
 /**
@@ -34,28 +33,19 @@ import 
org.apache.spark.internal.config.{IO_COMPRESSION_ZSTD_BUFFERPOOL_ENABLED,
  *      Results will be written to 
"benchmarks/ZStandardTPCDSDataBenchmark-results.txt".
  * }}}
  */
-object ZStandardTPCDSDataBenchmark extends BenchmarkBase {
-
-  val N = 4
-
-  // the size of TPCDS catalog_sales.dat (SF1) is about 283M
-  val data = Files.readAllBytes(Paths.get(sys.env("SPARK_TPCDS_DATA_TEXT"), 
"catalog_sales.dat"))
+object ZStandardTPCDSDataBenchmark extends TPCDSDataBenchmark {
 
   override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
-    val name = "Benchmark ZStandardCompressionCodec"
-    runBenchmark(name) {
-      val benchmark1 = new Benchmark(name, N, output = output)
-      compressionBenchmark(benchmark1, N)
-      benchmark1.run()
-
-      val benchmark2 = new Benchmark(name, N, output = output)
-      decompressionBenchmark(benchmark2, N)
-      benchmark2.run()
+    prepareData()
+    runBenchmark("Benchmark ZStandardCompressionCodec") {
+      compressionBenchmark()
+      decompressionBenchmark()
       parallelCompressionBenchmark()
     }
   }
 
-  private def compressionBenchmark(benchmark: Benchmark, N: Int): Unit = {
+  private def compressionBenchmark(): Unit = {
+    val benchmark = new Benchmark("Compression", N, output = output)
     Seq(false, true).foreach { enablePool =>
       Seq(1, 2, 3).foreach { level =>
         val conf = new SparkConf(false)
@@ -72,9 +62,11 @@ object ZStandardTPCDSDataBenchmark extends BenchmarkBase {
         }
       }
     }
+    benchmark.run()
   }
 
-  private def decompressionBenchmark(benchmark: Benchmark, N: Int): Unit = {
+  private def decompressionBenchmark(): Unit = {
+    val benchmark = new Benchmark("Decompression", N, output = output)
     Seq(false, true).foreach { enablePool =>
       Seq(1, 2, 3).foreach { level =>
         val conf = new SparkConf(false)
@@ -97,6 +89,7 @@ object ZStandardTPCDSDataBenchmark extends BenchmarkBase {
         }
       }
     }
+    benchmark.run()
   }
 
   private def parallelCompressionBenchmark(): Unit = {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to