[spark] branch master updated: [SPARK-42169][SQL] Implement code generation for to_csv function (StructsToCsv)

maxgekk Mon, 03 Jul 2023 00:13:32 -0700

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 45ae9c5cc67 [SPARK-42169][SQL] Implement code generation for to_csv 
function (StructsToCsv)
45ae9c5cc67 is described below

commit 45ae9c5cc67d379f5bbeadf8c56c032f2bdaaac0
Author: narek_karapetian <narek.karapetia...@yandex.ru>
AuthorDate: Mon Jul 3 10:13:12 2023 +0300

    [SPARK-42169][SQL] Implement code generation for to_csv function 
(StructsToCsv)
    
    ### What changes were proposed in this pull request?
    This PR enhances `StructsToCsv` class with `doGenCode` function instead of 
extending it from `CodegenFallback` trait (performance improvement).
    
    ### Why are the changes needed?
    It will improve performance.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    an additional test case were added to 
`org.apache.spark.sql.CsvFunctionsSuite` class.
    
    Closes #39719 from NarekDW/SPARK-42169.
    
    Authored-by: narek_karapetian <narek.karapetia...@yandex.ru>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../sql/catalyst/expressions/csvExpressions.scala  | 11 ++-
 .../catalyst/expressions/CsvExpressionsSuite.scala |  7 ++
 sql/core/benchmarks/CSVBenchmark-jdk11-results.txt | 82 +++++++++----------
 sql/core/benchmarks/CSVBenchmark-jdk17-results.txt | 82 +++++++++----------
 sql/core/benchmarks/CSVBenchmark-results.txt       | 94 +++++++++++-----------
 5 files changed, 144 insertions(+), 132 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
index e47cf493d4c..cdab9faacd4 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
 import org.apache.spark.sql.catalyst.csv._
-import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, 
CodegenFallback, ExprCode}
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase}
 import org.apache.spark.sql.internal.SQLConf
@@ -245,8 +245,7 @@ case class StructsToCsv(
      options: Map[String, String],
      child: Expression,
      timeZoneId: Option[String] = None)
-  extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback 
with ExpectsInputTypes
-    with NullIntolerant {
+  extends UnaryExpression with TimeZoneAwareExpression with ExpectsInputTypes 
with NullIntolerant {
   override def nullable: Boolean = true
 
   def this(options: Map[String, String], child: Expression) = this(options, 
child, None)
@@ -293,4 +292,10 @@ case class StructsToCsv(
 
   override protected def withNewChildInternal(newChild: Expression): 
StructsToCsv =
     copy(child = newChild)
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
+    val structsToCsv = ctx.addReferenceObj("structsToCsv", this)
+    nullSafeCodeGen(ctx, ev,
+      eval => s"${ev.value} = (UTF8String) 
$structsToCsv.converter().apply($eval);")
+  }
 }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala
index 1d174ed2145..a89cb58c3e0 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala
@@ -246,4 +246,11 @@ class CsvExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper with P
       CsvToStructs(schema, Map.empty, Literal.create("1 day")),
       InternalRow(new CalendarInterval(0, 1, 0)))
   }
+
+  test("StructsToCsv should not generate codes beyond 64KB") {
+    val range = Range.inclusive(1, 5000)
+    val struct = CreateStruct.create(range.map(Literal.apply))
+    val expected = range.mkString(",")
+    checkEvaluation(StructsToCsv(Map.empty, struct), expected)
+  }
 }
diff --git a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt 
b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt
index 7b5ea10bc4e..7fca105a8c2 100644
--- a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt
@@ -2,69 +2,69 @@
 Benchmark to measure CSV read/write performance
 
================================================================================================
 
-OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
 Parsing quoted values:                    Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-One quoted string                                 38218          38618         
520          0.0      764362.7       1.0X
+One quoted string                                 43871          44151         
336          0.0      877415.7       1.0X
 
-OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
 Wide rows with 1000 columns:              Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Select 1000 columns                               97679          98487        
1143          0.0       97678.6       1.0X
-Select 100 columns                                39193          39339         
193          0.0       39193.1       2.5X
-Select one column                                 32781          33041         
265          0.0       32780.7       3.0X
-count()                                            7154           7228         
 86          0.1        7153.5      13.7X
-Select 100 columns, one bad input field           53968          54158         
165          0.0       53967.9       1.8X
-Select 100 columns, corrupt record field          59730          60100         
484          0.0       59730.2       1.6X
+Select 1000 columns                              115001         115810        
1382          0.0      115001.2       1.0X
+Select 100 columns                                45575          45646         
 84          0.0       45575.5       2.5X
+Select one column                                 38701          38744         
 67          0.0       38700.7       3.0X
+count()                                            8544           8556         
 12          0.1        8544.0      13.5X
+Select 100 columns, one bad input field           67789          67841         
 79          0.0       67788.5       1.7X
+Select 100 columns, corrupt record field          74026          74050         
 26          0.0       74026.4       1.6X
 
-OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
 Count a dataset with 10 columns:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns + count()                       15305          15627         
282          0.7        1530.5       1.0X
-Select 1 column + count()                         13688          13777         
106          0.7        1368.8       1.1X
-count()                                            3189           3214         
 39          3.1         318.9       4.8X
+Select 10 columns + count()                       16855          16980         
179          0.6        1685.5       1.0X
+Select 1 column + count()                         11053          11075         
 29          0.9        1105.3       1.5X
+count()                                            3646           3664         
 17          2.7         364.6       4.6X
 
-OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                     1630           1641         
  9          6.1         163.0       1.0X
-to_csv(timestamp)                                 11606          11665         
 76          0.9        1160.6       0.1X
-write timestamps to files                         10636          10742         
121          0.9        1063.6       0.2X
-Create a dataset of dates                          1854           1879         
 25          5.4         185.4       0.9X
-to_csv(date)                                       7522           7563         
 37          1.3         752.2       0.2X
-write dates to files                               6435           6526         
 85          1.6         643.5       0.3X
+Create a dataset of timestamps                     1864           1904         
 35          5.4         186.4       1.0X
+to_csv(timestamp)                                 12050          12258         
279          0.8        1205.0       0.2X
+write timestamps to files                         12564          12586         
 22          0.8        1256.4       0.1X
+Create a dataset of dates                          2093           2106         
 20          4.8         209.3       0.9X
+to_csv(date)                                       7216           7236         
 33          1.4         721.6       0.3X
+write dates to files                               7300           7382         
 71          1.4         730.0       0.3X
 
-OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
 Read dates and timestamps:                                             Best 
Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
-----------------------------------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                                                 
 2245           2310          57          4.5         224.5       1.0X
-read timestamps from files                                                     
27283          27875         513          0.4        2728.3       0.1X
-infer timestamps from files                                                    
55465          56311         859          0.2        5546.5       0.0X
-read date text from files                                                      
 2054           2088          38          4.9         205.4       1.1X
-read date from files                                                           
15957          16190         202          0.6        1595.7       0.1X
-infer date from files                                                          
33163          33319         135          0.3        3316.3       0.1X
-timestamp strings                                                              
 2518           2594          71          4.0         251.8       0.9X
-parse timestamps from Dataset[String]                                          
30168          30266          87          0.3        3016.8       0.1X
-infer timestamps from Dataset[String]                                          
58608          59332         728          0.2        5860.8       0.0X
-date strings                                                                   
 2803           2847          44          3.6         280.3       0.8X
-parse dates from Dataset[String]                                               
17613          17877         421          0.6        1761.3       0.1X
-from_csv(timestamp)                                                            
27736          28241         482          0.4        2773.6       0.1X
-from_csv(date)                                                                 
16415          16816         367          0.6        1641.5       0.1X
-infer error timestamps from Dataset[String] with default format                
18335          18494         138          0.5        1833.5       0.1X
-infer error timestamps from Dataset[String] with user-provided format          
18327          18598         422          0.5        1832.7       0.1X
-infer error timestamps from Dataset[String] with legacy format                 
18713          18907         267          0.5        1871.3       0.1X
+read timestamp text from files                                                 
 2432           2458          40          4.1         243.2       1.0X
+read timestamps from files                                                     
31897          31950          79          0.3        3189.7       0.1X
+infer timestamps from files                                                    
65093          65196          90          0.2        6509.3       0.0X
+read date text from files                                                      
 2201           2211          15          4.5         220.1       1.1X
+read date from files                                                           
16138          18869         NaN          0.6        1613.8       0.2X
+infer date from files                                                          
33633          33742         126          0.3        3363.3       0.1X
+timestamp strings                                                              
 2909           2930          34          3.4         290.9       0.8X
+parse timestamps from Dataset[String]                                          
34951          34984          39          0.3        3495.1       0.1X
+infer timestamps from Dataset[String]                                          
68347          68448          92          0.1        6834.7       0.0X
+date strings                                                                   
 3234           3256          24          3.1         323.4       0.8X
+parse dates from Dataset[String]                                               
18591          18657          96          0.5        1859.1       0.1X
+from_csv(timestamp)                                                            
32386          32476          78          0.3        3238.6       0.1X
+from_csv(date)                                                                 
17333          17402          67          0.6        1733.3       0.1X
+infer error timestamps from Dataset[String] with default format                
21486          21565          68          0.5        2148.6       0.1X
+infer error timestamps from Dataset[String] with user-provided format          
21683          21697          16          0.5        2168.3       0.1X
+infer error timestamps from Dataset[String] with legacy format                 
21327          21379          85          0.5        2132.7       0.1X
 
-OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
 Filters pushdown:                         Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-w/o filters                                       19420          19520         
 87          0.0      194201.0       1.0X
-pushdown disabled                                 19196          19507         
409          0.0      191958.0       1.0X
-w/ filters                                         1380           1402         
 19          0.1       13796.9      14.1X
+w/o filters                                       22031          22075         
 46          0.0      220305.7       1.0X
+pushdown disabled                                 21935          21958         
 21          0.0      219353.1       1.0X
+w/ filters                                         1466           1481         
 15          0.1       14662.5      15.0X
 
 
diff --git a/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt 
b/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt
index 9b86f237496..24c56a42963 100644
--- a/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt
+++ b/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt
@@ -2,69 +2,69 @@
 Benchmark to measure CSV read/write performance
 
================================================================================================
 
-OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Parsing quoted values:                    Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-One quoted string                                 41215          41413         
184          0.0      824303.0       1.0X
+One quoted string                                 45085          45217         
227          0.0      901702.6       1.0X
 
-OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Wide rows with 1000 columns:              Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Select 1000 columns                               82745          83284         
859          0.0       82744.6       1.0X
-Select 100 columns                                31408          31505         
 99          0.0       31407.6       2.6X
-Select one column                                 26527          26578         
 53          0.0       26526.6       3.1X
-count()                                            5168           5214         
 40          0.2        5167.9      16.0X
-Select 100 columns, one bad input field           50701          50802         
120          0.0       50700.8       1.6X
-Select 100 columns, corrupt record field          55347          55377         
 27          0.0       55347.2       1.5X
+Select 1000 columns                               84298          84785         
814          0.0       84297.9       1.0X
+Select 100 columns                                31424          31438         
 14          0.0       31424.4       2.7X
+Select one column                                 26201          26308         
124          0.0       26200.9       3.2X
+count()                                            5215           5226         
 11          0.2        5214.8      16.2X
+Select 100 columns, one bad input field           47515          47615         
 98          0.0       47514.7       1.8X
+Select 100 columns, corrupt record field          52608          52658         
 62          0.0       52607.6       1.6X
 
-OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Count a dataset with 10 columns:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns + count()                       14368          14376         
 12          0.7        1436.8       1.0X
-Select 1 column + count()                          8791           8834         
 46          1.1         879.1       1.6X
-count()                                            2597           2613         
 13          3.8         259.7       5.5X
+Select 10 columns + count()                       15507          15522         
 14          0.6        1550.7       1.0X
+Select 1 column + count()                          9380           9397         
 15          1.1         938.0       1.7X
+count()                                            2932           2959         
 40          3.4         293.2       5.3X
 
-OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                     1448           1475         
 30          6.9         144.8       1.0X
-to_csv(timestamp)                                  9021           9033         
 13          1.1         902.1       0.2X
-write timestamps to files                          8104           8113         
  8          1.2         810.4       0.2X
-Create a dataset of dates                          1510           1527         
 15          6.6         151.0       1.0X
-to_csv(date)                                       6114           6121         
 12          1.6         611.4       0.2X
-write dates to files                               5191           5196         
  5          1.9         519.1       0.3X
+Create a dataset of timestamps                     1486           1495         
  8          6.7         148.6       1.0X
+to_csv(timestamp)                                  8333           8351         
 21          1.2         833.3       0.2X
+write timestamps to files                          8628           8633         
  7          1.2         862.8       0.2X
+Create a dataset of dates                          1698           1713         
 14          5.9         169.8       0.9X
+to_csv(date)                                       5566           5579         
 15          1.8         556.6       0.3X
+write dates to files                               5561           5585         
 21          1.8         556.1       0.3X
 
-OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Read dates and timestamps:                                             Best 
Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
-----------------------------------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                                                 
 1891           1900          11          5.3         189.1       1.0X
-read timestamps from files                                                     
25100          25122          27          0.4        2510.0       0.1X
-infer timestamps from files                                                    
50501          50568         110          0.2        5050.1       0.0X
-read date text from files                                                      
 1813           1816           4          5.5         181.3       1.0X
-read date from files                                                           
15558          15589          27          0.6        1555.8       0.1X
-infer date from files                                                          
31269          31335          84          0.3        3126.9       0.1X
-timestamp strings                                                              
 2126           2135          10          4.7         212.6       0.9X
-parse timestamps from Dataset[String]                                          
27361          27404          46          0.4        2736.1       0.1X
-infer timestamps from Dataset[String]                                          
52775          52897         146          0.2        5277.5       0.0X
-date strings                                                                   
 2421           2432          19          4.1         242.1       0.8X
-parse dates from Dataset[String]                                               
17745          17810          75          0.6        1774.5       0.1X
-from_csv(timestamp)                                                            
25839          25938         133          0.4        2583.9       0.1X
-from_csv(date)                                                                 
16625          16690          60          0.6        1662.5       0.1X
-infer error timestamps from Dataset[String] with default format                
20289          20376          76          0.5        2028.9       0.1X
-infer error timestamps from Dataset[String] with user-provided format          
20245          20326         108          0.5        2024.5       0.1X
-infer error timestamps from Dataset[String] with legacy format                 
20274          20314          36          0.5        2027.4       0.1X
+read timestamp text from files                                                 
 1910           1911           3          5.2         191.0       1.0X
+read timestamps from files                                                     
26650          26657           7          0.4        2665.0       0.1X
+infer timestamps from files                                                    
53172          53219          63          0.2        5317.2       0.0X
+read date text from files                                                      
 1859           1863           4          5.4         185.9       1.0X
+read date from files                                                           
15246          15259          20          0.7        1524.6       0.1X
+infer date from files                                                          
31002          31006           5          0.3        3100.2       0.1X
+timestamp strings                                                              
 2252           2257           5          4.4         225.2       0.8X
+parse timestamps from Dataset[String]                                          
28833          28871          34          0.3        2883.3       0.1X
+infer timestamps from Dataset[String]                                          
55417          55526         116          0.2        5541.7       0.0X
+date strings                                                                   
 2561           2568           6          3.9         256.1       0.7X
+parse dates from Dataset[String]                                               
17580          17601          19          0.6        1758.0       0.1X
+from_csv(timestamp)                                                            
26802          27121         280          0.4        2680.2       0.1X
+from_csv(date)                                                                 
16119          16126           6          0.6        1611.9       0.1X
+infer error timestamps from Dataset[String] with default format                
19595          19846         229          0.5        1959.5       0.1X
+infer error timestamps from Dataset[String] with user-provided format          
19816          19854          37          0.5        1981.6       0.1X
+infer error timestamps from Dataset[String] with legacy format                 
19810          19849          42          0.5        1981.0       0.1X
 
-OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure
+OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Filters pushdown:                         Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-w/o filters                                       15487          15499         
 13          0.0      154874.0       1.0X
-pushdown disabled                                 15405          15411         
  5          0.0      154051.4       1.0X
-w/ filters                                         1166           1174         
  7          0.1       11660.4      13.3X
+w/o filters                                       16689          16693         
  5          0.0      166885.8       1.0X
+pushdown disabled                                 16610          16615         
  5          0.0      166095.3       1.0X
+w/ filters                                         1094           1096         
  2          0.1       10936.1      15.3X
 
 
diff --git a/sql/core/benchmarks/CSVBenchmark-results.txt 
b/sql/core/benchmarks/CSVBenchmark-results.txt
index eb1ec99123d..ff67054b93d 100644
--- a/sql/core/benchmarks/CSVBenchmark-results.txt
+++ b/sql/core/benchmarks/CSVBenchmark-results.txt
@@ -2,69 +2,69 @@
 Benchmark to measure CSV read/write performance
 
================================================================================================
 
-OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure
-Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Parsing quoted values:                    Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-One quoted string                                 55478          55679         
175          0.0     1109556.3       1.0X
+One quoted string                                 43827          44673         
740          0.0      876536.0       1.0X
 
-OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure
-Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Wide rows with 1000 columns:              Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Select 1000 columns                              113407         117690         
NaN          0.0      113407.3       1.0X
-Select 100 columns                                42483          43350         
918          0.0       42483.3       2.7X
-Select one column                                 36959          37454         
437          0.0       36958.5       3.1X
-count()                                           10248          11871        
1413          0.1       10248.2      11.1X
-Select 100 columns, one bad input field           61143          61339         
276          0.0       61143.4       1.9X
-Select 100 columns, corrupt record field          65546          65662         
170          0.0       65546.5       1.7X
+Select 1000 columns                               93035          94150        
1041          0.0       93035.3       1.0X
+Select 100 columns                                34333          34440         
185          0.0       34333.3       2.7X
+Select one column                                 28763          28860         
116          0.0       28763.1       3.2X
+count()                                            7449           7665         
300          0.1        7448.9      12.5X
+Select 100 columns, one bad input field           50278          50458         
175          0.0       50277.6       1.9X
+Select 100 columns, corrupt record field          53481          53833         
540          0.0       53480.7       1.7X
 
-OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure
-Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Count a dataset with 10 columns:          Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns + count()                       12993          13063         
 83          0.8        1299.3       1.0X
-Select 1 column + count()                         11275          11448         
159          0.9        1127.5       1.2X
-count()                                            2804           2870         
 65          3.6         280.4       4.6X
+Select 10 columns + count()                       13070          13085         
 19          0.8        1307.0       1.0X
+Select 1 column + count()                         11406          11437         
 35          0.9        1140.6       1.1X
+count()                                            2840           2873         
 30          3.5         284.0       4.6X
 
-OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure
-Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                     1213           1270         
 50          8.2         121.3       1.0X
-to_csv(timestamp)                                  9959           9998         
 45          1.0         995.9       0.1X
-write timestamps to files                          8851           9069         
199          1.1         885.1       0.1X
-Create a dataset of dates                          1575           1758         
283          6.3         157.5       0.8X
-to_csv(date)                                       6708           6761         
 89          1.5         670.8       0.2X
-write dates to files                               5294           5330         
 38          1.9         529.4       0.2X
+Create a dataset of timestamps                     1150           1169         
 26          8.7         115.0       1.0X
+to_csv(timestamp)                                  9488           9499         
 15          1.1         948.8       0.1X
+write timestamps to files                          9194           9205         
 13          1.1         919.4       0.1X
+Create a dataset of dates                          1497           1506         
 15          6.7         149.7       0.8X
+to_csv(date)                                       6030           6041         
 18          1.7         603.0       0.2X
+write dates to files                               5722           5729         
  7          1.7         572.2       0.2X
 
-OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure
-Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Read dates and timestamps:                                             Best 
Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
-----------------------------------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                                                 
 1822           1844          26          5.5         182.2       1.0X
-read timestamps from files                                                     
26595          26727         194          0.4        2659.5       0.1X
-infer timestamps from files                                                    
53063          53427         450          0.2        5306.3       0.0X
-read date text from files                                                      
 1621           1656          34          6.2         162.1       1.1X
-read date from files                                                           
13226          13452         197          0.8        1322.6       0.1X
-infer date from files                                                          
26920          28034        1013          0.4        2692.0       0.1X
-timestamp strings                                                              
 2663           2721          77          3.8         266.3       0.7X
-parse timestamps from Dataset[String]                                          
29204          29608         352          0.3        2920.4       0.1X
-infer timestamps from Dataset[String]                                          
57302          57486         198          0.2        5730.2       0.0X
-date strings                                                                   
 2835           2890          50          3.5         283.5       0.6X
-parse dates from Dataset[String]                                               
15775          15965         184          0.6        1577.5       0.1X
-from_csv(timestamp)                                                            
27509          27967         418          0.4        2750.9       0.1X
-from_csv(date)                                                                 
14847          15059         325          0.7        1484.7       0.1X
-infer error timestamps from Dataset[String] with default format                
17424          17695         317          0.6        1742.4       0.1X
-infer error timestamps from Dataset[String] with user-provided format          
17585          17706         110          0.6        1758.5       0.1X
-infer error timestamps from Dataset[String] with legacy format                 
17775          17855          69          0.6        1777.5       0.1X
+read timestamp text from files                                                 
 1528           1560          28          6.5         152.8       1.0X
+read timestamps from files                                                     
27594          27600           8          0.4        2759.4       0.1X
+infer timestamps from files                                                    
54923          54958          49          0.2        5492.3       0.0X
+read date text from files                                                      
 1388           1389           2          7.2         138.8       1.1X
+read date from files                                                           
13358          13388          43          0.7        1335.8       0.1X
+infer date from files                                                          
27254          27304          46          0.4        2725.4       0.1X
+timestamp strings                                                              
 2688           2698          11          3.7         268.8       0.6X
+parse timestamps from Dataset[String]                                          
30710          30731          21          0.3        3071.0       0.0X
+infer timestamps from Dataset[String]                                          
58123          58211         122          0.2        5812.3       0.0X
+date strings                                                                   
 2804           2805           1          3.6         280.4       0.5X
+parse dates from Dataset[String]                                               
15409          15459          58          0.6        1540.9       0.1X
+from_csv(timestamp)                                                            
29102          29113          17          0.3        2910.2       0.1X
+from_csv(date)                                                                 
15682          15687           6          0.6        1568.2       0.1X
+infer error timestamps from Dataset[String] with default format                
17912          17926          12          0.6        1791.2       0.1X
+infer error timestamps from Dataset[String] with user-provided format          
17892          17911          26          0.6        1789.2       0.1X
+infer error timestamps from Dataset[String] with legacy format                 
17929          17935          10          0.6        1792.9       0.1X
 
-OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure
-Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Filters pushdown:                         Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 
------------------------------------------------------------------------------------------------------------------------
-w/o filters                                       18371          18553         
205          0.0      183711.1       1.0X
-pushdown disabled                                 18462          18770         
290          0.0      184620.0       1.0X
-w/ filters                                         1836           1871         
 50          0.1       18357.8      10.0X
+w/o filters                                       17003          17018         
 14          0.0      170025.5       1.0X
+pushdown disabled                                 17092          17103         
 10          0.0      170919.6       1.0X
+w/ filters                                         1340           1352         
 13          0.1       13395.9      12.7X
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-42169][SQL] Implement code generation for to_csv function (StructsToCsv)

Reply via email to