This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 45ae9c5cc67 [SPARK-42169][SQL] Implement code generation for to_csv function (StructsToCsv) 45ae9c5cc67 is described below commit 45ae9c5cc67d379f5bbeadf8c56c032f2bdaaac0 Author: narek_karapetian <narek.karapetia...@yandex.ru> AuthorDate: Mon Jul 3 10:13:12 2023 +0300 [SPARK-42169][SQL] Implement code generation for to_csv function (StructsToCsv) ### What changes were proposed in this pull request? This PR enhances `StructsToCsv` class with `doGenCode` function instead of extending it from `CodegenFallback` trait (performance improvement). ### Why are the changes needed? It will improve performance. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? an additional test case were added to `org.apache.spark.sql.CsvFunctionsSuite` class. Closes #39719 from NarekDW/SPARK-42169. Authored-by: narek_karapetian <narek.karapetia...@yandex.ru> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../sql/catalyst/expressions/csvExpressions.scala | 11 ++- .../catalyst/expressions/CsvExpressionsSuite.scala | 7 ++ sql/core/benchmarks/CSVBenchmark-jdk11-results.txt | 82 +++++++++---------- sql/core/benchmarks/CSVBenchmark-jdk17-results.txt | 82 +++++++++---------- sql/core/benchmarks/CSVBenchmark-results.txt | 94 +++++++++++----------- 5 files changed, 144 insertions(+), 132 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala index e47cf493d4c..cdab9faacd4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/csvExpressions.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch import org.apache.spark.sql.catalyst.csv._ -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode} import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} import org.apache.spark.sql.internal.SQLConf @@ -245,8 +245,7 @@ case class StructsToCsv( options: Map[String, String], child: Expression, timeZoneId: Option[String] = None) - extends UnaryExpression with TimeZoneAwareExpression with CodegenFallback with ExpectsInputTypes - with NullIntolerant { + extends UnaryExpression with TimeZoneAwareExpression with ExpectsInputTypes with NullIntolerant { override def nullable: Boolean = true def this(options: Map[String, String], child: Expression) = this(options, child, None) @@ -293,4 +292,10 @@ case class StructsToCsv( override protected def withNewChildInternal(newChild: Expression): StructsToCsv = copy(child = newChild) + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val structsToCsv = ctx.addReferenceObj("structsToCsv", this) + nullSafeCodeGen(ctx, ev, + eval => s"${ev.value} = (UTF8String) $structsToCsv.converter().apply($eval);") + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala index 1d174ed2145..a89cb58c3e0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CsvExpressionsSuite.scala @@ -246,4 +246,11 @@ class CsvExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with P CsvToStructs(schema, Map.empty, Literal.create("1 day")), InternalRow(new CalendarInterval(0, 1, 0))) } + + test("StructsToCsv should not generate codes beyond 64KB") { + val range = Range.inclusive(1, 5000) + val struct = CreateStruct.create(range.map(Literal.apply)) + val expected = range.mkString(",") + checkEvaluation(StructsToCsv(Map.empty, struct), expected) + } } diff --git a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt index 7b5ea10bc4e..7fca105a8c2 100644 --- a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt @@ -2,69 +2,69 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 38218 38618 520 0.0 764362.7 1.0X +One quoted string 43871 44151 336 0.0 877415.7 1.0X -OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 97679 98487 1143 0.0 97678.6 1.0X -Select 100 columns 39193 39339 193 0.0 39193.1 2.5X -Select one column 32781 33041 265 0.0 32780.7 3.0X -count() 7154 7228 86 0.1 7153.5 13.7X -Select 100 columns, one bad input field 53968 54158 165 0.0 53967.9 1.8X -Select 100 columns, corrupt record field 59730 60100 484 0.0 59730.2 1.6X +Select 1000 columns 115001 115810 1382 0.0 115001.2 1.0X +Select 100 columns 45575 45646 84 0.0 45575.5 2.5X +Select one column 38701 38744 67 0.0 38700.7 3.0X +count() 8544 8556 12 0.1 8544.0 13.5X +Select 100 columns, one bad input field 67789 67841 79 0.0 67788.5 1.7X +Select 100 columns, corrupt record field 74026 74050 26 0.0 74026.4 1.6X -OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 15305 15627 282 0.7 1530.5 1.0X -Select 1 column + count() 13688 13777 106 0.7 1368.8 1.1X -count() 3189 3214 39 3.1 318.9 4.8X +Select 10 columns + count() 16855 16980 179 0.6 1685.5 1.0X +Select 1 column + count() 11053 11075 29 0.9 1105.3 1.5X +count() 3646 3664 17 2.7 364.6 4.6X -OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1630 1641 9 6.1 163.0 1.0X -to_csv(timestamp) 11606 11665 76 0.9 1160.6 0.1X -write timestamps to files 10636 10742 121 0.9 1063.6 0.2X -Create a dataset of dates 1854 1879 25 5.4 185.4 0.9X -to_csv(date) 7522 7563 37 1.3 752.2 0.2X -write dates to files 6435 6526 85 1.6 643.5 0.3X +Create a dataset of timestamps 1864 1904 35 5.4 186.4 1.0X +to_csv(timestamp) 12050 12258 279 0.8 1205.0 0.2X +write timestamps to files 12564 12586 22 0.8 1256.4 0.1X +Create a dataset of dates 2093 2106 20 4.8 209.3 0.9X +to_csv(date) 7216 7236 33 1.4 721.6 0.3X +write dates to files 7300 7382 71 1.4 730.0 0.3X -OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 2245 2310 57 4.5 224.5 1.0X -read timestamps from files 27283 27875 513 0.4 2728.3 0.1X -infer timestamps from files 55465 56311 859 0.2 5546.5 0.0X -read date text from files 2054 2088 38 4.9 205.4 1.1X -read date from files 15957 16190 202 0.6 1595.7 0.1X -infer date from files 33163 33319 135 0.3 3316.3 0.1X -timestamp strings 2518 2594 71 4.0 251.8 0.9X -parse timestamps from Dataset[String] 30168 30266 87 0.3 3016.8 0.1X -infer timestamps from Dataset[String] 58608 59332 728 0.2 5860.8 0.0X -date strings 2803 2847 44 3.6 280.3 0.8X -parse dates from Dataset[String] 17613 17877 421 0.6 1761.3 0.1X -from_csv(timestamp) 27736 28241 482 0.4 2773.6 0.1X -from_csv(date) 16415 16816 367 0.6 1641.5 0.1X -infer error timestamps from Dataset[String] with default format 18335 18494 138 0.5 1833.5 0.1X -infer error timestamps from Dataset[String] with user-provided format 18327 18598 422 0.5 1832.7 0.1X -infer error timestamps from Dataset[String] with legacy format 18713 18907 267 0.5 1871.3 0.1X +read timestamp text from files 2432 2458 40 4.1 243.2 1.0X +read timestamps from files 31897 31950 79 0.3 3189.7 0.1X +infer timestamps from files 65093 65196 90 0.2 6509.3 0.0X +read date text from files 2201 2211 15 4.5 220.1 1.1X +read date from files 16138 18869 NaN 0.6 1613.8 0.2X +infer date from files 33633 33742 126 0.3 3363.3 0.1X +timestamp strings 2909 2930 34 3.4 290.9 0.8X +parse timestamps from Dataset[String] 34951 34984 39 0.3 3495.1 0.1X +infer timestamps from Dataset[String] 68347 68448 92 0.1 6834.7 0.0X +date strings 3234 3256 24 3.1 323.4 0.8X +parse dates from Dataset[String] 18591 18657 96 0.5 1859.1 0.1X +from_csv(timestamp) 32386 32476 78 0.3 3238.6 0.1X +from_csv(date) 17333 17402 67 0.6 1733.3 0.1X +infer error timestamps from Dataset[String] with default format 21486 21565 68 0.5 2148.6 0.1X +infer error timestamps from Dataset[String] with user-provided format 21683 21697 16 0.5 2168.3 0.1X +infer error timestamps from Dataset[String] with legacy format 21327 21379 85 0.5 2132.7 0.1X -OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 19420 19520 87 0.0 194201.0 1.0X -pushdown disabled 19196 19507 409 0.0 191958.0 1.0X -w/ filters 1380 1402 19 0.1 13796.9 14.1X +w/o filters 22031 22075 46 0.0 220305.7 1.0X +pushdown disabled 21935 21958 21 0.0 219353.1 1.0X +w/ filters 1466 1481 15 0.1 14662.5 15.0X diff --git a/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt b/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt index 9b86f237496..24c56a42963 100644 --- a/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt @@ -2,69 +2,69 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 41215 41413 184 0.0 824303.0 1.0X +One quoted string 45085 45217 227 0.0 901702.6 1.0X -OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 82745 83284 859 0.0 82744.6 1.0X -Select 100 columns 31408 31505 99 0.0 31407.6 2.6X -Select one column 26527 26578 53 0.0 26526.6 3.1X -count() 5168 5214 40 0.2 5167.9 16.0X -Select 100 columns, one bad input field 50701 50802 120 0.0 50700.8 1.6X -Select 100 columns, corrupt record field 55347 55377 27 0.0 55347.2 1.5X +Select 1000 columns 84298 84785 814 0.0 84297.9 1.0X +Select 100 columns 31424 31438 14 0.0 31424.4 2.7X +Select one column 26201 26308 124 0.0 26200.9 3.2X +count() 5215 5226 11 0.2 5214.8 16.2X +Select 100 columns, one bad input field 47515 47615 98 0.0 47514.7 1.8X +Select 100 columns, corrupt record field 52608 52658 62 0.0 52607.6 1.6X -OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 14368 14376 12 0.7 1436.8 1.0X -Select 1 column + count() 8791 8834 46 1.1 879.1 1.6X -count() 2597 2613 13 3.8 259.7 5.5X +Select 10 columns + count() 15507 15522 14 0.6 1550.7 1.0X +Select 1 column + count() 9380 9397 15 1.1 938.0 1.7X +count() 2932 2959 40 3.4 293.2 5.3X -OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1448 1475 30 6.9 144.8 1.0X -to_csv(timestamp) 9021 9033 13 1.1 902.1 0.2X -write timestamps to files 8104 8113 8 1.2 810.4 0.2X -Create a dataset of dates 1510 1527 15 6.6 151.0 1.0X -to_csv(date) 6114 6121 12 1.6 611.4 0.2X -write dates to files 5191 5196 5 1.9 519.1 0.3X +Create a dataset of timestamps 1486 1495 8 6.7 148.6 1.0X +to_csv(timestamp) 8333 8351 21 1.2 833.3 0.2X +write timestamps to files 8628 8633 7 1.2 862.8 0.2X +Create a dataset of dates 1698 1713 14 5.9 169.8 0.9X +to_csv(date) 5566 5579 15 1.8 556.6 0.3X +write dates to files 5561 5585 21 1.8 556.1 0.3X -OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 1891 1900 11 5.3 189.1 1.0X -read timestamps from files 25100 25122 27 0.4 2510.0 0.1X -infer timestamps from files 50501 50568 110 0.2 5050.1 0.0X -read date text from files 1813 1816 4 5.5 181.3 1.0X -read date from files 15558 15589 27 0.6 1555.8 0.1X -infer date from files 31269 31335 84 0.3 3126.9 0.1X -timestamp strings 2126 2135 10 4.7 212.6 0.9X -parse timestamps from Dataset[String] 27361 27404 46 0.4 2736.1 0.1X -infer timestamps from Dataset[String] 52775 52897 146 0.2 5277.5 0.0X -date strings 2421 2432 19 4.1 242.1 0.8X -parse dates from Dataset[String] 17745 17810 75 0.6 1774.5 0.1X -from_csv(timestamp) 25839 25938 133 0.4 2583.9 0.1X -from_csv(date) 16625 16690 60 0.6 1662.5 0.1X -infer error timestamps from Dataset[String] with default format 20289 20376 76 0.5 2028.9 0.1X -infer error timestamps from Dataset[String] with user-provided format 20245 20326 108 0.5 2024.5 0.1X -infer error timestamps from Dataset[String] with legacy format 20274 20314 36 0.5 2027.4 0.1X +read timestamp text from files 1910 1911 3 5.2 191.0 1.0X +read timestamps from files 26650 26657 7 0.4 2665.0 0.1X +infer timestamps from files 53172 53219 63 0.2 5317.2 0.0X +read date text from files 1859 1863 4 5.4 185.9 1.0X +read date from files 15246 15259 20 0.7 1524.6 0.1X +infer date from files 31002 31006 5 0.3 3100.2 0.1X +timestamp strings 2252 2257 5 4.4 225.2 0.8X +parse timestamps from Dataset[String] 28833 28871 34 0.3 2883.3 0.1X +infer timestamps from Dataset[String] 55417 55526 116 0.2 5541.7 0.0X +date strings 2561 2568 6 3.9 256.1 0.7X +parse dates from Dataset[String] 17580 17601 19 0.6 1758.0 0.1X +from_csv(timestamp) 26802 27121 280 0.4 2680.2 0.1X +from_csv(date) 16119 16126 6 0.6 1611.9 0.1X +infer error timestamps from Dataset[String] with default format 19595 19846 229 0.5 1959.5 0.1X +infer error timestamps from Dataset[String] with user-provided format 19816 19854 37 0.5 1981.6 0.1X +infer error timestamps from Dataset[String] with legacy format 19810 19849 42 0.5 1981.0 0.1X -OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure +OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1040-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 15487 15499 13 0.0 154874.0 1.0X -pushdown disabled 15405 15411 5 0.0 154051.4 1.0X -w/ filters 1166 1174 7 0.1 11660.4 13.3X +w/o filters 16689 16693 5 0.0 166885.8 1.0X +pushdown disabled 16610 16615 5 0.0 166095.3 1.0X +w/ filters 1094 1096 2 0.1 10936.1 15.3X diff --git a/sql/core/benchmarks/CSVBenchmark-results.txt b/sql/core/benchmarks/CSVBenchmark-results.txt index eb1ec99123d..ff67054b93d 100644 --- a/sql/core/benchmarks/CSVBenchmark-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-results.txt @@ -2,69 +2,69 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 55478 55679 175 0.0 1109556.3 1.0X +One quoted string 43827 44673 740 0.0 876536.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 113407 117690 NaN 0.0 113407.3 1.0X -Select 100 columns 42483 43350 918 0.0 42483.3 2.7X -Select one column 36959 37454 437 0.0 36958.5 3.1X -count() 10248 11871 1413 0.1 10248.2 11.1X -Select 100 columns, one bad input field 61143 61339 276 0.0 61143.4 1.9X -Select 100 columns, corrupt record field 65546 65662 170 0.0 65546.5 1.7X +Select 1000 columns 93035 94150 1041 0.0 93035.3 1.0X +Select 100 columns 34333 34440 185 0.0 34333.3 2.7X +Select one column 28763 28860 116 0.0 28763.1 3.2X +count() 7449 7665 300 0.1 7448.9 12.5X +Select 100 columns, one bad input field 50278 50458 175 0.0 50277.6 1.9X +Select 100 columns, corrupt record field 53481 53833 540 0.0 53480.7 1.7X -OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 12993 13063 83 0.8 1299.3 1.0X -Select 1 column + count() 11275 11448 159 0.9 1127.5 1.2X -count() 2804 2870 65 3.6 280.4 4.6X +Select 10 columns + count() 13070 13085 19 0.8 1307.0 1.0X +Select 1 column + count() 11406 11437 35 0.9 1140.6 1.1X +count() 2840 2873 30 3.5 284.0 4.6X -OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1213 1270 50 8.2 121.3 1.0X -to_csv(timestamp) 9959 9998 45 1.0 995.9 0.1X -write timestamps to files 8851 9069 199 1.1 885.1 0.1X -Create a dataset of dates 1575 1758 283 6.3 157.5 0.8X -to_csv(date) 6708 6761 89 1.5 670.8 0.2X -write dates to files 5294 5330 38 1.9 529.4 0.2X +Create a dataset of timestamps 1150 1169 26 8.7 115.0 1.0X +to_csv(timestamp) 9488 9499 15 1.1 948.8 0.1X +write timestamps to files 9194 9205 13 1.1 919.4 0.1X +Create a dataset of dates 1497 1506 15 6.7 149.7 0.8X +to_csv(date) 6030 6041 18 1.7 603.0 0.2X +write dates to files 5722 5729 7 1.7 572.2 0.2X -OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 1822 1844 26 5.5 182.2 1.0X -read timestamps from files 26595 26727 194 0.4 2659.5 0.1X -infer timestamps from files 53063 53427 450 0.2 5306.3 0.0X -read date text from files 1621 1656 34 6.2 162.1 1.1X -read date from files 13226 13452 197 0.8 1322.6 0.1X -infer date from files 26920 28034 1013 0.4 2692.0 0.1X -timestamp strings 2663 2721 77 3.8 266.3 0.7X -parse timestamps from Dataset[String] 29204 29608 352 0.3 2920.4 0.1X -infer timestamps from Dataset[String] 57302 57486 198 0.2 5730.2 0.0X -date strings 2835 2890 50 3.5 283.5 0.6X -parse dates from Dataset[String] 15775 15965 184 0.6 1577.5 0.1X -from_csv(timestamp) 27509 27967 418 0.4 2750.9 0.1X -from_csv(date) 14847 15059 325 0.7 1484.7 0.1X -infer error timestamps from Dataset[String] with default format 17424 17695 317 0.6 1742.4 0.1X -infer error timestamps from Dataset[String] with user-provided format 17585 17706 110 0.6 1758.5 0.1X -infer error timestamps from Dataset[String] with legacy format 17775 17855 69 0.6 1777.5 0.1X +read timestamp text from files 1528 1560 28 6.5 152.8 1.0X +read timestamps from files 27594 27600 8 0.4 2759.4 0.1X +infer timestamps from files 54923 54958 49 0.2 5492.3 0.0X +read date text from files 1388 1389 2 7.2 138.8 1.1X +read date from files 13358 13388 43 0.7 1335.8 0.1X +infer date from files 27254 27304 46 0.4 2725.4 0.1X +timestamp strings 2688 2698 11 3.7 268.8 0.6X +parse timestamps from Dataset[String] 30710 30731 21 0.3 3071.0 0.0X +infer timestamps from Dataset[String] 58123 58211 122 0.2 5812.3 0.0X +date strings 2804 2805 1 3.6 280.4 0.5X +parse dates from Dataset[String] 15409 15459 58 0.6 1540.9 0.1X +from_csv(timestamp) 29102 29113 17 0.3 2910.2 0.1X +from_csv(date) 15682 15687 6 0.6 1568.2 0.1X +infer error timestamps from Dataset[String] with default format 17912 17926 12 0.6 1791.2 0.1X +infer error timestamps from Dataset[String] with user-provided format 17892 17911 26 0.6 1789.2 0.1X +infer error timestamps from Dataset[String] with legacy format 17929 17935 10 0.6 1792.9 0.1X -OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_372-b07 on Linux 5.15.0-1040-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 18371 18553 205 0.0 183711.1 1.0X -pushdown disabled 18462 18770 290 0.0 184620.0 1.0X -w/ filters 1836 1871 50 0.1 18357.8 10.0X +w/o filters 17003 17018 14 0.0 170025.5 1.0X +pushdown disabled 17092 17103 10 0.0 170919.6 1.0X +w/ filters 1340 1352 13 0.1 13395.9 12.7X --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org