This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 3192bbd2958 [SPARK-39281][SQL] Speed up Timestamp type inference with legacy format in JSON/CSV data source 3192bbd2958 is described below commit 3192bbd29585607d43d0819c6c2d3ac00180261a Author: Jia Fan <fanjiaemi...@qq.com> AuthorDate: Tue May 16 15:59:01 2023 +0300 [SPARK-39281][SQL] Speed up Timestamp type inference with legacy format in JSON/CSV data source ### What changes were proposed in this pull request? Follow up https://github.com/apache/spark/pull/36562 , performance improvement when Timestamp type inference with legacy format. In the current implementation of CSV/JSON data source, the schema inference with legacy format relies on methods that will throw exceptions if the fields can't convert as some data types . Throwing and catching exceptions can be slow. We can improve it by creating methods that return optional results instead. The optimization of DefaultTimestampFormatter has been implemented in https://github.com/apache/spark/pull/36562 , this PR adds the optimization of legacy format. The basic logic is to prevent the formatter from throwing exceptions, and then use catch to determine whether the parsing is successful. ### Why are the changes needed? Performance improvement when Timestamp type inference with legacy format. When use JSON datasource, the speed up `67%`. CSV datasource speed also up, but not obvious. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Add new test Closes #41091 from Hisoka-X/SPARK-39281_legacy_format. Lead-authored-by: Jia Fan <fanjiaemi...@qq.com> Co-authored-by: Hisoka <fanjiaemi...@qq.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../sql/catalyst/util/TimestampFormatter.scala | 22 ++++ .../catalyst/util/TimestampFormatterSuite.scala | 19 ++++ sql/core/benchmarks/CSVBenchmark-jdk11-results.txt | 82 +++++++------- sql/core/benchmarks/CSVBenchmark-jdk17-results.txt | 82 +++++++------- sql/core/benchmarks/CSVBenchmark-results.txt | 82 +++++++------- .../benchmarks/JsonBenchmark-jdk11-results.txt | 98 ++++++++--------- .../benchmarks/JsonBenchmark-jdk17-results.txt | 122 ++++++++++----------- sql/core/benchmarks/JsonBenchmark-results.txt | 122 ++++++++++----------- 8 files changed, 335 insertions(+), 294 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala index 2a8283bde1d..aab90ec3844 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala @@ -407,6 +407,19 @@ class LegacyFastTimestampFormatter( if (!fastDateFormat.parse(s, new ParsePosition(0), cal)) { throw new IllegalArgumentException(s"'$s' is an invalid timestamp") } + extractMicros(cal) + } + + override def parseOptional(s: String): Option[Long] = { + cal.clear() // Clear the calendar because it can be re-used many times + if (fastDateFormat.parse(s, new ParsePosition(0), cal)) { + Some(extractMicros(cal)) + } else { + None + } + } + + private def extractMicros(cal: MicrosCalendar): Long = { val micros = cal.getMicros() cal.set(Calendar.MILLISECOND, 0) val julianMicros = Math.addExact(millisToMicros(cal.getTimeInMillis), micros) @@ -451,6 +464,15 @@ class LegacySimpleTimestampFormatter( fromJavaTimestamp(new Timestamp(sdf.parse(s).getTime)) } + override def parseOptional(s: String): Option[Long] = { + val date = sdf.parse(s, new ParsePosition(0)) + if (date == null) { + None + } else { + Some(fromJavaTimestamp(new Timestamp(date.getTime))) + } + } + override def format(us: Long): String = { sdf.format(toJavaTimestamp(us)) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala index 10553d421ea..8f6099e96ef 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala @@ -489,4 +489,23 @@ class TimestampFormatterSuite extends DatetimeFormatterSuite { assert(formatter.parseWithoutTimeZoneOptional("2012-00-65 23:59:59.9990", false) .isEmpty) } + + test("SPARK-39281: support returning optional parse results in the legacy formatter") { + val fastFormatter = new LegacyFastTimestampFormatter( + "yyyy-MM-dd HH:mm:ss.SSSS", + locale = DateFormatter.defaultLocale, + zoneId = DateTimeTestUtils.UTC) + + val simpleFormatter = new LegacySimpleTimestampFormatter( + "yyyy-MM-dd HH:mm:ss.SSSS", + locale = DateFormatter.defaultLocale, + zoneId = DateTimeTestUtils.UTC) + + assert(fastFormatter.parseOptional("2023-12-31 23:59:59.9990").contains(1704067199999000L)) + assert(fastFormatter.parseOptional("abc").isEmpty) + + assert(simpleFormatter.parseOptional("2023-12-31 23:59:59.9990").contains(1704067208990000L)) + assert(simpleFormatter.parseOptional("abc").isEmpty) + + } } diff --git a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt index 13135d877d9..7b5ea10bc4e 100644 --- a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt @@ -3,68 +3,68 @@ Benchmark to measure CSV read/write performance ================================================================================================ OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 30792 30900 145 0.0 615841.7 1.0X +One quoted string 38218 38618 520 0.0 764362.7 1.0X OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 74830 75688 1366 0.0 74829.6 1.0X -Select 100 columns 31458 31551 86 0.0 31458.3 2.4X -Select one column 26899 26950 56 0.0 26899.4 2.8X -count() 4864 4899 31 0.2 4863.6 15.4X -Select 100 columns, one bad input field 48174 48196 27 0.0 48174.4 1.6X -Select 100 columns, corrupt record field 52823 52911 137 0.0 52823.1 1.4X +Select 1000 columns 97679 98487 1143 0.0 97678.6 1.0X +Select 100 columns 39193 39339 193 0.0 39193.1 2.5X +Select one column 32781 33041 265 0.0 32780.7 3.0X +count() 7154 7228 86 0.1 7153.5 13.7X +Select 100 columns, one bad input field 53968 54158 165 0.0 53967.9 1.8X +Select 100 columns, corrupt record field 59730 60100 484 0.0 59730.2 1.6X OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 13875 13903 41 0.7 1387.5 1.0X -Select 1 column + count() 11740 11781 42 0.9 1174.0 1.2X -count() 2775 2784 8 3.6 277.5 5.0X +Select 10 columns + count() 15305 15627 282 0.7 1530.5 1.0X +Select 1 column + count() 13688 13777 106 0.7 1368.8 1.1X +count() 3189 3214 39 3.1 318.9 4.8X OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1358 1382 23 7.4 135.8 1.0X -to_csv(timestamp) 10820 10870 43 0.9 1082.0 0.1X -write timestamps to files 9951 9970 28 1.0 995.1 0.1X -Create a dataset of dates 1547 1563 22 6.5 154.7 0.9X -to_csv(date) 7164 7179 14 1.4 716.4 0.2X -write dates to files 5973 5998 40 1.7 597.3 0.2X +Create a dataset of timestamps 1630 1641 9 6.1 163.0 1.0X +to_csv(timestamp) 11606 11665 76 0.9 1160.6 0.1X +write timestamps to files 10636 10742 121 0.9 1063.6 0.2X +Create a dataset of dates 1854 1879 25 5.4 185.4 0.9X +to_csv(date) 7522 7563 37 1.3 752.2 0.2X +write dates to files 6435 6526 85 1.6 643.5 0.3X OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 1688 1696 7 5.9 168.8 1.0X -read timestamps from files 28003 28016 13 0.4 2800.3 0.1X -infer timestamps from files 56206 56293 137 0.2 5620.6 0.0X -read date text from files 1560 1560 0 6.4 156.0 1.1X -read date from files 14027 14086 57 0.7 1402.7 0.1X -infer date from files 28222 28292 76 0.4 2822.2 0.1X -timestamp strings 2164 2169 9 4.6 216.4 0.8X -parse timestamps from Dataset[String] 30744 30792 68 0.3 3074.4 0.1X -infer timestamps from Dataset[String] 59417 59438 20 0.2 5941.7 0.0X -date strings 2481 2492 11 4.0 248.1 0.7X -parse dates from Dataset[String] 16268 16353 74 0.6 1626.8 0.1X -from_csv(timestamp) 28849 29197 302 0.3 2884.9 0.1X -from_csv(date) 15273 15277 4 0.7 1527.3 0.1X -infer error timestamps from Dataset[String] with default format 17427 17493 80 0.6 1742.7 0.1X -infer error timestamps from Dataset[String] with user-provided format 17467 17509 50 0.6 1746.7 0.1X -infer error timestamps from Dataset[String] with legacy format 17501 17515 18 0.6 1750.1 0.1X +read timestamp text from files 2245 2310 57 4.5 224.5 1.0X +read timestamps from files 27283 27875 513 0.4 2728.3 0.1X +infer timestamps from files 55465 56311 859 0.2 5546.5 0.0X +read date text from files 2054 2088 38 4.9 205.4 1.1X +read date from files 15957 16190 202 0.6 1595.7 0.1X +infer date from files 33163 33319 135 0.3 3316.3 0.1X +timestamp strings 2518 2594 71 4.0 251.8 0.9X +parse timestamps from Dataset[String] 30168 30266 87 0.3 3016.8 0.1X +infer timestamps from Dataset[String] 58608 59332 728 0.2 5860.8 0.0X +date strings 2803 2847 44 3.6 280.3 0.8X +parse dates from Dataset[String] 17613 17877 421 0.6 1761.3 0.1X +from_csv(timestamp) 27736 28241 482 0.4 2773.6 0.1X +from_csv(date) 16415 16816 367 0.6 1641.5 0.1X +infer error timestamps from Dataset[String] with default format 18335 18494 138 0.5 1833.5 0.1X +infer error timestamps from Dataset[String] with user-provided format 18327 18598 422 0.5 1832.7 0.1X +infer error timestamps from Dataset[String] with legacy format 18713 18907 267 0.5 1871.3 0.1X OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 17666 17679 11 0.0 176663.1 1.0X -pushdown disabled 17611 17622 16 0.0 176107.8 1.0X -w/ filters 1130 1139 9 0.1 11295.5 15.6X +w/o filters 19420 19520 87 0.0 194201.0 1.0X +pushdown disabled 19196 19507 409 0.0 191958.0 1.0X +w/ filters 1380 1402 19 0.1 13796.9 14.1X diff --git a/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt b/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt index 952557b034e..9b86f237496 100644 --- a/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-jdk17-results.txt @@ -3,68 +3,68 @@ Benchmark to measure CSV read/write performance ================================================================================================ OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 43057 43213 259 0.0 861149.7 1.0X +One quoted string 41215 41413 184 0.0 824303.0 1.0X OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 99482 104630 NaN 0.0 99482.1 1.0X -Select 100 columns 34250 35621 1305 0.0 34249.7 2.9X -Select one column 34061 34275 213 0.0 34061.2 2.9X -count() 7097 7183 100 0.1 7097.2 14.0X -Select 100 columns, one bad input field 60693 63516 NaN 0.0 60693.0 1.6X -Select 100 columns, corrupt record field 71743 72373 967 0.0 71743.2 1.4X +Select 1000 columns 82745 83284 859 0.0 82744.6 1.0X +Select 100 columns 31408 31505 99 0.0 31407.6 2.6X +Select one column 26527 26578 53 0.0 26526.6 3.1X +count() 5168 5214 40 0.2 5167.9 16.0X +Select 100 columns, one bad input field 50701 50802 120 0.0 50700.8 1.6X +Select 100 columns, corrupt record field 55347 55377 27 0.0 55347.2 1.5X OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 13285 13689 499 0.8 1328.5 1.0X -Select 1 column + count() 8957 9828 756 1.1 895.7 1.5X -count() 3041 3051 11 3.3 304.1 4.4X +Select 10 columns + count() 14368 14376 12 0.7 1436.8 1.0X +Select 1 column + count() 8791 8834 46 1.1 879.1 1.6X +count() 2597 2613 13 3.8 259.7 5.5X OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1677 1710 29 6.0 167.7 1.0X -to_csv(timestamp) 11262 11402 153 0.9 1126.2 0.1X -write timestamps to files 11460 11492 28 0.9 1146.0 0.1X -Create a dataset of dates 1857 1948 109 5.4 185.7 0.9X -to_csv(date) 7411 7531 125 1.3 741.1 0.2X -write dates to files 6372 6467 152 1.6 637.2 0.3X +Create a dataset of timestamps 1448 1475 30 6.9 144.8 1.0X +to_csv(timestamp) 9021 9033 13 1.1 902.1 0.2X +write timestamps to files 8104 8113 8 1.2 810.4 0.2X +Create a dataset of dates 1510 1527 15 6.6 151.0 1.0X +to_csv(date) 6114 6121 12 1.6 611.4 0.2X +write dates to files 5191 5196 5 1.9 519.1 0.3X OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 2408 2496 90 4.2 240.8 1.0X -read timestamps from files 25407 26546 1412 0.4 2540.7 0.1X -infer timestamps from files 53186 53712 864 0.2 5318.6 0.0X -read date text from files 2245 2269 34 4.5 224.5 1.1X -read date from files 14504 15031 547 0.7 1450.4 0.2X -infer date from files 31829 32093 380 0.3 3182.9 0.1X -timestamp strings 2960 2988 26 3.4 296.0 0.8X -parse timestamps from Dataset[String] 30696 31474 715 0.3 3069.6 0.1X -infer timestamps from Dataset[String] 58994 61161 2098 0.2 5899.4 0.0X -date strings 2983 3018 44 3.4 298.3 0.8X -parse dates from Dataset[String] 15972 16140 146 0.6 1597.2 0.2X -from_csv(timestamp) 26790 27879 1022 0.4 2679.0 0.1X -from_csv(date) 15463 15806 305 0.6 1546.3 0.2X -infer error timestamps from Dataset[String] with default format 18566 19341 672 0.5 1856.6 0.1X -infer error timestamps from Dataset[String] with user-provided format 19675 19891 188 0.5 1967.5 0.1X -infer error timestamps from Dataset[String] with legacy format 20196 20244 43 0.5 2019.6 0.1X +read timestamp text from files 1891 1900 11 5.3 189.1 1.0X +read timestamps from files 25100 25122 27 0.4 2510.0 0.1X +infer timestamps from files 50501 50568 110 0.2 5050.1 0.0X +read date text from files 1813 1816 4 5.5 181.3 1.0X +read date from files 15558 15589 27 0.6 1555.8 0.1X +infer date from files 31269 31335 84 0.3 3126.9 0.1X +timestamp strings 2126 2135 10 4.7 212.6 0.9X +parse timestamps from Dataset[String] 27361 27404 46 0.4 2736.1 0.1X +infer timestamps from Dataset[String] 52775 52897 146 0.2 5277.5 0.0X +date strings 2421 2432 19 4.1 242.1 0.8X +parse dates from Dataset[String] 17745 17810 75 0.6 1774.5 0.1X +from_csv(timestamp) 25839 25938 133 0.4 2583.9 0.1X +from_csv(date) 16625 16690 60 0.6 1662.5 0.1X +infer error timestamps from Dataset[String] with default format 20289 20376 76 0.5 2028.9 0.1X +infer error timestamps from Dataset[String] with user-provided format 20245 20326 108 0.5 2024.5 0.1X +infer error timestamps from Dataset[String] with legacy format 20274 20314 36 0.5 2027.4 0.1X OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 20354 20531 233 0.0 203535.5 1.0X -pushdown disabled 19266 19815 476 0.0 192655.7 1.1X -w/ filters 1515 1526 10 0.1 15147.9 13.4X +w/o filters 15487 15499 13 0.0 154874.0 1.0X +pushdown disabled 15405 15411 5 0.0 154051.4 1.0X +w/ filters 1166 1174 7 0.1 11660.4 13.3X diff --git a/sql/core/benchmarks/CSVBenchmark-results.txt b/sql/core/benchmarks/CSVBenchmark-results.txt index 93dacefa8cf..eb1ec99123d 100644 --- a/sql/core/benchmarks/CSVBenchmark-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-results.txt @@ -3,68 +3,68 @@ Benchmark to measure CSV read/write performance ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 59323 60575 1109 0.0 1186457.9 1.0X +One quoted string 55478 55679 175 0.0 1109556.3 1.0X OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 137166 140409 2130 0.0 137165.6 1.0X -Select 100 columns 49431 49763 556 0.0 49430.7 2.8X -Select one column 41024 41352 288 0.0 41023.6 3.3X -count() 10378 11389 937 0.1 10377.8 13.2X -Select 100 columns, one bad input field 77370 77652 407 0.0 77369.7 1.8X -Select 100 columns, corrupt record field 84974 86193 1407 0.0 84974.4 1.6X +Select 1000 columns 113407 117690 NaN 0.0 113407.3 1.0X +Select 100 columns 42483 43350 918 0.0 42483.3 2.7X +Select one column 36959 37454 437 0.0 36958.5 3.1X +count() 10248 11871 1413 0.1 10248.2 11.1X +Select 100 columns, one bad input field 61143 61339 276 0.0 61143.4 1.9X +Select 100 columns, corrupt record field 65546 65662 170 0.0 65546.5 1.7X OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 16172 16387 231 0.6 1617.2 1.0X -Select 1 column + count() 12655 12963 331 0.8 1265.5 1.3X -count() 3674 3690 15 2.7 367.4 4.4X +Select 10 columns + count() 12993 13063 83 0.8 1299.3 1.0X +Select 1 column + count() 11275 11448 159 0.9 1127.5 1.2X +count() 2804 2870 65 3.6 280.4 4.6X OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1642 1693 45 6.1 164.2 1.0X -to_csv(timestamp) 12293 12549 258 0.8 1229.3 0.1X -write timestamps to files 11009 11067 64 0.9 1100.9 0.1X -Create a dataset of dates 1978 1993 26 5.1 197.8 0.8X -to_csv(date) 8332 8488 136 1.2 833.2 0.2X -write dates to files 6983 7017 34 1.4 698.3 0.2X +Create a dataset of timestamps 1213 1270 50 8.2 121.3 1.0X +to_csv(timestamp) 9959 9998 45 1.0 995.9 0.1X +write timestamps to files 8851 9069 199 1.1 885.1 0.1X +Create a dataset of dates 1575 1758 283 6.3 157.5 0.8X +to_csv(date) 6708 6761 89 1.5 670.8 0.2X +write dates to files 5294 5330 38 1.9 529.4 0.2X OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 2358 2366 9 4.2 235.8 1.0X -read timestamps from files 31052 31216 157 0.3 3105.2 0.1X -infer timestamps from files 61690 61964 279 0.2 6169.0 0.0X -read date text from files 2071 2126 50 4.8 207.1 1.1X -read date from files 15856 16016 187 0.6 1585.6 0.1X -infer date from files 31739 32099 404 0.3 3173.9 0.1X -timestamp strings 3335 3378 64 3.0 333.5 0.7X -parse timestamps from Dataset[String] 34779 35330 506 0.3 3477.9 0.1X -infer timestamps from Dataset[String] 66644 66767 110 0.2 6664.4 0.0X -date strings 3480 3528 45 2.9 348.0 0.7X -parse dates from Dataset[String] 18673 18824 196 0.5 1867.3 0.1X -from_csv(timestamp) 31932 32168 224 0.3 3193.2 0.1X -from_csv(date) 16794 16810 20 0.6 1679.4 0.1X -infer error timestamps from Dataset[String] with default format 20564 20839 239 0.5 2056.4 0.1X -infer error timestamps from Dataset[String] with user-provided format 20701 20767 59 0.5 2070.1 0.1X -infer error timestamps from Dataset[String] with legacy format 20486 20784 259 0.5 2048.6 0.1X +read timestamp text from files 1822 1844 26 5.5 182.2 1.0X +read timestamps from files 26595 26727 194 0.4 2659.5 0.1X +infer timestamps from files 53063 53427 450 0.2 5306.3 0.0X +read date text from files 1621 1656 34 6.2 162.1 1.1X +read date from files 13226 13452 197 0.8 1322.6 0.1X +infer date from files 26920 28034 1013 0.4 2692.0 0.1X +timestamp strings 2663 2721 77 3.8 266.3 0.7X +parse timestamps from Dataset[String] 29204 29608 352 0.3 2920.4 0.1X +infer timestamps from Dataset[String] 57302 57486 198 0.2 5730.2 0.0X +date strings 2835 2890 50 3.5 283.5 0.6X +parse dates from Dataset[String] 15775 15965 184 0.6 1577.5 0.1X +from_csv(timestamp) 27509 27967 418 0.4 2750.9 0.1X +from_csv(date) 14847 15059 325 0.7 1484.7 0.1X +infer error timestamps from Dataset[String] with default format 17424 17695 317 0.6 1742.4 0.1X +infer error timestamps from Dataset[String] with user-provided format 17585 17706 110 0.6 1758.5 0.1X +infer error timestamps from Dataset[String] with legacy format 17775 17855 69 0.6 1777.5 0.1X OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 21751 22231 570 0.0 217509.9 1.0X -pushdown disabled 21961 22114 135 0.0 219611.1 1.0X -w/ filters 2189 2215 24 0.0 21891.4 9.9X +w/o filters 18371 18553 205 0.0 183711.1 1.0X +pushdown disabled 18462 18770 290 0.0 184620.0 1.0X +w/ filters 1836 1871 50 0.1 18357.8 10.0X diff --git a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt index 711596fa9c7..edf089b9b1e 100644 --- a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt @@ -7,117 +7,117 @@ OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 3104 3137 30 1.6 620.8 1.0X -UTF-8 is set 4525 4556 27 1.1 904.9 0.7X +No encoding 3150 3166 27 1.6 630.1 1.0X +UTF-8 is set 4572 4585 12 1.1 914.4 0.7X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2640 2677 33 1.9 528.0 1.0X -UTF-8 is set 3824 3850 27 1.3 764.8 0.7X +No encoding 2422 2475 50 2.1 484.4 1.0X +UTF-8 is set 3786 3796 14 1.3 757.2 0.6X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 4943 5075 115 0.2 4942.6 1.0X -UTF-8 is set 8842 8861 26 0.1 8842.4 0.6X +No encoding 5104 5170 87 0.2 5104.0 1.0X +UTF-8 is set 9229 9246 15 0.1 9228.7 0.6X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 13730 13899 180 0.0 274590.8 1.0X -UTF-8 is set 15417 15493 105 0.0 308348.5 0.9X +No encoding 13977 14153 277 0.0 279538.0 1.0X +UTF-8 is set 16231 16284 70 0.0 324628.3 0.9X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 2109 2156 78 0.5 2109.0 1.0X -Select 1 column 1500 1509 8 0.7 1500.2 1.4X +Select 10 columns 2197 2232 42 0.5 2196.7 1.0X +Select 1 column 1560 1567 9 0.6 1560.2 1.4X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 702 728 22 1.4 702.4 1.0X -Short column with UTF-8 969 1000 32 1.0 969.3 0.7X -Wide column without encoding 7944 7981 55 0.1 7943.6 0.1X -Wide column with UTF-8 14084 14094 11 0.1 14083.9 0.0X +Short column without encoding 688 709 18 1.5 688.3 1.0X +Short column with UTF-8 939 963 21 1.1 939.4 0.7X +Wide column without encoding 8049 8102 66 0.1 8048.7 0.1X +Wide column with UTF-8 14346 14368 28 0.1 14345.7 0.0X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 97 105 8 10.3 96.9 1.0X -from_json 1933 1953 24 0.5 1933.3 0.1X -json_tuple 2170 2183 14 0.5 2170.4 0.0X -get_json_object 2013 2032 21 0.5 2012.5 0.0X +Text read 101 103 2 9.9 100.5 1.0X +from_json 1960 1965 6 0.5 1960.1 0.1X +json_tuple 2226 2235 13 0.4 2226.3 0.0X +get_json_object 2077 2088 12 0.5 2077.0 0.0X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 407 417 16 12.3 81.4 1.0X -schema inferring 2553 2564 18 2.0 510.7 0.2X -parsing 2911 2922 10 1.7 582.1 0.1X +Text read 416 426 14 12.0 83.2 1.0X +schema inferring 2606 2619 18 1.9 521.3 0.2X +parsing 2871 2872 1 1.7 574.2 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 959 968 8 5.2 191.7 1.0X -Schema inferring 3226 3238 10 1.5 645.3 0.3X -Parsing without charset 3234 3244 15 1.5 646.8 0.3X -Parsing with UTF-8 4603 4615 17 1.1 920.7 0.2X +Text read 986 994 9 5.1 197.1 1.0X +Schema inferring 3347 3380 49 1.5 669.4 0.3X +Parsing without charset 3294 3306 14 1.5 658.8 0.3X +Parsing with UTF-8 4441 4447 8 1.1 888.1 0.2X OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 201 207 7 5.0 201.3 1.0X -to_json(timestamp) 1113 1127 13 0.9 1113.4 0.2X -write timestamps to files 1003 1019 14 1.0 1003.4 0.2X -Create a dataset of dates 212 217 5 4.7 211.8 1.0X -to_json(date) 796 796 1 1.3 795.8 0.3X -write dates to files 647 656 10 1.5 647.1 0.3X +Create a dataset of timestamps 175 184 11 5.7 175.1 1.0X +to_json(timestamp) 1159 1170 18 0.9 1158.7 0.2X +write timestamps to files 1026 1036 17 1.0 1026.0 0.2X +Create a dataset of dates 202 211 12 5.0 201.9 0.9X +to_json(date) 808 818 15 1.2 808.5 0.2X +write dates to files 642 652 8 1.6 642.5 0.3X OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 253 268 23 4.0 252.8 1.0X -read timestamps from files 2559 2568 15 0.4 2558.8 0.1X -infer timestamps from files 6594 6604 14 0.2 6594.3 0.0X -read date text from files 230 242 21 4.3 230.2 1.1X -read date from files 814 828 16 1.2 814.3 0.3X -timestamp strings 269 279 15 3.7 268.9 0.9X -parse timestamps from Dataset[String] 2779 2784 7 0.4 2778.7 0.1X -infer timestamps from Dataset[String] 6808 6811 3 0.1 6808.0 0.0X -date strings 358 366 11 2.8 357.9 0.7X -parse dates from Dataset[String] 1193 1206 19 0.8 1193.4 0.2X -from_json(timestamp) 4020 4024 5 0.2 4020.3 0.1X -from_json(date) 2399 2407 7 0.4 2398.8 0.1X -infer error timestamps from Dataset[String] with default format 1846 1875 42 0.5 1846.1 0.1X -infer error timestamps from Dataset[String] with user-provided format 1837 1844 6 0.5 1837.3 0.1X -infer error timestamps from Dataset[String] with legacy format 5826 5831 5 0.2 5825.6 0.0X +read timestamp text from files 251 274 33 4.0 251.0 1.0X +read timestamps from files 2549 2563 16 0.4 2548.9 0.1X +infer timestamps from files 6574 6576 1 0.2 6574.1 0.0X +read date text from files 230 252 35 4.3 229.9 1.1X +read date from files 815 827 16 1.2 815.2 0.3X +timestamp strings 279 289 15 3.6 278.6 0.9X +parse timestamps from Dataset[String] 2823 2829 7 0.4 2822.6 0.1X +infer timestamps from Dataset[String] 6869 6872 4 0.1 6868.6 0.0X +date strings 352 360 12 2.8 351.8 0.7X +parse dates from Dataset[String] 1260 1269 10 0.8 1259.6 0.2X +from_json(timestamp) 4010 4041 43 0.2 4010.3 0.1X +from_json(date) 2445 2454 11 0.4 2444.7 0.1X +infer error timestamps from Dataset[String] with default format 1917 1930 12 0.5 1916.8 0.1X +infer error timestamps from Dataset[String] with user-provided format 1874 1904 27 0.5 1874.3 0.1X +infer error timestamps from Dataset[String] with legacy format 1886 1904 15 0.5 1886.5 0.1X OpenJDK 64-Bit Server VM 11.0.19+7 on Linux 5.15.0-1037-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 19584 19697 98 0.0 195844.6 1.0X -pushdown disabled 19115 19124 9 0.0 191146.2 1.0X -w/ filters 713 716 3 0.1 7131.0 27.5X +w/o filters 19379 19449 81 0.0 193788.4 1.0X +pushdown disabled 18995 19002 9 0.0 189954.9 1.0X +w/ filters 732 734 4 0.1 7320.2 26.5X diff --git a/sql/core/benchmarks/JsonBenchmark-jdk17-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk17-results.txt index fd4227787e1..7c68baa7e75 100644 --- a/sql/core/benchmarks/JsonBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-jdk17-results.txt @@ -4,120 +4,120 @@ Benchmark for performance of JSON parsing Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 3557 3590 31 1.4 711.4 1.0X -UTF-8 is set 5321 5355 47 0.9 1064.2 0.7X +No encoding 2964 3045 89 1.7 592.8 1.0X +UTF-8 is set 4365 4382 18 1.1 873.1 0.7X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2719 2839 177 1.8 543.9 1.0X -UTF-8 is set 4601 4657 60 1.1 920.2 0.6X +No encoding 2326 2381 52 2.1 465.2 1.0X +UTF-8 is set 3834 3846 17 1.3 766.7 0.6X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 5421 5580 191 0.2 5421.1 1.0X -UTF-8 is set 7294 7355 80 0.1 7294.5 0.7X +No encoding 4599 4622 26 0.2 4599.4 1.0X +UTF-8 is set 6079 6120 62 0.2 6078.8 0.8X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 14469 14801 520 0.0 289372.5 1.0X -UTF-8 is set 16764 16901 119 0.0 335271.7 0.9X +No encoding 12217 12443 256 0.0 244340.4 1.0X +UTF-8 is set 13720 13823 113 0.0 274409.6 0.9X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 2388 2398 11 0.4 2388.1 1.0X -Select 1 column 1837 1845 7 0.5 1837.5 1.3X +Select 10 columns 2291 2308 18 0.4 2291.5 1.0X +Select 1 column 1485 1491 8 0.7 1485.2 1.5X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 780 800 25 1.3 779.8 1.0X -Short column with UTF-8 1082 1093 17 0.9 1082.4 0.7X -Wide column without encoding 8264 8328 70 0.1 8263.8 0.1X -Wide column with UTF-8 9621 9662 37 0.1 9621.0 0.1X +Short column without encoding 689 691 3 1.5 688.7 1.0X +Short column with UTF-8 973 977 3 1.0 972.8 0.7X +Wide column without encoding 7239 7283 71 0.1 7238.6 0.1X +Wide column with UTF-8 9634 9667 30 0.1 9634.3 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 118 123 5 8.5 118.2 1.0X -from_json 1872 1896 26 0.5 1871.7 0.1X -json_tuple 2112 2133 29 0.5 2111.9 0.1X -get_json_object 2022 2027 6 0.5 2022.3 0.1X +Text read 95 100 9 10.5 95.1 1.0X +from_json 1638 1646 7 0.6 1638.5 0.1X +json_tuple 1971 1996 39 0.5 1970.6 0.0X +get_json_object 1799 1809 13 0.6 1799.3 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 482 490 11 10.4 96.5 1.0X -schema inferring 2844 2855 9 1.8 568.8 0.2X -parsing 3629 3645 16 1.4 725.8 0.1X +Text read 390 393 5 12.8 78.0 1.0X +schema inferring 2396 2400 3 2.1 479.2 0.2X +parsing 2899 2908 10 1.7 579.8 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 1119 1141 20 4.5 223.7 1.0X -Schema inferring 3711 3766 61 1.3 742.1 0.3X -Parsing without charset 3895 3909 22 1.3 779.1 0.3X -Parsing with UTF-8 5625 5659 45 0.9 1124.9 0.2X +Text read 923 927 4 5.4 184.6 1.0X +Schema inferring 3256 3267 11 1.5 651.3 0.3X +Parsing without charset 3347 3355 7 1.5 669.4 0.3X +Parsing with UTF-8 4877 4882 8 1.0 975.4 0.2X OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 231 242 10 4.3 231.0 1.0X -to_json(timestamp) 1302 1326 27 0.8 1301.5 0.2X -write timestamps to files 1084 1091 7 0.9 1083.7 0.2X -Create a dataset of dates 235 243 10 4.3 234.9 1.0X -to_json(date) 983 1006 21 1.0 983.0 0.2X -write dates to files 754 760 10 1.3 753.6 0.3X +Create a dataset of timestamps 182 194 11 5.5 181.7 1.0X +to_json(timestamp) 979 987 8 1.0 978.8 0.2X +write timestamps to files 921 923 2 1.1 921.0 0.2X +Create a dataset of dates 194 201 10 5.2 193.6 0.9X +to_json(date) 726 730 4 1.4 725.7 0.3X +write dates to files 617 623 8 1.6 616.6 0.3X OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 278 288 13 3.6 278.4 1.0X -read timestamps from files 2840 2851 18 0.4 2839.6 0.1X -infer timestamps from files 7357 7363 6 0.1 7356.7 0.0X -read date text from files 261 267 8 3.8 261.2 1.1X -read date from files 979 996 17 1.0 979.1 0.3X -timestamp strings 314 331 22 3.2 314.0 0.9X -parse timestamps from Dataset[String] 3105 3147 38 0.3 3104.6 0.1X -infer timestamps from Dataset[String] 7671 7708 33 0.1 7671.0 0.0X -date strings 428 435 6 2.3 427.8 0.7X -parse dates from Dataset[String] 1468 1493 26 0.7 1468.2 0.2X -from_json(timestamp) 4160 4191 32 0.2 4159.6 0.1X -from_json(date) 2493 2527 32 0.4 2493.4 0.1X -infer error timestamps from Dataset[String] with default format 2253 2289 32 0.4 2252.6 0.1X -infer error timestamps from Dataset[String] with user-provided format 2216 2239 36 0.5 2216.1 0.1X -infer error timestamps from Dataset[String] with legacy format 6899 6942 60 0.1 6898.9 0.0X +read timestamp text from files 248 252 6 4.0 247.7 1.0X +read timestamps from files 2436 2460 22 0.4 2436.5 0.1X +infer timestamps from files 6217 6231 14 0.2 6216.7 0.0X +read date text from files 209 215 6 4.8 209.0 1.2X +read date from files 843 849 5 1.2 843.2 0.3X +timestamp strings 263 264 1 3.8 263.2 0.9X +parse timestamps from Dataset[String] 2616 2617 1 0.4 2616.0 0.1X +infer timestamps from Dataset[String] 6254 6256 2 0.2 6254.0 0.0X +date strings 333 339 7 3.0 333.2 0.7X +parse dates from Dataset[String] 1076 1082 6 0.9 1075.6 0.2X +from_json(timestamp) 3691 3696 6 0.3 3691.1 0.1X +from_json(date) 2112 2118 6 0.5 2112.4 0.1X +infer error timestamps from Dataset[String] with default format 1753 1758 5 0.6 1753.3 0.1X +infer error timestamps from Dataset[String] with user-provided format 1741 1741 0 0.6 1740.7 0.1X +infer error timestamps from Dataset[String] with legacy format 1709 1718 8 0.6 1708.9 0.1X OpenJDK 64-Bit Server VM 17.0.7+7 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 22235 22266 27 0.0 222352.9 1.0X -pushdown disabled 21252 21325 87 0.0 212520.9 1.0X -w/ filters 1143 1172 33 0.1 11427.2 19.5X +w/o filters 18384 18399 14 0.0 183840.1 1.0X +pushdown disabled 17599 17610 10 0.0 175990.2 1.0X +w/ filters 957 985 43 0.1 9565.9 19.2X diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt b/sql/core/benchmarks/JsonBenchmark-results.txt index 4d0256fae9b..55f66f7bb24 100644 --- a/sql/core/benchmarks/JsonBenchmark-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-results.txt @@ -4,120 +4,120 @@ Benchmark for performance of JSON parsing Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 3871 3914 69 1.3 774.2 1.0X -UTF-8 is set 5539 5563 26 0.9 1107.8 0.7X +No encoding 3720 3843 121 1.3 743.9 1.0X +UTF-8 is set 5412 5455 45 0.9 1082.4 0.7X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 2984 2999 24 1.7 596.9 1.0X -UTF-8 is set 4875 4928 46 1.0 975.0 0.6X +No encoding 3234 3254 33 1.5 646.7 1.0X +UTF-8 is set 4847 4868 21 1.0 969.5 0.7X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 6353 6446 143 0.2 6353.4 1.0X -UTF-8 is set 10548 10647 163 0.1 10547.8 0.6X +No encoding 5702 5794 101 0.2 5702.1 1.0X +UTF-8 is set 9526 9607 73 0.1 9526.1 0.6X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 18807 18880 66 0.0 376130.9 1.0X -UTF-8 is set 20530 20554 23 0.0 410593.2 0.9X +No encoding 18318 18448 199 0.0 366367.7 1.0X +UTF-8 is set 19791 19887 99 0.0 395817.1 0.9X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 2741 2749 12 0.4 2740.6 1.0X -Select 1 column 1916 1925 8 0.5 1916.5 1.4X +Select 10 columns 2531 2570 51 0.4 2531.3 1.0X +Select 1 column 1867 1882 16 0.5 1867.0 1.4X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 901 934 29 1.1 900.8 1.0X -Short column with UTF-8 1320 1343 31 0.8 1319.9 0.7X -Wide column without encoding 13446 13544 103 0.1 13445.8 0.1X -Wide column with UTF-8 17770 17854 76 0.1 17770.0 0.1X +Short column without encoding 868 875 7 1.2 868.4 1.0X +Short column with UTF-8 1151 1163 11 0.9 1150.9 0.8X +Wide column without encoding 12063 12299 205 0.1 12063.0 0.1X +Wide column with UTF-8 16095 16136 51 0.1 16095.3 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 159 167 9 6.3 159.2 1.0X -from_json 2844 2863 25 0.4 2844.1 0.1X -json_tuple 3137 3161 23 0.3 3136.7 0.1X -get_json_object 2874 2884 9 0.3 2874.2 0.1X +Text read 165 170 4 6.1 164.7 1.0X +from_json 2339 2386 77 0.4 2338.9 0.1X +json_tuple 2667 2730 55 0.4 2667.3 0.1X +get_json_object 2627 2659 32 0.4 2627.1 0.1X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 732 745 11 6.8 146.3 1.0X -schema inferring 3260 3265 6 1.5 652.0 0.2X -parsing 3592 3645 46 1.4 718.4 0.2X +Text read 700 715 20 7.1 140.1 1.0X +schema inferring 3144 3166 20 1.6 628.7 0.2X +parsing 3261 3271 9 1.5 652.1 0.2X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 1092 1100 11 4.6 218.4 1.0X -Schema inferring 3814 3826 15 1.3 762.8 0.3X -Parsing without charset 4153 4184 32 1.2 830.7 0.3X -Parsing with UTF-8 6014 6035 22 0.8 1202.9 0.2X +Text read 1096 1105 12 4.6 219.1 1.0X +Schema inferring 3818 3830 16 1.3 763.6 0.3X +Parsing without charset 4107 4137 32 1.2 821.4 0.3X +Parsing with UTF-8 5717 5763 41 0.9 1143.3 0.2X OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 193 198 4 5.2 193.5 1.0X -to_json(timestamp) 1566 1582 14 0.6 1566.4 0.1X -write timestamps to files 1265 1274 14 0.8 1265.1 0.2X -Create a dataset of dates 232 239 10 4.3 231.9 0.8X -to_json(date) 1037 1058 18 1.0 1037.2 0.2X -write dates to files 766 770 7 1.3 765.6 0.3X +Create a dataset of timestamps 199 202 3 5.0 198.9 1.0X +to_json(timestamp) 1458 1487 26 0.7 1458.0 0.1X +write timestamps to files 1232 1262 26 0.8 1232.5 0.2X +Create a dataset of dates 231 237 5 4.3 230.8 0.9X +to_json(date) 956 966 10 1.0 956.5 0.2X +write dates to files 785 793 10 1.3 785.4 0.3X OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------------------------- -read timestamp text from files 283 289 6 3.5 283.1 1.0X -read timestamps from files 3364 3431 60 0.3 3363.6 0.1X -infer timestamps from files 8913 8935 38 0.1 8912.6 0.0X -read date text from files 263 267 4 3.8 262.9 1.1X -read date from files 1102 1116 12 0.9 1101.7 0.3X -timestamp strings 412 426 14 2.4 412.0 0.7X -parse timestamps from Dataset[String] 3941 3956 14 0.3 3940.8 0.1X -infer timestamps from Dataset[String] 9334 9383 43 0.1 9333.8 0.0X -date strings 469 484 24 2.1 469.3 0.6X -parse dates from Dataset[String] 1565 1572 11 0.6 1564.8 0.2X -from_json(timestamp) 5825 5917 88 0.2 5824.5 0.0X -from_json(date) 3553 3574 19 0.3 3553.1 0.1X -infer error timestamps from Dataset[String] with default format 2590 2609 19 0.4 2589.9 0.1X -infer error timestamps from Dataset[String] with user-provided format 2517 2551 30 0.4 2516.8 0.1X -infer error timestamps from Dataset[String] with legacy format 6836 6876 63 0.1 6836.1 0.0X +read timestamp text from files 294 300 6 3.4 293.8 1.0X +read timestamps from files 3254 3283 49 0.3 3254.0 0.1X +infer timestamps from files 8390 8528 165 0.1 8389.8 0.0X +read date text from files 269 276 7 3.7 269.3 1.1X +read date from files 1178 1192 13 0.8 1177.8 0.2X +timestamp strings 406 418 15 2.5 406.2 0.7X +parse timestamps from Dataset[String] 3700 3713 16 0.3 3699.5 0.1X +infer timestamps from Dataset[String] 8604 8647 65 0.1 8604.0 0.0X +date strings 464 479 14 2.2 463.7 0.6X +parse dates from Dataset[String] 1528 1538 10 0.7 1527.7 0.2X +from_json(timestamp) 5402 5429 26 0.2 5401.8 0.1X +from_json(date) 2948 2966 17 0.3 2947.5 0.1X +infer error timestamps from Dataset[String] with default format 2358 2434 70 0.4 2357.6 0.1X +infer error timestamps from Dataset[String] with user-provided format 2363 2390 36 0.4 2362.9 0.1X +infer error timestamps from Dataset[String] with legacy format 2248 2287 35 0.4 2248.3 0.1X OpenJDK 64-Bit Server VM 1.8.0_362-b09 on Linux 5.15.0-1037-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 25753 25770 28 0.0 257534.1 1.0X -pushdown disabled 24549 24770 205 0.0 245490.3 1.0X -w/ filters 908 919 15 0.1 9081.2 28.4X +w/o filters 22544 22661 109 0.0 225436.4 1.0X +pushdown disabled 21045 21213 188 0.0 210452.6 1.1X +w/ filters 893 904 10 0.1 8931.8 25.2X --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org