This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 6fe52ad [SPARK-31414][SQL] Fix performance regression with new
TimestampFormatter for json and csv time parsing
6fe52ad is described below
commit 6fe52ad0a7eafe4293338a075ee25917127d4497
Author: Kent Yao <[email protected]>
AuthorDate: Mon Apr 13 03:11:28 2020 +0000
[SPARK-31414][SQL] Fix performance regression with new TimestampFormatter
for json and csv time parsing
With benchmark original, where the timestamp values are valid to the new
parser
the result is
```scala
[info] Running benchmark: Read dates and timestamps
[info] Running case: timestamp strings
[info] Stopped after 3 iterations, 5781 ms
[info] Running case: parse timestamps from Dataset[String]
[info] Stopped after 3 iterations, 44764 ms
[info] Running case: infer timestamps from Dataset[String]
[info] Stopped after 3 iterations, 93764 ms
[info] Running case: from_json(timestamp)
[info] Stopped after 3 iterations, 59021 ms
```
When we modify the benchmark to
```scala
def timestampStr: Dataset[String] = {
spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${i %
100}"}""")
}.select($"value".as("timestamp")).as[String]
}
readBench.addCase("timestamp strings", numIters) { _ =>
timestampStr.noop()
}
readBench.addCase("parse timestamps from Dataset[String]", numIters)
{ _ =>
spark.read.schema(tsSchema).json(timestampStr).noop()
}
readBench.addCase("infer timestamps from Dataset[String]", numIters)
{ _ =>
spark.read.json(timestampStr).noop()
}
```
where the timestamp values are invalid for the new parser which causes a
fallback to legacy parser(2.4).
the result is
```scala
[info] Running benchmark: Read dates and timestamps
[info] Running case: timestamp strings
[info] Stopped after 3 iterations, 5623 ms
[info] Running case: parse timestamps from Dataset[String]
[info] Stopped after 3 iterations, 506637 ms
[info] Running case: infer timestamps from Dataset[String]
[info] Stopped after 3 iterations, 509076 ms
```
About 10x perf-regression
BUT if we modify the timestamp pattern to `....HH:mm:ss[.SSS][XXX]` which
make all timestamp values valid for the new parser to prohibit fallback, the
result is
```scala
[info] Running benchmark: Read dates and timestamps
[info] Running case: timestamp strings
[info] Stopped after 3 iterations, 5623 ms
[info] Running case: parse timestamps from Dataset[String]
[info] Stopped after 3 iterations, 506637 ms
[info] Running case: infer timestamps from Dataset[String]
[info] Stopped after 3 iterations, 509076 ms
```
Fix performance regression.
NO
new tests added.
Closes #28181 from yaooqinn/SPARK-31414.
Authored-by: Kent Yao <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit d65f534c5ad4385b7c5198f15cb014e1d24e47c9)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../apache/spark/sql/catalyst/csv/CSVOptions.scala | 9 +-
.../spark/sql/catalyst/json/JSONOptions.scala | 9 +-
sql/core/benchmarks/CSVBenchmark-jdk11-results.txt | 88 +++++++-------
sql/core/benchmarks/CSVBenchmark-results.txt | 88 +++++++-------
.../benchmarks/JsonBenchmark-jdk11-results.txt | 130 ++++++++++-----------
sql/core/benchmarks/JsonBenchmark-results.txt | 130 ++++++++++-----------
.../org/apache/spark/sql/CsvFunctionsSuite.scala | 12 ++
.../org/apache/spark/sql/JsonFunctionsSuite.scala | 12 ++
.../execution/datasources/csv/CSVBenchmark.scala | 4 +-
.../execution/datasources/json/JsonBenchmark.scala | 4 +-
10 files changed, 262 insertions(+), 224 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
index 8892037..9d09cab 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -26,6 +26,7 @@ import com.univocity.parsers.csv.{CsvParserSettings,
CsvWriterSettings, Unescape
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
class CSVOptions(
@transient val parameters: CaseInsensitiveMap[String],
@@ -148,8 +149,12 @@ class CSVOptions(
val dateFormat: String = parameters.getOrElse("dateFormat",
DateFormatter.defaultPattern)
- val timestampFormat: String =
- parameters.getOrElse("timestampFormat",
s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")
+ val timestampFormat: String = parameters.getOrElse("timestampFormat",
+ if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
+ s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX"
+ } else {
+ s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]"
+ })
val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index 45c4edf..f9222f5 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -27,6 +27,7 @@ import com.fasterxml.jackson.core.json.JsonReadFeature
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
/**
* Options for parsing JSON data into Spark SQL rows.
@@ -90,8 +91,12 @@ private[sql] class JSONOptions(
val dateFormat: String = parameters.getOrElse("dateFormat",
DateFormatter.defaultPattern)
- val timestampFormat: String =
- parameters.getOrElse("timestampFormat",
s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")
+ val timestampFormat: String = parameters.getOrElse("timestampFormat",
+ if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
+ s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX"
+ } else {
+ s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]"
+ })
val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
diff --git a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt
b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt
index d8071e7..147a77f 100644
--- a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt
@@ -2,66 +2,66 @@
Benchmark to measure CSV read/write performance
================================================================================================
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Parsing quoted values: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-One quoted string 44297 44515
373 0.0 885948.7 1.0X
+One quoted string 24907 29374
NaN 0.0 498130.5 1.0X
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Wide rows with 1000 columns: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Select 1000 columns 196720 197783
1560 0.0 196719.8 1.0X
-Select 100 columns 46691 46861
219 0.0 46691.4 4.2X
-Select one column 36811 36922
111 0.0 36811.3 5.3X
-count() 8520 8610
106 0.1 8520.5 23.1X
-Select 100 columns, one bad input field 67914 67994
136 0.0 67914.0 2.9X
-Select 100 columns, corrupt record field 77272 77445
214 0.0 77272.0 2.5X
+Select 1000 columns 62811 63690
1416 0.0 62811.4 1.0X
+Select 100 columns 23839 24064
230 0.0 23839.5 2.6X
+Select one column 19936 20641
827 0.1 19936.4 3.2X
+count() 4174 4380
206 0.2 4174.4 15.0X
+Select 100 columns, one bad input field 41015 42380
1688 0.0 41015.4 1.5X
+Select 100 columns, corrupt record field 46281 46338
93 0.0 46280.5 1.4X
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Count a dataset with 10 columns: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns + count() 25965 26054
103 0.4 2596.5 1.0X
-Select 1 column + count() 18591 18666
91 0.5 1859.1 1.4X
-count() 6102 6119
18 1.6 610.2 4.3X
+Select 10 columns + count() 10810 10997
163 0.9 1081.0 1.0X
+Select 1 column + count() 7608 7641
47 1.3 760.8 1.4X
+count() 2415 2462
77 4.1 241.5 4.5X
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps 2142 2161
17 4.7 214.2 1.0X
-to_csv(timestamp) 14744 14950
182 0.7 1474.4 0.1X
-write timestamps to files 12078 12202
175 0.8 1207.8 0.2X
-Create a dataset of dates 2275 2291
18 4.4 227.5 0.9X
-to_csv(date) 11407 11464
51 0.9 1140.7 0.2X
-write dates to files 7638 7702
90 1.3 763.8 0.3X
+Create a dataset of timestamps 874 914
37 11.4 87.4 1.0X
+to_csv(timestamp) 7051 7223
250 1.4 705.1 0.1X
+write timestamps to files 6712 6741
31 1.5 671.2 0.1X
+Create a dataset of dates 909 945
35 11.0 90.9 1.0X
+to_csv(date) 4222 4231
8 2.4 422.2 0.2X
+write dates to files 3799 3813
14 2.6 379.9 0.2X
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files 2578 2590
10 3.9 257.8 1.0X
-read timestamps from files 60103 60694
512 0.2 6010.3 0.0X
-infer timestamps from files 107871 108268
351 0.1 10787.1 0.0X
-read date text from files 2306 2310
4 4.3 230.6 1.1X
-read date from files 47415 47657
367 0.2 4741.5 0.1X
-infer date from files 35261 35447
164 0.3 3526.1 0.1X
-timestamp strings 3045 3056
11 3.3 304.5 0.8X
-parse timestamps from Dataset[String] 62221 63173
849 0.2 6222.1 0.0X
-infer timestamps from Dataset[String] 118838 119629
697 0.1 11883.8 0.0X
-date strings 3459 3481
19 2.9 345.9 0.7X
-parse dates from Dataset[String] 51026 51447
503 0.2 5102.6 0.1X
-from_csv(timestamp) 60738 61818
936 0.2 6073.8 0.0X
-from_csv(date) 46012 46278
370 0.2 4601.2 0.1X
+read timestamp text from files 1342 1364
35 7.5 134.2 1.0X
+read timestamps from files 20300 20473
247 0.5 2030.0 0.1X
+infer timestamps from files 40705 40744
54 0.2 4070.5 0.0X
+read date text from files 1146 1151
6 8.7 114.6 1.2X
+read date from files 12278 12408
117 0.8 1227.8 0.1X
+infer date from files 12734 12872
220 0.8 1273.4 0.1X
+timestamp strings 1467 1482
15 6.8 146.7 0.9X
+parse timestamps from Dataset[String] 21708 22234
477 0.5 2170.8 0.1X
+infer timestamps from Dataset[String] 42357 43253
922 0.2 4235.7 0.0X
+date strings 1512 1532
18 6.6 151.2 0.9X
+parse dates from Dataset[String] 13436 13470
33 0.7 1343.6 0.1X
+from_csv(timestamp) 20390 20486
95 0.5 2039.0 0.1X
+from_csv(date) 12592 12693
139 0.8 1259.2 0.1X
-OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.2
-Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Filters pushdown: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-w/o filters 11889 11945
52 0.0 118893.1 1.0X
-pushdown disabled 11790 11860
115 0.0 117902.3 1.0X
-w/ filters 1240 1278
33 0.1 12400.8 9.6X
+w/o filters 12535 12606
67 0.0 125348.8 1.0X
+pushdown disabled 12611 12672
91 0.0 126112.9 1.0X
+w/ filters 1093 1099
11 0.1 10928.3 11.5X
diff --git a/sql/core/benchmarks/CSVBenchmark-results.txt
b/sql/core/benchmarks/CSVBenchmark-results.txt
index b3ba69c..498ca4c 100644
--- a/sql/core/benchmarks/CSVBenchmark-results.txt
+++ b/sql/core/benchmarks/CSVBenchmark-results.txt
@@ -2,66 +2,66 @@
Benchmark to measure CSV read/write performance
================================================================================================
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Parsing quoted values: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-One quoted string 51602 51659
59 0.0 1032039.4 1.0X
+One quoted string 24073 24109
33 0.0 481463.5 1.0X
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Wide rows with 1000 columns: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Select 1000 columns 191926 192879
1615 0.0 191925.6 1.0X
-Select 100 columns 46766 46846
69 0.0 46766.1 4.1X
-Select one column 35877 35930
83 0.0 35876.8 5.3X
-count() 11186 11262
65 0.1 11186.0 17.2X
-Select 100 columns, one bad input field 59943 60107
232 0.0 59943.0 3.2X
-Select 100 columns, corrupt record field 73062 73406
479 0.0 73062.2 2.6X
+Select 1000 columns 58415 59611
2071 0.0 58414.8 1.0X
+Select 100 columns 22568 23020
594 0.0 22568.0 2.6X
+Select one column 18995 19058
99 0.1 18995.0 3.1X
+count() 5301 5332
30 0.2 5300.9 11.0X
+Select 100 columns, one bad input field 39736 40153
361 0.0 39736.1 1.5X
+Select 100 columns, corrupt record field 47195 47826
590 0.0 47195.2 1.2X
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Count a dataset with 10 columns: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns + count() 22389 22447
87 0.4 2238.9 1.0X
-Select 1 column + count() 14844 14890
43 0.7 1484.4 1.5X
-count() 5519 5538
18 1.8 551.9 4.1X
+Select 10 columns + count() 9884 9904
25 1.0 988.4 1.0X
+Select 1 column + count() 6794 6835
46 1.5 679.4 1.5X
+count() 2060 2065
5 4.9 206.0 4.8X
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps 1949 1977
25 5.1 194.9 1.0X
-to_csv(timestamp) 14944 15702
714 0.7 1494.4 0.1X
-write timestamps to files 12983 12998
14 0.8 1298.3 0.2X
-Create a dataset of dates 2156 2164
7 4.6 215.6 0.9X
-to_csv(date) 9675 9709
41 1.0 967.5 0.2X
-write dates to files 7880 7897
15 1.3 788.0 0.2X
+Create a dataset of timestamps 717 732
18 14.0 71.7 1.0X
+to_csv(timestamp) 6994 7100
121 1.4 699.4 0.1X
+write timestamps to files 6417 6435
27 1.6 641.7 0.1X
+Create a dataset of dates 827 855
24 12.1 82.7 0.9X
+to_csv(date) 4408 4438
32 2.3 440.8 0.2X
+write dates to files 3738 3758
28 2.7 373.8 0.2X
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files 2235 2245
10 4.5 223.5 1.0X
-read timestamps from files 54490 54690
283 0.2 5449.0 0.0X
-infer timestamps from files 104501 104737
236 0.1 10450.1 0.0X
-read date text from files 2035 2040
6 4.9 203.5 1.1X
-read date from files 39650 39707
52 0.3 3965.0 0.1X
-infer date from files 29235 29363
164 0.3 2923.5 0.1X
-timestamp strings 3412 3426
18 2.9 341.2 0.7X
-parse timestamps from Dataset[String] 66864 67804
981 0.1 6686.4 0.0X
-infer timestamps from Dataset[String] 118780 119284
837 0.1 11878.0 0.0X
-date strings 3730 3734
4 2.7 373.0 0.6X
-parse dates from Dataset[String] 48728 49071
309 0.2 4872.8 0.0X
-from_csv(timestamp) 62294 62493
260 0.2 6229.4 0.0X
-from_csv(date) 44581 44665
117 0.2 4458.1 0.1X
+read timestamp text from files 1121 1176
52 8.9 112.1 1.0X
+read timestamps from files 21298 21366
105 0.5 2129.8 0.1X
+infer timestamps from files 41008 41051
39 0.2 4100.8 0.0X
+read date text from files 962 967
5 10.4 96.2 1.2X
+read date from files 11749 11772
22 0.9 1174.9 0.1X
+infer date from files 12426 12459
29 0.8 1242.6 0.1X
+timestamp strings 1508 1519
9 6.6 150.8 0.7X
+parse timestamps from Dataset[String] 21674 21997
455 0.5 2167.4 0.1X
+infer timestamps from Dataset[String] 42141 42230
105 0.2 4214.1 0.0X
+date strings 1694 1701
8 5.9 169.4 0.7X
+parse dates from Dataset[String] 12929 12951
25 0.8 1292.9 0.1X
+from_csv(timestamp) 20603 20786
166 0.5 2060.3 0.1X
+from_csv(date) 12325 12338
12 0.8 1232.5 0.1X
-Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.2
-Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Filters pushdown: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-w/o filters 12557 12634
78 0.0 125572.9 1.0X
-pushdown disabled 12449 12509
65 0.0 124486.4 1.0X
-w/ filters 1372 1393
18 0.1 13724.8 9.1X
+w/o filters 12455 12474
22 0.0 124553.8 1.0X
+pushdown disabled 12462 12486
29 0.0 124624.9 1.0X
+w/ filters 1073 1092
18 0.1 10727.6 11.6X
diff --git a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
index 920e0a7..03bc334 100644
--- a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
@@ -3,110 +3,110 @@ Benchmark for performance of JSON parsing
================================================================================================
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
JSON schema inferring: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 84774 84927
264 1.2 847.7 1.0X
-UTF-8 is set 119081 120155
1773 0.8 1190.8 0.7X
+No encoding 46010 46118
113 2.2 460.1 1.0X
+UTF-8 is set 54407 55427
1718 1.8 544.1 0.8X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
count a short column: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 49293 49356
70 2.0 492.9 1.0X
-UTF-8 is set 80183 80211
25 1.2 801.8 0.6X
+No encoding 26614 28220
1461 3.8 266.1 1.0X
+UTF-8 is set 42765 43400
550 2.3 427.6 0.6X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
count a wide column: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 61070 61476
536 0.2 6107.0 1.0X
-UTF-8 is set 109765 109881
102 0.1 10976.5 0.6X
+No encoding 35696 35821
113 0.3 3569.6 1.0X
+UTF-8 is set 55441 56176
1037 0.2 5544.1 0.6X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
select wide row: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 176999 178163
1008 0.0 353997.9 1.0X
-UTF-8 is set 201209 201641
614 0.0 402419.0 0.9X
+No encoding 61514 62968
NaN 0.0 123027.2 1.0X
+UTF-8 is set 72096 72933
1162 0.0 144192.7 0.9X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Select a subset of 10 columns: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns 18768 20587
496 0.5 1876.8 1.0X
-Select 1 column 22642 22644
3 0.4 2264.2 0.8X
+Select 10 columns 9859 9913
79 1.0 985.9 1.0X
+Select 1 column 10981 11003
36 0.9 1098.1 0.9X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
creation of JSON parser per line: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Short column without encoding 7697 7738
55 1.3 769.7 1.0X
-Short column with UTF-8 14051 14189
176 0.7 1405.1 0.5X
-Wide column without encoding 108999 110075
1085 0.1 10899.9 0.1X
-Wide column with UTF-8 157433 157779
308 0.1 15743.3 0.0X
+Short column without encoding 3555 3579
27 2.8 355.5 1.0X
+Short column with UTF-8 5204 5227
35 1.9 520.4 0.7X
+Wide column without encoding 60458 60637
164 0.2 6045.8 0.1X
+Wide column with UTF-8 77544 78111
551 0.1 7754.4 0.0X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
JSON functions: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 644 647
4 15.5 64.4 1.0X
-from_json 25859 25872
12 0.4 2585.9 0.0X
-json_tuple 31679 31761
71 0.3 3167.9 0.0X
-get_json_object 24772 25220
389 0.4 2477.2 0.0X
+Text read 342 346
3 29.2 34.2 1.0X
+from_json 7123 7318
179 1.4 712.3 0.0X
+json_tuple 9843 9957
132 1.0 984.3 0.0X
+get_json_object 7827 8046
194 1.3 782.7 0.0X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Dataset of json strings: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 3135 3165
52 15.9 62.7 1.0X
-schema inferring 29383 29389
10 1.7 587.7 0.1X
-parsing 32623 35183
NaN 1.5 652.5 0.1X
+Text read 1856 1884
32 26.9 37.1 1.0X
+schema inferring 16734 16900
153 3.0 334.7 0.1X
+parsing 14884 15203
470 3.4 297.7 0.1X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Json files in the per-line mode: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 11874 11948
82 4.2 237.5 1.0X
-Schema inferring 42382 42398
23 1.2 847.6 0.3X
-Parsing without charset 36410 36442
54 1.4 728.2 0.3X
-Parsing with UTF-8 62412 62463
48 0.8 1248.2 0.2X
+Text read 5932 6148
228 8.4 118.6 1.0X
+Schema inferring 20836 21938
1086 2.4 416.7 0.3X
+Parsing without charset 18134 18661
457 2.8 362.7 0.3X
+Parsing with UTF-8 27734 28069
378 1.8 554.7 0.2X
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps 2191 2209
20 4.6 219.1 1.0X
-to_json(timestamp) 18670 19042
565 0.5 1867.0 0.1X
-write timestamps to files 11836 13156
NaN 0.8 1183.6 0.2X
-Create a dataset of dates 2321 2351
33 4.3 232.1 0.9X
-to_json(date) 12703 12726
24 0.8 1270.3 0.2X
-write dates to files 8230 8303
76 1.2 823.0 0.3X
+Create a dataset of timestamps 889 914
28 11.2 88.9 1.0X
+to_json(timestamp) 7920 8172
353 1.3 792.0 0.1X
+write timestamps to files 6726 6822
129 1.5 672.6 0.1X
+Create a dataset of dates 953 963
12 10.5 95.3 0.9X
+to_json(date) 5370 5705
320 1.9 537.0 0.2X
+write dates to files 4109 4166
52 2.4 410.9 0.2X
-OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files 2780 2795
13 3.6 278.0 1.0X
-read timestamps from files 37158 37305
137 0.3 3715.8 0.1X
-infer timestamps from files 73666 73838
149 0.1 7366.6 0.0X
-read date text from files 2597 2609
10 3.9 259.7 1.1X
-read date from files 24439 24501
56 0.4 2443.9 0.1X
-timestamp strings 3052 3064
12 3.3 305.2 0.9X
-parse timestamps from Dataset[String] 43611 43665
52 0.2 4361.1 0.1X
-infer timestamps from Dataset[String] 83745 84153
376 0.1 8374.5 0.0X
-date strings 4068 4076
10 2.5 406.8 0.7X
-parse dates from Dataset[String] 34700 34807
118 0.3 3470.0 0.1X
-from_json(timestamp) 64074 64124
53 0.2 6407.4 0.0X
-from_json(date) 52520 52617
101 0.2 5252.0 0.1X
+read timestamp text from files 1614 1675
55 6.2 161.4 1.0X
+read timestamps from files 16640 16858
209 0.6 1664.0 0.1X
+infer timestamps from files 33239 33388
227 0.3 3323.9 0.0X
+read date text from files 1310 1340
44 7.6 131.0 1.2X
+read date from files 9470 9513
41 1.1 947.0 0.2X
+timestamp strings 1303 1342
47 7.7 130.3 1.2X
+parse timestamps from Dataset[String] 17650 18073
380 0.6 1765.0 0.1X
+infer timestamps from Dataset[String] 32623 34065
1330 0.3 3262.3 0.0X
+date strings 1864 1871
7 5.4 186.4 0.9X
+parse dates from Dataset[String] 10914 11316
482 0.9 1091.4 0.1X
+from_json(timestamp) 21102 21990
929 0.5 2110.2 0.1X
+from_json(date) 15275 15961
598 0.7 1527.5 0.1X
diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt
b/sql/core/benchmarks/JsonBenchmark-results.txt
index e435f57..0f188c4 100644
--- a/sql/core/benchmarks/JsonBenchmark-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-results.txt
@@ -3,110 +3,110 @@ Benchmark for performance of JSON parsing
================================================================================================
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
JSON schema inferring: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 61888 61918
27 1.6 618.9 1.0X
-UTF-8 is set 109057 113663
NaN 0.9 1090.6 0.6X
+No encoding 38998 41002
NaN 2.6 390.0 1.0X
+UTF-8 is set 61231 63282
1854 1.6 612.3 0.6X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
count a short column: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 44517 44535
29 2.2 445.2 1.0X
-UTF-8 is set 75722 75840
111 1.3 757.2 0.6X
+No encoding 28272 28338
70 3.5 282.7 1.0X
+UTF-8 is set 58681 62243
1517 1.7 586.8 0.5X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
count a wide column: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 63677 64090
633 0.2 6367.7 1.0X
-UTF-8 is set 99424 99615
185 0.1 9942.4 0.6X
+No encoding 44026 51829
1329 0.2 4402.6 1.0X
+UTF-8 is set 65839 68596
500 0.2 6583.9 0.7X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
select wide row: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 174052 174251
174 0.0 348104.1 1.0X
-UTF-8 is set 189000 189098
113 0.0 378000.9 0.9X
+No encoding 72144 74820
NaN 0.0 144287.6 1.0X
+UTF-8 is set 69571 77888
NaN 0.0 139142.3 1.0X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Select a subset of 10 columns: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns 18387 18473
142 0.5 1838.7 1.0X
-Select 1 column 25560 25571
13 0.4 2556.0 0.7X
+Select 10 columns 9502 9604
106 1.1 950.2 1.0X
+Select 1 column 11861 11948
109 0.8 1186.1 0.8X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
creation of JSON parser per line: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Short column without encoding 9323 9384
58 1.1 932.3 1.0X
-Short column with UTF-8 14016 14058
55 0.7 1401.6 0.7X
-Wide column without encoding 133258 133532
382 0.1 13325.8 0.1X
-Wide column with UTF-8 181212 181283
61 0.1 18121.2 0.1X
+Short column without encoding 3830 3846
15 2.6 383.0 1.0X
+Short column with UTF-8 5538 5543
7 1.8 553.8 0.7X
+Wide column without encoding 66899 69158
NaN 0.1 6689.9 0.1X
+Wide column with UTF-8 90052 93235
NaN 0.1 9005.2 0.0X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
JSON functions: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 1168 1174
5 8.6 116.8 1.0X
-from_json 22604 23571
883 0.4 2260.4 0.1X
-json_tuple 29979 30053
91 0.3 2997.9 0.0X
-get_json_object 21987 22263
241 0.5 2198.7 0.1X
+Text read 659 674
13 15.2 65.9 1.0X
+from_json 7676 7943
405 1.3 767.6 0.1X
+json_tuple 9881 10172
273 1.0 988.1 0.1X
+get_json_object 7949 8055
119 1.3 794.9 0.1X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Dataset of json strings: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 5831 5842
14 8.6 116.6 1.0X
-schema inferring 31372 31456
73 1.6 627.4 0.2X
-parsing 35911 36191
254 1.4 718.2 0.2X
+Text read 3314 3326
17 15.1 66.3 1.0X
+schema inferring 16549 17037
484 3.0 331.0 0.2X
+parsing 15138 15283
172 3.3 302.8 0.2X
Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Json files in the per-line mode: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 10249 10314
77 4.9 205.0 1.0X
-Schema inferring 35403 35436
40 1.4 708.1 0.3X
-Parsing without charset 32875 32879
4 1.5 657.5 0.3X
-Parsing with UTF-8 53444 53519
100 0.9 1068.9 0.2X
+Text read 5136 5446
268 9.7 102.7 1.0X
+Schema inferring 19864 20568
1191 2.5 397.3 0.3X
+Parsing without charset 17535 17888
329 2.9 350.7 0.3X
+Parsing with UTF-8 25609 25758
218 2.0 512.2 0.2X
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps 1909 1924
17 5.2 190.9 1.0X
-to_json(timestamp) 18956 19122
208 0.5 1895.6 0.1X
-write timestamps to files 13446 13472
43 0.7 1344.6 0.1X
-Create a dataset of dates 2180 2200
28 4.6 218.0 0.9X
-to_json(date) 12780 12899
109 0.8 1278.0 0.1X
-write dates to files 7835 7865
29 1.3 783.5 0.2X
+Create a dataset of timestamps 784 790
7 12.8 78.4 1.0X
+to_json(timestamp) 8005 8055
50 1.2 800.5 0.1X
+write timestamps to files 6515 6559
45 1.5 651.5 0.1X
+Create a dataset of dates 854 881
24 11.7 85.4 0.9X
+to_json(date) 5187 5194
7 1.9 518.7 0.2X
+write dates to files 3663 3684
22 2.7 366.3 0.2X
-OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux
4.15.0-1044-aws
-Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
+Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files 2467 2477
9 4.1 246.7 1.0X
-read timestamps from files 40186 40342
135 0.2 4018.6 0.1X
-infer timestamps from files 82005 82079
71 0.1 8200.5 0.0X
-read date text from files 2243 2264
22 4.5 224.3 1.1X
-read date from files 24852 24863
19 0.4 2485.2 0.1X
-timestamp strings 3836 3854
16 2.6 383.6 0.6X
-parse timestamps from Dataset[String] 51521 51697
242 0.2 5152.1 0.0X
-infer timestamps from Dataset[String] 97300 97398
133 0.1 9730.0 0.0X
-date strings 4488 4491
5 2.2 448.8 0.5X
-parse dates from Dataset[String] 37918 37976
68 0.3 3791.8 0.1X
-from_json(timestamp) 69611 69632
36 0.1 6961.1 0.0X
-from_json(date) 56598 56974
347 0.2 5659.8 0.0X
+read timestamp text from files 1297 1316
26 7.7 129.7 1.0X
+read timestamps from files 16915 17723
963 0.6 1691.5 0.1X
+infer timestamps from files 33967 34304
360 0.3 3396.7 0.0X
+read date text from files 1095 1100
7 9.1 109.5 1.2X
+read date from files 8376 8513
209 1.2 837.6 0.2X
+timestamp strings 1807 1816
8 5.5 180.7 0.7X
+parse timestamps from Dataset[String] 18189 18242
74 0.5 1818.9 0.1X
+infer timestamps from Dataset[String] 37906 38547
571 0.3 3790.6 0.0X
+date strings 2191 2194
4 4.6 219.1 0.6X
+parse dates from Dataset[String] 11593 11625
33 0.9 1159.3 0.1X
+from_json(timestamp) 22589 22650
101 0.4 2258.9 0.1X
+from_json(date) 16479 16619
159 0.6 1647.9 0.1X
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
index 89fb4d5..b9e0d50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
@@ -212,4 +212,16 @@ class CsvFunctionsSuite extends QueryTest with
SharedSparkSession {
assert(readback(0).getAs[Row](0).getAs[Date](0).getTime >= 0)
}
}
+
+ test("optional datetime parser does not affect csv time formatting") {
+ val s = "2015-08-26 12:34:46"
+ def toDF(p: String): DataFrame = sql(
+ s"""
+ |SELECT
+ | to_csv(
+ | named_struct('time', timestamp'$s'), map('timestampFormat', "$p")
+ | )
+ | """.stripMargin)
+ checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"),
toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]"))
+ }
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 8cc5c22..b989b5d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -688,4 +688,16 @@ class JsonFunctionsSuite extends QueryTest with
SharedSparkSession {
options.asJava)),
Seq(Row("string")))
}
+
+ test("optional datetime parser does not affect json time formatting") {
+ val s = "2015-08-26 12:34:46"
+ def toDF(p: String): DataFrame = sql(
+ s"""
+ |SELECT
+ | to_json(
+ | named_struct('time', timestamp'$s'), map('timestampFormat', "$p")
+ | )
+ | """.stripMargin)
+ checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"),
toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]"))
+ }
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala
index e2abb39..53d287b 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala
@@ -238,7 +238,9 @@ object CSVBenchmark extends SqlBasedBenchmark {
def timestampStr: Dataset[String] = {
spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
- iter.map(i => s"1970-01-01T01:02:03.${100 + i % 100}Z")
+ iter.map {
+ i => s"1970-01-01T01:02:03.${i % 200}Z".stripSuffix(".0Z")
+ }
}.select($"value".as("timestamp")).as[String]
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
index bcecacc..5693088 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
@@ -445,7 +445,9 @@ object JsonBenchmark extends SqlBasedBenchmark {
def timestampStr: Dataset[String] = {
spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
- iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${100 + i %
100}Z"}""")
+ iter.map { i =>
+ s"""{"timestamp":"1970-01-01T01:02:03.${i %
200}Z"}""".stripSuffix(".0Z")
+ }
}.select($"value".as("timestamp")).as[String]
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]