This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new bdbfe6e [SPARK-32130][SQL] Disable the JSON option `inferTimestamp`
by default
bdbfe6e is described below
commit bdbfe6ec15667835000d7e95b0c437b6ab4b251e
Author: Max Gekk <[email protected]>
AuthorDate: Wed Jul 1 15:45:39 2020 -0700
[SPARK-32130][SQL] Disable the JSON option `inferTimestamp` by default
Set the JSON option `inferTimestamp` to `false` if an user don't pass it as
datasource option.
To prevent perf regression while inferring schemas from JSON with potential
timestamps fields.
Yes
- Modified existing tests in `JsonSuite` and `JsonInferSchemaSuite`.
- Regenerated results of `JsonBenchmark` in the environment:
| Item | Description |
| ---- | ----|
| Region | us-west-2 (Oregon) |
| Instance | r3.xlarge |
| AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1
(ami-06f2f779464715dc5) |
| Java | OpenJDK 64-Bit Server VM 1.8.0_252 and OpenJDK 64-Bit Server VM
11.0.7+10 |
Closes #28966 from MaxGekk/json-inferTimestamps-disable-by-default.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit bcf23307f4fd70590ea10e5e9edb6e9de1f76125)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
docs/sql-migration-guide.md | 4 +
.../spark/sql/catalyst/json/JSONOptions.scala | 2 +-
.../sql/catalyst/json/JsonInferSchemaSuite.scala | 56 ++++++++------
.../benchmarks/JsonBenchmark-jdk11-results.txt | 86 +++++++++++-----------
sql/core/benchmarks/JsonBenchmark-results.txt | 86 +++++++++++-----------
.../sql/execution/datasources/json/JsonSuite.scala | 6 +-
6 files changed, 129 insertions(+), 111 deletions(-)
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index b7b01d0..fb3fe09 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -22,6 +22,10 @@ license: |
* Table of contents
{:toc}
+## Upgrading from Spark SQL 3.0 to 3.0.1
+
+- In Spark 3.0, JSON datasource and JSON function `schema_of_json` infer
TimestampType from string values if they match to the pattern defined by the
JSON option `timestampFormat`. Since version 3.0.1, the timestamp type
inference is disabled by default. Set the JSON option `inferTimestamp` to
`true` to enable such type inference.
+
## Upgrading from Spark SQL 2.4 to 3.0
### Dataset/DataFrame APIs
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index f9222f5..70a673b 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -133,7 +133,7 @@ private[sql] class JSONOptions(
* Enables inferring of TimestampType from strings matched to the timestamp
pattern
* defined by the timestampFormat option.
*/
- val inferTimestamp: Boolean =
parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(true)
+ val inferTimestamp: Boolean =
parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(false)
/** Build a Jackson [[JsonFactory]] using JSON options. */
def buildJsonFactory(): JsonFactory = {
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
index bce917c..8290b38 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -35,22 +35,29 @@ class JsonInferSchemaSuite extends SparkFunSuite with
SQLHelper {
assert(inferSchema.inferField(parser) === expectedType)
}
- def checkTimestampType(pattern: String, json: String): Unit = {
- checkType(Map("timestampFormat" -> pattern), json, TimestampType)
+ def checkTimestampType(pattern: String, json: String, inferTimestamp:
Boolean): Unit = {
+ checkType(
+ Map("timestampFormat" -> pattern, "inferTimestamp" ->
inferTimestamp.toString),
+ json,
+ if (inferTimestamp) TimestampType else StringType)
}
test("inferring timestamp type") {
- Seq("legacy", "corrected").foreach { legacyParserPolicy =>
- withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy)
{
- checkTimestampType("yyyy", """{"a": "2018"}""")
- checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
- checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
- checkTimestampType(
- "yyyy-MM-dd'T'HH:mm:ss.SSS",
- """{"a": "2018-12-02T21:04:00.123"}""")
- checkTimestampType(
- "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
- """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+ Seq(true, false).foreach { inferTimestamp =>
+ Seq("legacy", "corrected").foreach { legacyParserPolicy =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key ->
legacyParserPolicy) {
+ checkTimestampType("yyyy", """{"a": "2018"}""", inferTimestamp)
+ checkTimestampType("yyyy=MM", """{"a": "2018=12"}""", inferTimestamp)
+ checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""",
inferTimestamp)
+ checkTimestampType(
+ "yyyy-MM-dd'T'HH:mm:ss.SSS",
+ """{"a": "2018-12-02T21:04:00.123"}""",
+ inferTimestamp)
+ checkTimestampType(
+ "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
+ """{"a": "2018-12-02T21:04:00.123567+01:00"}""",
+ inferTimestamp)
+ }
}
}
}
@@ -71,16 +78,19 @@ class JsonInferSchemaSuite extends SparkFunSuite with
SQLHelper {
}
test("skip decimal type inferring") {
- Seq("legacy", "corrected").foreach { legacyParserPolicy =>
- withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy)
{
- checkType(
- options = Map(
- "prefersDecimal" -> "false",
- "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
- ),
- json = """{"a": "20181202.210400123"}""",
- dt = TimestampType
- )
+ Seq(true, false).foreach { inferTimestamp =>
+ Seq("legacy", "corrected").foreach { legacyParserPolicy =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key ->
legacyParserPolicy) {
+ checkType(
+ options = Map(
+ "prefersDecimal" -> "false",
+ "timestampFormat" -> "yyyyMMdd.HHmmssSSS",
+ "inferTimestamp" -> inferTimestamp.toString
+ ),
+ json = """{"a": "20181202.210400123"}""",
+ dt = if (inferTimestamp) TimestampType else StringType
+ )
+ }
}
}
}
diff --git a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
index d0cd591..ff37084 100644
--- a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
@@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM
11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-106
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON schema inferring: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 68879 68993
116 1.5 688.8 1.0X
-UTF-8 is set 115270 115602
455 0.9 1152.7 0.6X
+No encoding 69219 69342
116 1.4 692.2 1.0X
+UTF-8 is set 143950 143986
55 0.7 1439.5 0.5X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
count a short column: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 47452 47538
113 2.1 474.5 1.0X
-UTF-8 is set 77330 77354
30 1.3 773.3 0.6X
+No encoding 57828 57913
136 1.7 578.3 1.0X
+UTF-8 is set 83649 83711
60 1.2 836.5 0.7X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
count a wide column: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 60470 60900
534 0.2 6047.0 1.0X
-UTF-8 is set 104733 104931
189 0.1 10473.3 0.6X
+No encoding 64560 65193
1023 0.2 6456.0 1.0X
+UTF-8 is set 102925 103174
216 0.1 10292.5 0.6X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
select wide row: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 130302 131072
976 0.0 260604.6 1.0X
-UTF-8 is set 150860 151284
377 0.0 301720.1 0.9X
+No encoding 131002 132316
1160 0.0 262003.1 1.0X
+UTF-8 is set 152128 152371
332 0.0 304256.5 0.9X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Select a subset of 10 columns: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns 18619 18684
99 0.5 1861.9 1.0X
-Select 1 column 24227 24270
38 0.4 2422.7 0.8X
+Select 10 columns 19376 19514
160 0.5 1937.6 1.0X
+Select 1 column 24089 24156
58 0.4 2408.9 0.8X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
creation of JSON parser per line: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Short column without encoding 7947 7971
21 1.3 794.7 1.0X
-Short column with UTF-8 12700 12753
58 0.8 1270.0 0.6X
-Wide column without encoding 92632 92955
463 0.1 9263.2 0.1X
-Wide column with UTF-8 147013 147170
188 0.1 14701.3 0.1X
+Short column without encoding 8131 8219
103 1.2 813.1 1.0X
+Short column with UTF-8 13464 13508
44 0.7 1346.4 0.6X
+Wide column without encoding 108012 108598
914 0.1 10801.2 0.1X
+Wide column with UTF-8 150988 151369
412 0.1 15098.8 0.1X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON functions: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 713 734
19 14.0 71.3 1.0X
-from_json 22019 22429
456 0.5 2201.9 0.0X
-json_tuple 27987 28047
74 0.4 2798.7 0.0X
-get_json_object 21468 21870
350 0.5 2146.8 0.0X
+Text read 753 765
18 13.3 75.3 1.0X
+from_json 23182 23446
230 0.4 2318.2 0.0X
+json_tuple 31129 31304
181 0.3 3112.9 0.0X
+get_json_object 22821 23073
225 0.4 2282.1 0.0X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Dataset of json strings: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 2887 2910
24 17.3 57.7 1.0X
-schema inferring 31793 31843
43 1.6 635.9 0.1X
-parsing 36791 37104
294 1.4 735.8 0.1X
+Text read 3078 3101
26 16.2 61.6 1.0X
+schema inferring 30225 30434
333 1.7 604.5 0.1X
+parsing 32237 32308
63 1.6 644.7 0.1X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Json files in the per-line mode: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 10570 10611
45 4.7 211.4 1.0X
-Schema inferring 48729 48763
41 1.0 974.6 0.2X
-Parsing without charset 35490 35648
141 1.4 709.8 0.3X
-Parsing with UTF-8 63853 63994
163 0.8 1277.1 0.2X
+Text read 10835 10900
86 4.6 216.7 1.0X
+Schema inferring 37720 37805
110 1.3 754.4 0.3X
+Parsing without charset 35464 35538
100 1.4 709.3 0.3X
+Parsing with UTF-8 67311 67738
381 0.7 1346.2 0.2X
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps 2187 2190
5 4.6 218.7 1.0X
-to_json(timestamp) 16262 16503
323 0.6 1626.2 0.1X
-write timestamps to files 11679 11692
12 0.9 1167.9 0.2X
-Create a dataset of dates 2297 2310
12 4.4 229.7 1.0X
-to_json(date) 10904 10956
46 0.9 1090.4 0.2X
-write dates to files 6610 6645
35 1.5 661.0 0.3X
+Create a dataset of timestamps 2208 2222
14 4.5 220.8 1.0X
+to_json(timestamp) 14299 14570
285 0.7 1429.9 0.2X
+write timestamps to files 12955 12969
13 0.8 1295.5 0.2X
+Create a dataset of dates 2297 2323
30 4.4 229.7 1.0X
+to_json(date) 8509 8561
74 1.2 850.9 0.3X
+write dates to files 6786 6827
45 1.5 678.6 0.3X
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files 2524 2530
9 4.0 252.4 1.0X
-read timestamps from files 41002 41052
59 0.2 4100.2 0.1X
-infer timestamps from files 84621 84939
526 0.1 8462.1 0.0X
-read date text from files 2292 2302
9 4.4 229.2 1.1X
-read date from files 16954 16976
21 0.6 1695.4 0.1X
-timestamp strings 3067 3077
13 3.3 306.7 0.8X
-parse timestamps from Dataset[String] 48690 48971
243 0.2 4869.0 0.1X
-infer timestamps from Dataset[String] 97463 97786
338 0.1 9746.3 0.0X
-date strings 3952 3956
3 2.5 395.2 0.6X
-parse dates from Dataset[String] 24210 24241
30 0.4 2421.0 0.1X
-from_json(timestamp) 71710 72242
629 0.1 7171.0 0.0X
-from_json(date) 42465 42481
13 0.2 4246.5 0.1X
+read timestamp text from files 2598 2613
18 3.8 259.8 1.0X
+read timestamps from files 42007 42028
19 0.2 4200.7 0.1X
+infer timestamps from files 18102 18120
28 0.6 1810.2 0.1X
+read date text from files 2355 2360
5 4.2 235.5 1.1X
+read date from files 17420 17458
33 0.6 1742.0 0.1X
+timestamp strings 3099 3101
3 3.2 309.9 0.8X
+parse timestamps from Dataset[String] 48188 48215
25 0.2 4818.8 0.1X
+infer timestamps from Dataset[String] 22929 22988
102 0.4 2292.9 0.1X
+date strings 4090 4103
11 2.4 409.0 0.6X
+parse dates from Dataset[String] 24952 25068
139 0.4 2495.2 0.1X
+from_json(timestamp) 66038 66352
413 0.2 6603.8 0.0X
+from_json(date) 43755 43782
27 0.2 4375.5 0.1X
diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt
b/sql/core/benchmarks/JsonBenchmark-results.txt
index 46d2410..0e4ce90 100644
--- a/sql/core/benchmarks/JsonBenchmark-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-results.txt
@@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on
Linux 4.15.0-1063-aw
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON schema inferring: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 63981 64044
56 1.6 639.8 1.0X
-UTF-8 is set 112672 113350
962 0.9 1126.7 0.6X
+No encoding 64950 65182
306 1.5 649.5 1.0X
+UTF-8 is set 129566 129796
229 0.8 1295.7 0.5X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
count a short column: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 51256 51449
180 2.0 512.6 1.0X
-UTF-8 is set 83694 83859
148 1.2 836.9 0.6X
+No encoding 50896 51277
372 2.0 509.0 1.0X
+UTF-8 is set 89712 89763
49 1.1 897.1 0.6X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
count a wide column: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 58440 59097
569 0.2 5844.0 1.0X
-UTF-8 is set 102746 102883
198 0.1 10274.6 0.6X
+No encoding 59415 59785
372 0.2 5941.5 1.0X
+UTF-8 is set 103059 103165
156 0.1 10305.9 0.6X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
select wide row: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-No encoding 128982 129304
356 0.0 257965.0 1.0X
-UTF-8 is set 147247 147415
231 0.0 294494.1 0.9X
+No encoding 132951 133122
288 0.0 265901.9 1.0X
+UTF-8 is set 149318 149441
107 0.0 298635.3 0.9X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Select a subset of 10 columns: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Select 10 columns 18837 19048
331 0.5 1883.7 1.0X
-Select 1 column 24707 24723
14 0.4 2470.7 0.8X
+Select 10 columns 18491 18552
85 0.5 1849.1 1.0X
+Select 1 column 25908 25946
65 0.4 2590.8 0.7X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
creation of JSON parser per line: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Short column without encoding 8218 8234
17 1.2 821.8 1.0X
-Short column with UTF-8 12374 12438
107 0.8 1237.4 0.7X
-Wide column without encoding 136918 137298
345 0.1 13691.8 0.1X
-Wide column with UTF-8 176961 177142
257 0.1 17696.1 0.0X
+Short column without encoding 9264 9307
49 1.1 926.4 1.0X
+Short column with UTF-8 14707 14727
17 0.7 1470.7 0.6X
+Wide column without encoding 141138 141347
276 0.1 14113.8 0.1X
+Wide column with UTF-8 179601 180035
664 0.1 17960.1 0.1X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON functions: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 1268 1278
12 7.9 126.8 1.0X
-from_json 23348 23479
176 0.4 2334.8 0.1X
-json_tuple 29606 30221
1024 0.3 2960.6 0.0X
-get_json_object 21898 22148
226 0.5 2189.8 0.1X
+Text read 1173 1184
9 8.5 117.3 1.0X
+from_json 23432 23738
338 0.4 2343.2 0.1X
+json_tuple 32573 32851
358 0.3 3257.3 0.0X
+get_json_object 22442 22489
47 0.4 2244.2 0.1X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Dataset of json strings: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 5887 5944
49 8.5 117.7 1.0X
-schema inferring 46696 47054
312 1.1 933.9 0.1X
-parsing 32336 32450
129 1.5 646.7 0.2X
+Text read 5656 5680
31 8.8 113.1 1.0X
+schema inferring 33283 33337
64 1.5 665.7 0.2X
+parsing 41771 41929
178 1.2 835.4 0.1X
Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Json files in the per-line mode: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Text read 9756 9769
11 5.1 195.1 1.0X
-Schema inferring 51318 51433
108 1.0 1026.4 0.2X
-Parsing without charset 43609 43743
118 1.1 872.2 0.2X
-Parsing with UTF-8 60775 60844
106 0.8 1215.5 0.2X
+Text read 9626 9668
39 5.2 192.5 1.0X
+Schema inferring 39489 39579
91 1.3 789.8 0.2X
+Parsing without charset 38096 38232
125 1.3 761.9 0.3X
+Parsing with UTF-8 64565 64725
165 0.8 1291.3 0.1X
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps 1998 2015
17 5.0 199.8 1.0X
-to_json(timestamp) 18156 18317
263 0.6 1815.6 0.1X
-write timestamps to files 12912 12917
5 0.8 1291.2 0.2X
-Create a dataset of dates 2209 2270
53 4.5 220.9 0.9X
-to_json(date) 9433 9489
90 1.1 943.3 0.2X
-write dates to files 6915 6923
8 1.4 691.5 0.3X
+Create a dataset of timestamps 1898 1912
13 5.3 189.8 1.0X
+to_json(timestamp) 20011 20092
119 0.5 2001.1 0.1X
+write timestamps to files 13388 13427
35 0.7 1338.8 0.1X
+Create a dataset of dates 2351 2368
18 4.3 235.1 0.8X
+to_json(date) 11884 11913
40 0.8 1188.4 0.2X
+write dates to files 7317 7326
9 1.4 731.7 0.3X
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux
4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms)
Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files 2395 2412
17 4.2 239.5 1.0X
-read timestamps from files 47269 47334
89 0.2 4726.9 0.1X
-infer timestamps from files 91806 91851
67 0.1 9180.6 0.0X
-read date text from files 2118 2133
13 4.7 211.8 1.1X
-read date from files 17267 17340
115 0.6 1726.7 0.1X
-timestamp strings 3906 3935
26 2.6 390.6 0.6X
-parse timestamps from Dataset[String] 52244 52534
279 0.2 5224.4 0.0X
-infer timestamps from Dataset[String] 100488 100714
198 0.1 10048.8 0.0X
-date strings 4572 4584
12 2.2 457.2 0.5X
-parse dates from Dataset[String] 26749 26768
17 0.4 2674.9 0.1X
-from_json(timestamp) 71414 71867
556 0.1 7141.4 0.0X
-from_json(date) 45322 45549
250 0.2 4532.2 0.1X
+read timestamp text from files 2316 2324
13 4.3 231.6 1.0X
+read timestamps from files 43712 43900
165 0.2 4371.2 0.1X
+infer timestamps from files 19302 19328
38 0.5 1930.2 0.1X
+read date text from files 2090 2099
11 4.8 209.0 1.1X
+read date from files 18914 18940
44 0.5 1891.4 0.1X
+timestamp strings 3785 3793
11 2.6 378.5 0.6X
+parse timestamps from Dataset[String] 51177 51353
160 0.2 5117.7 0.0X
+infer timestamps from Dataset[String] 27907 28119
186 0.4 2790.7 0.1X
+date strings 4446 4452
6 2.2 444.6 0.5X
+parse dates from Dataset[String] 28124 28172
55 0.4 2812.4 0.1X
+from_json(timestamp) 71432 71827
354 0.1 7143.2 0.0X
+from_json(date) 46497 46651
163 0.2 4649.7 0.0X
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 19ec586..d95d4e0 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2610,7 +2610,9 @@ abstract class JsonSuite extends QueryTest with
SharedSparkSession with TestJson
}
test("inferring timestamp type") {
- def schemaOf(jsons: String*): StructType =
spark.read.json(jsons.toDS).schema
+ def schemaOf(jsons: String*): StructType = {
+ spark.read.option("inferTimestamp", true).json(jsons.toDS).schema
+ }
assert(schemaOf(
"""{"a":"2018-12-17T10:11:12.123-01:00"}""",
@@ -2633,6 +2635,7 @@ abstract class JsonSuite extends QueryTest with
SharedSparkSession with TestJson
val timestampsWithFormatPath =
s"${dir.getCanonicalPath}/timestampsWithFormat.json"
val timestampsWithFormat = spark.read
.option("timestampFormat", "dd/MM/yyyy HH:mm")
+ .option("inferTimestamp", true)
.json(datesRecords)
assert(timestampsWithFormat.schema === customSchema)
@@ -2645,6 +2648,7 @@ abstract class JsonSuite extends QueryTest with
SharedSparkSession with TestJson
val readBack = spark.read
.option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
.option(DateTimeUtils.TIMEZONE_OPTION, "UTC")
+ .option("inferTimestamp", true)
.json(timestampsWithFormatPath)
assert(readBack.schema === customSchema)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]