[GitHub] spark pull request #23150: [SPARK-26178][SQL] Use java.time API for parsing ...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/23150 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #23150: [SPARK-26178][SQL] Use java.time API for parsing ...
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/23150#discussion_r238195206 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala --- @@ -1618,6 +1618,13 @@ object SQLConf { "a SparkConf entry.") .booleanConf .createWithDefault(true) + + val LEGACY_TIME_PARSER_ENABLED = buildConf("spark.sql.legacy.timeParser.enabled") +.doc("When set to true, java.text.SimpleDateFormat is using for formatting and parsing " + + " dates/timestamps in a locale-sensitive manner. When set to false, classes from " + + "java.time.* packages are using for the same purpose.") --- End diff -- ditto --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #23150: [SPARK-26178][SQL] Use java.time API for parsing ...
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/23150#discussion_r238195153 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala --- @@ -1618,6 +1618,13 @@ object SQLConf { "a SparkConf entry.") .booleanConf .createWithDefault(true) + + val LEGACY_TIME_PARSER_ENABLED = buildConf("spark.sql.legacy.timeParser.enabled") +.doc("When set to true, java.text.SimpleDateFormat is using for formatting and parsing " + --- End diff -- nit `using` -> `used` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #23150: [SPARK-26178][SQL] Use java.time API for parsing ...
Github user MaxGekk commented on a diff in the pull request: https://github.com/apache/spark/pull/23150#discussion_r238075711 --- Diff: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala --- @@ -86,62 +85,74 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { // null. Seq(true, false).foreach { b => val options = new CSVOptions(Map("nullValue" -> "null"), false, "GMT") - val converter = -parser.makeConverter("_1", StringType, nullable = b, options = options) + val parser = new UnivocityParser(StructType(Seq.empty), options) + val converter = parser.makeConverter("_1", StringType, nullable = b) assert(converter.apply("") == UTF8String.fromString("")) } } test("Throws exception for empty string with non null type") { - val options = new CSVOptions(Map.empty[String, String], false, "GMT") +val options = new CSVOptions(Map.empty[String, String], false, "GMT") +val parser = new UnivocityParser(StructType(Seq.empty), options) val exception = intercept[RuntimeException]{ - parser.makeConverter("_1", IntegerType, nullable = false, options = options).apply("") + parser.makeConverter("_1", IntegerType, nullable = false).apply("") } assert(exception.getMessage.contains("null value found but field _1 is not nullable.")) } test("Types are cast correctly") { val options = new CSVOptions(Map.empty[String, String], false, "GMT") -assert(parser.makeConverter("_1", ByteType, options = options).apply("10") == 10) -assert(parser.makeConverter("_1", ShortType, options = options).apply("10") == 10) -assert(parser.makeConverter("_1", IntegerType, options = options).apply("10") == 10) -assert(parser.makeConverter("_1", LongType, options = options).apply("10") == 10) -assert(parser.makeConverter("_1", FloatType, options = options).apply("1.00") == 1.0) -assert(parser.makeConverter("_1", DoubleType, options = options).apply("1.00") == 1.0) -assert(parser.makeConverter("_1", BooleanType, options = options).apply("true") == true) - -val timestampsOptions = +var parser = new UnivocityParser(StructType(Seq.empty), options) +assert(parser.makeConverter("_1", ByteType).apply("10") == 10) +assert(parser.makeConverter("_1", ShortType).apply("10") == 10) +assert(parser.makeConverter("_1", IntegerType).apply("10") == 10) +assert(parser.makeConverter("_1", LongType).apply("10") == 10) +assert(parser.makeConverter("_1", FloatType).apply("1.00") == 1.0) +assert(parser.makeConverter("_1", DoubleType).apply("1.00") == 1.0) +assert(parser.makeConverter("_1", BooleanType).apply("true") == true) + +var timestampsOptions = new CSVOptions(Map("timestampFormat" -> "dd/MM/ hh:mm"), false, "GMT") +parser = new UnivocityParser(StructType(Seq.empty), timestampsOptions) val customTimestamp = "31/01/2015 00:00" -val expectedTime = timestampsOptions.timestampFormat.parse(customTimestamp).getTime -val castedTimestamp = - parser.makeConverter("_1", TimestampType, nullable = true, options = timestampsOptions) +var format = FastDateFormat.getInstance( + timestampsOptions.timestampFormat, timestampsOptions.timeZone, timestampsOptions.locale) +val expectedTime = format.parse(customTimestamp).getTime +val castedTimestamp = parser.makeConverter("_1", TimestampType, nullable = true) .apply(customTimestamp) assert(castedTimestamp == expectedTime * 1000L) val customDate = "31/01/2015" val dateOptions = new CSVOptions(Map("dateFormat" -> "dd/MM/"), false, "GMT") -val expectedDate = dateOptions.dateFormat.parse(customDate).getTime -val castedDate = - parser.makeConverter("_1", DateType, nullable = true, options = dateOptions) -.apply(customTimestamp) -assert(castedDate == DateTimeUtils.millisToDays(expectedDate)) +parser = new UnivocityParser(StructType(Seq.empty), dateOptions) +format = FastDateFormat.getInstance( + dateOptions.dateFormat, dateOptions.timeZone, dateOptions.locale) +val expectedDate = format.parse(customDate).getTime +val castedDate = parser.makeConverter("_1", DateType, nullable = true) +.apply(customDate) +assert(castedDate == DateTimeUtils.millisToDays(expectedDate, TimeZone.getTimeZone("GMT"))) val timestamp = "2015-01-01 00:00:00" -assert(parser.makeConverter("_1", TimestampType, options = options).apply(timestamp) == - DateTimeUtils.stringToTime(timestamp).getTime * 1000L) -assert(parser.makeConverter("_1", DateType, options =
[GitHub] spark pull request #23150: [SPARK-26178][SQL] Use java.time API for parsing ...
Github user MaxGekk commented on a diff in the pull request: https://github.com/apache/spark/pull/23150#discussion_r238075664 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala --- @@ -622,10 +623,11 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te val options = Map( "header" -> "true", "inferSchema" -> "false", - "dateFormat" -> "dd/MM/ hh:mm") + "dateFormat" -> "dd/MM/ HH:mm") --- End diff -- According to iso 8601: ``` h clock-hour-of-am-pm (1-12) number12 H hour-of-day (0-23) number0 ``` but real data is not in the allowed range: ``` date 26/08/2015 18:00 27/10/2014 18:30 28/01/2016 20:00 ``` --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #23150: [SPARK-26178][SQL] Use java.time API for parsing ...
Github user MaxGekk commented on a diff in the pull request: https://github.com/apache/spark/pull/23150#discussion_r238075585 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala --- @@ -1107,7 +,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te test("SPARK-18699 put malformed records in a `columnNameOfCorruptRecord` field") { Seq(false, true).foreach { multiLine => - val schema = new StructType().add("a", IntegerType).add("b", TimestampType) + val schema = new StructType().add("a", IntegerType).add("b", DateType) --- End diff -- I changed the type because supposed to valid date `"1983-08-04"` cannot be parsed with default timestamp pattern. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #23150: [SPARK-26178][SQL] Use java.time API for parsing ...
Github user mgaido91 commented on a diff in the pull request: https://github.com/apache/spark/pull/23150#discussion_r237451758 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala --- @@ -23,10 +23,16 @@ import scala.util.control.Exception.allCatch import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.analysis.TypeCoercion -import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.catalyst.util.DateTimeFormatter import org.apache.spark.sql.types._ -object CSVInferSchema { +class CSVInferSchema(val options: CSVOptions) extends Serializable { --- End diff -- since we get the `CSVOptions` in the constructor, shall we remove it as a parameter of the several methods? it is pretty confusing which one is used right now... --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #23150: [SPARK-26178][SQL] Use java.time API for parsing ...
GitHub user MaxGekk opened a pull request: https://github.com/apache/spark/pull/23150 [SPARK-26178][SQL] Use java.time API for parsing timestamps and dates from CSV ## What changes were proposed in this pull request? In the PR, I propose to use **java.time API** for parsing timestamps and dates from CSV content with microseconds precision. The SQL config `spark.sql.legacy.timeParser.enabled` allow to switch back to previous behaviour with using `java.text.SimpleDateFormat`/`FastDateFormat` for parsing/generating timestamps/dates. ## How was this patch tested? It was tested by `UnivocityParserSuite`, `CsvExpressionsSuite`, `CsvFunctionsSuite` and `CsvSuite`. You can merge this pull request into a Git repository by running: $ git pull https://github.com/MaxGekk/spark-1 time-parser Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/23150.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #23150 commit 74a76c2f78ad139993f3bbe0f2ff8f1c81c3bd84 Author: Maxim Gekk Date: 2018-11-24T11:37:55Z New and legacy time parser commit 63cf6112085029c52e4aee6f9bb2e6b84ce18a96 Author: Maxim Gekk Date: 2018-11-24T12:00:10Z Add config spark.sql.legacy.timeParser.enabled commit 2a2ab83a5ecb251ce81e7f12a8c0d3067f88b2d5 Author: Maxim Gekk Date: 2018-11-24T13:24:07Z Fallback legacy parser commit 667bf9f65a90ac69b8cbbad77a17e21f9dd18733 Author: Maxim Gekk Date: 2018-11-24T15:54:19Z something commit 227a7bdc53bdd022e9c365b410810c58f56e8bea Author: Maxim Gekk Date: 2018-11-25T12:15:15Z Using instances commit 73ee56088bf4d2856c454a7bbd4171b61cfe4614 Author: Maxim Gekk Date: 2018-11-25T13:52:02Z Added generator commit f35f6e13270eb994ac97627da79497673b4fe686 Author: Maxim Gekk Date: 2018-11-25T18:03:17Z Refactoring of TimeFormatter commit 1c09b58e6fe3e0fd565c852dcb73dc012fa56819 Author: Maxim Gekk Date: 2018-11-25T18:06:22Z Renaming to DateTimeFormatter commit 7b213d5b2ae404c87f090da622a78d3d19fee6a9 Author: Maxim Gekk Date: 2018-11-25T18:32:54Z Added DateFormatter commit 242ba474dcf112b48bd286811daed86a66366c39 Author: Maxim Gekk Date: 2018-11-25T19:58:08Z Default values in parsing commit db48ee6918eef06e19c3bdf64e3c44f4541cc294 Author: Maxim Gekk Date: 2018-11-25T21:09:08Z Parse as date type because format for timestamp is not not matched to values commit e18841b38050ac411a507a2a2643584f2c8739ce Author: Maxim Gekk Date: 2018-11-25T21:53:11Z Fix tests commit 8db023834b680f336ff5a0e08253ba2cb3b6e3b7 Author: Maxim Gekk Date: 2018-11-25T23:09:20Z CSVSuite passed commit 0b9ed92a456d60db0934340f37e0bd428b2f6a42 Author: Maxim Gekk Date: 2018-11-26T22:00:10Z Fix imports commit 799ebb3432dec7fe1e1099d68a3f1c09e714aa8e Author: Maxim Gekk Date: 2018-11-26T22:03:19Z Revert test back commit 5a223919439e2d22814b92c0e1e572b3c318566f Author: Maxim Gekk Date: 2018-11-26T22:17:11Z Set timeZone commit f287b77d94de9e9f466c0ff2c2370f22a46b48f7 Author: Maxim Gekk Date: 2018-11-26T22:44:42Z Removing default for micros because it causes conflicts in parsing --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org