[jira] [Commented] (SPARK-19228) inferSchema function processed csv date column as string and "dateFormat" DataSource option is ignored
[ https://issues.apache.org/jira/browse/SPARK-19228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16723960#comment-16723960 ] ASF GitHub Bot commented on SPARK-19228: HyukjinKwon closed pull request #21363: [SPARK-19228][SQL] Migrate on Java 8 time from FastDateFormat for meet the ISO8601 URL: https://github.com/apache/spark/pull/21363 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 80f15053005ff..9eaf6a2862a0f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.util import java.sql.{Date, Timestamp} import java.text.{DateFormat, SimpleDateFormat} +import java.time.LocalDateTime +import java.time.temporal.ChronoField import java.util.{Calendar, Locale, TimeZone} import java.util.concurrent.ConcurrentHashMap import java.util.function.{Function => JFunction} @@ -143,6 +145,12 @@ object DateTimeUtils { millisLocal - getOffsetFromLocalMillis(millisLocal, timeZone) } + def dateTimeToMicroseconds(localDateTime: LocalDateTime, timeZone: TimeZone): Long = { +val microOfSecond = localDateTime.getLong(ChronoField.MICRO_OF_SECOND) +val epochSecond = localDateTime.atZone(timeZone.toZoneId).toInstant.getEpochSecond +epochSecond * 100L + microOfSecond + } + def dateToString(days: SQLDate): String = getThreadLocalDateFormat.format(toJavaDate(days)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index cbf6106697f30..cd1b7395b97d5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -19,11 +19,14 @@ package org.apache.spark.sql.catalyst.util import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter import java.util.{Calendar, Locale, TimeZone} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.unsafe.types.UTF8String +import org.junit.Assert.assertEquals class DateTimeUtilsSuite extends SparkFunSuite { @@ -645,6 +648,18 @@ class DateTimeUtilsSuite extends SparkFunSuite { } } + test("Java 8 LocalDateTime to microseconds") { +val nanos = "2015-05-09 00:10:23.999750987" +var formatter = DateTimeFormatter.ofPattern("-MM-dd HH:mm:ss.S") +val localDateTimeInNanos = LocalDateTime.parse(nanos, formatter) +val timeInMicros = dateTimeToMicroseconds(localDateTimeInNanos, TimeZonePST) +assertEquals(1431155423999750L, timeInMicros) +val micros = "2015-05-09 00:10:23.999750" +formatter = DateTimeFormatter.ofPattern("-MM-dd HH:mm:ss.SS") +val localDateTimeInMicros = LocalDateTime.parse(micros, formatter) +assertEquals(timeInMicros, dateTimeToMicroseconds(localDateTimeInMicros, TimeZonePST)) + } + test("daysToMillis and millisToDays") { val c = Calendar.getInstance(TimeZonePST) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala index a585cbed2551b..6239f5666cd4f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala @@ -90,6 +90,7 @@ private[csv] object CSVInferSchema { // DecimalTypes have different precisions and scales, so we try to find the common type. findTightestCommonType(typeSoFar, tryParseDecimal(field, options)).getOrElse(StringType) case DoubleType => tryParseDouble(field, options) +case DateType => tryParseDate(field, options) case TimestampType => tryParseTimestamp(field, options) case BooleanType => tryParseBoolean(field, options) case StringType => StringType @@ -140,14 +141,23 @@ private[csv] object CSVInferSchema { private def tryParseDouble(field: String, options: CSVOptions): DataType = { if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field, options)) { DoubleT
[jira] [Commented] (SPARK-19228) inferSchema function processed csv date column as string and "dateFormat" DataSource option is ignored
[ https://issues.apache.org/jira/browse/SPARK-19228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16480582#comment-16480582 ] Apache Spark commented on SPARK-19228: -- User 'sergey-rubtsov' has created a pull request for this issue: https://github.com/apache/spark/pull/21363 > inferSchema function processed csv date column as string and "dateFormat" > DataSource option is ignored > -- > > Key: SPARK-19228 > URL: https://issues.apache.org/jira/browse/SPARK-19228 > Project: Spark > Issue Type: Bug > Components: Input/Output, SQL >Affects Versions: 2.1.0 >Reporter: Sergey Rubtsov >Priority: Major > Labels: easyfix > Original Estimate: 6h > Remaining Estimate: 6h > > Current FastDateFormat can't properly parse date and timestamp and does not > meet the ISO8601. > That is why there is now supporting for inferring DateType and custom > "dateFormat" option for csv parsing. > For example, I need to process user.csv like this: > {code:java} > id,project,started,ended > sergey.rubtsov,project0,12/12/2012,10/10/2015 > {code} > When I add date format options: > {code:java} > Dataset users = spark.read().format("csv").option("mode", > "PERMISSIVE").option("header", "true") > .option("inferSchema", > "true").option("dateFormat", > "dd/MM/").load("src/main/resources/user.csv"); > users.printSchema(); > {code} > expected scheme should be > {code:java} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: date (nullable = true) > |-- ended: date (nullable = true) > {code} > but the actual result is: > {code:java} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: string (nullable = true) > |-- ended: string (nullable = true) > {code} > This mean that date processed as string and "dateFormat" option is ignored. > If I add option > {code:java} > .option("timestampFormat", "dd/MM/") > {code} > result is: > {code:java} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: timestamp (nullable = true) > |-- ended: timestamp (nullable = true) > {code} > I think, the issue is somewhere in object CSVInferSchema, function > inferField, lines 80-97 and > method "tryParseDate" need to be added before/after "tryParseTimestamp", or > date/timestamp process logic need to be changed. -- This message was sent by Atlassian JIRA (v7.6.3#76005) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-19228) inferSchema function processed csv date column as string and "dateFormat" DataSource option is ignored
[ https://issues.apache.org/jira/browse/SPARK-19228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16478811#comment-16478811 ] Sergey Rubtsov commented on SPARK-19228: Java 8 contains new java.time module, also it can fix an old bug with parse string to SQL's timestamp value in microseconds accuracy: https://issues.apache.org/jira/browse/SPARK-10681.x > inferSchema function processed csv date column as string and "dateFormat" > DataSource option is ignored > -- > > Key: SPARK-19228 > URL: https://issues.apache.org/jira/browse/SPARK-19228 > Project: Spark > Issue Type: Bug > Components: Input/Output, SQL >Affects Versions: 2.1.0 >Reporter: Sergey Rubtsov >Priority: Major > Labels: easyfix > Original Estimate: 6h > Remaining Estimate: 6h > > I need to process user.csv like this: > {code} > id,project,started,ended > sergey.rubtsov,project0,12/12/2012,10/10/2015 > {code} > When I add date format options: > {code} > Dataset users = spark.read().format("csv").option("mode", > "PERMISSIVE").option("header", "true") > .option("inferSchema", > "true").option("dateFormat", > "dd/MM/").load("src/main/resources/user.csv"); > users.printSchema(); > {code} > expected scheme should be > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: date (nullable = true) > |-- ended: date (nullable = true) > {code} > but the actual result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: string (nullable = true) > |-- ended: string (nullable = true) > {code} > This mean that date processed as string and "dateFormat" option is ignored. > If I add option > {code} > .option("timestampFormat", "dd/MM/") > {code} > result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: timestamp (nullable = true) > |-- ended: timestamp (nullable = true) > {code} > I think, the issue is somewhere in object CSVInferSchema, function > inferField, lines 80-97 and > method "tryParseDate" need to be added before/after "tryParseTimestamp", or > date/timestamp process logic need to be changed. -- This message was sent by Atlassian JIRA (v7.6.3#76005) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-19228) inferSchema function processed csv date column as string and "dateFormat" DataSource option is ignored
[ https://issues.apache.org/jira/browse/SPARK-19228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16309509#comment-16309509 ] Apache Spark commented on SPARK-19228: -- User 'sergey-rubtsov' has created a pull request for this issue: https://github.com/apache/spark/pull/20140 > inferSchema function processed csv date column as string and "dateFormat" > DataSource option is ignored > -- > > Key: SPARK-19228 > URL: https://issues.apache.org/jira/browse/SPARK-19228 > Project: Spark > Issue Type: Bug > Components: Input/Output, SQL >Affects Versions: 2.1.0 >Reporter: Sergey Rubtsov > Labels: easyfix > Original Estimate: 6h > Remaining Estimate: 6h > > I need to process user.csv like this: > {code} > id,project,started,ended > sergey.rubtsov,project0,12/12/2012,10/10/2015 > {code} > When I add date format options: > {code} > Dataset users = spark.read().format("csv").option("mode", > "PERMISSIVE").option("header", "true") > .option("inferSchema", > "true").option("dateFormat", > "dd/MM/").load("src/main/resources/user.csv"); > users.printSchema(); > {code} > expected scheme should be > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: date (nullable = true) > |-- ended: date (nullable = true) > {code} > but the actual result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: string (nullable = true) > |-- ended: string (nullable = true) > {code} > This mean that date processed as string and "dateFormat" option is ignored. > If I add option > {code} > .option("timestampFormat", "dd/MM/") > {code} > result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: timestamp (nullable = true) > |-- ended: timestamp (nullable = true) > {code} > I think, the issue is somewhere in object CSVInferSchema, function > inferField, lines 80-97 and > method "tryParseDate" need to be added before/after "tryParseTimestamp", or > date/timestamp process logic need to be changed. -- This message was sent by Atlassian JIRA (v6.4.14#64029) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-19228) inferSchema function processed csv date column as string and "dateFormat" DataSource option is ignored
[ https://issues.apache.org/jira/browse/SPARK-19228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15925775#comment-15925775 ] Sergey Rubtsov commented on SPARK-19228: Hi [~hyukjin.kwon], Updated pull request: https://github.com/apache/spark/pull/16735 Please, take a look. Couldn't run tests in CSVSuite locally on my Windows OS, apologize for the possible test fails > inferSchema function processed csv date column as string and "dateFormat" > DataSource option is ignored > -- > > Key: SPARK-19228 > URL: https://issues.apache.org/jira/browse/SPARK-19228 > Project: Spark > Issue Type: Bug > Components: Input/Output, SQL >Affects Versions: 2.1.0 >Reporter: Sergey Rubtsov > Labels: easyfix > Original Estimate: 6h > Remaining Estimate: 6h > > I need to process user.csv like this: > {code} > id,project,started,ended > sergey.rubtsov,project0,12/12/2012,10/10/2015 > {code} > When I add date format options: > {code} > Dataset users = spark.read().format("csv").option("mode", > "PERMISSIVE").option("header", "true") > .option("inferSchema", > "true").option("dateFormat", > "dd/MM/").load("src/main/resources/user.csv"); > users.printSchema(); > {code} > expected scheme should be > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: date (nullable = true) > |-- ended: date (nullable = true) > {code} > but the actual result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: string (nullable = true) > |-- ended: string (nullable = true) > {code} > This mean that date processed as string and "dateFormat" option is ignored. > If I add option > {code} > .option("timestampFormat", "dd/MM/") > {code} > result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: timestamp (nullable = true) > |-- ended: timestamp (nullable = true) > {code} > I think, the issue is somewhere in object CSVInferSchema, function > inferField, lines 80-97 and > method "tryParseDate" need to be added before/after "tryParseTimestamp", or > date/timestamp process logic need to be changed. -- This message was sent by Atlassian JIRA (v6.3.15#6346) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-19228) inferSchema function processed csv date column as string and "dateFormat" DataSource option is ignored
[ https://issues.apache.org/jira/browse/SPARK-19228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15825860#comment-15825860 ] Sergey Rubtsov commented on SPARK-19228: Okey, I will do it. > inferSchema function processed csv date column as string and "dateFormat" > DataSource option is ignored > -- > > Key: SPARK-19228 > URL: https://issues.apache.org/jira/browse/SPARK-19228 > Project: Spark > Issue Type: Bug > Components: Input/Output, SQL >Affects Versions: 2.1.0 >Reporter: Sergey Rubtsov > Labels: easyfix > Original Estimate: 6h > Remaining Estimate: 6h > > I need to process user.csv like this: > {code} > id,project,started,ended > sergey.rubtsov,project0,12/12/2012,10/10/2015 > {code} > When I add date format options: > {code} > Dataset users = spark.read().format("csv").option("mode", > "PERMISSIVE").option("header", "true") > .option("inferSchema", > "true").option("dateFormat", > "dd/MM/").load("src/main/resources/user.csv"); > users.printSchema(); > {code} > expected scheme should be > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: date (nullable = true) > |-- ended: date (nullable = true) > {code} > but the actual result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: string (nullable = true) > |-- ended: string (nullable = true) > {code} > This mean that date processed as string and "dateFormat" option is ignored. > If I add option > {code} > .option("timestampFormat", "dd/MM/") > {code} > result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: timestamp (nullable = true) > |-- ended: timestamp (nullable = true) > {code} > I think, the issue is somewhere in object CSVInferSchema, function > inferField, lines 80-97 and > method "tryParseDate" need to be added before/after "tryParseTimestamp", or > date/timestamp process logic need to be changed. -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-19228) inferSchema function processed csv date column as string and "dateFormat" DataSource option is ignored
[ https://issues.apache.org/jira/browse/SPARK-19228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15823419#comment-15823419 ] Hyukjin Kwon commented on SPARK-19228: -- Yes, inferring {{DateType}} is currently not supported (we don't have {{tryParseDate}}). As we now have two different options, {{dateFormat}} for {{DateType}} and {{timestampFormat}} for {{TimestampType}} each which were unified with {{dateFormat}} option before, I think it'd make sense to make this possible by introducing {{tryParseDate}}. > inferSchema function processed csv date column as string and "dateFormat" > DataSource option is ignored > -- > > Key: SPARK-19228 > URL: https://issues.apache.org/jira/browse/SPARK-19228 > Project: Spark > Issue Type: Bug > Components: Input/Output, SQL >Affects Versions: 2.1.0 >Reporter: Sergey Rubtsov > Labels: easyfix > Original Estimate: 6h > Remaining Estimate: 6h > > I need to process user.csv like this: > {code} > id,project,started,ended > sergey.rubtsov,project0,12/12/2012,10/10/2015 > {code} > When I add date format options: > {code} > Dataset users = spark.read().format("csv").option("mode", > "PERMISSIVE").option("header", "true") > .option("inferSchema", > "true").option("dateFormat", > "dd/MM/").load("src/main/resources/user.csv"); > users.printSchema(); > {code} > expected scheme should be > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: date (nullable = true) > |-- ended: date (nullable = true) > {code} > but the actual result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: string (nullable = true) > |-- ended: string (nullable = true) > {code} > This mean that date processed as string and "dateFormat" option is ignored > and date processed as string. > If I add option > {code} > .option("timestampFormat", "dd/MM/") > {code} > result is: > {code} > root > |-- id: string (nullable = true) > |-- project: string (nullable = true) > |-- started: timestamp (nullable = true) > |-- ended: timestamp (nullable = true) > {code} > I think, the issue is somewhere in object CSVInferSchema, function > inferField, lines 80-97 and > method "tryParseDate" need to be added before/after "tryParseTimestamp", or > date/timestamp process logic need to be changed. -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org