cloud-fan commented on a change in pull request #26507: [SPARK-29904][SQL][2.4] 
Parse timestamps in microsecond precision by JSON/CSV datasources
URL: https://github.com/apache/spark/pull/26507#discussion_r346661193
 
 

 ##########
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 ##########
 @@ -1164,4 +1167,48 @@ object DateTimeUtils {
     threadLocalTimestampFormat.remove()
     threadLocalDateFormat.remove()
   }
+
+  /**
+   * The custom sub-class of `GregorianCalendar` is needed to get access to
+   * the array of parsed `fields` immediately after parsing. We cannot use
+   * the `get()` method because it performs normalization of the fraction
+   * part. Accordingly, the `MILLISECOND` field doesn't contain original value.
+   */
+  class MicrosCalendar(tz: TimeZone) extends GregorianCalendar(tz, Locale.US) {
+    // Converts parsed `MILLISECOND` field to seconds fraction in microsecond 
precision.
+    // For example if the fraction pattern is `SSSS` then `digitsInFraction` = 
4, and
+    // if the `MILLISECOND` field was parsed to `1234`.
+    def getMicros(digitsInFraction: Int): SQLTimestamp = {
+      // Append `digitsInFraction` zeros to the field: 1234 -> 1234000000
+      val d = fields(Calendar.MILLISECOND) * MICROS_PER_SECOND
+      // Take the first 6 digits from `d`: 1234000000 -> 123400
+      // The rest contains exactly `digitsInFraction`: `0000` = 10 ^ 
digitsInFraction
+      // So, the result is `(1234 * 1000000) / (10 ^ digitsInFraction)
+      d / Decimal.POW_10(digitsInFraction)
+    }
+  }
+
+  /**
+   * An instance of the class is aimed to re-use many times. It contains 
helper objects
+   * that can be reused between `parse()` invokes.
+   * @param format The parser itself.
+   * @param digitsInFraction The number of digits in the seconds fraction 
precalculated
+   *                         from the pattern. For `ss.SSSS`, it is 4.
+   * @param cal The calendar which can get microseconds from the second 
fraction.
+   */
+  class DateTimeParser(format: FastDateFormat, digitsInFraction: Int, cal: 
MicrosCalendar) {
+    def parse(s: String): SQLTimestamp = {
+      cal.clear() // Clear the calendar because it can be re-used many times
+      if (!format.parse(s, new ParsePosition(0), cal)) {
+        throw new IllegalArgumentException(s"'$s' is an invalid timestamp")
+      }
+      val micros = cal.getMicros(digitsInFraction)
+      cal.set(Calendar.MILLISECOND, 0)
+      cal.getTimeInMillis * MICROS_PER_MILLIS + micros
+    }
+  }
+
+  def getDateTimeParser(format: FastDateFormat, tz: TimeZone): DateTimeParser 
= {
 
 Review comment:
   we can get the timezone via `format.getTimezone`

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to