This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 488b7bab9 Parse timestamps with arbitrary seconds fraction (#3858)
488b7bab9 is described below

commit 488b7bab9fa2b0b615639f6a7dd73b4acb794e1e
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Tue Mar 14 18:52:22 2023 +0000

    Parse timestamps with arbitrary seconds fraction (#3858)
    
    * Parse timestamps with arbitrary seconds fraction
    
    * Review feedback
---
 arrow-cast/src/parse.rs | 98 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 67 insertions(+), 31 deletions(-)

diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index 38fb4fc29..36bc8777c 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -23,6 +23,14 @@ use arrow_schema::ArrowError;
 use chrono::prelude::*;
 use std::str::FromStr;
 
+#[inline]
+fn parse_nanos<const N: usize>(digits: &[u8]) -> u32 {
+    digits[..N]
+        .iter()
+        .fold(0_u32, |acc, v| acc * 10 + *v as u32)
+        * 10_u32.pow((9 - N) as _)
+}
+
 /// Helper for parsing timestamps
 struct TimestampParser {
     /// The timestamp bytes to parse minus `b'0'`
@@ -89,34 +97,23 @@ impl TimestampParser {
                 let second = self.digits[17] * 10 + self.digits[18];
                 let time = NaiveTime::from_hms_opt(hour as _, minute as _, 
second as _)?;
 
-                let millis = || {
-                    self.digits[20] as u32 * 100_000_000
-                        + self.digits[21] as u32 * 10_000_000
-                        + self.digits[22] as u32 * 1_000_000
-                };
-
-                let micros = || {
-                    self.digits[23] as u32 * 100_000
-                        + self.digits[24] as u32 * 10_000
-                        + self.digits[25] as u32 * 1_000
-                };
-
-                let nanos = || {
-                    self.digits[26] as u32 * 100
-                        + self.digits[27] as u32 * 10
-                        + self.digits[28] as u32
-                };
-
                 match self.test(19, b'.') {
-                    true => match (self.mask >> 20).trailing_ones() {
-                        3 => Some((time.with_nanosecond(millis())?, 23)),
-                        6 => Some((time.with_nanosecond(millis() + micros())?, 
26)),
-                        9 => Some((
-                            time.with_nanosecond(millis() + micros() + 
nanos())?,
-                            29,
-                        )),
-                        _ => None,
-                    },
+                    true => {
+                        let digits = (self.mask >> 20).trailing_ones();
+                        let nanos = match digits {
+                            0 => return None,
+                            1 => parse_nanos::<1>(&self.digits[20..21]),
+                            2 => parse_nanos::<2>(&self.digits[20..22]),
+                            3 => parse_nanos::<3>(&self.digits[20..23]),
+                            4 => parse_nanos::<4>(&self.digits[20..24]),
+                            5 => parse_nanos::<5>(&self.digits[20..25]),
+                            6 => parse_nanos::<6>(&self.digits[20..26]),
+                            7 => parse_nanos::<7>(&self.digits[20..27]),
+                            8 => parse_nanos::<8>(&self.digits[20..28]),
+                            _ => parse_nanos::<9>(&self.digits[20..29]),
+                        };
+                        Some((time.with_nanosecond(nanos)?, 20 + digits as 
usize))
+                    }
                     false => Some((time, 19)),
                 }
             }
@@ -195,8 +192,16 @@ pub fn string_to_datetime<T: TimeZone>(
         return Err(err("invalid timestamp separator"));
     }
 
-    let (time, tz_offset) = parser.time().ok_or_else(|| err("error parsing 
time"))?;
+    let (time, mut tz_offset) = parser.time().ok_or_else(|| err("error parsing 
time"))?;
     let datetime = date.and_time(time);
+
+    if tz_offset == 32 {
+        // Decimal overrun
+        while bytes[tz_offset].is_ascii_digit() && tz_offset < bytes.len() {
+            tz_offset += 1;
+        }
+    }
+
     if bytes.len() <= tz_offset {
         let offset = timezone.offset_from_local_datetime(&datetime);
         let offset = offset
@@ -983,6 +988,38 @@ mod tests {
         )
     }
 
+    #[test]
+    fn string_to_timestamp_chrono() {
+        let cases = [
+            "2020-09-08T13:42:29Z",
+            "1969-01-01T00:00:00.1Z",
+            "2020-09-08T12:00:12.12345678+00:00",
+            "2020-09-08T12:00:12+00:00",
+            "2020-09-08T12:00:12.1+00:00",
+            "2020-09-08T12:00:12.12+00:00",
+            "2020-09-08T12:00:12.123+00:00",
+            "2020-09-08T12:00:12.1234+00:00",
+            "2020-09-08T12:00:12.12345+00:00",
+            "2020-09-08T12:00:12.123456+00:00",
+            "2020-09-08T12:00:12.1234567+00:00",
+            "2020-09-08T12:00:12.12345678+00:00",
+            "2020-09-08T12:00:12.123456789+00:00",
+            "2020-09-08T12:00:12.12345678912z",
+            "2020-09-08T12:00:12.123456789123Z",
+            "2020-09-08T12:00:12.123456789123+02:00",
+            "2020-09-08T12:00:12.12345678912345Z",
+            "2020-09-08T12:00:12.1234567891234567+02:00",
+        ];
+
+        for case in cases {
+            let chrono = DateTime::parse_from_rfc3339(case).unwrap();
+            let chrono_utc = chrono.with_timezone(&Utc);
+
+            let custom = string_to_datetime(&Utc, case).unwrap();
+            assert_eq!(chrono_utc, custom)
+        }
+    }
+
     #[test]
     fn string_to_timestamp_invalid() {
         // Test parsing invalid formats
@@ -1002,11 +1039,10 @@ mod tests {
             ("2015-01-20T25:35:20-08:00", "error parsing time"),
             ("1997-01-10T09:61:56.123Z", "error parsing time"),
             ("1997-01-10T09:61:90.123Z", "error parsing time"),
-            ("1997-01-10T12:00:56.12Z", "error parsing time"),
-            ("1997-01-10T12:00:56.1234Z", "error parsing time"),
-            ("1997-01-10T12:00:56.12345Z", "error parsing time"),
             ("1997-01-10T12:00:6.123Z", "error parsing time"),
             ("1997-01-31T092656.123Z", "error parsing time"),
+            ("1997-01-10T12:00:06.", "error parsing time"),
+            ("1997-01-10T12:00:06. ", "error parsing time"),
         ];
 
         for (s, ctx) in cases {

Reply via email to