(datafusion-comet) branch main updated: fix: add timezone and special formats support for cast string to timestamp (#3730)

parthc Wed, 25 Mar 2026 13:14:54 -0700

This is an automated email from the ASF dual-hosted git repository.

parthc pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git



The following commit(s) were added to refs/heads/main by this push:
     new cfd8a1c0a fix: add timezone and special formats support for cast 
string to timestamp (#3730)
cfd8a1c0a is described below

commit cfd8a1c0afa9156aacee85b92e3934a7f0bf4af2
Author: Parth Chandra <[email protected]>
AuthorDate: Wed Mar 25 13:14:44 2026 -0700

    fix: add timezone and special formats support for cast string to timestamp 
(#3730)
    
    * fix: add timezone and special formats support for cast string to timestamp
    
    * improve splitting of date_part
---
 native/spark-expr/src/conversion_funcs/string.rs   | 295 ++++++++++++++++-----
 .../scala/org/apache/comet/CometCastSuite.scala    |  26 +-
 2 files changed, 249 insertions(+), 72 deletions(-)

diff --git a/native/spark-expr/src/conversion_funcs/string.rs 
b/native/spark-expr/src/conversion_funcs/string.rs
index cd1c643be..cdff90a4e 100644
--- a/native/spark-expr/src/conversion_funcs/string.rs
+++ b/native/spark-expr/src/conversion_funcs/string.rs
@@ -34,9 +34,11 @@ use std::str::FromStr;
 use std::sync::{Arc, LazyLock};
 
 macro_rules! cast_utf8_to_timestamp {
-    ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, 
$tz:expr) => {{
+    // $tz is a Timezone:Tz object and contains the session timezone.
+    // $to_tz_str is a string containing the to_type timezone
+    ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, 
$tz:expr, $to_tz_str:expr) => {{
         let len = $array.len();
-        let mut cast_array = 
PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
+        let mut cast_array = 
PrimitiveArray::<$array_type>::builder(len).with_timezone($to_tz_str);
         let mut cast_err: Option<SparkError> = None;
         for i in 0..len {
             if $array.is_null(i) {
@@ -675,16 +677,21 @@ pub(crate) fn cast_string_to_timestamp(
         .downcast_ref::<GenericStringArray<i32>>()
         .expect("Expected a string array");
 
-    let tz = &timezone::Tz::from_str(timezone_str).unwrap();
+    let tz = &timezone::Tz::from_str(timezone_str)
+        .map_err(|_| SparkError::Internal(format!("Invalid timezone string: 
{timezone_str}")))?;
 
     let cast_array: ArrayRef = match to_type {
-        DataType::Timestamp(_, _) => cast_utf8_to_timestamp!(
-            string_array,
-            eval_mode,
-            TimestampMicrosecondType,
-            timestamp_parser,
-            tz
-        )?,
+        DataType::Timestamp(_, tz_opt) => {
+            let to_tz = tz_opt.as_deref().unwrap_or("UTC");
+            cast_utf8_to_timestamp!(
+                string_array,
+                eval_mode,
+                TimestampMicrosecondType,
+                timestamp_parser,
+                tz,
+                to_tz
+            )?
+        }
         _ => unreachable!("Invalid data type {:?} in cast from string", 
to_type),
     };
     Ok(cast_array)
@@ -967,20 +974,31 @@ fn get_timestamp_values<T: TimeZone>(
     timestamp_type: &str,
     tz: &T,
 ) -> SparkResult<Option<i64>> {
-    let values: Vec<_> = value.split(['T', '-', ':', '.']).collect();
-    let year = values[0].parse::<i32>().unwrap_or_default();
+    // Handle negative year: strip leading '-' and remember the sign.
+    let (sign, date_part) = if let Some(stripped) = value.strip_prefix('-') {
+        (-1i32, stripped)
+    } else {
+        (1i32, value)
+    };
+    let mut parts = date_part.split(['T', ' ', '-', ':', '.']);
+    let year = sign
+        * parts
+            .next()
+            .unwrap_or("")
+            .parse::<i32>()
+            .unwrap_or_default();
 
     // NaiveDate (used internally by chrono's with_ymd_and_hms) is bounded to 
±262142.
     if !(-262143..=262142).contains(&year) {
         return Ok(None);
     }
 
-    let month = values.get(1).map_or(1, |m| m.parse::<u32>().unwrap_or(1));
-    let day = values.get(2).map_or(1, |d| d.parse::<u32>().unwrap_or(1));
-    let hour = values.get(3).map_or(0, |h| h.parse::<u32>().unwrap_or(0));
-    let minute = values.get(4).map_or(0, |m| m.parse::<u32>().unwrap_or(0));
-    let second = values.get(5).map_or(0, |s| s.parse::<u32>().unwrap_or(0));
-    let microsecond = values.get(6).map_or(0, |ms| 
ms.parse::<u32>().unwrap_or(0));
+    let month = parts.next().map_or(1, |m| m.parse::<u32>().unwrap_or(1));
+    let day = parts.next().map_or(1, |d| d.parse::<u32>().unwrap_or(1));
+    let hour = parts.next().map_or(0, |h| h.parse::<u32>().unwrap_or(0));
+    let minute = parts.next().map_or(0, |m| m.parse::<u32>().unwrap_or(0));
+    let second = parts.next().map_or(0, |s| s.parse::<u32>().unwrap_or(0));
+    let microsecond = parts.next().map_or(0, |ms| 
ms.parse::<u32>().unwrap_or(0));
 
     let mut timestamp_info = TimeStampInfo::default();
 
@@ -1041,28 +1059,19 @@ fn parse_timestamp_to_micros<T: TimeZone>(
         timestamp_info.second,
     );
 
-    // Check if datetime is not None
-    let tz_datetime = match datetime.single() {
+    // Spark uses the offset before daylight savings change so we need to use 
earliest()
+    // Return None for LocalResult::None which is the invalid time in a DST 
spring forward gap).
+    let tz_datetime = match datetime.earliest() {
         Some(dt) => dt
             .with_timezone(tz)
             .with_nanosecond(timestamp_info.microsecond * 1000),
-        None => {
-            return Err(SparkError::Internal(
-                "Failed to parse timestamp".to_string(),
-            ));
-        }
-    };
-
-    let result = match tz_datetime {
-        Some(dt) => dt.timestamp_micros(),
-        None => {
-            return Err(SparkError::Internal(
-                "Failed to parse timestamp".to_string(),
-            ));
-        }
+        None => return Ok(None),
     };
 
-    Ok(Some(result))
+    match tz_datetime {
+        Some(dt) => Ok(Some(dt.timestamp_micros())),
+        None => Ok(None),
+    }
 }
 
 fn parse_str_to_year_timestamp<T: TimeZone>(value: &str, tz: &T) -> 
SparkResult<Option<i64>> {
@@ -1096,21 +1105,6 @@ fn parse_str_to_microsecond_timestamp<T: TimeZone>(
     get_timestamp_values(value, "microsecond", tz)
 }
 
-type TimestampPattern<T> = (&'static Regex, fn(&str, &T) -> 
SparkResult<Option<i64>>);
-
-static RE_YEAR: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^\d{4,7}$").unwrap());
-static RE_MONTH: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^\d{4,7}-\d{2}$").unwrap());
-static RE_DAY: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^\d{4,7}-\d{2}-\d{2}$").unwrap());
-static RE_HOUR: LazyLock<Regex> =
-    LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{1,2}$").unwrap());
-static RE_MINUTE: LazyLock<Regex> =
-    LazyLock::new(|| 
Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap());
-static RE_SECOND: LazyLock<Regex> =
-    LazyLock::new(|| 
Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap());
-static RE_MICROSECOND: LazyLock<Regex> =
-    LazyLock::new(|| 
Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
-static RE_TIME_ONLY: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^T\d{1,2}$").unwrap());
-
 fn timestamp_parser<T: TimeZone>(
     value: &str,
     eval_mode: EvalMode,
@@ -1120,15 +1114,103 @@ fn timestamp_parser<T: TimeZone>(
     if value.is_empty() {
         return Ok(None);
     }
-    let patterns: &[TimestampPattern<T>] = &[
-        (&RE_YEAR, parse_str_to_year_timestamp),
+
+    // Handle Z or ±HH:MM offset suffix: strip it and parse with the explicit 
fixed offset.
+    if let Some((stripped, offset_secs)) = extract_offset_suffix(value) {
+        let fixed_tz = chrono::FixedOffset::east_opt(offset_secs)
+            .ok_or_else(|| SparkError::Internal("Invalid timezone 
offset".to_string()))?;
+        return timestamp_parser_with_tz(stripped, eval_mode, &fixed_tz);
+    }
+
+    timestamp_parser_with_tz(value, eval_mode, tz)
+}
+
+/// If `value` ends with a UTC offset suffix (`Z`, `+HH:MM`, or `-HH:MM`), 
returns the
+/// stripped string and the offset in seconds. Returns `None` if no offset 
suffix is present.
+fn extract_offset_suffix(value: &str) -> Option<(&str, i32)> {
+    if let Some(stripped) = value.strip_suffix('Z') {
+        return Some((stripped, 0));
+    }
+    // Check for ±HH:MM at the end (exactly 6 chars: sign + 2 digits + ':' + 2 
digits)
+    if value.len() >= 6 {
+        let suffix_start = value.len() - 6;
+        let suffix = &value[suffix_start..];
+        let sign_byte = suffix.as_bytes()[0];
+        if (sign_byte == b'+' || sign_byte == b'-') && suffix.as_bytes()[3] == 
b':' {
+            if let (Ok(h), Ok(m)) = (suffix[1..3].parse::<i32>(), 
suffix[4..6].parse::<i32>()) {
+                let sign = if sign_byte == b'+' { 1i32 } else { -1i32 };
+                return Some((&value[..suffix_start], sign * (h * 3600 + m * 
60)));
+            }
+        }
+    }
+    None
+}
+
+type TimestampParsePattern<T> = (&'static Regex, fn(&str, &T) -> 
SparkResult<Option<i64>>);
+
+static RE_YEAR: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^-?\d{4,7}$").unwrap());
+static RE_MONTH: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^-?\d{4,7}-\d{2}$").unwrap());
+static RE_DAY: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}$").unwrap());
+static RE_HOUR: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T 
]\d{1,2}$").unwrap());
+static RE_MINUTE: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T 
]\d{2}:\d{2}$").unwrap());
+static RE_SECOND: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T 
]\d{2}:\d{2}:\d{2}$").unwrap());
+static RE_MICROSECOND: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T 
]\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
+static RE_TIME_ONLY_H: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^T\d{1,2}$").unwrap());
+static RE_TIME_ONLY_HM: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}$").unwrap());
+static RE_TIME_ONLY_HMS: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}:\d{2}$").unwrap());
+static RE_TIME_ONLY_HMSU: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
+static RE_BARE_HM: LazyLock<Regex> = LazyLock::new(|| 
Regex::new(r"^\d{1,2}:\d{2}$").unwrap());
+static RE_BARE_HMS: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}:\d{2}$").unwrap());
+static RE_BARE_HMSU: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
+
+fn timestamp_parser_with_tz<T: TimeZone>(
+    value: &str,
+    eval_mode: EvalMode,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
+    // Both T-separator and space-separator date-time forms are supported.
+    // Negative years are handled by get_timestamp_values detecting a leading 
'-'.
+    let patterns: &[TimestampParsePattern<T>] = &[
+        // Year only: 4-7 digits, optionally negative
+        (
+            &RE_YEAR,
+            parse_str_to_year_timestamp as fn(&str, &T) -> 
SparkResult<Option<i64>>,
+        ),
+        // Year-month
         (&RE_MONTH, parse_str_to_month_timestamp),
+        // Year-month-day
         (&RE_DAY, parse_str_to_day_timestamp),
+        // Date T-or-space hour (1 or 2 digits)
         (&RE_HOUR, parse_str_to_hour_timestamp),
+        // Date T-or-space hour:minute
         (&RE_MINUTE, parse_str_to_minute_timestamp),
+        // Date T-or-space hour:minute:second
         (&RE_SECOND, parse_str_to_second_timestamp),
+        // Date T-or-space hour:minute:second.fraction
         (&RE_MICROSECOND, parse_str_to_microsecond_timestamp),
-        (&RE_TIME_ONLY, parse_str_to_time_only_timestamp),
+        // Time-only: T hour (1 or 2 digits, no colon)
+        (&RE_TIME_ONLY_H, parse_str_to_time_only_timestamp),
+        // Time-only: T hour:minute
+        (&RE_TIME_ONLY_HM, parse_str_to_time_only_timestamp),
+        // Time-only: T hour:minute:second
+        (&RE_TIME_ONLY_HMS, parse_str_to_time_only_timestamp),
+        // Time-only: T hour:minute:second.fraction
+        (&RE_TIME_ONLY_HMSU, parse_str_to_time_only_timestamp),
+        // Bare time-only: hour:minute (without T prefix)
+        (&RE_BARE_HM, parse_str_to_time_only_timestamp),
+        // Bare time-only: hour:minute:second
+        (&RE_BARE_HMS, parse_str_to_time_only_timestamp),
+        // Bare time-only: hour:minute:second.fraction
+        (&RE_BARE_HMSU, parse_str_to_time_only_timestamp),
     ];
 
     let mut timestamp = None;
@@ -1157,23 +1239,43 @@ fn timestamp_parser<T: TimeZone>(
 }
 
 fn parse_str_to_time_only_timestamp<T: TimeZone>(value: &str, tz: &T) -> 
SparkResult<Option<i64>> {
-    let values: Vec<&str> = value.split('T').collect();
-    let time_values: Vec<u32> = values[1]
-        .split(':')
-        .map(|v| v.parse::<u32>().unwrap_or(0))
-        .collect();
+    // The 'T' is optional in the time format; strip it if specified.
+    let time_part = value.strip_prefix('T').unwrap_or(value);
+
+    // Parse time components: hour[:minute[:second[.fraction]]]
+    // Use splitn(3) so "12:34:56.789" splits into ["12", "34", "56.789"].
+    let colon_parts: Vec<&str> = time_part.splitn(3, ':').collect();
+    let hour: u32 = colon_parts[0].parse().unwrap_or(0);
+    let minute: u32 = colon_parts.get(1).and_then(|s| 
s.parse().ok()).unwrap_or(0);
+    let (second, nanosecond) = if let Some(sec_frac) = colon_parts.get(2) {
+        let dot_idx = sec_frac.find('.');
+        let sec: u32 = sec_frac[..dot_idx.unwrap_or(sec_frac.len())]
+            .parse()
+            .unwrap_or(0);
+        let ns: u32 = if let Some(dot) = dot_idx {
+            let frac = &sec_frac[dot + 1..];
+            // Interpret up to 6 digits as microseconds, padding with trailing 
zeros.
+            let trimmed = &frac[..frac.len().min(6)];
+            let padded = format!("{:0<6}", trimmed);
+            padded.parse::<u32>().unwrap_or(0) * 1000
+        } else {
+            0
+        };
+        (sec, ns)
+    } else {
+        (0, 0)
+    };
 
     let datetime = tz.from_utc_datetime(&chrono::Utc::now().naive_utc());
-    let timestamp = datetime
+    let result = datetime
         .with_timezone(tz)
-        .with_hour(time_values.first().copied().unwrap_or_default())
-        .and_then(|dt| dt.with_minute(*time_values.get(1).unwrap_or(&0)))
-        .and_then(|dt| dt.with_second(*time_values.get(2).unwrap_or(&0)))
-        .and_then(|dt| dt.with_nanosecond(*time_values.get(3).unwrap_or(&0) * 
1_000))
-        .map(|dt| dt.timestamp_micros())
-        .unwrap_or_default();
-
-    Ok(Some(timestamp))
+        .with_hour(hour)
+        .and_then(|dt| dt.with_minute(minute))
+        .and_then(|dt| dt.with_second(second))
+        .and_then(|dt| dt.with_nanosecond(nanosecond))
+        .map(|dt| dt.timestamp_micros());
+
+    Ok(result)
 }
 
 //a string to date parser - port of spark's SparkDateTimeUtils#stringToDate.
@@ -1343,7 +1445,8 @@ mod tests {
             eval_mode,
             TimestampMicrosecondType,
             timestamp_parser,
-            tz
+            tz,
+            "UTC"
         )
         .unwrap();
 
@@ -1373,7 +1476,8 @@ mod tests {
             eval_mode,
             TimestampMicrosecondType,
             timestamp_parser,
-            tz
+            tz,
+            "UTC"
         );
         assert!(
             result.is_err(),
@@ -1497,6 +1601,59 @@ mod tests {
             timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, 
tz).unwrap(),
             Some(253402346096123456)
         );
+        // Space separator (same values as T separator)
+        assert_eq!(
+            timestamp_parser("2020-01-01 12", EvalMode::Legacy, tz).unwrap(),
+            Some(1577880000000000)
+        );
+        assert_eq!(
+            timestamp_parser("2020-01-01 12:34", EvalMode::Legacy, 
tz).unwrap(),
+            Some(1577882040000000)
+        );
+        assert_eq!(
+            timestamp_parser("2020-01-01 12:34:56", EvalMode::Legacy, 
tz).unwrap(),
+            Some(1577882096000000)
+        );
+        assert_eq!(
+            timestamp_parser("2020-01-01 12:34:56.123456", EvalMode::Legacy, 
tz).unwrap(),
+            Some(1577882096123456)
+        );
+        // Z suffix (UTC)
+        assert_eq!(
+            timestamp_parser("2020-01-01T12:34:56Z", EvalMode::Legacy, 
tz).unwrap(),
+            Some(1577882096000000)
+        );
+        // Positive offset suffix
+        assert_eq!(
+            timestamp_parser("2020-01-01T12:34:56+05:30", EvalMode::Legacy, 
tz).unwrap(),
+            Some(1577862296000000) // 12:34:56 UTC+5:30 = 07:04:56 UTC
+        );
+        // T-prefixed time-only with colon
+        assert!(timestamp_parser("T12:34", EvalMode::Legacy, tz)
+            .unwrap()
+            .is_some());
+        assert!(timestamp_parser("T12:34:56", EvalMode::Legacy, tz)
+            .unwrap()
+            .is_some());
+        assert!(timestamp_parser("T12:34:56.123456", EvalMode::Legacy, tz)
+            .unwrap()
+            .is_some());
+        // Bare time-only (hour:minute without T prefix)
+        assert!(timestamp_parser("12:34", EvalMode::Legacy, tz)
+            .unwrap()
+            .is_some());
+        assert!(timestamp_parser("12:34:56", EvalMode::Legacy, tz)
+            .unwrap()
+            .is_some());
+        // Negative year
+        assert!(timestamp_parser("-0001", EvalMode::Legacy, tz)
+            .unwrap()
+            .is_some());
+        assert!(
+            timestamp_parser("-0001-01-01T12:34:56", EvalMode::Legacy, tz)
+                .unwrap()
+                .is_some()
+        );
     }
 
     #[test]
diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala 
b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
index f213e90e0..d1d00e339 100644
--- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
@@ -992,8 +992,10 @@ class CometCastSuite extends CometTestBase with 
AdaptiveSparkPlanHelper {
   }
 
   ignore("cast StringType to TimestampType") {
-    // TODO: enable once all Spark timestamp formats are supported natively.
-    // Currently missing: time-only formats with colon (e.g. "T12:34", "4:4").
+    // TODO: enable once string→timestamp is marked Compatible in 
CometCast.canCastFromString.
+    // All Spark timestamp formats are now supported natively (space 
separator, Z/offset suffix,
+    // T-prefixed and bare H:M time-only, negative years). The fuzz filter 
below can be removed
+    // when enabling the native path.
     withSQLConf((SQLConf.SESSION_LOCAL_TIMEZONE.key, "UTC")) {
       val values = Seq("2020-01-01T12:34:56.123456", "T2") ++ 
gen.generateStrings(
         dataSize,
@@ -1045,7 +1047,25 @@ class CometCastSuite extends CometTestBase with 
AdaptiveSparkPlanHelper {
         "10000-01-01T12",
         "10000-01-01T12:34",
         "10000-01-01T12:34:56",
-        "10000-01-01T12:34:56.123456")
+        "10000-01-01T12:34:56.123456",
+        // Space separator
+        "2020-01-01 12",
+        "2020-01-01 12:34",
+        "2020-01-01 12:34:56",
+        "2020-01-01 12:34:56.123456",
+        // Z and offset suffixes
+        "2020-01-01T12:34:56Z",
+        "2020-01-01T12:34:56+05:30",
+        "2020-01-01T12:34:56-08:00",
+        // T-prefixed time-only with colon
+        "T12:34",
+        "T12:34:56",
+        "T12:34:56.123456",
+        // Bare time-only (hour:minute)
+        "12:34",
+        "12:34:56",
+        // Negative year
+        "-0001-01-01T12:34:56")
       castTimestampTest(values.toDF("a"), DataTypes.TimestampType)
     }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion-comet) branch main updated: fix: add timezone and special formats support for cast string to timestamp (#3730)

Reply via email to