Weijun-H commented on code in PR #18025:
URL: https://github.com/apache/datafusion/pull/18025#discussion_r2506788699
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -42,6 +47,508 @@ pub(crate) fn string_to_timestamp_nanos_shim(s: &str) ->
Result<i64> {
string_to_timestamp_nanos(s).map_err(|e| e.into())
}
+#[derive(Clone, Copy, Debug)]
+enum ConfiguredZone {
+ Named(Tz),
+ Offset(FixedOffset),
+}
+
+#[derive(Clone)]
+pub(crate) struct ConfiguredTimeZone {
+ repr: Arc<str>,
+ zone: ConfiguredZone,
+}
+
+impl ConfiguredTimeZone {
+ pub(crate) fn utc() -> Self {
+ Self {
+ repr: Arc::from("+00:00"),
+ zone: ConfiguredZone::Offset(FixedOffset::east_opt(0).unwrap()),
+ }
+ }
+
+ pub(crate) fn parse(tz: &str) -> Result<Option<Self>> {
+ let tz = tz.trim();
+ if tz.is_empty() {
+ return Ok(None);
+ }
+
+ if let Ok(named) = Tz::from_str(tz) {
+ return Ok(Some(Self {
+ repr: Arc::from(tz),
+ zone: ConfiguredZone::Named(named),
+ }));
+ }
+
+ if let Some(offset) = parse_fixed_offset(tz) {
+ return Ok(Some(Self {
+ repr: Arc::from(tz),
+ zone: ConfiguredZone::Offset(offset),
+ }));
+ }
+
+ Err(exec_datafusion_err!(
+ "Invalid execution timezone '{tz}'. Please provide an IANA
timezone name (e.g. 'America/New_York') or an offset in the form '+HH:MM'."
+ ))
+ }
+
+ pub(crate) fn from_config(config: &ConfigOptions) -> Self {
+ match Self::parse(config.execution.time_zone.as_deref().unwrap_or(""))
{
+ Ok(Some(tz)) => tz,
+ _ => Self::utc(),
+ }
+ }
+
+ fn timestamp_from_naive(&self, naive: &NaiveDateTime) -> Result<i64> {
+ match self.zone {
+ ConfiguredZone::Named(tz) => {
+ local_datetime_to_timestamp(tz.from_local_datetime(naive),
&self.repr)
+ }
+ ConfiguredZone::Offset(offset) => {
+ local_datetime_to_timestamp(offset.from_local_datetime(naive),
&self.repr)
+ }
+ }
+ }
+
+ fn datetime_from_formatted(&self, s: &str, format: &str) ->
Result<DateTime<Utc>> {
+ let datetime = match self.zone {
+ ConfiguredZone::Named(tz) => {
+ string_to_datetime_formatted(&tz, s,
format)?.with_timezone(&Utc)
+ }
+ ConfiguredZone::Offset(offset) => {
+ string_to_datetime_formatted(&offset, s,
format)?.with_timezone(&Utc)
+ }
+ };
+ Ok(datetime)
+ }
+}
+
+impl fmt::Debug for ConfiguredTimeZone {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("ConfiguredTimeZone")
+ .field("repr", &self.repr)
+ .finish()
+ }
+}
+
+impl PartialEq for ConfiguredTimeZone {
+ fn eq(&self, other: &Self) -> bool {
+ self.repr == other.repr
+ }
+}
+
+impl Eq for ConfiguredTimeZone {}
+
+impl Hash for ConfiguredTimeZone {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.repr.hash(state);
+ }
+}
+
+fn parse_fixed_offset(tz: &str) -> Option<FixedOffset> {
+ let tz = tz.trim();
+ if tz.eq_ignore_ascii_case("utc") || tz.eq_ignore_ascii_case("z") {
+ return FixedOffset::east_opt(0);
+ }
+
+ let (sign, rest) = if let Some(rest) = tz.strip_prefix('+') {
+ (1, rest)
+ } else if let Some(rest) = tz.strip_prefix('-') {
+ (-1, rest)
+ } else {
+ return None;
+ };
+
+ let (hours, minutes) = if let Some((hours, minutes)) =
rest.split_once(':') {
+ (hours, minutes)
+ } else if rest.len() == 4 {
+ rest.split_at(2)
+ } else {
+ return None;
+ };
+
+ let hours: i32 = hours.parse().ok()?;
+ let minutes: i32 = minutes.parse().ok()?;
+ if hours > 23 || minutes > 59 {
+ return None;
+ }
+
+ let total_minutes = hours * 60 + minutes;
+ let total_seconds = sign * total_minutes * 60;
+ FixedOffset::east_opt(total_seconds)
+}
+
+/// Converts a local datetime result to a UTC timestamp in nanoseconds.
+///
+/// # DST Transition Behavior
+///
+/// This function handles daylight saving time (DST) transitions by returning
an error
+/// when the local time is ambiguous or invalid:
+///
+/// ## Ambiguous Times (Fall Back)
+/// When clocks "fall back" (e.g., 2:00 AM becomes 1:00 AM), times in the
repeated hour
+/// exist twice. For example, in America/New_York on 2024-11-03:
+/// - `2024-11-03 01:30:00` occurs both at UTC 05:30 (EDT) and UTC 06:30 (EST)
+///
+/// DataFusion returns an error rather than silently choosing one
interpretation,
+/// ensuring users are aware of the ambiguity.
+///
+/// ## Invalid Times (Spring Forward)
+/// When clocks "spring forward" (e.g., 2:00 AM becomes 3:00 AM), times in the
skipped hour
+/// don't exist. For example, in America/New_York on 2024-03-10:
+/// - `2024-03-10 02:30:00` never occurred (clocks jumped from 02:00 to 03:00)
+///
+/// DataFusion returns an error for these non-existent times.
+///
+/// ## Workarounds
+/// To avoid ambiguity errors:
+/// 1. Use timestamps with explicit timezone offsets (e.g., `2024-11-03
01:30:00-05:00`)
+/// 2. Convert to UTC before processing
+/// 3. Use a timezone without DST (e.g., UTC, `America/Phoenix`)
+fn local_datetime_to_timestamp<T: TimeZone>(
+ result: LocalResult<DateTime<T>>,
+ tz_repr: &str,
+) -> Result<i64> {
+ match result {
+ Single(dt) => datetime_to_timestamp(dt.with_timezone(&Utc)),
+ LocalResult::Ambiguous(dt1, dt2) => Err(exec_datafusion_err!(
+ "The local time '{:?}' is ambiguous in timezone '{tz_repr}' (also
corresponds to '{:?}').",
+ dt1.naive_local(),
+ dt2.naive_local()
+ )),
+ LocalResult::None => Err(exec_datafusion_err!(
+ "The local time is invalid in timezone '{tz_repr}'."
+ )),
+ }
+}
+
+fn datetime_to_timestamp(datetime: DateTime<Utc>) -> Result<i64> {
+ datetime
+ .timestamp_nanos_opt()
+ .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))
+}
+
+fn timestamp_to_naive(value: i64) -> Result<NaiveDateTime> {
+ let secs = value.div_euclid(1_000_000_000);
+ let nanos = value.rem_euclid(1_000_000_000) as u32;
+ DateTime::<Utc>::from_timestamp(secs, nanos)
+ .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))
+ .map(|dt| dt.naive_utc())
+}
+
+/// Detects whether a timestamp string contains explicit timezone information.
+///
+/// This function performs a single-pass scan to check for:
+/// 1. RFC3339-compatible format (via Arrow's parser)
+/// 2. Timezone offset markers (e.g., `+05:00`, `-0800`, `+05`)
+/// 3. Trailing 'Z' or 'z' suffix (UTC indicator)
+/// 4. Named timezone identifiers (e.g., `UTC`, `America/New_York`)
+///
+/// # Performance Considerations
+/// This function is called for every string value during timestamp parsing.
+/// The implementation uses a single-pass byte-level scan for efficiency.
+///
+/// # Examples
+/// ```ignore
+/// assert!(has_explicit_timezone("2020-09-08T13:42:29Z"));
+/// assert!(has_explicit_timezone("2020-09-08T13:42:29+05:00"));
+/// assert!(has_explicit_timezone("2020-09-08T13:42:29 UTC"));
+/// assert!(!has_explicit_timezone("2020-09-08T13:42:29"));
+/// ```
+fn has_explicit_timezone(value: &str) -> bool {
+ // Fast path: try RFC3339 parsing first
+ if has_rfc3339_timezone(value) {
+ return true;
+ }
Review Comment:
Maybe for Non-RFC3339 format, skip full parsing.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]