This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new feeee04601 fix: unambiguously truncate time in date_trunc function 
(#9068)
feeee04601 is described below

commit feeee046010ac53e1f6baf6aa1a696a26c7c74d7
Author: Martin Hilton <[email protected]>
AuthorDate: Wed Jan 31 11:01:26 2024 +0000

    fix: unambiguously truncate time in date_trunc function (#9068)
    
    * fix: unambiguously truncate time in date_trunc function
    
    When date_trunc is truncating a timestamp with a geographic
    timezone it would previously get stuck if the local reprentation of
    the time could be ambiguously interpretted. This happens when the
    clocks "go back". The update here is to use the original timestamp
    offset as the tie-breaker when the local representation of the
    truncated time could be ambiguous.
    
    * fix: fmt
    
    * chore: review suggestion
    
    * fix: fmt
    
    * Add date_trunc test to slt
    
    * Update test for fixed fode
    
    * feat: additional tests
    
    Add test for the historical America/Sao_Paulo timezone which changed
    in and out of DST at midnight.
    
    * chore: review suggestion
    
    * fix: date_trunc support days starting at 1am
    
    Historically Sao Paulo, and possibly other places, have had daylight
    savings time that started at midnight. This causes the day to start
    at 1am. The naive method used by date_trunc to truncate to 'day' will
    create a non-existent time in these circumstances. Adjust the
    timestamps produced by date_trunc in this case to be valid within
    the required timezone.
    
    * fix: fmt
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../physical-expr/src/datetime_expressions.rs      | 236 ++++++++++++++++++++-
 datafusion/sqllogictest/test_files/timestamps.slt  | 167 +++++++++++++++
 2 files changed, 400 insertions(+), 3 deletions(-)

diff --git a/datafusion/physical-expr/src/datetime_expressions.rs 
b/datafusion/physical-expr/src/datetime_expressions.rs
index 430220faf8..c40a89b0ba 100644
--- a/datafusion/physical-expr/src/datetime_expressions.rs
+++ b/datafusion/physical-expr/src/datetime_expressions.rs
@@ -17,6 +17,7 @@
 
 //! DateTime expressions
 
+use std::ops::{Add, Sub};
 use std::str::FromStr;
 use std::sync::Arc;
 
@@ -43,7 +44,7 @@ use arrow_array::types::{ArrowTimestampType, Date32Type, 
Int32Type};
 use arrow_array::GenericStringArray;
 use chrono::prelude::*;
 use chrono::LocalResult::Single;
-use chrono::{Duration, Months, NaiveDate};
+use chrono::{Duration, LocalResult, Months, NaiveDate};
 use itertools::Either;
 
 use datafusion_common::cast::{
@@ -662,8 +663,42 @@ fn _date_trunc_coarse_with_tz(
     granularity: &str,
     value: Option<DateTime<Tz>>,
 ) -> Result<Option<i64>> {
-    let value = _date_trunc_coarse::<DateTime<Tz>>(granularity, value)?;
-    Ok(value.and_then(|value| value.timestamp_nanos_opt()))
+    if let Some(value) = value {
+        let local = value.naive_local();
+        let truncated = _date_trunc_coarse::<NaiveDateTime>(granularity, 
Some(local))?;
+        let truncated = truncated.and_then(|truncated| {
+            match truncated.and_local_timezone(value.timezone()) {
+                LocalResult::None => {
+                    // This can happen if the date_trunc operation moves the 
time into
+                    // an hour that doesn't exist due to daylight savings. On 
known example where
+                    // this can happen is with historic dates in the 
America/Sao_Paulo time zone.
+                    // To account for this adjust the time by a few hours, 
convert to local time,
+                    // and then adjust the time back.
+                    truncated
+                        .sub(Duration::hours(3))
+                        .and_local_timezone(value.timezone())
+                        .single()
+                        .map(|v| v.add(Duration::hours(3)))
+                }
+                LocalResult::Single(datetime) => Some(datetime),
+                LocalResult::Ambiguous(datetime1, datetime2) => {
+                    // Because we are truncating from an equally or more 
specific time
+                    // the original time must have been within the ambiguous 
local time
+                    // period. Therefore the offset of one of these times 
should match the
+                    // offset of the original time.
+                    if datetime1.offset().fix() == value.offset().fix() {
+                        Some(datetime1)
+                    } else {
+                        Some(datetime2)
+                    }
+                }
+            }
+        });
+        Ok(truncated.and_then(|value| value.timestamp_nanos_opt()))
+    } else {
+        _date_trunc_coarse::<NaiveDateTime>(granularity, None)?;
+        Ok(None)
+    }
 }
 
 fn _date_trunc_coarse_without_tz(
@@ -1784,6 +1819,44 @@ mod tests {
                     "2020-09-08T00:00:00+08",
                 ],
             ),
+            (
+                vec![
+                    "2024-10-26T23:00:00Z",
+                    "2024-10-27T00:00:00Z",
+                    "2024-10-27T01:00:00Z",
+                    "2024-10-27T02:00:00Z",
+                ],
+                Some("Europe/Berlin".into()),
+                vec![
+                    "2024-10-27T00:00:00+02",
+                    "2024-10-27T00:00:00+02",
+                    "2024-10-27T00:00:00+02",
+                    "2024-10-27T00:00:00+02",
+                ],
+            ),
+            (
+                vec![
+                    "2018-02-18T00:00:00Z",
+                    "2018-02-18T01:00:00Z",
+                    "2018-02-18T02:00:00Z",
+                    "2018-02-18T03:00:00Z",
+                    "2018-11-04T01:00:00Z",
+                    "2018-11-04T02:00:00Z",
+                    "2018-11-04T03:00:00Z",
+                    "2018-11-04T04:00:00Z",
+                ],
+                Some("America/Sao_Paulo".into()),
+                vec![
+                    "2018-02-17T00:00:00-02",
+                    "2018-02-17T00:00:00-02",
+                    "2018-02-17T00:00:00-02",
+                    "2018-02-18T00:00:00-03",
+                    "2018-11-03T00:00:00-03",
+                    "2018-11-03T00:00:00-03",
+                    "2018-11-04T01:00:00-02",
+                    "2018-11-04T01:00:00-02",
+                ],
+            ),
         ];
 
         cases.iter().for_each(|(original, tz_opt, expected)| {
@@ -1815,6 +1888,163 @@ mod tests {
         });
     }
 
+    #[test]
+    fn test_date_trunc_hour_timezones() {
+        let cases = vec![
+            (
+                vec![
+                    "2020-09-08T00:30:00Z",
+                    "2020-09-08T01:30:00Z",
+                    "2020-09-08T02:30:00Z",
+                    "2020-09-08T03:30:00Z",
+                    "2020-09-08T04:30:00Z",
+                ],
+                Some("+00".into()),
+                vec![
+                    "2020-09-08T00:00:00Z",
+                    "2020-09-08T01:00:00Z",
+                    "2020-09-08T02:00:00Z",
+                    "2020-09-08T03:00:00Z",
+                    "2020-09-08T04:00:00Z",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T00:30:00Z",
+                    "2020-09-08T01:30:00Z",
+                    "2020-09-08T02:30:00Z",
+                    "2020-09-08T03:30:00Z",
+                    "2020-09-08T04:30:00Z",
+                ],
+                None,
+                vec![
+                    "2020-09-08T00:00:00Z",
+                    "2020-09-08T01:00:00Z",
+                    "2020-09-08T02:00:00Z",
+                    "2020-09-08T03:00:00Z",
+                    "2020-09-08T04:00:00Z",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T00:30:00Z",
+                    "2020-09-08T01:30:00Z",
+                    "2020-09-08T02:30:00Z",
+                    "2020-09-08T03:30:00Z",
+                    "2020-09-08T04:30:00Z",
+                ],
+                Some("-02".into()),
+                vec![
+                    "2020-09-08T00:00:00Z",
+                    "2020-09-08T01:00:00Z",
+                    "2020-09-08T02:00:00Z",
+                    "2020-09-08T03:00:00Z",
+                    "2020-09-08T04:00:00Z",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T00:30:00+05",
+                    "2020-09-08T01:30:00+05",
+                    "2020-09-08T02:30:00+05",
+                    "2020-09-08T03:30:00+05",
+                    "2020-09-08T04:30:00+05",
+                ],
+                Some("+05".into()),
+                vec![
+                    "2020-09-08T00:00:00+05",
+                    "2020-09-08T01:00:00+05",
+                    "2020-09-08T02:00:00+05",
+                    "2020-09-08T03:00:00+05",
+                    "2020-09-08T04:00:00+05",
+                ],
+            ),
+            (
+                vec![
+                    "2020-09-08T00:30:00+08",
+                    "2020-09-08T01:30:00+08",
+                    "2020-09-08T02:30:00+08",
+                    "2020-09-08T03:30:00+08",
+                    "2020-09-08T04:30:00+08",
+                ],
+                Some("+08".into()),
+                vec![
+                    "2020-09-08T00:00:00+08",
+                    "2020-09-08T01:00:00+08",
+                    "2020-09-08T02:00:00+08",
+                    "2020-09-08T03:00:00+08",
+                    "2020-09-08T04:00:00+08",
+                ],
+            ),
+            (
+                vec![
+                    "2024-10-26T23:30:00Z",
+                    "2024-10-27T00:30:00Z",
+                    "2024-10-27T01:30:00Z",
+                    "2024-10-27T02:30:00Z",
+                ],
+                Some("Europe/Berlin".into()),
+                vec![
+                    "2024-10-27T01:00:00+02",
+                    "2024-10-27T02:00:00+02",
+                    "2024-10-27T02:00:00+01",
+                    "2024-10-27T03:00:00+01",
+                ],
+            ),
+            (
+                vec![
+                    "2018-02-18T00:30:00Z",
+                    "2018-02-18T01:30:00Z",
+                    "2018-02-18T02:30:00Z",
+                    "2018-02-18T03:30:00Z",
+                    "2018-11-04T01:00:00Z",
+                    "2018-11-04T02:00:00Z",
+                    "2018-11-04T03:00:00Z",
+                    "2018-11-04T04:00:00Z",
+                ],
+                Some("America/Sao_Paulo".into()),
+                vec![
+                    "2018-02-17T22:00:00-02",
+                    "2018-02-17T23:00:00-02",
+                    "2018-02-17T23:00:00-03",
+                    "2018-02-18T00:00:00-03",
+                    "2018-11-03T22:00:00-03",
+                    "2018-11-03T23:00:00-03",
+                    "2018-11-04T01:00:00-02",
+                    "2018-11-04T02:00:00-02",
+                ],
+            ),
+        ];
+
+        cases.iter().for_each(|(original, tz_opt, expected)| {
+            let input = original
+                .iter()
+                .map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
+                .collect::<TimestampNanosecondArray>()
+                .with_timezone_opt(tz_opt.clone());
+            let right = expected
+                .iter()
+                .map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
+                .collect::<TimestampNanosecondArray>()
+                .with_timezone_opt(tz_opt.clone());
+            let result = date_trunc(&[
+                ColumnarValue::Scalar(ScalarValue::from("hour")),
+                ColumnarValue::Array(Arc::new(input)),
+            ])
+            .unwrap();
+            if let ColumnarValue::Array(result) = result {
+                assert_eq!(
+                    result.data_type(),
+                    &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone())
+                );
+                let left = 
as_primitive_array::<TimestampNanosecondType>(&result);
+                assert_eq!(left, &right);
+            } else {
+                panic!("unexpected column type");
+            }
+        });
+    }
+
     #[test]
     fn test_date_bin_single() {
         use chrono::Duration;
diff --git a/datafusion/sqllogictest/test_files/timestamps.slt 
b/datafusion/sqllogictest/test_files/timestamps.slt
index 8f565453a1..980545e922 100644
--- a/datafusion/sqllogictest/test_files/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/timestamps.slt
@@ -1262,6 +1262,173 @@ SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP 
'2023-08-03 14:38:50Z', 'Timestamp
 2023-08-03T14:38:50
 2023-08-03T14:38:50
 
+# date_trunc with data with timezones
+statement ok
+CREATE TABLE timestamp_strings(ts varchar)
+AS VALUES
+('2024-10-27 00:00:00'),
+('2024-10-27 00:30:00'),
+('2024-10-27 01:30:00'),
+('2024-10-27 02:00:00'), -- Daylight Savings Time happens here in Berlin
+('2024-10-27 02:30:00'),
+('2024-10-27 03:00:00'),
+('2024-10-27 03:30:00')
+;
+
+statement ok
+create view timestamp_utc as
+select
+  arrow_cast(ts, 'Timestamp(Nanosecond, Some("UTC"))') as ts
+from timestamp_strings;
+
+statement ok
+create view timestamp_berlin as
+select
+  arrow_cast(ts, 'Timestamp(Nanosecond, Some("Europe/Berlin"))') as ts
+from timestamp_utc; -- have to convert to utc prior to converting to berlin
+
+query PT
+select ts, arrow_typeof(ts) from timestamp_utc order by ts;
+----
+2024-10-27T00:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T00:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T01:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T02:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T02:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T03:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T03:30:00Z Timestamp(Nanosecond, Some("UTC"))
+
+query PT
+select ts, arrow_typeof(ts) from timestamp_berlin order by ts;
+----
+2024-10-27T02:00:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T02:30:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T02:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T03:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T03:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T04:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T04:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+
+#  date trunc in utc with DST
+query PPPP
+select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour', 
ts)
+from timestamp_utc order by ts;
+----
+2024-10-27T00:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 
2024-10-27T00:00:00Z
+2024-10-27T00:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 
2024-10-27T00:00:00Z
+2024-10-27T01:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 
2024-10-27T01:00:00Z
+2024-10-27T02:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 
2024-10-27T02:00:00Z
+2024-10-27T02:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 
2024-10-27T02:00:00Z
+2024-10-27T03:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 
2024-10-27T03:00:00Z
+2024-10-27T03:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z 
2024-10-27T03:00:00Z
+
+
+# date trunc in a timezone with DST across DST boundary (note the date-trunc 
hour value repeats)
+# Test for https://github.com/apache/arrow-datafusion/issues/8899
+query PPPP
+select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour', 
ts)
+from timestamp_berlin order by ts;
+----
+2024-10-27T02:00:00+02:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 
2024-10-27T02:00:00+02:00
+2024-10-27T02:30:00+02:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 
2024-10-27T02:00:00+02:00
+2024-10-27T02:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 
2024-10-27T02:00:00+01:00
+2024-10-27T03:00:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 
2024-10-27T03:00:00+01:00
+2024-10-27T03:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 
2024-10-27T03:00:00+01:00
+2024-10-27T04:00:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 
2024-10-27T04:00:00+01:00
+2024-10-27T04:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00 
2024-10-27T04:00:00+01:00
+
+statement ok
+drop table timestamp_strings;
+
+statement ok
+drop view timestamp_utc;
+
+statement ok
+drop view timestamp_berlin;
+
+# date_trunc with data with timezones where transition happens at midnight
+statement ok
+CREATE TABLE timestamp_strings(ts varchar)
+AS VALUES
+('2018-11-04 01:00:00'),
+('2018-11-04 01:30:00'),
+('2018-11-04 02:30:00'),
+('2018-11-04 03:00:00'), -- Daylight Savings Time started here in Sao Paulo
+('2018-11-04 03:30:00'),
+('2018-11-04 04:00:00'),
+('2018-11-04 04:30:00')
+;
+
+statement ok
+create view timestamp_utc as
+select
+  arrow_cast(ts, 'Timestamp(Nanosecond, Some("UTC"))') as ts
+from timestamp_strings;
+
+statement ok
+create view timestamp_sao_paulo as
+select
+  arrow_cast(ts, 'Timestamp(Nanosecond, Some("America/Sao_Paulo"))') as ts
+from timestamp_utc; -- have to convert to utc prior to converting to Sau Paulo
+
+query PT
+select ts, arrow_typeof(ts) from timestamp_utc order by ts;
+----
+2018-11-04T01:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T01:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T02:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T03:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T03:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T04:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T04:30:00Z Timestamp(Nanosecond, Some("UTC"))
+
+query PT
+select ts, arrow_typeof(ts) from timestamp_sao_paulo order by ts;
+----
+2018-11-03T22:00:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-03T22:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-03T23:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-04T01:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-04T01:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-04T02:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-04T02:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+
+#  date trunc in utc with DST
+query PPPP
+select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour', 
ts)
+from timestamp_utc order by ts;
+----
+2018-11-04T01:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 
2018-11-04T01:00:00Z
+2018-11-04T01:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 
2018-11-04T01:00:00Z
+2018-11-04T02:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 
2018-11-04T02:00:00Z
+2018-11-04T03:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 
2018-11-04T03:00:00Z
+2018-11-04T03:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 
2018-11-04T03:00:00Z
+2018-11-04T04:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 
2018-11-04T04:00:00Z
+2018-11-04T04:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z 
2018-11-04T04:00:00Z
+
+
+# date trunc in a timezone with DST across DST boundary (note there is no 
midnight on 2018-11-04)
+# Test for https://github.com/apache/arrow-datafusion/issues/8899
+query PPPP
+select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour', 
ts)
+from timestamp_sao_paulo order by ts;
+----
+2018-11-03T22:00:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00 
2018-11-03T22:00:00-03:00
+2018-11-03T22:30:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00 
2018-11-03T22:00:00-03:00
+2018-11-03T23:30:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00 
2018-11-03T23:00:00-03:00
+2018-11-04T01:00:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00 
2018-11-04T01:00:00-02:00
+2018-11-04T01:30:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00 
2018-11-04T01:00:00-02:00
+2018-11-04T02:00:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00 
2018-11-04T02:00:00-02:00
+2018-11-04T02:30:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00 
2018-11-04T02:00:00-02:00
+
+statement ok
+drop table timestamp_strings;
+
+statement ok
+drop view timestamp_utc;
+
+statement ok
+drop view timestamp_sao_paulo;
 
 # Demonstrate that strings are automatically coerced to timestamps (don't use 
TIMESTAMP)
 

Reply via email to