This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new feeee04601 fix: unambiguously truncate time in date_trunc function
(#9068)
feeee04601 is described below
commit feeee046010ac53e1f6baf6aa1a696a26c7c74d7
Author: Martin Hilton <[email protected]>
AuthorDate: Wed Jan 31 11:01:26 2024 +0000
fix: unambiguously truncate time in date_trunc function (#9068)
* fix: unambiguously truncate time in date_trunc function
When date_trunc is truncating a timestamp with a geographic
timezone it would previously get stuck if the local reprentation of
the time could be ambiguously interpretted. This happens when the
clocks "go back". The update here is to use the original timestamp
offset as the tie-breaker when the local representation of the
truncated time could be ambiguous.
* fix: fmt
* chore: review suggestion
* fix: fmt
* Add date_trunc test to slt
* Update test for fixed fode
* feat: additional tests
Add test for the historical America/Sao_Paulo timezone which changed
in and out of DST at midnight.
* chore: review suggestion
* fix: date_trunc support days starting at 1am
Historically Sao Paulo, and possibly other places, have had daylight
savings time that started at midnight. This causes the day to start
at 1am. The naive method used by date_trunc to truncate to 'day' will
create a non-existent time in these circumstances. Adjust the
timestamps produced by date_trunc in this case to be valid within
the required timezone.
* fix: fmt
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
.../physical-expr/src/datetime_expressions.rs | 236 ++++++++++++++++++++-
datafusion/sqllogictest/test_files/timestamps.slt | 167 +++++++++++++++
2 files changed, 400 insertions(+), 3 deletions(-)
diff --git a/datafusion/physical-expr/src/datetime_expressions.rs
b/datafusion/physical-expr/src/datetime_expressions.rs
index 430220faf8..c40a89b0ba 100644
--- a/datafusion/physical-expr/src/datetime_expressions.rs
+++ b/datafusion/physical-expr/src/datetime_expressions.rs
@@ -17,6 +17,7 @@
//! DateTime expressions
+use std::ops::{Add, Sub};
use std::str::FromStr;
use std::sync::Arc;
@@ -43,7 +44,7 @@ use arrow_array::types::{ArrowTimestampType, Date32Type,
Int32Type};
use arrow_array::GenericStringArray;
use chrono::prelude::*;
use chrono::LocalResult::Single;
-use chrono::{Duration, Months, NaiveDate};
+use chrono::{Duration, LocalResult, Months, NaiveDate};
use itertools::Either;
use datafusion_common::cast::{
@@ -662,8 +663,42 @@ fn _date_trunc_coarse_with_tz(
granularity: &str,
value: Option<DateTime<Tz>>,
) -> Result<Option<i64>> {
- let value = _date_trunc_coarse::<DateTime<Tz>>(granularity, value)?;
- Ok(value.and_then(|value| value.timestamp_nanos_opt()))
+ if let Some(value) = value {
+ let local = value.naive_local();
+ let truncated = _date_trunc_coarse::<NaiveDateTime>(granularity,
Some(local))?;
+ let truncated = truncated.and_then(|truncated| {
+ match truncated.and_local_timezone(value.timezone()) {
+ LocalResult::None => {
+ // This can happen if the date_trunc operation moves the
time into
+ // an hour that doesn't exist due to daylight savings. On
known example where
+ // this can happen is with historic dates in the
America/Sao_Paulo time zone.
+ // To account for this adjust the time by a few hours,
convert to local time,
+ // and then adjust the time back.
+ truncated
+ .sub(Duration::hours(3))
+ .and_local_timezone(value.timezone())
+ .single()
+ .map(|v| v.add(Duration::hours(3)))
+ }
+ LocalResult::Single(datetime) => Some(datetime),
+ LocalResult::Ambiguous(datetime1, datetime2) => {
+ // Because we are truncating from an equally or more
specific time
+ // the original time must have been within the ambiguous
local time
+ // period. Therefore the offset of one of these times
should match the
+ // offset of the original time.
+ if datetime1.offset().fix() == value.offset().fix() {
+ Some(datetime1)
+ } else {
+ Some(datetime2)
+ }
+ }
+ }
+ });
+ Ok(truncated.and_then(|value| value.timestamp_nanos_opt()))
+ } else {
+ _date_trunc_coarse::<NaiveDateTime>(granularity, None)?;
+ Ok(None)
+ }
}
fn _date_trunc_coarse_without_tz(
@@ -1784,6 +1819,44 @@ mod tests {
"2020-09-08T00:00:00+08",
],
),
+ (
+ vec![
+ "2024-10-26T23:00:00Z",
+ "2024-10-27T00:00:00Z",
+ "2024-10-27T01:00:00Z",
+ "2024-10-27T02:00:00Z",
+ ],
+ Some("Europe/Berlin".into()),
+ vec![
+ "2024-10-27T00:00:00+02",
+ "2024-10-27T00:00:00+02",
+ "2024-10-27T00:00:00+02",
+ "2024-10-27T00:00:00+02",
+ ],
+ ),
+ (
+ vec![
+ "2018-02-18T00:00:00Z",
+ "2018-02-18T01:00:00Z",
+ "2018-02-18T02:00:00Z",
+ "2018-02-18T03:00:00Z",
+ "2018-11-04T01:00:00Z",
+ "2018-11-04T02:00:00Z",
+ "2018-11-04T03:00:00Z",
+ "2018-11-04T04:00:00Z",
+ ],
+ Some("America/Sao_Paulo".into()),
+ vec![
+ "2018-02-17T00:00:00-02",
+ "2018-02-17T00:00:00-02",
+ "2018-02-17T00:00:00-02",
+ "2018-02-18T00:00:00-03",
+ "2018-11-03T00:00:00-03",
+ "2018-11-03T00:00:00-03",
+ "2018-11-04T01:00:00-02",
+ "2018-11-04T01:00:00-02",
+ ],
+ ),
];
cases.iter().for_each(|(original, tz_opt, expected)| {
@@ -1815,6 +1888,163 @@ mod tests {
});
}
+ #[test]
+ fn test_date_trunc_hour_timezones() {
+ let cases = vec![
+ (
+ vec![
+ "2020-09-08T00:30:00Z",
+ "2020-09-08T01:30:00Z",
+ "2020-09-08T02:30:00Z",
+ "2020-09-08T03:30:00Z",
+ "2020-09-08T04:30:00Z",
+ ],
+ Some("+00".into()),
+ vec![
+ "2020-09-08T00:00:00Z",
+ "2020-09-08T01:00:00Z",
+ "2020-09-08T02:00:00Z",
+ "2020-09-08T03:00:00Z",
+ "2020-09-08T04:00:00Z",
+ ],
+ ),
+ (
+ vec![
+ "2020-09-08T00:30:00Z",
+ "2020-09-08T01:30:00Z",
+ "2020-09-08T02:30:00Z",
+ "2020-09-08T03:30:00Z",
+ "2020-09-08T04:30:00Z",
+ ],
+ None,
+ vec![
+ "2020-09-08T00:00:00Z",
+ "2020-09-08T01:00:00Z",
+ "2020-09-08T02:00:00Z",
+ "2020-09-08T03:00:00Z",
+ "2020-09-08T04:00:00Z",
+ ],
+ ),
+ (
+ vec![
+ "2020-09-08T00:30:00Z",
+ "2020-09-08T01:30:00Z",
+ "2020-09-08T02:30:00Z",
+ "2020-09-08T03:30:00Z",
+ "2020-09-08T04:30:00Z",
+ ],
+ Some("-02".into()),
+ vec![
+ "2020-09-08T00:00:00Z",
+ "2020-09-08T01:00:00Z",
+ "2020-09-08T02:00:00Z",
+ "2020-09-08T03:00:00Z",
+ "2020-09-08T04:00:00Z",
+ ],
+ ),
+ (
+ vec![
+ "2020-09-08T00:30:00+05",
+ "2020-09-08T01:30:00+05",
+ "2020-09-08T02:30:00+05",
+ "2020-09-08T03:30:00+05",
+ "2020-09-08T04:30:00+05",
+ ],
+ Some("+05".into()),
+ vec![
+ "2020-09-08T00:00:00+05",
+ "2020-09-08T01:00:00+05",
+ "2020-09-08T02:00:00+05",
+ "2020-09-08T03:00:00+05",
+ "2020-09-08T04:00:00+05",
+ ],
+ ),
+ (
+ vec![
+ "2020-09-08T00:30:00+08",
+ "2020-09-08T01:30:00+08",
+ "2020-09-08T02:30:00+08",
+ "2020-09-08T03:30:00+08",
+ "2020-09-08T04:30:00+08",
+ ],
+ Some("+08".into()),
+ vec![
+ "2020-09-08T00:00:00+08",
+ "2020-09-08T01:00:00+08",
+ "2020-09-08T02:00:00+08",
+ "2020-09-08T03:00:00+08",
+ "2020-09-08T04:00:00+08",
+ ],
+ ),
+ (
+ vec![
+ "2024-10-26T23:30:00Z",
+ "2024-10-27T00:30:00Z",
+ "2024-10-27T01:30:00Z",
+ "2024-10-27T02:30:00Z",
+ ],
+ Some("Europe/Berlin".into()),
+ vec![
+ "2024-10-27T01:00:00+02",
+ "2024-10-27T02:00:00+02",
+ "2024-10-27T02:00:00+01",
+ "2024-10-27T03:00:00+01",
+ ],
+ ),
+ (
+ vec![
+ "2018-02-18T00:30:00Z",
+ "2018-02-18T01:30:00Z",
+ "2018-02-18T02:30:00Z",
+ "2018-02-18T03:30:00Z",
+ "2018-11-04T01:00:00Z",
+ "2018-11-04T02:00:00Z",
+ "2018-11-04T03:00:00Z",
+ "2018-11-04T04:00:00Z",
+ ],
+ Some("America/Sao_Paulo".into()),
+ vec![
+ "2018-02-17T22:00:00-02",
+ "2018-02-17T23:00:00-02",
+ "2018-02-17T23:00:00-03",
+ "2018-02-18T00:00:00-03",
+ "2018-11-03T22:00:00-03",
+ "2018-11-03T23:00:00-03",
+ "2018-11-04T01:00:00-02",
+ "2018-11-04T02:00:00-02",
+ ],
+ ),
+ ];
+
+ cases.iter().for_each(|(original, tz_opt, expected)| {
+ let input = original
+ .iter()
+ .map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
+ .collect::<TimestampNanosecondArray>()
+ .with_timezone_opt(tz_opt.clone());
+ let right = expected
+ .iter()
+ .map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
+ .collect::<TimestampNanosecondArray>()
+ .with_timezone_opt(tz_opt.clone());
+ let result = date_trunc(&[
+ ColumnarValue::Scalar(ScalarValue::from("hour")),
+ ColumnarValue::Array(Arc::new(input)),
+ ])
+ .unwrap();
+ if let ColumnarValue::Array(result) = result {
+ assert_eq!(
+ result.data_type(),
+ &DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone())
+ );
+ let left =
as_primitive_array::<TimestampNanosecondType>(&result);
+ assert_eq!(left, &right);
+ } else {
+ panic!("unexpected column type");
+ }
+ });
+ }
+
#[test]
fn test_date_bin_single() {
use chrono::Duration;
diff --git a/datafusion/sqllogictest/test_files/timestamps.slt
b/datafusion/sqllogictest/test_files/timestamps.slt
index 8f565453a1..980545e922 100644
--- a/datafusion/sqllogictest/test_files/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/timestamps.slt
@@ -1262,6 +1262,173 @@ SELECT DATE_TRUNC('day', arrow_cast(TIMESTAMP
'2023-08-03 14:38:50Z', 'Timestamp
2023-08-03T14:38:50
2023-08-03T14:38:50
+# date_trunc with data with timezones
+statement ok
+CREATE TABLE timestamp_strings(ts varchar)
+AS VALUES
+('2024-10-27 00:00:00'),
+('2024-10-27 00:30:00'),
+('2024-10-27 01:30:00'),
+('2024-10-27 02:00:00'), -- Daylight Savings Time happens here in Berlin
+('2024-10-27 02:30:00'),
+('2024-10-27 03:00:00'),
+('2024-10-27 03:30:00')
+;
+
+statement ok
+create view timestamp_utc as
+select
+ arrow_cast(ts, 'Timestamp(Nanosecond, Some("UTC"))') as ts
+from timestamp_strings;
+
+statement ok
+create view timestamp_berlin as
+select
+ arrow_cast(ts, 'Timestamp(Nanosecond, Some("Europe/Berlin"))') as ts
+from timestamp_utc; -- have to convert to utc prior to converting to berlin
+
+query PT
+select ts, arrow_typeof(ts) from timestamp_utc order by ts;
+----
+2024-10-27T00:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T00:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T01:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T02:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T02:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T03:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2024-10-27T03:30:00Z Timestamp(Nanosecond, Some("UTC"))
+
+query PT
+select ts, arrow_typeof(ts) from timestamp_berlin order by ts;
+----
+2024-10-27T02:00:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T02:30:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T02:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T03:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T03:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T04:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+2024-10-27T04:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin"))
+
+# date trunc in utc with DST
+query PPPP
+select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour',
ts)
+from timestamp_utc order by ts;
+----
+2024-10-27T00:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z
2024-10-27T00:00:00Z
+2024-10-27T00:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z
2024-10-27T00:00:00Z
+2024-10-27T01:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z
2024-10-27T01:00:00Z
+2024-10-27T02:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z
2024-10-27T02:00:00Z
+2024-10-27T02:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z
2024-10-27T02:00:00Z
+2024-10-27T03:00:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z
2024-10-27T03:00:00Z
+2024-10-27T03:30:00Z 2024-10-01T00:00:00Z 2024-10-27T00:00:00Z
2024-10-27T03:00:00Z
+
+
+# date trunc in a timezone with DST across DST boundary (note the date-trunc
hour value repeats)
+# Test for https://github.com/apache/arrow-datafusion/issues/8899
+query PPPP
+select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour',
ts)
+from timestamp_berlin order by ts;
+----
+2024-10-27T02:00:00+02:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00
2024-10-27T02:00:00+02:00
+2024-10-27T02:30:00+02:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00
2024-10-27T02:00:00+02:00
+2024-10-27T02:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00
2024-10-27T02:00:00+01:00
+2024-10-27T03:00:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00
2024-10-27T03:00:00+01:00
+2024-10-27T03:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00
2024-10-27T03:00:00+01:00
+2024-10-27T04:00:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00
2024-10-27T04:00:00+01:00
+2024-10-27T04:30:00+01:00 2024-10-01T00:00:00+02:00 2024-10-27T00:00:00+02:00
2024-10-27T04:00:00+01:00
+
+statement ok
+drop table timestamp_strings;
+
+statement ok
+drop view timestamp_utc;
+
+statement ok
+drop view timestamp_berlin;
+
+# date_trunc with data with timezones where transition happens at midnight
+statement ok
+CREATE TABLE timestamp_strings(ts varchar)
+AS VALUES
+('2018-11-04 01:00:00'),
+('2018-11-04 01:30:00'),
+('2018-11-04 02:30:00'),
+('2018-11-04 03:00:00'), -- Daylight Savings Time started here in Sao Paulo
+('2018-11-04 03:30:00'),
+('2018-11-04 04:00:00'),
+('2018-11-04 04:30:00')
+;
+
+statement ok
+create view timestamp_utc as
+select
+ arrow_cast(ts, 'Timestamp(Nanosecond, Some("UTC"))') as ts
+from timestamp_strings;
+
+statement ok
+create view timestamp_sao_paulo as
+select
+ arrow_cast(ts, 'Timestamp(Nanosecond, Some("America/Sao_Paulo"))') as ts
+from timestamp_utc; -- have to convert to utc prior to converting to Sau Paulo
+
+query PT
+select ts, arrow_typeof(ts) from timestamp_utc order by ts;
+----
+2018-11-04T01:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T01:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T02:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T03:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T03:30:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T04:00:00Z Timestamp(Nanosecond, Some("UTC"))
+2018-11-04T04:30:00Z Timestamp(Nanosecond, Some("UTC"))
+
+query PT
+select ts, arrow_typeof(ts) from timestamp_sao_paulo order by ts;
+----
+2018-11-03T22:00:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-03T22:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-03T23:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-04T01:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-04T01:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-04T02:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+2018-11-04T02:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo"))
+
+# date trunc in utc with DST
+query PPPP
+select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour',
ts)
+from timestamp_utc order by ts;
+----
+2018-11-04T01:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z
2018-11-04T01:00:00Z
+2018-11-04T01:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z
2018-11-04T01:00:00Z
+2018-11-04T02:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z
2018-11-04T02:00:00Z
+2018-11-04T03:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z
2018-11-04T03:00:00Z
+2018-11-04T03:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z
2018-11-04T03:00:00Z
+2018-11-04T04:00:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z
2018-11-04T04:00:00Z
+2018-11-04T04:30:00Z 2018-11-01T00:00:00Z 2018-11-04T00:00:00Z
2018-11-04T04:00:00Z
+
+
+# date trunc in a timezone with DST across DST boundary (note there is no
midnight on 2018-11-04)
+# Test for https://github.com/apache/arrow-datafusion/issues/8899
+query PPPP
+select ts, date_trunc('month', ts), date_trunc('day', ts), date_trunc('hour',
ts)
+from timestamp_sao_paulo order by ts;
+----
+2018-11-03T22:00:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00
2018-11-03T22:00:00-03:00
+2018-11-03T22:30:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00
2018-11-03T22:00:00-03:00
+2018-11-03T23:30:00-03:00 2018-11-01T00:00:00-03:00 2018-11-03T00:00:00-03:00
2018-11-03T23:00:00-03:00
+2018-11-04T01:00:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00
2018-11-04T01:00:00-02:00
+2018-11-04T01:30:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00
2018-11-04T01:00:00-02:00
+2018-11-04T02:00:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00
2018-11-04T02:00:00-02:00
+2018-11-04T02:30:00-02:00 2018-11-01T00:00:00-03:00 2018-11-04T01:00:00-02:00
2018-11-04T02:00:00-02:00
+
+statement ok
+drop table timestamp_strings;
+
+statement ok
+drop view timestamp_utc;
+
+statement ok
+drop view timestamp_sao_paulo;
# Demonstrate that strings are automatically coerced to timestamps (don't use
TIMESTAMP)