alamb commented on code in PR #7614:
URL: https://github.com/apache/arrow-datafusion/pull/7614#discussion_r1333328646
##########
datafusion/expr/src/type_coercion/functions.rs:
##########
@@ -136,62 +137,100 @@ fn maybe_data_types(
///
/// See the module level documentation for more detail on coercion.
pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool {
- use self::DataType::*;
-
if type_into == type_from {
return true;
}
- // Null can convert to most of types
+ if let Some(coerced) = coerced_from(type_into, type_from) {
+ return coerced == type_into;
+ }
+ false
+}
+
+fn coerced_from<'a>(
+ type_into: &'a DataType,
+ type_from: &'a DataType,
+) -> Option<&'a DataType> {
+ use self::DataType::*;
+
match type_into {
- Int8 => matches!(type_from, Null | Int8),
- Int16 => matches!(type_from, Null | Int8 | Int16 | UInt8),
- Int32 => matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 |
UInt16),
- Int64 => matches!(
- type_from,
- Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32
- ),
- UInt8 => matches!(type_from, Null | UInt8),
- UInt16 => matches!(type_from, Null | UInt8 | UInt16),
- UInt32 => matches!(type_from, Null | UInt8 | UInt16 | UInt32),
- UInt64 => matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64),
- Float32 => matches!(
- type_from,
- Null | Int8
- | Int16
- | Int32
- | Int64
- | UInt8
- | UInt16
- | UInt32
- | UInt64
- | Float32
- ),
- Float64 => matches!(
- type_from,
- Null | Int8
- | Int16
- | Int32
- | Int64
- | UInt8
- | UInt16
- | UInt32
- | UInt64
- | Float32
- | Float64
- | Decimal128(_, _)
- ),
- Timestamp(TimeUnit::Nanosecond, _) => {
- matches!(
+ // coerced into type_into
+ Int8 if matches!(type_from, Null | Int8) => Some(type_into),
+ Int16 if matches!(type_from, Null | Int8 | Int16 | UInt8) =>
Some(type_into),
+ Int32 if matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 |
UInt16) => {
+ Some(type_into)
+ }
+ Int64
+ if matches!(
+ type_from,
+ Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32
+ ) =>
+ {
+ Some(type_into)
+ }
+ UInt8 if matches!(type_from, Null | UInt8) => Some(type_into),
+ UInt16 if matches!(type_from, Null | UInt8 | UInt16) =>
Some(type_into),
+ UInt32 if matches!(type_from, Null | UInt8 | UInt16 | UInt32) =>
Some(type_into),
+ UInt64 if matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64)
=> {
+ Some(type_into)
+ }
+ Float32
+ if matches!(
type_from,
- Null | Timestamp(_, _) | Date32 | Utf8 | LargeUtf8
- )
+ Null | Int8
+ | Int16
+ | Int32
+ | Int64
+ | UInt8
+ | UInt16
+ | UInt32
+ | UInt64
+ | Float32
+ ) =>
+ {
+ Some(type_into)
}
- Interval(_) => {
- matches!(type_from, Utf8 | LargeUtf8)
+ Float64
+ if matches!(
+ type_from,
+ Null | Int8
+ | Int16
+ | Int32
+ | Int64
+ | UInt8
+ | UInt16
+ | UInt32
+ | UInt64
+ | Float32
+ | Float64
+ | Decimal128(_, _)
+ ) =>
+ {
+ Some(type_into)
+ }
+ Timestamp(TimeUnit::Nanosecond, None)
+ if matches!(
+ type_from,
+ Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8
+ ) =>
+ {
+ Some(type_into)
}
- Utf8 | LargeUtf8 => true,
- Null => can_cast_types(type_from, type_into),
- _ => false,
+ Interval(_) if matches!(type_from, Utf8 | LargeUtf8) =>
Some(type_into),
+ Utf8 | LargeUtf8 => Some(type_into),
+ Null if can_cast_types(type_from, type_into) => Some(type_into),
+
+ // timestamp coercions, with timezone, accept the type_from timezone
if valid
+ Timestamp(TimeUnit::Nanosecond, Some(_))
+ if matches!(
+ type_from,
+ Timestamp(TimeUnit::Nanosecond, Some(from_tz)) if
arrow_array::timezone::Tz::from_str(from_tz).is_ok()
Review Comment:
I think using the `arrow_array` function to parse strings to a valid
timezone is the correct choice
Also I don't think `arrow_array` is a new dependency as `datafusion_expr`
already depends on the `arrow` crate, which depends on `arrow_array`. Thus I
think adding a new explicit dependency to use `arrow_array::timezone::Tz` is
fine
##########
datafusion/sqllogictest/test_files/timestamps.slt:
##########
@@ -1307,3 +1341,121 @@ drop table ts_data_millis
statement ok
drop table ts_data_secs
+
Review Comment:
👏 -- very nice tests.
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -208,20 +209,25 @@ pub fn make_current_time(
move |_arg| Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(nano)))
}
-fn quarter_month(date: &NaiveDateTime) -> u32 {
+fn quarter_month<T: TimeZone>(date: &DateTime<T>) -> u32 {
1 + 3 * ((date.month() - 1) / 3)
}
/// Tuncates the single `value`, expressed in nanoseconds since the
/// epoch, for granularities greater than 1 second, in taking into
/// account that some granularities are not uniform durations of time
/// (e.g. months are not always the same lengths, leap seconds, etc)
-fn date_trunc_coarse(granularity: &str, value: i64) -> Result<i64> {
- // Use chrono NaiveDateTime to clear the various fields
- // correctly accounting for non uniform granularities
- let value = timestamp_ns_to_datetime(value).ok_or_else(|| {
- DataFusionError::Execution(format!("Timestamp {value} out of range"))
- })?;
+fn date_trunc_coarse(
+ granularity: &str,
+ value: i64,
+ tz: &Option<Arc<str>>,
+) -> Result<i64> {
+ // Use chrono DateTime<Tz> to clear the various fields because need to
clear per timezone,
+ // and NaiveDateTime (ISO 8601) has no concept of timezones
+ let tz =
arrow_array::timezone::Tz::from_str(tz.as_deref().unwrap_or("+00"))?;
Review Comment:
I think as written this will invoke `as_datetime_with_timezone` for each
row. This will effectively re-parse the same string for all rows which is
likely to be expensive
This per-row parsing overhead will happen even for timestamps that have a tz
of `None` which the version on master doesn't do and thus I think this change
would result in a performance regression if we merged this code as is.
However I think the fix should be relatively straightforward.
What would you think about parsing the timezone once per batch (basically
parse it once in `date_trunc` and then pass `Option<Tz>` down through
`_date_trunc` rather than `Option<Arc<str>>`?
##########
datafusion/expr/src/type_coercion/functions.rs:
##########
@@ -136,62 +137,100 @@ fn maybe_data_types(
///
/// See the module level documentation for more detail on coercion.
pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool {
- use self::DataType::*;
-
if type_into == type_from {
return true;
}
- // Null can convert to most of types
+ if let Some(coerced) = coerced_from(type_into, type_from) {
+ return coerced == type_into;
+ }
+ false
+}
+
+fn coerced_from<'a>(
+ type_into: &'a DataType,
+ type_from: &'a DataType,
+) -> Option<&'a DataType> {
+ use self::DataType::*;
+
match type_into {
- Int8 => matches!(type_from, Null | Int8),
- Int16 => matches!(type_from, Null | Int8 | Int16 | UInt8),
- Int32 => matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 |
UInt16),
- Int64 => matches!(
- type_from,
- Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32
- ),
- UInt8 => matches!(type_from, Null | UInt8),
- UInt16 => matches!(type_from, Null | UInt8 | UInt16),
- UInt32 => matches!(type_from, Null | UInt8 | UInt16 | UInt32),
- UInt64 => matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64),
- Float32 => matches!(
- type_from,
- Null | Int8
- | Int16
- | Int32
- | Int64
- | UInt8
- | UInt16
- | UInt32
- | UInt64
- | Float32
- ),
- Float64 => matches!(
- type_from,
- Null | Int8
- | Int16
- | Int32
- | Int64
- | UInt8
- | UInt16
- | UInt32
- | UInt64
- | Float32
- | Float64
- | Decimal128(_, _)
- ),
- Timestamp(TimeUnit::Nanosecond, _) => {
- matches!(
+ // coerced into type_into
+ Int8 if matches!(type_from, Null | Int8) => Some(type_into),
+ Int16 if matches!(type_from, Null | Int8 | Int16 | UInt8) =>
Some(type_into),
+ Int32 if matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 |
UInt16) => {
+ Some(type_into)
+ }
+ Int64
+ if matches!(
+ type_from,
+ Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32
+ ) =>
+ {
+ Some(type_into)
+ }
+ UInt8 if matches!(type_from, Null | UInt8) => Some(type_into),
+ UInt16 if matches!(type_from, Null | UInt8 | UInt16) =>
Some(type_into),
+ UInt32 if matches!(type_from, Null | UInt8 | UInt16 | UInt32) =>
Some(type_into),
+ UInt64 if matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64)
=> {
+ Some(type_into)
+ }
+ Float32
+ if matches!(
type_from,
- Null | Timestamp(_, _) | Date32 | Utf8 | LargeUtf8
- )
+ Null | Int8
+ | Int16
+ | Int32
+ | Int64
+ | UInt8
+ | UInt16
+ | UInt32
+ | UInt64
+ | Float32
+ ) =>
+ {
+ Some(type_into)
}
- Interval(_) => {
- matches!(type_from, Utf8 | LargeUtf8)
+ Float64
+ if matches!(
+ type_from,
+ Null | Int8
+ | Int16
+ | Int32
+ | Int64
+ | UInt8
+ | UInt16
+ | UInt32
+ | UInt64
+ | Float32
+ | Float64
+ | Decimal128(_, _)
+ ) =>
+ {
+ Some(type_into)
+ }
+ Timestamp(TimeUnit::Nanosecond, None)
+ if matches!(
+ type_from,
+ Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8
+ ) =>
+ {
+ Some(type_into)
}
- Utf8 | LargeUtf8 => true,
- Null => can_cast_types(type_from, type_into),
- _ => false,
+ Interval(_) if matches!(type_from, Utf8 | LargeUtf8) =>
Some(type_into),
+ Utf8 | LargeUtf8 => Some(type_into),
+ Null if can_cast_types(type_from, type_into) => Some(type_into),
+
+ // timestamp coercions, with timezone, accept the type_from timezone
if valid
+ Timestamp(TimeUnit::Nanosecond, Some(_))
+ if matches!(
+ type_from,
+ Timestamp(TimeUnit::Nanosecond, Some(from_tz)) if
arrow_array::timezone::Tz::from_str(from_tz).is_ok()
Review Comment:
It would be nice if we didn't have to discard the `ArrowError` (via `is_ok`)
as that causes the work to create the error message to bis discarded.
However, I think this will need a new API in arrow-rs -- so I'll file a new
ticket there prior to merging this PR
##########
datafusion/sqllogictest/test_files/timestamps.slt:
##########
@@ -100,6 +100,40 @@ select * from foo where ts != '2000-02-01T00:00:00';
statement ok
drop table foo;
+
+##########
+## Timezone Handling Tests
+##########
+
+statement ok
+SET TIME ZONE = '+08'
+
+# should use execution timezone
+query P
+SELECT TIMESTAMPTZ '2000-01-01T01:01:01'
Review Comment:
👍
##########
datafusion/sqllogictest/test_files/timestamps.slt:
##########
@@ -1307,3 +1341,121 @@ drop table ts_data_millis
statement ok
drop table ts_data_secs
+
Review Comment:
👏 -- very nice tests.
##########
datafusion/sqllogictest/test_files/timestamps.slt:
##########
@@ -100,6 +100,40 @@ select * from foo where ts != '2000-02-01T00:00:00';
statement ok
drop table foo;
+
+##########
+## Timezone Handling Tests
+##########
+
+statement ok
+SET TIME ZONE = '+08'
+
+# should use execution timezone
+query P
+SELECT TIMESTAMPTZ '2000-01-01T01:01:01'
Review Comment:
👍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]