kosiew commented on code in PR #19078:
URL: https://github.com/apache/datafusion/pull/19078#discussion_r2621759327
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -15,31 +15,44 @@
// specific language governing permissions and limitations
// under the License.
-use std::sync::Arc;
+use std::marker::PhantomData;
+use std::sync::{Arc, LazyLock};
+use arrow::array::timezone::Tz;
use arrow::array::{
Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray,
StringArrayType, StringViewArray,
};
-use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
-use arrow::datatypes::DataType;
+use arrow::compute::kernels::cast_utils::string_to_datetime;
+use arrow::datatypes::{DataType, TimeUnit};
+use arrow_buffer::ArrowNativeType;
use chrono::LocalResult::Single;
use chrono::format::{Parsed, StrftimeItems, parse};
use chrono::{DateTime, TimeZone, Utc};
-
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::{
- DataFusionError, Result, ScalarType, ScalarValue, exec_datafusion_err,
exec_err,
- unwrap_or_internal_err,
+ DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err,
+ internal_datafusion_err, unwrap_or_internal_err,
};
use datafusion_expr::ColumnarValue;
+use num_traits::{PrimInt, ToPrimitive};
/// Error message if nanosecond conversion request beyond supported interval
const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented
as nanoseconds have to be between 1677-09-21T00:12:44.0 and
2262-04-11T23:47:16.854775804";
-/// Calls string_to_timestamp_nanos and converts the error type
-pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result<i64> {
- string_to_timestamp_nanos(s).map_err(|e| e.into())
+static UTC: LazyLock<Tz> = LazyLock::new(|| "UTC".parse().expect("UTC is
always valid"));
+
+pub(crate) fn string_to_timestamp_nanos_with_timezone(
+ timezone: &Option<Tz>,
+ s: &str,
+) -> Result<i64> {
+ let tz = timezone.unwrap_or(*UTC);
Review Comment:
perhaps
```
let tz = timezone.as_ref().unwrap_or(&UTC);
```
so
tz is a borrow instead of owning.
##########
datafusion/functions/benches/to_timestamp.rs:
##########
@@ -115,15 +115,17 @@ fn criterion_benchmark(c: &mut Criterion) {
let arg_field = Field::new("a", DataType::Utf8, false).into();
let arg_fields = vec![arg_field];
let config_options = Arc::new(ConfigOptions::default());
+ let to_timestamp_udf = to_timestamp(config_options.as_ref());
Review Comment:
Since the UDF now depends on session config, should we set
`ConfigOptions.execution.time_zone` to a fixed value (e.g. UTC) in the bench
to avoid future default changes affecting comparability?
##########
datafusion/functions/src/datetime/to_timestamp.rs:
##########
@@ -988,13 +1605,21 @@ mod tests {
}
#[test]
- fn test_tz() {
+ fn test_no_tz() {
Review Comment:
hi @Omega359,
I think we should also add a unit test where the UDF instance is not updated
via `with_updated_config` but `ScalarFunctionArgs` contains a `config_options`
with timezone, and verify behavior
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -89,11 +101,55 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
)
};
+ let mut datetime_str = s;
+ let mut format = format;
+
+ // Manually handle the most common case of a named timezone at the end of
the timestamp.
+ // Note that %+ handles 'Z' at the end of the string without a space. This
code doesn't
+ // handle named timezones with no preceding space since that would require
writing a
+ // custom parser (or switching to Jiff)
+ let tz: Option<chrono_tz::Tz> = if format.ends_with(" %Z") {
+ // grab the string after the last space as the named timezone
+ if let Some((dt_str, timezone_name)) = datetime_str.rsplit_once(' ') {
Review Comment:
This only splits on a single space. Should we consider trimming and
splitting on any ASCII whitespace?
##########
datafusion/sqllogictest/test_files/to_timestamp_timezone.slt:
##########
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## to_timestamp timezone tests
+##########
+
+## Reset timezone for other tests
+statement ok
+RESET datafusion.execution.time_zone
+
+## Test 1: Default timezone (None) with naive timestamp
+## Naive timestamps (without explicit timezone) should be interpreted as UTC
by default
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29
+
+## Test 2: Explicit UTC timezone ('Z' suffix)
+## Explicit timezone should be respected regardless of session timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T13:42:29
+
+## Test 3: Explicit timezone offset (+05:00)
+## Explicit offset should be respected - this is 13:42:29+05:00 which is
08:42:29 UTC
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T08:42:29
+
+## Test 4: Explicit timezone offset without colon (+0500)
+## Should handle offset formats without colons
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+0500');
+----
+2020-09-08T08:42:29
+
+## Test 5: Negative timezone offset
+query P
+SELECT to_timestamp('2020-09-08T13:42:29-03:30');
+----
+2020-09-08T17:12:29
+
+## Test 6: Configure session timezone to America/New_York
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## Test 7: Naive timestamp with configured timezone
+## '2020-09-08T13:42:29' in America/New_York is EDT (UTC-4)
+## So this should become '2020-09-08T13:42:29-04:00'
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29-04:00
+
+## Test 8: Explicit UTC should be transformed to configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T09:42:29-04:00
+
+## Test 9: Explicit offset should be transformed to configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T04:42:29-04:00
+
+## Test 10: Check arrow_typeof returns timstamp in configured timezone
+## Result should be Timestamp(Nanosecond, "America/New_York") regardless of
input timezone
+query T
+SELECT arrow_typeof(to_timestamp('2020-09-08T13:42:29Z'));
+----
+Timestamp(ns, "America/New_York")
+
+## Test 11: Configure to offset-based timezone
+statement ok
+SET datafusion.execution.time_zone = '+05:30';
+
+## Test 12: Naive timestamp with offset-based configured timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29+05:30
+
+## Test 13: Reset to None
+statement ok
+RESET datafusion.execution.time_zone
+
+## Test 14: Naive timestamp
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29
+
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T13:42:29
+
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T08:42:29
+
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## Test 15: to_timestamp with format string - naive timestamp with session
timezone
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29', '%Y-%m-%d %H:%M:%S');
+----
+2020-09-08T13:42:29-04:00
+
+## Test 16: to_timestamp with format string - explicit timezone should be
respected
+statement ok
+SET datafusion.execution.time_zone = 'UTC';
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29 +0000', '%Y-%m-%d %H:%M:%S %z');
+----
+2020-09-08T13:42:29Z
+
+query P
+SELECT to_timestamp('2020-09-08 13:42:29 America/Toronto', '%Y-%m-%d %H:%M:%S
%Z');
+----
+2020-09-08T17:42:29Z
+
+query error Error parsing timestamp from '2020-09-08 13:42:29America/Toronto'
using format '%Y-%m-%d %H:%M:%S%Z': '%Z' is only supported at the end of the
format string preceded by a space
+SELECT to_timestamp('2020-09-08 13:42:29America/Toronto', '%Y-%m-%d
%H:%M:%S%Z');
+
+## Test 17: Test all precision variants respect timezone
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## to_timestamp_seconds
+query P
+SELECT to_timestamp_seconds('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29-04:00
+
+## to_timestamp_millis
+query P
+SELECT to_timestamp_millis('2020-09-08T13:42:29.123');
+----
+2020-09-08T13:42:29.123-04:00
+
+## to_timestamp_micros
+query P
+SELECT to_timestamp_micros('2020-09-08T13:42:29.123456');
+----
+2020-09-08T13:42:29.123456-04:00
+
+## to_timestamp_nanos
+query P
+SELECT to_timestamp_nanos('2020-09-08T13:42:29.123456789');
+----
+2020-09-08T13:42:29.123456789-04:00
+
+## test integers
+query T
+select arrow_typeof(to_timestamp_seconds(61))
+----
+Timestamp(s, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_millis(61))
+----
+Timestamp(ms, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_micros(61))
+----
+Timestamp(µs, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp_nanos(61))
+----
+Timestamp(ns, "America/New_York")
+
+query T
+select arrow_typeof(to_timestamp(61))
+----
+Timestamp(ns, "America/New_York")
+
+## Reset timezone for other tests
+statement ok
+RESET datafusion.execution.time_zone
Review Comment:
I think you need a newline after this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]