Github user icexelloss commented on a diff in the pull request: https://github.com/apache/spark/pull/20537#discussion_r166975718 --- Diff: python/pyspark/sql/types.py --- @@ -1730,7 +1730,28 @@ def _check_series_convert_timestamps_internal(s, timezone): # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if is_datetime64_dtype(s.dtype): tz = timezone or 'tzlocal()' - return s.dt.tz_localize(tz).dt.tz_convert('UTC') + """ + tz_localize with ambiguous=False has the same behavior of pytz.localize + >>> import datetime + >>> import pandas as pd + >>> import pytz + >>> + >>> t = datetime.datetime(2015, 11, 1, 1, 23, 24) + >>> ts = pd.Series([t]) + >>> tz = pytz.timezone('America/New_York') + >>> + >>> ts.dt.tz_localize(tz, ambiguous=False) + >>> 0 2015-11-01 01:23:24-05:00 + >>> dtype: datetime64[ns, America/New_York] + >>> + >>> ts.dt.tz_localize(tz, ambiguous=True) + >>> 0 2015-11-01 01:23:24-04:00 + >>> dtype: datetime64[ns, America/New_York] + >>> + >>> str(tz.localize(t)) + >>> '2015-11-01 01:23:24-05:00' + """ + return s.dt.tz_localize(tz, ambiguous=False).dt.tz_convert('UTC') --- End diff -- Yes will create a new for `pandas_udf`. Seems `ambiguous=False` is undocumented in the method doc, @jreback can you please confirm this usage is correct?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org