Github user ueshin commented on a diff in the pull request:
https://github.com/apache/spark/pull/19607#discussion_r149582142
--- Diff: python/pyspark/sql/types.py ---
@@ -1629,37 +1629,82 @@ def to_arrow_type(dt):
return arrow_type
-def _check_dataframe_localize_timestamps(pdf):
+def _check_dataframe_localize_timestamps(pdf, timezone):
"""
- Convert timezone aware timestamps to timezone-naive in local time
+ Convert timezone aware timestamps to timezone-naive in the specified
timezone or local timezone
:param pdf: pandas.DataFrame
- :return pandas.DataFrame where any timezone aware columns have be
converted to tz-naive
+ :param timezone: the timezone to convert. if None then use local
timezone
+ :return pandas.DataFrame where any timezone aware columns have been
converted to tz-naive
"""
from pandas.api.types import is_datetime64tz_dtype
+ tz = timezone or 'tzlocal()'
for column, series in pdf.iteritems():
# TODO: handle nested timestamps, such as
ArrayType(TimestampType())?
if is_datetime64tz_dtype(series.dtype):
- pdf[column] =
series.dt.tz_convert('tzlocal()').dt.tz_localize(None)
+ pdf[column] = series.dt.tz_convert(tz).dt.tz_localize(None)
return pdf
-def _check_series_convert_timestamps_internal(s):
+def _check_series_convert_timestamps_internal(s, timezone):
"""
- Convert a tz-naive timestamp in local tz to UTC normalized for Spark
internal storage
+ Convert a tz-naive timestamp in the specified timezone or local
timezone to UTC normalized for
+ Spark internal storage
+
:param s: a pandas.Series
+ :param timezone: the timezone to convert. if None then use local
timezone
:return pandas.Series where if it is a timestamp, has been UTC
normalized without a time zone
"""
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64_dtype(s.dtype):
- return s.dt.tz_localize('tzlocal()').dt.tz_convert('UTC')
+ tz = timezone or 'tzlocal()'
+ return s.dt.tz_localize(tz).dt.tz_convert('UTC')
elif is_datetime64tz_dtype(s.dtype):
return s.dt.tz_convert('UTC')
else:
return s
+def _check_series_convert_timestamps_localize(s, timezone):
+ """
+ Convert timestamp to timezone-naive in the specified timezone or local
timezone
+
+ :param s: a pandas.Series
+ :param timezone: the timezone to convert. if None then use local
timezone
+ :return pandas.Series where if it is a timestamp, has been converted
to tz-naive
+ """
+ import pandas as pd
+ try:
+ from pandas.api.types import is_datetime64tz_dtype,
is_datetime64_dtype
+ tz = timezone or 'tzlocal()'
+ # TODO: handle nested timestamps, such as
ArrayType(TimestampType())?
+ if is_datetime64tz_dtype(s.dtype):
+ return s.dt.tz_convert(tz).dt.tz_localize(None)
+ elif is_datetime64_dtype(s.dtype) and timezone is not None:
+ # `s.dt.tz_localize('tzlocal()')` doesn't work properly when
including NaT.
+ return s.apply(lambda ts:
ts.tz_localize('tzlocal()').tz_convert(tz).tz_localize(None)
+ if ts is not pd.NaT else pd.NaT)
+ else:
+ return s
+ except ImportError:
--- End diff --
We will be able to remove this block if we decided to support only Pandas
>=0.19.2.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]