Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/19607#discussion_r153142413
--- Diff: python/pyspark/sql/types.py ---
@@ -1678,37 +1679,105 @@ def from_arrow_schema(arrow_schema):
for field in arrow_schema])
-def _check_dataframe_localize_timestamps(pdf):
+def _old_pandas_exception_message(e):
+ """ Create an error message for importing old Pandas.
"""
- Convert timezone aware timestamps to timezone-naive in local time
+ msg = "note: Pandas (>=0.19.2) must be installed and available on
calling Python process"
+ return "%s\n%s" % (_exception_message(e), msg)
+
+
+def _check_dataframe_localize_timestamps(pdf, timezone):
+ """
+ Convert timezone aware timestamps to timezone-naive in the specified
timezone or local timezone
:param pdf: pandas.DataFrame
- :return pandas.DataFrame where any timezone aware columns have be
converted to tz-naive
+ :param timezone: the timezone to convert. if None then use local
timezone
+ :return pandas.DataFrame where any timezone aware columns have been
converted to tz-naive
"""
- from pandas.api.types import is_datetime64tz_dtype
+ try:
+ from pandas.api.types import is_datetime64tz_dtype
+ except ImportError as e:
+ raise ImportError(_old_pandas_exception_message(e))
+ tz = timezone or 'tzlocal()'
for column, series in pdf.iteritems():
# TODO: handle nested timestamps, such as
ArrayType(TimestampType())?
if is_datetime64tz_dtype(series.dtype):
- pdf[column] =
series.dt.tz_convert('tzlocal()').dt.tz_localize(None)
+ pdf[column] = series.dt.tz_convert(tz).dt.tz_localize(None)
return pdf
-def _check_series_convert_timestamps_internal(s):
+def _check_series_convert_timestamps_internal(s, timezone):
"""
- Convert a tz-naive timestamp in local tz to UTC normalized for Spark
internal storage
+ Convert a tz-naive timestamp in the specified timezone or local
timezone to UTC normalized for
+ Spark internal storage
+
:param s: a pandas.Series
+ :param timezone: the timezone to convert. if None then use local
timezone
:return pandas.Series where if it is a timestamp, has been UTC
normalized without a time zone
"""
- from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
+ try:
+ from pandas.api.types import is_datetime64_dtype,
is_datetime64tz_dtype
+ except ImportError as e:
+ raise ImportError(_old_pandas_exception_message(e))
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64_dtype(s.dtype):
- return s.dt.tz_localize('tzlocal()').dt.tz_convert('UTC')
+ tz = timezone or 'tzlocal()'
+ return s.dt.tz_localize(tz).dt.tz_convert('UTC')
elif is_datetime64tz_dtype(s.dtype):
return s.dt.tz_convert('UTC')
else:
return s
+def _check_series_convert_timestamps_localize(s, fromTimezone, toTimezone):
+ """
+ Convert timestamp to timezone-naive in the specified timezone or local
timezone
+
+ :param s: a pandas.Series
+ :param fromTimezone: the timezone to convert from. if None then use
local timezone
+ :param toTimezone: the timezone to convert to. if None then use local
timezone
+ :return pandas.Series where if it is a timestamp, has been converted
to tz-naive
+ """
+ try:
+ import pandas as pd
+ from pandas.api.types import is_datetime64tz_dtype,
is_datetime64_dtype
+ except ImportError as e:
+ raise ImportError(_old_pandas_exception_message(e))
+ fromTz = fromTimezone or 'tzlocal()'
--- End diff --
Ditto.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]