BryanCutler commented on a change in pull request #33980:
URL: https://github.com/apache/spark/pull/33980#discussion_r745214776
##########
File path: python/pyspark/sql/pandas/types.py
##########
@@ -294,8 +303,10 @@ def _check_series_convert_timestamps_localize(
from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
from_tz = from_timezone or _get_local_timezone()
to_tz = to_timezone or _get_local_timezone()
- # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
- if is_datetime64tz_dtype(s.dtype):
+ if datatype == ArrayType.__name__:
Review comment:
I don't think it's a good way to check for data types by using the
`__name__` attribute, isn't there another way to check that the data type is an
ArrayType?
##########
File path: python/pyspark/sql/pandas/types.py
##########
@@ -356,7 +389,27 @@ def _convert_map_items_to_dict(s: "PandasSeriesLike") ->
"PandasSeriesLike":
return s.apply(lambda m: None if m is None else {k: v for k, v in m})
-def _convert_dict_to_map_items(s: "PandasSeriesLike") -> "PandasSeriesLike":
+def _is_series_contain_timestamp(s):
+ """
+ checks whether the series contain Timstamp object
+ :param s: pd.series
+ :return: True if the series contain timestamp object
+ """
+ from numpy import ndarray
+ import datetime
+ from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
+ if not s.empty and isinstance(s.dtype, object) and \
+ isinstance(s.iloc[0], list) and len(s.iloc[0]) > 0 and \
+ isinstance(s.iloc[0][0], datetime.datetime):
Review comment:
I don't think you can rely on checking the first value to determine the
data type
##########
File path: python/pyspark/sql/pandas/types.py
##########
@@ -306,8 +317,28 @@ def _check_series_convert_timestamps_localize(
return s
+def modify_timestamp_array(data, to_tz: Optional[str], from_tz: Optional[str]
= None
+ , is_utc: bool = False):
+ import pandas as pd
+ if data is None:
+ return [None]
Review comment:
Wouldn't this change a value in the series from None to an array with
one value that is None?
##########
File path: python/pyspark/sql/tests/test_arrow.py
##########
@@ -133,18 +133,29 @@ def test_toPandas_fallback_enabled(self):
user_warns = [
warn.message for warn in warns if
isinstance(warn.message, UserWarning)]
self.assertTrue(len(user_warns) > 0)
- self.assertTrue(
- "Attempting non-optimization" in
str(user_warns[-1]))
- assert_frame_equal(pdf, pd.DataFrame({"a": [[ts]]}))
+ self.assertTrue("Attempting non-optimization" in
str(user_warns[-1]))
+ assert_frame_equal(pdf, pd.DataFrame({"a": [[[ts]]]}))
def test_toPandas_fallback_disabled(self):
- schema = StructType([StructField("a", ArrayType(TimestampType()),
True)])
- df = self.spark.createDataFrame([(None,)], schema=schema)
+ ts = datetime.datetime(2015, 11, 1, 0, 30)
+ schema = StructType([StructField("a",
ArrayType(ArrayType(TimestampType())), True)])
+ df = self.spark.createDataFrame([([[ts]],)], schema=schema)
with QuietTest(self.sc):
with self.warnings_lock:
with self.assertRaisesRegex(Exception, 'Unsupported type'):
df.toPandas()
+ def test_toPandas_array_timestamp(self):
+ schema = StructType([
+ StructField("idx", LongType(), True),
+ StructField("timestamp_array", ArrayType(TimestampType()), True)])
+ data = [(0, [datetime.datetime(1969, 1, 1, 1, 1, 1)]),
+ (2, [datetime.datetime(2100, 3, 3, 3, 3, 3)])]
Review comment:
The data here is a little too simple, there should be multiple values
including nulls
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]