Github user ueshin commented on a diff in the pull request:
https://github.com/apache/spark/pull/19884#discussion_r155983224
--- Diff: python/pyspark/serializers.py ---
@@ -223,27 +223,13 @@ def _create_batch(series, timezone):
series = [series]
series = ((s, None) if not isinstance(s, (list, tuple)) else s for s
in series)
- # If a nullable integer series has been promoted to floating point
with NaNs, need to cast
- # NOTE: this is not necessary with Arrow >= 0.7
- def cast_series(s, t):
- if type(t) == pa.TimestampType:
- # NOTE: convert to 'us' with astype here, unit ignored in
`from_pandas` see ARROW-1680
- return _check_series_convert_timestamps_internal(s.fillna(0),
timezone)\
- .values.astype('datetime64[us]', copy=False)
- # NOTE: can not compare None with pyarrow.DataType(), fixed with
Arrow >= 0.7.1
- elif t is not None and t == pa.date32():
- # TODO: this converts the series to Python objects, possibly
avoid with Arrow >= 0.8
- return s.dt.date
- elif t is None or s.dtype == t.to_pandas_dtype():
- return s
- else:
- return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
-
- # Some object types don't support masks in Arrow, see ARROW-1721
def create_array(s, t):
- casted = cast_series(s, t)
- mask = None if casted.dtype == 'object' else s.isnull()
- return pa.Array.from_pandas(casted, mask=mask, type=t)
+ mask = s.isnull()
+ # Workaround for casting timestamp units with timezone, ARROW-1906
--- End diff --
Will the fix for this workaround be included in Arrow 0.8?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]