Github user ueshin commented on a diff in the pull request:
https://github.com/apache/spark/pull/18664#discussion_r146469109
--- Diff: python/pyspark/serializers.py ---
@@ -224,7 +225,13 @@ def _create_batch(series):
# If a nullable integer series has been promoted to floating point
with NaNs, need to cast
# NOTE: this is not necessary with Arrow >= 0.7
def cast_series(s, t):
- if t is None or s.dtype == t.to_pandas_dtype():
+ if type(t) == pa.TimestampType:
+ # NOTE: convert to 'us' with astype here, unit ignored in
`from_pandas` see ARROW-1680
+ return
_series_convert_timestamps_internal(s).values.astype('datetime64[us]')
+ elif t == pa.date32():
+ # TODO: ValueError: Cannot cast DatetimeIndex to dtype
datetime64[D]
--- End diff --
How about:
```diff
@@ -229,14 +229,20 @@ def _create_batch(series):
# NOTE: convert to 'us' with astype here, unit ignored in
`from_pandas` see ARROW-1680
return
_series_convert_timestamps_internal(s).values.astype('datetime64[us]')
elif t == pa.date32():
- # TODO: ValueError: Cannot cast DatetimeIndex to dtype
datetime64[D]
- return s.dt.values.astype('datetime64[D]')
+ return s.dt.date
elif t is None or s.dtype == t.to_pandas_dtype():
return s
else:
return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
- arrs = [pa.Array.from_pandas(cast_series(s, t), mask=s.isnull(),
type=t) for s, t in series]
+ def create_array(s, t):
+ casted = cast_series(s, t)
+ if casted.dtype == 'object':
+ return pa.Array.from_pandas(casted, type=t)
+ else:
+ return pa.Array.from_pandas(casted, mask=s.isnull(), type=t)
+
+ arrs = [create_array(s, t) for s, t in series]
return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in
xrange(len(arrs))])
```
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]