moskvax commented on a change in pull request #28743:
URL: https://github.com/apache/spark/pull/28743#discussion_r438051160
##########
File path: python/pyspark/sql/pandas/serializers.py
##########
@@ -150,15 +151,22 @@ def _create_batch(self, series):
series = ((s, None) if not isinstance(s, (list, tuple)) else s for s
in series)
def create_array(s, t):
- mask = s.isnull()
+ # Create with __arrow_array__ if the series' backing array
implements it
+ series_array = getattr(s, 'array', s._values)
+ if hasattr(series_array, "__arrow_array__"):
+ return series_array.__arrow_array__(type=t)
+
# Ensure timestamp series are in expected form for Spark internal
representation
if t is not None and pa.types.is_timestamp(t):
s = _check_series_convert_timestamps_internal(s,
self._timezone)
- elif type(s.dtype) == pd.CategoricalDtype:
+ elif is_categorical_dtype(s.dtype):
# Note: This can be removed once minimum pyarrow version is >=
0.16.1
s = s.astype(s.dtypes.categories.dtype)
try:
- array = pa.Array.from_pandas(s, mask=mask, type=t,
safe=self._safecheck)
+ mask = s.isnull()
+ # pass _ndarray_values to avoid potential failed type checks
from pandas array types
Review comment:
This is a workaround for `IntegerArray` in pre-1.0.0 pandas, which did
not yet implement `__arrow_array__`, so pyarrow expects it to be a NumPy array:
```pycon
>>> import pandas as pd
>>> import pyarrow as pa
>>> print(pd.__version__, pa.__version__)
0.25.0 0.17.1
>>> s = pd.Series(range(3), dtype=pd.Int64Dtype())
>>> pa.Array.from_pandas(s)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pyarrow/array.pxi", line 805, in pyarrow.lib.Array.from_pandas
File "pyarrow/array.pxi", line 265, in pyarrow.lib.array
File "pyarrow/types.pxi", line 76, in pyarrow.lib._datatype_to_pep3118
File "pyarrow/array.pxi", line 64, in pyarrow.lib._ndarray_to_type
File "pyarrow/error.pxi", line 108, in pyarrow.lib.check_status
pyarrow.lib.ArrowTypeError: Did not pass numpy.dtype object
>>> pa.Array.from_pandas(s, type=pa.int64())
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pyarrow/array.pxi", line 805, in pyarrow.lib.Array.from_pandas
File "pyarrow/array.pxi", line 265, in pyarrow.lib.array
File "pyarrow/array.pxi", line 80, in pyarrow.lib._ndarray_to_array
File "pyarrow/error.pxi", line 85, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Input object was not a NumPy array
>>> pa.Array.from_pandas(s._ndarray_values, type=pa.int64())
<pyarrow.lib.Int64Array object at 0x7fb88007a980>
[
0,
1,
2
]
>>>
```
I'll update the comment to mention this.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]